概述
02 获得文本语料和词汇资源
# -*- coding: utf-8 -*-
# win10 python3.5.3/python3.6.1 nltk3.2.4
# 《Python自然语言处理》 02 获得文本语料和词汇资源
# pnlp02.py
# 2.1 获取文本语料库
# 古滕堡语料库
import nltk
gtb = nltk.corpus.gutenberg.fileids()
print(gtb)
'''
['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt',
'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt',
'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt',
'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt',
'shakespeare-macbeth.txt', 'whitman-leaves.txt']
'''
emma = nltk.corpus.gutenberg.words('austen-emma.txt')
print(len(emma)) # 192427
emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt'))
print(emma.concordance("surprize"))
'''
Displaying 25 of 37 matches:
er father , was sometimes taken by surprize at his being still able to pity `
...
g engaged !" Emma even jumped with surprize ;-- and , horror - struck , exclai
None
'''
from nltk.corpus import gutenberg
print(gutenberg.fileids())
'''
['austen-emma.txt', 'austen-persuasion.txt', ..., 'whitman-leaves.txt']
'''
# 计算平均词长、平均句子长度和文本中每个词出现的平均次数
for fileid in gutenberg.fileids():
num_chars = len(gutenberg.raw(fileid))
num_words = len(gutenberg.words(fileid))
num_sents = len(gutenberg.sents(fileid))
num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
print(int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid)
'''
4 24 26 austen-emma.txt
4 26 16 austen-persuasion.txt
4 28 22 austen-sense.txt
4 33 79 bible-kjv.txt
4 19 5 blake-poems.txt
4 19 14 bryant-stories.txt
4 17 12 burgess-busterbrown.txt
4 20 12 carroll-alice.txt
4 20 11 chesterton-ball.txt
4 22 11 chesterton-brown.txt
4 18 10 chesterton-thursday.txt
4 20 24 edgeworth-parents.txt
4 25 15 melville-moby_dick.txt
4 52 10 milton-paradise.txt
4 11 8 shakespeare-caesar.txt
4 12 7 shakespeare-hamlet.txt
4 12 6 shakespeare-macbeth.txt
4 36 12 whitman-leaves.txt
'''
macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
print(macbeth_sentences)
# [['[', 'The', 'Tragedie', 'of', 'Macbeth', 'by', 'William', 'Shakespeare', '1603', ']'], ['Actus', 'Primus', '.'], ...]
print(macbeth_sentences[1037])
# ['Good', 'night', ',', 'and', 'better', 'health', 'Attend', 'his', 'Maiesty']
longest_len = max([len(s) for s in macbeth_sentences])
l1 = [s for s in macbeth_sentences if (len(s) == longest_len)]
print(l1)
'''
[['Doubtfull', 'it', 'stood', ',', 'As', 'two', 'spent', 'Swimmers', ',', 'that', 'doe', 'cling',
...
'Head', 'vpon', 'our', 'Battlements']]
'''
# 网络和聊天文本
from nltk.corpus import webtext
for fileid in webtext.fileids():
print(fileid, webtext.raw(fileid)[:65], '...')
'''
firefox.txt Cookie Manager: "Don't allow sites that set removed cookies to se ...
grail.txt SCENE 1: [wind] [clop clop clop]
KING ARTHUR: Whoa there!
[clop ...
overheard.txt White guy: So, do you have any plans for this evening?
Asian girl ...
pirates.txt PIRATES OF THE CARRIBEAN: DEAD MAN'S CHEST, by Ted Elliott & Terr ...
singles.txt 25 SEXY MALE, seeks attrac older single lady, for discreet encoun ...
wine.txt Lovely delicate, fragrant Rhone wine. Polished leather and strawb ...
'''
from nltk.corpus import nps_chat
chatroom = nps_chat.posts('10-19-20s_706posts.xml')
print(chatroom)
'''[['now', 'im', 'left', 'with', 'this', 'gay', 'name'], [':P'], ...]'''
# 布朗语料库
from nltk.corpus import brown
print(brown.categories())
'''
['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore',
'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
'''
print(brown.words(categories='news'))
'''['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]'''
print(brown.words(fileids=['cg22']))
'''['Does', 'our', 'society', 'have', 'a', 'runaway', ',', ...]'''
print(brown.sents(categories=['news', 'editorial', 'reviews']))
'''
[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's",
'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities',
'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the',
'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election',
',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''",
'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]
'''
from nltk.corpus import brown
news_text = brown.words(categories='news')
fdist = nltk.FreqDist([w.lower() for w in news_text])
modals = ['can', 'could', 'may', 'might', 'must', 'will']
for m in modals:
print(m + ':', fdist[m])
'''
can: 94
could: 87
may: 93
might: 38
must: 53
will: 389
'''
cfd = nltk.ConditionalFreqDist(
(genre, word)
for genre in brown.categories()
for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
res = cfd.tabulate(conditions=genres, samples=modals)
print(res)
'''
can could
may might
must
will
news
93
86
66
38
50
389
religion
82
59
78
12
54
71
hobbies
268
58
131
22
83
264
science_fiction
16
49
4
12
8
16
romance
74
193
11
51
45
43
humor
16
30
8
8
9
13
None
'''
# 透露社语料库
from nltk.corpus import reuters
print(reuters.fileids())
'''
['test/14826', 'test/14828', 'test/14829', ..., 'training/9994', 'training/9995']
'''
print(reuters.categories())
'''
['acq', 'alum', 'barley', 'bop', ..., 'wpi', 'yen', 'zinc']
'''
print(reuters.categories('training/9865'))
# ['barley', 'corn', 'grain', 'wheat']
print(reuters.categories(['training/9865', 'training/9880']))
# ['barley', 'corn', 'grain', 'money-fx', 'wheat']
print(reuters.fileids('barley'))
'''
['test/15618', 'test/15649', 'test/15676', ..., 'training/9865', 'training/9958']
'''
print(reuters.fileids(['barley', 'corn']))
'''
['test/14832', 'test/14858', 'test/15033', ..., 'training/9958', 'training/9989']
'''
print(reuters.words('training/9865')[:14])
'''
['FRENCH', 'FREE', 'MARKET', 'CEREAL', 'EXPORT', 'BIDS', 'DETAILED', 'French', 'operators', 'have',
'requested', 'licences', 'to', 'export']
'''
print(reuters.words(['training/9865', 'training/9880']))
'''['FRENCH', 'FREE', 'MARKET', 'CEREAL', 'EXPORT', ...]'''
print(reuters.words(categories='barley'))
'''['FRENCH', 'FREE', 'MARKET', 'CEREAL', 'EXPORT', ...]'''
print(reuters.words(categories=['barley', 'corn']))
'''['THAI', 'TRADE', 'DEFICIT', 'WIDENS', 'IN', 'FIRST', ...]'''
# 就职演说语料库
from nltk.corpus import inaugural
print(inaugural.fileids())
'''['1789-Washington.txt', '1793-Washington.txt', ..., '2005-Bush.txt', '2009-Obama.txt']'''
res = [fileid[:4] for fileid in inaugural.fileids()]
print(res)
'''['1789', '1793', '1797', ..., '2005', '2009']'''
cfd = nltk.ConditionalFreqDist(
(target, fileid[:4])
for fileid in inaugural.fileids()
for w in inaugural.words(fileid)
for target in ['america', 'citizen']
if w.lower().startswith(target)
)
cfd.plot()
# 标注文本预料库
# 其他语料库
print(nltk.corpus.cess_esp.words())
'''['El', 'grupo', 'estatal', 'Electricité_de_France', ...]'''
print(nltk.corpus.floresta.words())
'''['Um', 'revivalismo', 'refrescante', 'O', '7_e_Meio', ...]'''
print(nltk.corpus.indian.words('hindi.pos'))
'''['पूर्ण', 'प्रतिबंध', 'हटाओ', ':', 'इराक', 'संयुक्त', ...]'''
print(nltk.corpus.udhr.fileids())
'''['Abkhaz-Cyrillic+Abkh', 'Abkhaz-UTF8', ..., 'Zhuang-Latin1', 'Zulu-Latin1']'''
print(nltk.corpus.udhr.words('Javanese-Latin1')[11:])
'''['Saben', 'umat', 'manungsa', 'lair', 'kanthi', 'hak', ...]'''
from nltk.corpus import udhr
languages = ['Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']
cfd = nltk.ConditionalFreqDist(
(lang, len(word))
for lang in languages
for word in udhr.words(lang + '-Latin1')
)
cfd.plot(cumulative=True)
# 文本语料库结构
# help(nltk.corpus.reader)
raw = gutenberg.raw("burgess-busterbrown.txt")
print(raw[1:20]) # The Adventures of B
words = gutenberg.words("burgess-busterbrown.txt")
print(words[1:20])
'''['The', 'Adventures', 'of', 'Buster', ..., 'Bear']'''
sents = gutenberg.sents("burgess-busterbrown.txt")
print(sents[1:20])
'''[['I'], ['BUSTER', 'BEAR', 'GOES', 'FISHING'], ..., 'for', 'breakfast', '.']]'''
# 加载自己的语料库
from nltk.corpus import PlaintextCorpusReader
corpus_root = 'D:/tmp/tensorflow/data'
wordlists = PlaintextCorpusReader(corpus_root, 'my*.txt')
print(wordlists.fileids()) # 无数据显示
print(wordlists.readme()) # 需要有README文件
print(wordlists.words('mya.txt')) # 需要有mya.txt文件
# 2.2 条件频率分布 ConditionalFreqDist
# 条件和事件
text = ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said']
pairs = [('news', 'The'), ('news', 'Fulton'), ('news', 'County')]
# 按文件技术词汇
from nltk.corpus import brown
cfd = nltk.ConditionalFreqDist(
(genre, word)
for genre in brown.categories()
for word in brown.words(categories=genre)
)
genre_word = [(genre, word)
for genre in ['news', 'romance']
for word in brown.words(categories=genre)]
print(len(genre_word)) # 170576
print(genre_word[:4]) # [('news', 'The'), ('news', 'Fulton'), ('news', 'County'), ('news', 'Grand')]
print(genre_word[-4:]) # [('romance', 'afraid'), ('romance', 'not'), ('romance', "''"), ('romance', '.')]
cfd = nltk.ConditionalFreqDist(genre_word)
print(cfd) # <ConditionalFreqDist with 2 conditions>
print(cfd.conditions()) # ['news', 'romance']
print(cfd['news']) # <FreqDist with 14394 samples and 100554 outcomes>
print(cfd['romance']) # <FreqDist with 8452 samples and 70022 outcomes>
print(list(cfd['romance'])) # ['They', 'neither', 'liked', ..., 'expect', 'episode']
print(cfd['romance']['could']) # 193
# 绘制分布图和分布表
from nltk.corpus import inaugural
cfd = nltk.ConditionalFreqDist(
(target, fileid[:4])
for fileid in inaugural.fileids()
for w in inaugural.words(fileid)
for target in ['america', 'citizen']
if w.lower().startswith(target)
)
from nltk.corpus import udhr
languages = ['Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']
cfd = nltk.ConditionalFreqDist(
(lang, len(word))
for lang in languages
for word in udhr.words(lang + '-Latin1')
)
cfd.tabulate(conditions=['English', 'German_Deutsch'], samples = range(10), cumulative=True)
'''
0
1
2
3
4
5
6
7
8
9
English
0
185
525
883
997 1166 1283 1440 1558 1638
German_Deutsch
0
171
263
614
717
894 1013 1110 1213 1275
'''
# 使用双连词生成随机文本
sent = ['In', 'the', 'beginning', 'God', 'Created', 'the', 'heaven', 'and', 'the', 'earth', '.']
print(nltk.bigrams(sent)) # <generator object bigrams at 0x00000219653297D8>
print(list(nltk.bigrams(sent)))
'''
[('In', 'the'), ('the', 'beginning'), ('beginning', 'God'), ('God', 'Created'), ('Created', 'the'),
('the', 'heaven'), ('heaven', 'and'), ('and', 'the'), ('the', 'earth'), ('earth', '.')]
'''
# 2.3 更多关于Python代码重用
# 使用文本编辑器创建程序
# 函数
# 模块
# 2.4 词典资源
# 词汇列表语料库
def unusual_words(text):
text_vocab = set(w.lower() for w in text if w.isalpha())
english_vocab = set(w.lower() for w in nltk.corpus.words.words())
unusual = text_vocab.difference(english_vocab)
return sorted(unusual)
res = unusual_words(nltk.corpus.gutenberg.words('austen-sense.txt'))
print(res)
'''['abbeyland', 'abhorred', 'abilities', ..., 'yielded', 'youngest']'''
res = unusual_words(nltk.corpus.nps_chat.words())
print(res)
'''['aaaaaaaaaaaaaaaaa', 'aaahhhh', 'abortions', ..., 'zzzzzzzing', 'zzzzzzzz']'''
from nltk.corpus import stopwords
print(stopwords.words('english'))
'''['i', 'me', 'my', 'myself', 'we', ..., 'won', 'wouldn']'''
def content_fraction(text):
stopwords = nltk.corpus.stopwords.words('english')
content = [w for w in text if w.lower() not in stopwords]
return len(content)/len(text)
print(content_fraction(nltk.corpus.reuters.words())) # 0.735240435097661
puzzle_letters = nltk.FreqDist('egivrvonl')
obligatory = 'r'
wordlist = nltk.corpus.words.words()
res = [w for w in wordlist if len(w) >= 6
and obligatory in w
and nltk.FreqDist(w) <= puzzle_letters]
print(res)
'''['glover', 'gorlin', 'govern', 'grovel', 'ignore', ..., 'violer', 'virole']'''
names = nltk.corpus.names
print(names.fileids()) # ['female.txt', 'male.txt']
male_names = names.words('male.txt')
female_names = names.words('female.txt')
res = [w for w in male_names if w in female_names]
print(res)
'''['Abbey', 'Abbie', 'Abby', ..., 'Winnie', 'Winny', 'Wynn']'''
# 发音的词典
entries = nltk.corpus.cmudict.entries()
print(len(entries)) # 133737
for entry in entries[39943:39951]:
print(entry)
'''
('explorer', ['IH0', 'K', 'S', 'P', 'L', 'AO1', 'R', 'ER0'])
('explorers', ['IH0', 'K', 'S', 'P', 'L', 'AO1', 'R', 'ER0', 'Z'])
('explores', ['IH0', 'K', 'S', 'P', 'L', 'AO1', 'R', 'Z'])
('exploring', ['IH0', 'K', 'S', 'P', 'L', 'AO1', 'R', 'IH0', 'NG'])
('explosion', ['IH0', 'K', 'S', 'P', 'L', 'OW1', 'ZH', 'AH0', 'N'])
('explosions', ['IH0', 'K', 'S', 'P', 'L', 'OW1', 'ZH', 'AH0', 'N', 'Z'])
('explosive', ['IH0', 'K', 'S', 'P', 'L', 'OW1', 'S', 'IH0', 'V'])
('explosively', ['EH2', 'K', 'S', 'P', 'L', 'OW1', 'S', 'IH0', 'V', 'L', 'IY0'])
'''
for word, pron in entries:
if len(pron) == 3:
ph1, ph2, ph3 = pron
if ph1 == 'P' and ph3 == 'T':
print(word, ph2)
'''
pait EY1
pat AE1
...
put UH1
putt AH1
'''
syllable = ['N', 'IHO', 'K', 'S']
res = [word for word, pron in entries if pron[-4:] == syllable]
print(res)
'''[]'''
res = [w for w, pron in entries if pron[-1] == 'M' and w[-1] == 'n']
print(res)
'''['autumn', 'column', 'condemn', 'damn', 'goddamn', 'hymn', 'solemn']'''
res = sorted(set(w[:2] for w, pron in entries if pron[0] == 'N' and w[0] != 'n'))
print(res)
'''['gn', 'kn', 'mn', 'pn']'''
def stress(pron):
return [char for phone in pron for char in phone if char.isdigit()]
res = [w for w, pron in entries if stress(pron) == ['0', '1', '0', '2', '0']]
print(res)
'''['abbreviated', 'abbreviated', 'abbreviating', ..., 'vocabulary', 'voluntarism']'''
res = [w for w, pron in entries if stress(pron) == ['0', '2', '0', '1', '0']]
print(res)
'''['abbreviation', 'abbreviations', 'abomination', ..., 'wakabayashi', 'yekaterinburg']'''
p3 = [(pron[0] + '-' + pron[2], word)
for (word, pron) in entries
if pron[0] == 'P' and len(pron) == 3]
cfd = nltk.ConditionalFreqDist(p3)
for template in cfd.conditions():
if len(cfd[template]) > 10:
words = cfd[template].keys()
wordlist = ' '.join(words)
print(template, wordlist[:70] + "...")
'''
P-P paap paape pap pape papp paup peep pep pip pipe pipp poop pop pope pop...
P-R paar pair par pare parr pear peer pier poor poore por pore porr pour...
P-K pac pack paek paik pak pake paque peak peake pech peck peek perc perk ...
P-S pace pass pasts peace pearse pease perce pers perse pesce piece piss p...
P-L pahl pail paille pal pale pall paul paule paull peal peale pearl pearl...
P-N paign pain paine pan pane pawn payne peine pen penh penn pin pine pinn...
P-Z pais paiz pao's pas pause paws pays paz peas pease pei's perz pez pies...
P-T pait pat pate patt peart peat peet peete pert pet pete pett piet piett...
P-CH patch pautsch peach perch petsch petsche piche piech pietsch pitch pit...
P-UW1 peru peugh pew plew plue prew pru prue prugh pshew pugh...
'''
prondict = nltk.corpus.cmudict.dict()
print(prondict['fire']) # [['F', 'AY1', 'ER0'], ['F', 'AY1', 'R']]
# print(prondict['blog']) # KeyError: 'blog'
prondict['blog'] = [['B', 'L', 'AA1', 'G']]
print(prondict['blog']) # [['B', 'L', 'AA1', 'G']]
text = ['natural', 'language', 'processing']
res = [ph for w in text for ph in prondict[w][0]]
print(res)
'''
['N', 'AE1', 'CH', 'ER0', 'AH0', 'L', 'L', 'AE1', 'NG', 'G', 'W', 'AH0', 'JH', 'P',
'R', 'AA1', 'S', 'EH0', 'S', 'IH0', 'NG']
'''
# 比较词表
from nltk.corpus import swadesh
print(swadesh.fileids())
'''
['be', 'bg', 'bs', 'ca', 'cs', 'cu', 'de', 'en', 'es', 'fr', 'hr', 'it', 'la', 'mk', 'nl',
'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'sr', 'sw', 'uk']
'''
print(swadesh.words('en'))
'''['I', 'you (singular), thou', ..., 'if', 'because', 'name']'''
fr2en = swadesh.entries(['fr', 'en'])
print(fr2en)
'''[('je', 'I'), ('tu, vous', 'you (singular), thou'), ..., ('parce que', 'because'), ('nom', 'name')]'''
translate = dict(fr2en)
print(translate['chien']) # dog
print(translate['jeter']) # throw
de2en = swadesh.entries(['de', 'en'])
es2en = swadesh.entries(['es', 'en'])
translate.update(dict(de2en))
translate.update(dict(es2en))
print(translate['Hund']) # dog
print(translate['perro']) # dog
languages = ['en', 'de', 'nl', 'es', 'fr', 'pt', 'la']
for i in [139, 140, 141, 142]:
print(swadesh.entries(languages)[i])
'''
('say', 'sagen', 'zeggen', 'decir', 'dire', 'dizer', 'dicere')
('sing', 'singen', 'zingen', 'cantar', 'chanter', 'cantar', 'canere')
('play', 'spielen', 'spelen', 'jugar', 'jouer', 'jogar, brincar', 'ludere')
('float', 'schweben', 'zweven', 'flotar', 'flotter', 'flutuar, boiar', 'fluctuare')
'''
# 词汇工具:Toolbox和Shoebox
from nltk.corpus import toolbox
print(toolbox.entries('rotokas.dic'))
'''
[('kaa', [('ps', 'V'), ('pt', 'A'), ..., ('tkp', 'laplap'), ('dt', '28/Jul/2004')])]
'''
# 2.5 WordNet
# 意义与同义词
from nltk.corpus import wordnet as wn
print(wn.synsets('motorcar')) # [Synset('car.n.01')]
print(wn.synset('car.n.01').lemma_names()) # ['car', 'auto', 'automobile', 'machine', 'motorcar']
print(wn.synset('car.n.01').definition()) # a motor vehicle with four wheels; usually propelled by an internal combustion engine
print(wn.synset('car.n.01').examples()) # ['he needs a car to get to work']
print(wn.synset('car.n.01').lemmas())
'''
[Lemma('car.n.01.car'), Lemma('car.n.01.auto'), Lemma('car.n.01.automobile'),
Lemma('car.n.01.machine'), Lemma('car.n.01.motorcar')]
'''
print(wn.lemma('car.n.01.automobile')) # Lemma('car.n.01.automobile')
print(wn.lemma('car.n.01.automobile').synset()) # Synset('car.n.01')
print(wn.lemma('car.n.01.automobile').name()) # automobile
print(wn.synsets('car'))
'''[Synset('car.n.01'), Synset('car.n.02'), Synset('car.n.03'), Synset('car.n.04'), Synset('cable_car.n.01')]'''
for synset in wn.synsets('car'):
print(synset.lemma_names())
'''
['car', 'auto', 'automobile', 'machine', 'motorcar']
['car', 'railcar', 'railway_car', 'railroad_car']
['car', 'gondola']
['car', 'elevator_car']
['cable_car', 'car']
'''
print(wn.lemmas('car'))
# WordNet的层次结构
motorcar = wn.synset('car.n.01')
types_of_motorcar = motorcar.hyponyms()
print(types_of_motorcar[26])
'''[Lemma('car.n.01.car'), Lemma('car.n.02.car'), Lemma('car.n.03.car'),
Lemma('car.n.04.car'), Lemma('cable_car.n.01.car')]'''
res = sorted([lemma.name() for synset in types_of_motorcar for lemma in synset.lemmas()])
print(res)
'''['Model_T', 'S.U.V.', 'SUV', 'Stanley_Steamer', ..., 'used-car', 'waggon', 'wagon']'''
print(motorcar.hypernyms()) # [Synset('motor_vehicle.n.01')]
paths = motorcar.hypernym_paths()
print(len(paths)) # 2
res = [synset.name() for synset in paths[0]]
print(res)
'''
['entity.n.01', 'physical_entity.n.01', 'object.n.01', 'whole.n.02', 'artifact.n.01', 'instrumentality.n.03',
'container.n.01', 'wheeled_vehicle.n.01', 'self-propelled_vehicle.n.01', 'motor_vehicle.n.01', 'car.n.01']
'''
res = [synset.name() for synset in paths[1]]
print(res)
'''
['entity.n.01', 'physical_entity.n.01', 'object.n.01', 'whole.n.02', 'artifact.n.01', 'instrumentality.n.03',
'conveyance.n.03', 'vehicle.n.01', 'wheeled_vehicle.n.01', 'self-propelled_vehicle.n.01',
'motor_vehicle.n.01', 'car.n.01']
'''
print(motorcar.root_hypernyms()) # [Synset('entity.n.01')]
# 更多的词汇关系
print(wn.synset('tree.n.01').part_meronyms())
'''[Synset('burl.n.02'), Synset('crown.n.07'), Synset('limb.n.02'), Synset('stump.n.01'), Synset('trunk.n.01')]'''
print(wn.synset('tree.n.01').substance_meronyms())
'''[Synset('heartwood.n.01'), Synset('sapwood.n.01')]'''
print(wn.synset('tree.n.01').member_holonyms())
'''[Synset('forest.n.01')]'''
for synset in wn.synsets('mint', wn.NOUN):
print(synset.name() + ':', synset.definition())
'''
batch.n.02: (often followed by `of') a large number or amount or extent
mint.n.02: any north temperate plant of the genus Mentha with aromatic leaves and small mauve flowers
mint.n.03: any member of the mint family of plants
mint.n.04: the leaves of a mint plant used fresh or candied
mint.n.05: a candy that is flavored with a mint oil
mint.n.06: a plant where money is coined by authority of the government
'''
print(wn.synset('mint.n.04').part_holonyms()) # [Synset('mint.n.02')]
print(wn.synset('mint.n.04').substance_holonyms()) # [Synset('mint.n.05')]
print(wn.synset('walk.v.01').entailments()) # [Synset('step.v.01')]
print(wn.synset('eat.v.01').entailments()) # [Synset('chew.v.01'), Synset('swallow.v.01')]
print(wn.synset('tease.v.03').entailments()) # [Synset('arouse.v.07'), Synset('disappoint.v.01')]
print(wn.lemma('supply.n.02.supply').antonyms()) # [Lemma('demand.n.02.demand')]
print(wn.lemma('rush.v.01.rush').antonyms()) # [Lemma('linger.v.04.linger')]
print(wn.lemma('horizontal.a.01.horizontal').antonyms())
'''[Lemma('inclined.a.02.inclined'), Lemma('vertical.a.01.vertical')]'''
print(wn.lemma('staccato.r.01.staccato').antonyms()) # [Lemma('legato.r.01.legato')]
# 语义相似度
right = wn.synset('right_whale.n.01')
orca = wn.synset('orca.n.01')
minke = wn.synset('minke_whale.n.01')
tortoise = wn.synset('tortoise.n.01')
novel = wn.synset('novel.n.01')
print(right.lowest_common_hypernyms(minke)) # [Synset('baleen_whale.n.01')]
print(right.lowest_common_hypernyms(orca)) # [Synset('whale.n.02')]
print(right.lowest_common_hypernyms(tortoise)) # [Synset('vertebrate.n.01')]
print(right.lowest_common_hypernyms(novel)) # [Synset('entity.n.01')]
print(wn.synset('baleen_whale.n.01').min_depth()) # 14
print(wn.synset('whale.n.02').min_depth()) # 13
print(wn.synset('vertebrate.n.01').min_depth()) # 8
print(wn.synset('entity.n.01').min_depth()) # 0
print(right.path_similarity(minke)) # 0.25
print(right.path_similarity(orca)) # 0.16666666666666666
print(right.path_similarity(tortoise)) # 0.07692307692307693
print(right.path_similarity(novel)) # 0.043478260869565216
最后
以上就是外向朋友为你收集整理的NLTK03 《Python自然语言处理》code02 获得文本语料和词汇资源的全部内容,希望文章能够帮你解决NLTK03 《Python自然语言处理》code02 获得文本语料和词汇资源所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
发表评论 取消回复