第三章 处理原始文本
1 从网络和硬盘访问文本
#<<罪与罚>>的英文翻译 未作测试??
From utlib import urlopen
#分词 未作测试??
Raw.find(‘PART I’)#取得字符串索引值
Raw.rfind(“End of Project Gutenberg’s Crime”)
Raw.find(“PART I”)
处理html 未作测试??
处理rss订阅 未作测试
Import feedparser
>>> f=open('document.txt')
Traceback (most recent call last):
File "<input>", line 1, in <module>
FileNotFoundError: [Errno 2] No such fileor directory: 'document.txt'
>>> import os
>>> os.listdir('.')
['.idea', 'One', 'Two']
>>> f=open('document.txt')
>>> f.read()
'this is my time\nTime files like anarrow.\nFruit files like a banana.\n'
>>> f=open('document.txt','rU')
>>> for line in f:
... print(line.strip())#删除行尾换行符
this is my time
Time files like an arrow.
Fruit files like a banana.
>>> path=nltk.data.find('corpora/gutenberg/melville-moby_dick.txt')
>>> raw=open(path,'rU').read()
从PDF,MS WORD 和其他二进制文件中提取文本
>>> s=input('Enter some text')
Enter some text>? On an exceptionallyhot eveing early in july
>>> print('You typed',len(nltk.word_tokenize(s)),'words')
You typed 8 words
>>> type(raw)
<class 'str'>
>>> tokens=nltk.word_tokenize(raw)
>>> type(tokens)
<class 'list'>
>>> words=[w.lower() for w intokens]
>>> type(words)
<class 'list'>
>>> vocab=sorted(set(words))
>>> type(vocab)
<class 'list'>
>>> vocab.append('blog')
>>> raw.append('blog')
Traceback (most recent call last):
File "<input>", line 1, in <module>
AttributeError: 'str' object has noattribute 'append'
#字符串+字符串 链表+链表 链表不能加字符串
>>> query='Who knows?'
>>> query+beatles
Traceback (most recent call last):
File "<input>", line 1, in <module>
TypeError: Can't convert 'list' object tostr implicitly
2 字符串:最底层的文本处理
monty = 'Monty python' print(monty) circus = "Monty python's Flying Circus" print(circus) circus = 'Monty python\'s Flying Circus' print(circus)
>>> couplet="Shall I comparethee to a Summer's day?"\
... "Thou are more lovely and moretemperate:"
>>> print(couplet)
Shall I compare thee to a Summer's day?Thouare more lovely and more temperate:
>>> couplet=("Shall I comparethee to a Summer's day?"
... "Thou are more lovely and moretemperate:")
>>> print(couplet)
Shall I compare thee to a Summer's day?Thouare more lovely and more temperate:
>>> couplet='''Shall I comparethee to a Summer's day?
... Thou are more lovely and moretemperate:'''
>>> print(couplet)
Shall I compare thee to a Summer's day?
Thou are more lovely and more temperate:
>>>couplet="""Shall I compare thee to a Summer's day?
... Thou are more lovely and moretemperate:"""
>>> print(couplet)
Shall I compare thee to a Summer's day?
Thou are more lovely and more temperate:
>>> 'very'+'very'+'very'
>>> 'very'*3
>>> monty='Monty python'
>>> monty[0]
>>> monty[-1]
#不要在行尾输出换行符 ??
>>> import nltk
>>> from nltk.corpus importgutenberg
>>> fdist=nltk.FreqDist(ch.lower()for ch in raw if ch.isalpha())
>>> fdist.keys()
dict_keys(['s', 'z', 'r', 'h', 'a', 'i','n', 'b', 't', 'j', 'o', 'e', 'c', 'm', 'x', 'y', 'g', 'd', 'q', 'v', 'w', 'f','k', 'p', 'u', 'l'])
>>> monty='Monty python'
>>> monty[6:10]
>>> monty[-12:-7]
>>> monty[:5]
>>> monty[6:]
>>> phrase='And now for something completelydifferent'
>>> if 'thing' in phrase:
... print("found 'thing")
found 'thing
>>> monty.find('python')
>>> help(str)
Help on class str in module builtins:
class str(object)
| str(object='') -> str
| str(bytes_or_buffer[,encoding[, errors]]) -> str
| Create a new string object from the given object. If encoding or
| errors is specified, then the object must expose a data buffer
| thatwill be decoded using the given encoding and error handler.
| Otherwise, returns the result of object.__str__() (if defined)
| orrepr(object).
| encoding defaults to sys.getdefaultencoding().
| errors defaults to 'strict'.
| Methods defined here:
| __add__(self, value, /) #私有方法
| Return self+value.
| __contains__(self, key, /)
| Return key in self.
| __eq__(self, value, /)
| Return self==value.
| __format__(...)
| S.__format__(format_spec) -> str
| Return a formatted version of S as described by format_spec.
| __ge__(self, value, /)
| Return self>=value.
| __getattribute__(self, name, /)
| Return getattr(self, name).
| __getitem__(self, key, /)
| Return self[key].
| __getnewargs__(...)
| __gt__(self, value, /)
| Return self>value.
| __hash__(self, /)
| Return hash(self).
| __iter__(self, /)
| Implement iter(self).
| __le__(self, value, /)
| Return self<=value.
| __len__(self, /)
| Return len(self).
| __lt__(self, value, /)
| Return self<value.
| __mod__(self, value, /)
| Return self%value.
| __mul__(self, value, /)
| Return self*value.n
| __ne__(self, value, /)
| Return self!=value.
| __new__(*args, **kwargs) from builtins.type
| Create and return a new object. See help(type) for accurate signature.
| __repr__(self, /)
| Return repr(self).
| __rmod__(self, value, /)
| Return value%self.
| __rmul__(self, value, /)
| Return self*value.
| __sizeof__(...)
| S.__sizeof__() -> size of S in memory, in bytes
| __str__(self, /)
| Return str(self).
| capitalize(...)
| S.capitalize() -> str
| Return a capitalized version of S, i.e. make the first character
| have upper case and the rest lower case.
| casefold(...)
| S.casefold() -> str
| Return a version of S suitable for caseless comparisons.
| center(...)
| S.center(width[, fillchar]) -> str
| Return S centered in a string of length width. Padding is
| done using the specified fill character (default is a space)
| count(...) #字符串中字符数量
| S.count(sub[, start[, end]]) -> int
| Return the number of non-overlapping occurrences of substring sub in
| string S[start:end]. Optionalarguments start and end are
| interpreted as in slice notation.
| encode(...)
| S.encode(encoding='utf-8', errors='strict') -> bytes
| Encode S using the codec registered for encoding. Default encoding
| is 'utf-8'. errors may be given to set a different error
| handling scheme. Default is 'strict' meaning that encoding errors raise
| aUnicodeEncodeError. Other possible values are 'ignore', 'replace' and
| 'xmlcharrefreplace' as well as any other name registered with
| codecs.register_error that can handle UnicodeEncodeErrors.
| endswith(...) #是否以指定字符串结尾
| S.endswith(suffix[, start[, end]]) -> bool
| Return True if S ends with the specified suffix, False otherwise.
| With optional start, test S beginning at that position.
| With optional end, stop comparing S at that position.
| suffix can also be a tuple of strings to try.
| expandtabs(...)
| S.expandtabs(tabsize=8) -> str
| Return a copy of S where all tab characters are expanded using spaces.
| If tabsize is not given, a tab size of 8 characters is assumed.
| find(...) #查找子字串的第一个索引
| S.find(sub[, start[, end]]) -> int
| Return the lowest index in S where substring sub is found,
| such that sub is contained within S[start:end]. Optional
| arguments start and end are interpreted as in slice notation.
| Return -1 on failure.
| format(...) #格式化字串
| S.format(*args, **kwargs) -> str
| Return a formatted version of S, using substitutions from args andkwargs.
| The substitutions are identified by braces ('{' and '}').
| format_map(...)
| S.format_map(mapping) -> str
| Return a formatted version of S, using substitutions from mapping.
| The substitutions are identified by braces ('{' and '}').
| index(...)
| S.index(sub[, start[, end]]) -> int
| Like S.find() but raise ValueError when the substring is not found.
| isalnum(...) #是否为数字
| S.isalnum() -> bool
| Return True if all characters in S are alphanumeric
| and there is at least one character in S,False otherwise.
| isalpha(...) #是不为字母
| S.isalpha() -> bool
| Return True if all characters in S are alphabetic
| and there is at least one character in S, False otherwise.
| isdecimal(...)
| S.isdecimal() -> bool
| Return True if there are only decimal characters in S,
| False otherwise.
| isdigit(...)
| S.isdigit() -> bool
| Return True if all characters in S are digits
| and there is at least one character in S, False otherwise.
| isidentifier(...)
| S.isidentifier() -> bool
| Return True if S is a valid identifier according
| to the language definition.
| Use keyword.iskeyword() to test for reserved identifiers
| such as "def" and "class".
| islower(...) #是否小写
| S.islower() -> bool
| Return True if all cased characters in S are lowercase and there is
| at least one cased character in S, False otherwise.
| isnumeric(...)
| S.isnumeric() -> bool
| Return True if there are only numeric characters in S,
| False otherwise.
| isprintable(...)
| S.isprintable() -> bool
| Return True if all characters in S are considered
| printable in repr() or S is empty, False otherwise.
| isspace(...)
| S.isspace() -> bool
| Return True if all characters in S are whitespace
| and there is at least one character in S, False otherwise.
| istitle(...)
| S.istitle() -> bool
| Return True if S is a titlecased string and there is at least one
| character in S, i.e. upper- and titlecase characters may only
| follow uncased characters and lowercase characters only cased ones.
| Return False otherwise.
| isupper(...)#是不是大写
| S.isupper() -> bool
| Return True if all cased characters in S are uppercase and there is
| at least one cased character in S, False otherwise.
| join(...) #连接字符串
| S.join(iterable) -> str
| Return a string which is the concatenation of the strings in the
| iterable. The separator betweenelements is S.
| ljust(...)
| S.ljust(width[, fillchar]) -> str
| Return S left-justified in a Unicode string of length width. Padding is
| done using the specified fill character (default is a space).
| lower(...)
| S.lower() -> str
| Return a copy of the string S converted to lowercase.
| lstrip(...)
| S.lstrip([chars]) -> str
| Return a copy of the string S with leading whitespace removed.
| If chars is given and not None, remove characters in chars instead.
| partition(...)
| S.partition(sep) -> (head, sep, tail)
| Search for the separator sep in S, and return the part before it,
| the separator itself, and the part after it. If the separator is not
| found, return S and two empty strings.
| replace(...) #替换
| S.replace(old, new[, count]) -> str
| Return a copy of S with all occurrences of substring
| old replaced by new. If theoptional argument count is
| given, only the first count occurrences are replaced.
| rfind(...) #反向查找
| S.rfind(sub[, start[, end]]) -> int
| Return the highest index in S where substring sub is found,
| such that sub is contained within S[start:end]. Optional
| arguments start and end are interpreted as in slice notation.
| Return -1 on failure.
| rindex(...)
| S.rindex(sub[, start[, end]]) -> int
| Like S.rfind() but raise ValueError when thesubstring is not found.
| rjust(...)
| S.rjust(width[, fillchar]) -> str
| Return S right-justified in a string of length width. Padding is
| done using the specified fill character (default is a space).
| rpartition(...)
| S.rpartition(sep) -> (head, sep, tail)
| Search for the separator sep in S, starting at the end of S, and return
| the part before it, the separator itself, and the part after it. If the
| separator is not found, return two empty strings and S.
| rsplit(...)
| S.rsplit(sep=None, maxsplit=-1) -> list of strings
| Return a list of the words in S, using sep as the
| delimiter string, starting at the end of the string and
| working to the front. If maxsplitis given, at most maxsplit
| splits are done. If sep is not specified, any whitespace string
| is a separator.
| rstrip(...)
| S.rstrip([chars]) -> str
| Return a copy of the string S with trailing whitespace removed.
| If chars is given and not None, remove characters in chars instead.
| split(...)
| S.split(sep=None, maxsplit=-1) -> list of strings
| Return a list of the words in S, using sep as the
| delimiter string. If maxsplit isgiven, at most maxsplit
| splits are done. If sep is not specified or is None, any
| whitespace string is a separator and empty strings are
| removed from the result.
| splitlines(...) #按行分割成字符串链表
| S.splitlines([keepends]) -> list of strings
| Return a list of the lines in S, breaking at line boundaries.
| Line breaks are not included in the resulting list unless keepends
| is given and true.
| startswith(...)
| S.startswith(prefix[, start[, end]]) -> bool
| Return True if S starts with the specified prefix, False otherwise.
| With optional start, test S beginning at that position.
| With optional end, stop comparing S at that position.
| prefix can also be a tuple of strings to try.
| strip(...) #返加首尾没有空白字符
| S.strip([chars]) -> str
| Return a copy of the string S with leading and trailing
| whitespace removed.
| If chars is given and not None, remove characters in chars instead.
| swapcase(...)
| S.swapcase() -> str
| Return a copy of S with uppercase characters converted to lowercase
| and vice versa.
| title(...)
| S.title() -> str
| Return a titlecased version of S, i.e. words start with title case
| characters, all remaining cased characters have lower case.
| translate(...)
| S.translate(table) -> str
| Return a copy of the string S in which each character has been mapped
| through the given translation table. The table must implement
| lookup/indexing via __getitem__, for instance a dictionary or list,
| mapping Unicode ordinals to Unicode ordinals, strings, or None. If
| this operation raises LookupError, the character is left untouched.
| Characters mapped to None are deleted.
| upper(...)
| S.upper() -> str
| Return a copy of S converted to uppercase.
| zfill(...)
| S.zfill(width) -> str
| Pad a numeric string S with zeros on theleft, to fill a field
| of the specified width. The string S is never truncated.
| ----------------------------------------------------------------------
| Static methods defined here:
| maketrans(x,y=None, z=None, /)
| Return a translation table usable for str.translate().
| If there is only one argument, it must be a dictionary mapping Unicode
| ordinals (integers) or characters to Unicode ordinals, strings or None.
| Character keys will be then converted to ordinals.
| If there are two arguments, they must be strings of equal length, and
| in the resulting dictionary, each character in x will be mapped to the
| character at the same position in y. If there is a third argument, it
| must be a string, whose characters will be mapped to None in the result.
3 使用unicode进行文字处理
>>> import codecs
>>>f=codecs.open(path,encoding='latin2') #latin-2,也称为iso-8859-2
>>> for line in f:
... line=line.strip()
... print(line.encode('unicode_escape'))
b'"Berlinka" to skarb kultury isztuki niemieckiej. Przewiezione przez'
b'Niemc\\xf3w pod koniec II wojny \\u015bwiatowejna Dolny \\u015al\\u0105sk, zosta\\u0142y'
b'odnalezione po 1945 r. na terytoriumPolski. Trafi\\u0142y do Biblioteki'
b'Jagiello\\u0144skiej w Krakowie,obejmuj\\u0105 ponad 500 tys. zabytkowych'
b'archiwali\\xf3w, m.in. manuskrypty Goethego,Mozarta, Beethovena, Bacha.'
>>> ord('a')
>>> a=u'\u0061'
>>> a
>>> print(a)
>>> nacute=u'\u0144'
>>> nacute
>>> print(repr(nacute_utf))
>>> import unicodedata
>>> line=lines[2]
b'Niemc\\xf3w pod koniec II wojny\\u015bwiatowej na Dolny \\u015al\\u0105sk, zosta\\u0142y\\n'
>>> for c in line:
... if ord(c)>127:
... print('%rU+x%s' % (c.encode('utf-8'),ord(c),unicodedata.name(c)))
>>> line.find(u'zostau0142y')
>>> line=line.lower()
b'niemc\\xf3w pod koniec ii wojny\\u015bwiatowej na dolny \\u015bl\\u0105sk, zosta\\u0142y\\n'
>>> import re
>>> m=re.search(u'\u015b\w*',line)
>>> m.group()
>>> nltk.word_tokenize(line)
['niemców', 'pod', 'koniec','ii', 'wojny', 'światowej', 'na', 'dolny', 'śląsk', ',','zostały']
# -*- coding: <coding> -*-
Coding可以是latin-1 big5 utf-8
4 使用正则表达式检测词组搭配
>>> import nltk
>>> wordlist=[w for w innltk.corpus.words.words('en') if w.islower()]
>>> [w for w in wordlist ifre.search('ed$',w)]
['abaissed', 'abandoned', 'abased','abashed', 'abatised', 'abed', 'aborted', 'abridged', 'abscessed',
'younghearted', 'zagged', 'zed', 'zeed','zigzagged', 'zonated', 'zoned']
[w for w in wordlist ifre.search('^..j..t..$',w)]
['abjectly', 'adjuster', 'dejected','dejectly', 'injector', 'majestic', 'objectee', 'objector', 'rejecter','rejector', 'unjilted', 'unjolted', 'unjustly']
>>> [w for w in wordlist ifre.search('^[ghi][mno][jlk][def]$',w)]
['gold', 'golf', 'hold', 'hole']
>>> chat_words=sorted(set(w for win nltk.corpus.nps_chat.words()))
>>> [w for w in chat_words if re.search('^m+i+n+e+$',w)]
['miiiiiiiiiiiiinnnnnnnnnnneeeeeeeeee','miiiiiinnnnnnnnnneeeeeeee', 'mine', 'mmmmmmmmiiiiiiiiinnnnnnnnneeeeeeee']
>>> [w for w in chat_words ifre.search('^[ha]+$',w)]
['a', 'aaaaaaaaaaaaaaaaa', 'aaahhhh', 'ah','ahah', 'ahahah', 'ahh', 'ahhahahaha', 'ahhh', 'ahhhh', 'ahhhhhh','ahhhhhhhhhhhhhh', 'h', 'ha', 'haaa', 'hah', 'haha', 'hahaaa', 'hahah','hahaha', 'hahahaa', 'hahahah', 'hahahaha', 'hahahahaaa', 'hahahahahaha','hahahahahahaha', 'hahahahahahahahahahahahahahahaha', 'hahahhahah','hahhahahaha']
>>> [w for w in wsj ifre.search('^[0-9]+\.[0-9]+$',w)]
['0.0085', '0.05', '0.1', '0.16', '0.2','0.25', '0.28', '0.3', '0.4', '0.5', '0.50', '0.54', '0.56', '0.60', '0.7',
'9.8', '9.82', '9.9', '92.9', '93.3','93.9', '94.2', '94.8', '95.09', '96.4', '98.3', '99.1', '99.3']
>>> [w for w in wsj if re.search('^[A-Z]+\$$',w)]
['C$', 'US$']
>>> [w for w in wsj ifre.search('^[0-9]+-[a-z]{3,5}$',w)]
['10-day', '10-lap', '10-year','100-share', '12-point', '12-year', '14-hour', '15-day', '150-point','190-point', '20-point', '20-stock', '21-month', '237-seat', '240-page','27-year', '30-day', '30-point', '30-share', '30-year', '300-day', '36-day','36-store', '42-year', '50-state', '500-stock', '52-week', '69-point','84-month', '87-store', '90-day']
>>> [w for w in wsj ifre.search('^[a-z]{5,}-[a-z]{2,3}-[a-z]{,6}$',w)]
['black-and-white', 'bread-and-butter','father-in-law', 'machine-gun-toting', 'savings-and-loan']
>>> [w for w in wsj ifre.search('(ed|ing)$',w)]
['62%-owned', 'Absorbed', 'According','Adopting', 'Advanced', 'Advancing', 'Alfred', 'Allied',
'yielded', 'yielding', 'yttrium-containing','zoomed']
5 正则表达式的有益应用
>>> re.findall(r'[aeiou]',word)
['u', 'e', 'a', 'i', 'a', 'i', 'i', 'i','e', 'i', 'a', 'i', 'o', 'i', 'o', 'u']
>>> fd=nltk.FreqDist(vs for wordin wsj
... for vs in re.findall(r'[aeiou]{2,}',word))
... fd.items()
dict_items([('eei', 2), ('aia', 1),('aiia', 1), ('au', 106), ('ao', 6), ('eo', 39), ('ioa', 1), ('ia', 253),('uu', 1), ('ui', 95), ('oa', 59), ('iai', 1), ('ueui', 1), ('ae', 11), ('ei',86), ('ai', 261), ('eou', 5), ('ou', 329), ('ee', 217), ('uo', 8), ('iou', 27),('ie', 331), ('uie', 3), ('iu', 14), ('aii', 1), ('iao', 1), ('eu', 18),('ooi', 1), ('ue', 105), ('oui', 6), ('oei', 1), ('ieu', 3), ('oi', 65), ('io',549), ('uou', 5), ('ea', 476), ('oo', 174), ('ua', 109), ('eau', 10), ('oe',15), ('eea', 1), ('aa', 3), ('uee', 4)])
>>> def compress(word):
... pieces=re.findall(regexp,word)
... return ''.join(pieces)
>>> print(nltk.tokenwrap(compress(w)for w in english_udhr[:75]))
nvrsl Dclrtn f Hmn Rghts Prmbl Whrs rcgntnf th nhrnt dgnty nd f th ql
nd nlnbl rghts f ll mmbrs f th hmn fmly sth fndtn f frdm , jstc nd pc
n th wrld , Whrs dsrgrd nd cntmpt fr hmnrghts hv rsltd n brbrs cts
whch hv trgd th cnscnc f mnknd , nd th dvntf wrld n whch hmn bngs
shll njy frdm f spch nd
>>> cvs=[cv for w in rotokas_wordsfor cv in re.findall(r'[ptksvr][aeiou]',w)]
>>> cfd=nltk.ConditionalFreqDist(cvs)
>>> cfd.tabulate()
a e i o u
k 418 148 94 420 173
p 83 31 105 34 51
r 187 63 84 89 79
s 0 0 100 2 1
t 47 8 0 148 37
v 93 27 105 48 49
>>> def stem(word):
... for suffix in ['ing','ly','ed','ious','ies','ive','es','s','ment']:
... if word.endswith(suffix):
... return word[:-len(suffix)]
... return word
>>> re.findall(r'^.*(ing|ly|ed|ious|ies|ive|es|s|ment)$','processing')
[('process', 'ing')]
[('process', 'es')]
[('processe', 's')]
>>> def stem(word):
... regexp=r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
... stem,suffix=re.findall(regexp,word)[0]
... return stem
>>> raw="""DENNIS:Listen,strange women lying in ponds distributing swords
... is no bassis for a system ofgovernment.Supreme executive power derives from
... a mandate form masses,not from somefarcical aquatic ceremony."""
>>> tokens=nltk.word_tokenize(raw)
>>> [stem(t) for t in tokens]
['DENNIS', ':', 'Listen', ',', 'strange','women', 'ly', 'in', 'pond', 'distribut', 'sword', 'i', 'no', 'bassi', 'for', 'a','system', 'of', 'government.Supreme', 'execut', 'power', 'deriv', 'from', 'a','mandate', 'form', 'mass', ',', 'not', 'from', 'some', 'farcical', 'aquatic','ceremony',’.’]
#找出所有a * man
>>> from nltk.corpus importgutenberg,nps_chat
>>> moby=nltk.Text(gutenberg.words('melville-moby_dick.txt'))
monied; nervous; dangerous; white; white;white; pious; queer; good;
mature; white; Cape; great; wise; wise;butterless; white; fiendish;
pale; furious; better; certain; complete;dismasted; younger; brave;
brave; brave; brave
you rule bro; telling you bro; u twiztedbro
lol lol lol; lmao lol lol; lol lol lol; lala la la la; la la la; la
la la; lovely lol lol love; lol lol lol.;la la la; la la la
#在词料库中搜索x and other ys
>>> from nltk.corpus import brown
>>>hobbies_learned.findall(r'<\w*> <and> <other> <\w*s>')
speed and other activities; water and otherliquids; tomb and other
landmarks; Statues and other monuments;pearls and other jewels;
charts and other items; roads and otherfeatures; figures and other
objects; military and other areas; demandsand other factors;
abstracts and other compilations; iron andother metals
6 规范化文本
>>> import nltk
>>> raw="""DENNIS:Listen,strange women lying in ponds distributing swords
... is no bassis for a system ofgovernment.Supreme executive power derives from
... a mandate form masses,not from somefarcical aquatic ceremony."""
>>> porter=nltk.PorterStemmer()
>>> lancaster=nltk.LancasterStemmer()
>>> [porter.stem(t) for t intokens]
['DENNI', ':', 'Listen', ',', 'strang','women', 'lie', 'in', 'pond', 'distribut', 'sword', 'is', 'no', 'bassi', 'for','a', 'system', 'of', 'government.Suprem', 'execut', 'power', 'deriv', 'from','a', 'mandat', 'form', 'mass', ',', 'not', 'from', 'some', 'farcic', 'aquat','ceremoni', '.']
>>> [lancaster.stem(t) for t intokens]
['den', ':', 'list', ',', 'strange', 'wom','lying', 'in', 'pond', 'distribut', 'sword', 'is', 'no', 'bass', 'for', 'a','system', 'of', 'government.supreme', 'execut', 'pow', 'der', 'from', 'a','mand', 'form', 'mass', ',', 'not', 'from', 'som', 'farc', 'aqu', 'ceremony','.']
>>> import nltk
class IndexedText(object):
def __init__(self, stemmer, text):
self._text = text
self._stemmer = stemmer
self._index = nltk.Index((self._stem(word), i) for (i, word) inenumerate(text))
def concordance(self, word, width=40):
key = self._stem(word)
wc = int(width / 4)
for i in self._index[key]:
lcontext = ' '.join(self._text[i - wc:i])
rcontext = ' '.join(self._text[i:i + wc])
ldisplay = '%*s' % (width, lcontext[-width:])
rdisplay = '%-*s' % (width, rcontext[:width])
print(ldisplay, rdisplay)
def _stem(self, word):
return self._stemmer.stem(word).lower()
porter = nltk.PorterStemmer()
grail =nltk.corpus.webtext.words('grail.txt')
text = IndexedText(porter, grail)
r king ! DENNIS : Listen , strange womenlying in ponds distributing swords is no
beata very brave retreat . ROBIN : All lies ! MINSTREL : [ singing ] Bravest of
Nay . Nay . Come . Come . You may lie here . Oh , but you are wounded!
doctors immediately ! No , no , please !Lie down . [ clap clap ] PIGLET : Well
ere is much danger , for beyond the cavelies the Gorge of Eternal Peril , which
you . Oh ... TIM : To the north there lies a cave -- the cave ofCaerbannog --
h it and lived ! Bones of full fifty menlie strewn about its lair . So , brave k
not stop our fight ' til each one of youlies dead , and the Holy Grail returns t
词形归并 (词形归并器)
>>> import nltk
>>> raw="""DENNIS:Listen,strange women lying in ponds distributing swords
... is no bassis for a system ofgovernment.Supreme executive power derives from
... a mandate form masses,not from somefarcical aquatic ceremony."""
>>> tokens=nltk.word_tokenize(raw)
>>> wnl=nltk.WordNetLemmatizer()
>>> [wnl.lemmatize(t) for t intokens]
['DENNIS', ':', 'Listen', ',', 'strange','woman', 'lying', 'in', 'pond', 'distributing', 'sword', 'is', 'no', 'bassis','for', 'a', 'system', 'of', 'government.Supreme', 'executive', 'power','derives', 'from', 'a', 'mandate', 'form', 'mass', ',', 'not', 'from', 'some','farcical', 'aquatic', 'ceremony', '.']
7 用正则表达式为文本分词
>>> import nltk
>>> import re
>>> raw="""'WhenI'm a Duchess,' she said herself,(not in a very hopefultone
... though),'I won't have any peper in mykitchen AT ALL.Soup does very
... well tithout--Maybe it's always peperthat makes people hot-tempered,'..."""
>>> re.split(r' ',raw)
["'When", "I'm", 'a',"Duchess,'", 'she', 'said', 'herself,(not', 'in', 'a', 'very',"hopefultone\nthough),'I", "won't", 'have', 'any', 'peper','in', 'my', 'kitchen', 'AT', 'ALL.Soup', 'does', 'very\nwell','tithout--Maybe', "it's", 'always', 'peper', 'that', 'makes','people', "hot-tempered,'..."]
#使用'[ \t\n]+',可以匹配一个或多个空格,制表符或换行
>>> re.split(r'[\t\n]+',raw)
["'When", "I'm", 'a',"Duchess,'", 'she', 'said', 'herself,(not', 'in', 'a', 'very','hopefultone', "though),'I", "won't", 'have', 'any','peper', 'in', 'my', 'kitchen', 'AT', 'ALL.Soup', 'does', 'very', 'well','tithout--Maybe', "it's", 'always', 'peper', 'that', 'makes','people', "hot-tempered,'..."]
>>> re.split(r'\W+',raw)
['', 'When', 'I', 'm', 'a', 'Duchess','she', 'said', 'herself', 'not', 'in', 'a', 'very', 'hopefultone', 'though','I', 'won', 't', 'have', 'any', 'peper', 'in', 'my', 'kitchen', 'AT', 'ALL','Soup', 'does', 'very', 'well', 'tithout', 'Maybe', 'it', 's', 'always','peper', 'that', 'makes', 'people', 'hot', 'tempered', '']
>>> re.findall(r'\w+|\S\w*',raw)
["'When", 'I', "'m",'a', 'Duchess', ',', "'", 'she', 'said', 'herself', ',', '(not','in', 'a', 'very', 'hopefultone', 'though', ')', ',', "'I", 'won',"'t", 'have', 'any', 'peper', 'in', 'my', 'kitchen', 'AT', 'ALL','.Soup', 'does', 'very', 'well', 'tithout', '-', '-Maybe', 'it',"'s", 'always', 'peper', 'that', 'makes', 'people', 'hot','-tempered', ',', "'", '.', '.', '.']
["'", 'When', "I'm",'a', 'Duchess', ',', "'", 'she', 'said', 'herself', ',', '(', 'not','in', 'a', 'very', 'hopefultone', 'though', ')', ',', "'", 'I',"won't", 'have', 'any', 'peper', 'in', 'my', 'kitchen', 'AT', 'ALL','.', 'Soup', 'does', 'very', 'well', 'tithout', '--', 'Maybe',"it's", 'always', 'peper', 'that', 'makes', 'people', 'hot-t','empered', ',', "'", '...']
>>> text='That U.S.A. poster-printcosts $12.40..,'
>>> pattern=r'''(?x)
... ([A-Z]\.)+
... | \w+(-\w+)*
... | \$?\d+(\.\d+)?%?
... | \.\.\.
... | [][.,;"'?():-_`]
... '''
8 分割
>>> import pprint
>>> sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
>>> pp.pprint(sents[171:181])
[ 'In the wild events which were to follow this girl had no\n'
'part at all; he never saw her again until all his tale was over.',
'And yet, in some indescribable way, she kept recurring like a\n'
'motive in music through all his mad adventures afterwards, and the\n'
'glory of her strange hair ran like a red thread through those dark\n'
'and ill-drawn tapestries of the night.',
'For what followed was so\n'
'improbable, that it might well have been a dream.',
'When Syme went out into the starlit street, he found it for the\n'
'moment empty.',
'Then he realised (in some odd way) that the silence\n'
'was rather a living silence than a dead one.',
'Directly outside the\n'
'door stood a street lamp, whose gleam gilded the leaves of the tree\n'
'that bent out over the fence behind him.',
'About a foot from the\n'
'lamp-post stood a figure almost as rigid and motionless as the\n'
'lamp-post itself.',
'The tall hat and long frock coat were black; the\n'
'face, in an abrupt shadow, was almost as dark.',
'Only a fringe of\n'
'fiery hair against the light, and also something aggressive in the\n'
'attitude, proclaimed that it was the poet Gregory.',
'He had something\n'
'of the look of a masked bravo waiting sword in hand for his foe.']
>>> defsegment(text,segs):
... words=[]
... last=0
... for i in range(len(segs)):
... if segs[i]=='1':
... words.append(text[last:i+1])
... last=i+1
... words.append(text[last:])
... return words
>>> segment(text,seg1)
['doyouseethekitty', 'seethedoggy','doyoulikethekitty', 'likethedoggy']
>>> segment(text,seg2)
['do', 'you', 'see', 'the', 'kitty', 'see','the', 'doggy', 'do', 'you', 'like', 'the', 'kitty', 'like', 'the', 'doggy']
>>> defevaluate(text,segs):
... words=segment(text,segs)
... text_size=len(words)
... lexicon_size=len(''.join(list(set(words))))
... return text_size+lexicon_size
>>> segment(text,seg3)
['doyou', 'see', 'thekitt', 'y', 'see','thedogg', 'y', 'doyou', 'like', 'thekitt', 'y', 'like', 'thedogg', 'y']
>>> evaluate(text,seg1)
>>> evaluate(text,seg2)
>>> evaluate(text,seg3)
from random import randint def flip(segs, pos): return segs[:pos] + str(1 - int(segs[pos])) + segs[pos + 1:] def flip_n(segs, n): for i in range(n): segs = flip(segs, randint(0, len(segs) - 1)) return segs def anneal(text, segs, iterations, cooling_rate): temperature = float(len(segs)) while temperature > 0.5: best_segs, best = segs, evaluate(text, segs) for i in range(iterations): guess = flip_n(segs, int(round(temperature))) score = evaluate(text, guess) if score < best: best, best_segs = score, guess score, segs = best, best_segs temperature = temperature / cooling_rate print(evaluate(text, segs), segment(text, segs)) print return segs
>>> anneal(text, seg1, 5000, 1.2)
63 ['doyouseethekitty', 'seethedoggy','doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy','doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy','doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy','doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy','doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty','likethedoggy']
60 ['doyousee', 'thekitty', 'seethedoggy','doyou', 'l', 'ike', 'thekitty', 'l', 'ike', 'thedoggy']
60 ['doyousee', 'thekitty', 'seethedoggy','doyou', 'l', 'ike', 'thekitty', 'l', 'ike', 'thedoggy']
60 ['doyousee', 'thekitty', 'seethedoggy','doyou', 'l', 'ike', 'thekitty', 'l', 'ike', 'thedoggy']
60 ['doyousee', 'thekitty', 'seethedoggy','doyou', 'l', 'ike', 'thekitty', 'l', 'ike', 'thedoggy']
58 ['doyo', 'u', 'see', 'thekitty', 'see','thedoggy', 'doyo', 'ul', 'ike', 'thekitty', 'l', 'i', 'k', 'e', 'thedoggy']
58 ['doyo', 'u', 'see', 'thekitty', 'see','thedoggy', 'doyo', 'ul', 'ike', 'thekitty', 'l', 'i', 'k', 'e', 'thedoggy']
54 ['doyo', 'u', 'see', 'thekitty', 'see','thedoggy', 'doyo', 'u', 'l', 'ike', 'thekitty', 'l', 'ik', 'e', 'thedoggy']
51 ['doyo', 'u', 'see', 'thekitty', 'see','t', 'hedoggy', 'doyo', 'u', 'l', 'ike', 'thekitty', 'l', 'ike', 't','hedoggy']
48 ['doyo', 'u', 'see', 'thekitty', 'see','t', 'hedoggy', 'doyo', 'u', 'like', 'thekitty', 'like', 't', 'hedoggy']
45 ['doyou', 'see', 'thekitty', 'see', 't','hedoggy', 'doyou', 'like', 'thekitty', 'like', 't', 'hedoggy']
45 ['doyou', 'see', 'thekitty', 'see', 't','hedoggy', 'doyou', 'like', 'thekitty', 'like', 't', 'hedoggy']
42 ['doyou', 'see', 'thekitty', 'see','thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']
42 ['doyou', 'see', 'thekitty', 'see','thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']
42 ['doyou', 'see', 'thekitty', 'see','thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']
42 ['doyou', 'see', 'thekitty', 'see','thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']
42 ['doyou', 'see', 'thekitty', 'see','thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']
42 ['doyou', 'see', 'thekitty', 'see', 'thedoggy','doyou', 'like', 'thekitty', 'like', 'thedoggy']
42 ['doyou', 'see', 'thekitty', 'see','thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']
42 ['doyou', 'see', 'thekitty', 'see','thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']
42 ['doyou', 'see', 'thekitty', 'see','thedoggy', 'doyou', 'like', 'thekitty', 'like', 'thedoggy']
9 格式化:从链表到字符串
>>> ' '.join(silly)
'We called him Tortoise because he taughtus .'
>>> ";".join(silly)
>>> "".join(silly)
>>> word='cat'
>>> sentence="""hello
... world"""
>>> print(word)
>>> print(sentence)
>>> word
>>> sentence
>>> import nltk
>>> for word in fdist:
... print(word,'->',fdist[word],';',)
dog -> 4 ;
snake -> 1 ;
cat -> 3 ;
>>> for word in fdist:
... print('%s->%d' % (word,fdist[word]),)
>>> importnltk
>>>from nltk.corpus import brown
>>>def tabulate(cfdist, words, categories):
print('%-16s' % 'Category', )
for word in words:
print('%6s' % word, end="",sep=None)
for category in categories:
print('%-16s' % category,end="", sep=None)
for word in words: