Chinese text analyzer
MIT License
chinese is a Chinese text analyzer.
NOTE: Python 2.* is not supported.
Install chinese using pip:
$ pip install chinese
$ pynlpir update
Start analyzing Chinese text:
>>> from chinese import ChineseAnalyzer
>>> analyzer = ChineseAnalyzer()
>>> result = analyzer.parse('')
>>> result.tokens()
['', '', '', '', '']
>>> result.pinyin()
'w hn goxng rnshi n'
>>> result.pprint()
{'original': '',
'parsed': [{'dict_data': [{'definitions': ['I', 'me', 'my'],
'kind': 'Simplified',
'match': '',
'pinyin': ['wo3']}],
'token': ('', 0, 1)},
{'dict_data': [{'definitions': ['(adverb of degree)',
'quite',
'very',
'awfully'],
'kind': 'Simplified',
'match': '',
'pinyin': ['hen3']}],
'token': ('', 1, 2)},
{'dict_data': [{'definitions': ['happy',
'glad',
'willing (to do sth)',
'in a cheerful mood'],
'kind': 'Simplified',
'match': '',
'pinyin': ['gao1', 'xing4']}],
'token': ('', 2, 4)},
{'dict_data': [{'definitions': ['to know',
'to recognize',
'to be familiar with',
'to get acquainted with sb',
'knowledge',
'understanding',
'awareness',
'cognition'],
'kind': 'Simplified',
'match': '',
'pinyin': ['ren4', 'shi5']}],
'token': ('', 4, 6)},
{'dict_data': [{'definitions': ['you (informal, as opposed to '
'courteous [nin2])'],
'kind': 'Simplified',
'match': '',
'pinyin': ['ni3']}],
'token': ('', 6, 7)}]}
>>> result = analyzer.parse('', traditional=True)
>>> print(res)
{'': [{'definitions': ['flavor', 'smell', 'hint of'],
'kind': 'Traditional',
'match': '',
'pinyin': ['wei4', 'dao5']}],
'': [{'definitions': ['to like', 'to be fond of'],
'kind': 'Traditional',
'match': '',
'pinyin': ['xi3', 'huan5']}],
'': [{'definitions': ['I', 'me', 'my'],
'kind': 'Traditional',
'match': '',a
'pinyin': ['wo3']}],
'': [{'definitions': ['this', 'this one'],
'kind': 'Traditional',
'match': '',
'pinyin': ['zhe4', 'ge5']}]}
parse()
returns a ChineseAnalyzerResult object.>>> from chinese import ChineseAnalyzer
>>> analyzer = ChineseAnalyzer()
# Basic usage.
>>> result = analyzer.parse('')
# If the traditional option is set to True, the analyzer tries to parse the
# provided text as .
>>> result = analyzer.parse('', traditional=True)
# The default tokenizer uses jieba's. You can also use pynlpir's to tokenize.
>>> result = analyzer.parse('', using=analyzer.tokenizer.pynlpir)
# In addition, a custom tokenizer can be passed to the method.
>>> from chinese.tokenizer import TokenizerInterface
>>> class MyTokenizer(TokenizerInterface): # Custom tokenizer must inherit from TokenizerInterface.
... # Custom tokenizer must implement tokenize() method.
... def tokenize(self, string):
... # tokenize() must return a list of tuples containing at least
... # a string as a first element.
... # For example: [('token1', ...), ('token2', ...), ...].
...
>>> my_tokenizer = MyTokenizer()
>>> result = analyzer.parse('', using=my_tokenizer)
# You can also specify the dictionary used for looking up each token.
# You specify a path to a dictionary file for that and the file must have
# the CC-CEDICT's dictionary file structure.
# CC-CEDICT's dictionary is used for looking up by default.
>>> result = analyzer.parse('', dictionary='path/to/dict')
original()
returns the supplied text as is.>>> result = analyzer.parse('')
>>> result.original()
''
tokens()
returns tokens in the provided text.>>> result = analyzer.parse('')
>>> result.tokens()
['', '', '', '']
>>> result.tokens(details=True) # If the details option is set to True, additional information is also attached.
[('', 0, 1), ('', 1, 2), ('', 2, 4), ('', 4, 8)] # In this case, the positions of tokens are included.
>>> result = analyzer.parse('')
>>> result.tokens(unique=True) # You can get a unique collection of tokens using unique option.
['', '', '', '', '']
freq()
returns a Counter object that counts the number of occurrences for each token.>>> result = analyzer.parse('')
>>> result.freq()
Counter({'': 9, '': 3, '': 1, '': 1, '': 1})
sentences()
returns a list of paragraphs in a provided text.>>> s = '''
...
... '''
>>> result = analyzer.parse(s)
>>> result.sentences()
['', '', '', '']
search()
returns a list of sentences containing the argument string.>>> s = ''
>>> result = analyzer.parse(s)
>>> result.search('')
['']
paragraphs()
returns a list of sentences in a provided text.>>> s = '''
...
... '''
>>> result = analyzer.parse(s)
>>> result.paragraphs()
['', '']
pinyin()
returns a pinyin representation of the provided text.>>> result = analyzer.parse('Python')
>>> result.pinyin()
'w xhuan Python.'
>>> result = analyzer.parse('')
>>> result.pinyin() # Sometimes the analyzer cannot find a correponding pinyin.
'xigyu w q '
>>> result.pinyin(force=True) # The force option forces it to try to convert an unknown word to pinyin.
'xigyu w q sg'
pprint()
prints a formatted description of the parsed text.>>> result = analyzer.parse('')
>>> result.pprint()
{'original': '',
'parsed': [{'dict_data': [{'definitions': ['I', 'me', 'my'],
'kind': 'Simplified',
'match': '',
'pinyin': ['wo3']}],
'token': ('', 0, 1)},
{'dict_data': [{'definitions': ['to love',
'to be fond of',
'to like',
'affection',
'to be inclined (to do sth)',
'to tend to (happen)'],
'kind': 'Simplified',
'match': '',
'pinyin': ['ai4']}],
'token': ('', 1, 2)},
{'dict_data': [{'definitions': ['to read', 'to study'],
'kind': 'Simplified',
'match': '',
'pinyin': ['kan4', 'shu1']}],
'token': ('', 2, 4)}]}
say()
converts the provided text to Chinese audible speech (macOS only).>>> result = analyzer.parse('Ting-Ting')
>>> result.say() # Output the speech.
>>> result.say(out='say.aac') # Save the speech to out.
>>> result = analyzer.parse('')
>>> result.tokens()
['', '', '', '']
>>> len(result)
4
>>> result = analyzer.parse('')
>>> '' in result
True
>>> '' in result
False
>>> result = analyzer.parse('')
>>> result.tokens()
['', '', '', '', '']
>>> shenme = result[''] # It's just a list of lookup results.
>>> len(shenme) # It has only one entry.
1
>>> print(shenme[0]) # Print that entry.
{'definitions': ['what?', 'something', 'anything'],
'kind': 'Simplified',
'match': '',
'pinyin': ['shen2', 'me5']}
>>> shenme_info = shenme[0]
>>> shenme_info.definitions # Definitions of the token.
['what?', 'something', 'anything']
>>> shenme_info.match # The corresponding .
''
>>> shenme_info.pinyin # The pinyin of the token.
['shen2', 'me5']
MIT License
jieba and PyNLPIR are used to tokenize a Chinese text.
CC-CEDICT is used to lookup information for tokens.