U C^@sddlmZddlmZddlmZddlmZddlm Z ddl m Z d d l m Z d d lmZd d lmZd dZGddde ZGdddejZGdddeZdgZdS))unicode_literals)LANG)Language)Doc)DummyTokenizer)BASE_EXCEPTIONS) LEX_ATTRS) STOP_WORDS)TAG_MAPcCs8zddl}|WStk r2|r.d}t|YnXdS)NrzgJieba not installed. Either set Chinese.use_jieba = False, or install it https://github.com/fxsjy/jieba)jieba ImportError) use_jiebarmsgr9/tmp/pip-install-6_kvzl1k/spacy/spacy/lang/zh/__init__.pytry_jieba_importsrc@seZdZdddZddZdS)ChineseTokenizerNcCsB|dk r|jn|||_|j|_t|j|_t||_dSN) vocabZ create_vocabrr jieba_segrDefaultscreate_tokenizer tokenizer)selfclsnlprrr__init__s zChineseTokenizer.__init__cCsl|jrtdd|jj|ddD}|dg}dg}tdt|D]}||}|r|drt|||dq|dkr|dsd |d<q|dr|d|7<q|||dqB|||dqBt|j ||d Sg}g}| |D]\}|j r&||j |dq| t|j | dgt|j t |j|d<qt|j ||d S) NcSsg|] }|r|qSrr).0xrrr 'sz-ChineseTokenizer.__call__..F)Zcut_allrr  T)wordsspaces)rlistrZcutrangelenisspaceappendrrrtextextendboolZ whitespace_)rr,Z jieba_wordsr%r&iwordtokenrrr__call__#s>            zChineseTokenizer.__call__)N)__name__ __module__ __qualname__rr2rrrrrs rc@sXeZdZeejjZeeddee <e Z e Z eZddddZdZed dd ZdS) ChineseDefaultscCsdS)Nzhr)r,rrrVzChineseDefaults.ZltrF) directionZhas_caseZ has_lettersTNcCs t||Sr)r)rrrrrr]sz ChineseDefaults.create_tokenizer)N)r3r4r5dictrrZlex_attr_gettersupdater rr tokenizer_exceptionsr stop_wordsr tag_mapZwriting_systemr classmethodrrrrrr6Ss    r6c@seZdZdZeZddZdS)Chineser7cCs ||Sr)r)rr,rrrmake_docfszChinese.make_docN)r3r4r5langr6rrBrrrrrAbsrAN) __future__rattrsrlanguagertokensrutilrr=r Z lex_attrsr r>r r?r rrrr6rA__all__rrrrs         7