ó <¿CVc@sµddlmZmZddlZddlZddlZddlmZddlm Z ddl m Z m Z m Z mZddlmZdZdefd „ƒYZd „ZdS( iÿÿÿÿ(tunicode_literalstprint_functionN(tPIPE(tcompat(tfind_jart config_javatjavat _java_options(t TokenizerIu0http://nlp.stanford.edu/software/tokenizer.shtmltStanfordTokenizercBsPeZdZdZdddedd„Zed„ƒZd„Z ed„Z RS( u Interface to the Stanford Tokenizer >>> from nltk.tokenize import StanfordTokenizer >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks." >>> StanfordTokenizer().tokenize(s) ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] >>> s = "The colour of the wall is blue." >>> StanfordTokenizer(options={"americanize": True}).tokenize(s) ['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.'] ustanford-postagger.jaruutf8u-mx1000mc Cs}t|j|dddd dtd|ƒ|_||_||_|dkrQin|}djd„|jƒDƒƒ|_ dS( Ntenv_varsuSTANFORD_POSTAGGERt searchpathturltverboseu,css'|]\}}dj||ƒVqdS(u{0}={1}N(tformat(t.0tkeytval((sh/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/stanford.pys 3s(uSTANFORD_POSTAGGER(( Rt_JARt _stanford_urlt _stanford_jart _encodingt java_optionstNonetjointitemst _options_cmd(tselft path_to_jartencodingtoptionsR R((sh/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/stanford.pyt__init__'s     cCs |jƒS(N(t splitlines(ts((sh/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/stanford.pyt_parse_tokenized_output5scCs"dg}|j|j||ƒƒS(uW Use stanford tokenizer's PTBTokenizer to tokenize multiple sentences. u%edu.stanford.nlp.process.PTBTokenizer(R"t_execute(RR!tcmd((sh/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/stanford.pyttokenize9s c Cs8|j}|jd|gƒ|j}|rD|jd|jgƒndjtƒ}td|jd|ƒtjdddt ƒŽ}t |t j ƒr®|r®|j |ƒ}n|j|ƒ|jƒ|j|jƒt|d |jd td tƒ\}} |j|ƒ}WdQXtj|jƒtd|dt ƒ|S( Nu-charsetu-optionsu RR tmodeuwbtdeletet classpathtstdouttstderr(RtextendRRRRRttempfiletNamedTemporaryFiletFalset isinstanceRt text_typetencodetwritetflushtappendtnameRRRtdecodetostunlink( RR$tinput_R RRtdefault_optionst input_fileR)R*((sh/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/stanford.pyR#Bs&    N( t__name__t __module__t__doc__RRR.Rt staticmethodR"R%R#(((sh/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/stanford.pyR s   cCs?ddlm}y tƒWntk r:|dƒ‚nXdS(Niÿÿÿÿ(tSkipTestuadoctests from nltk.tokenize.stanford are skipped because the stanford postagger jar doesn't exist(tnoseR@R t LookupError(tmoduleR@((sh/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/stanford.pyt setup_modulees   (t __future__RRR,R7tjsont subprocessRtnltkRtnltk.internalsRRRRtnltk.tokenize.apiRRR RD(((sh/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/stanford.pyt s   "M