ó <¿CVc@s¹ddlmZddlZddlmZddlmZddlmZm Z m Z ddl m Z m Z d„Zde fd „ƒYZd „Zd „Zed krµeƒndS( iÿÿÿÿ(tprint_functionN(t string_types(tDependencyGraph(tFileSystemPathPointertfind_corpus_fileidstread_blankline_block(tSyntaxCorpusReadert CorpusReadercCsdjd„|DƒƒS(Nt/css)|]}|ddkr|dVqdS(itEOSN((t.0tm((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/knbc.pys s(tjoin(tmorphs((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/knbc.pytstKNBCorpusReadercBsDeZdZded„Zd„Zd„Zdd„Zd„Z RS(sµ This class implements: - ``__init__``, which specifies the location of the corpus and a method for detecting the sentence blocks in corpus files. - ``_read_block``, which reads a block from the input stream. - ``_word``, which takes a block and returns a list of list of words. - ``_tag``, which takes a block and returns a list of list of tagged words. - ``_parse``, which takes a block and returns a list of parsed sentences. The structure of tagged words: tagged_word = (word(str), tags(tuple)) tags = (surface, reading, lemma, pos1, posid1, pos2, posid2, pos3, posid3, others ...) Usage example ------------- >>> from nltk.corpus.util import LazyCorpusLoader >>> knbc = LazyCorpusLoader( ... 'knbc/corpus1', ... KNBCorpusReader, ... r'.*/KN.*', ... encoding='euc-jp', ... ) >>> len(knbc.sents()[0]) 9 tutf8cCs#tj||||ƒ||_dS(s– Initialize KNBCorpusReader morphs2str is a function to convert morphlist to str for tree representation for _parse() N(Rt__init__t morphs2str(tselftroottfileidstencodingR((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/knbc.pyR;scCs t|ƒS(N(R(Rtstream((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/knbc.pyt _read_blockDscCs\g}xO|jƒD]A}tjd|ƒs|jƒjdƒ}|j|dƒqqW|S(Ns EOS|\*|\#|\+t i(t splitlinestretmatchtstriptsplittappend(Rtttrestlinetcells((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/knbc.pyt_wordHs cCsog}xb|jƒD]T}tjd|ƒs|jƒjdƒ}|j|ddj|dƒfƒqqW|S(Ns EOS|\*|\#|\+Rii(RRRRRRR (RR ttagsetR!R"R#((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/knbc.pyt_tagSs +c Cs¨tƒ}d}xO|jƒD]A}|ddkrü|jƒjddƒ}tjd|dƒ}|dk srt‚|j|}|j i|d6|j dƒd 6gd 6ƒt |j dƒƒ}|d kr×||_ n|j|d j |ƒ|d7}q|dd kr|jƒjdƒ}|ddj|dƒf} |j|dd j | ƒqqW|jržx1|jjƒD]}|j|d ƒ|d  sicSs djd„|DƒƒjdƒS(NRcssD|]:}|ddkrd|d|djdƒdfVqdS(iR s%s(%s)iRiN(R(R R ((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/knbc.pys £ssutf-8(R tencode(R ((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/knbc.pyR¢scss|]}d|VqdS(s%sN((R R3((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/knbc.pys ¦ss css(|]}djd„|DƒƒVqdS(Rcss4|]*}d|d|djdƒdfVqdS(s%s/%siiRiN(R(R tw((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/knbc.pys ªsN(R (R tsent((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/knbc.pys ªsi(tnltktnltk.corpus.utilR>tdatatfindRRRtsearchRtsortedtprintRR twordst parsed_sentsRt tagged_sents(RHR>RtfRRAtknbc((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/knbc.pytdemos"   ' 'cCs°ddlm}|dtdddƒ}t|jƒdtƒsGt‚t|jƒddtƒsjt‚t|jƒdt ƒs‰t‚t|j ƒddt ƒs¬t‚dS(Niÿÿÿÿ(R>s knbc/corpus1s.*/KN.*Rseuc-jpi( RIR>Rt isinstanceRORR-tsentst tagged_wordsttupleRQ(R>RS((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/knbc.pyttest°s#t__main__(t __future__RRt nltk.compatRt nltk.parseRtnltk.corpus.reader.utilRRRtnltk.corpus.reader.apiRRR=RRTRYR:(((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/knbc.pyt s  r #