ó <¿CVc@sôddlZddlZddlZddlmZddlmZddlmZm Z ddl Z d„Z defd„ƒYZ de fd „ƒYZ d dd „ƒYZd e fd „ƒYZde fd„ƒYZde fd„ƒYZdS(iÿÿÿÿN(tcompat(tconcat(tXMLCorpusReadert XMLCorpusViewcs%tjˆƒd‡fd†ƒ}|S(sj Wraps function arguments: if fileids not specified then function set NKJPCorpusReader paths. cs"|s|j}nˆ|||S(N(t_paths(tselftfileidstkwargs(tfun(si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pyt decorators N(t functoolstwrapstNone(RR ((Rsi/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pyt _parse_argss tNKJPCorpusReadercBs­eZdZdZdZdZdd„Zd„Zd„Zdd„Z d „Z e dd „ƒZ e dd „ƒZe dd „ƒZe dd „ƒZe dd„ƒZRS(iiiis.*cCsit|tjƒr,tj|||dƒn*tj||g|D]}|d^q?ƒ|jƒ|_dS(sN Corpus reader designed to work with National Corpus of Polish. See http://nkjp.pl/ for more details about NKJP. use example: import nltk import nkjp from nkjp import NKJPCorpusReader x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='') # obtain the whole corpus x.header() x.raw() x.words() x.tagged_words(tags=['subst', 'comp']) #Link to find more tags: nkjp.pl/poliqarp/help/ense2.html x.sents() x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='Wilk*') # obtain particular file(s) x.header(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy']) x.tagged_words(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'], tags=['subst', 'comp']) s .*/header.xmls /header.xmlN(t isinstanceRt string_typesRt__init__t get_pathsR(RtrootRtfileid((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pyR&s*cCs?g|jD]1}tjjt|jƒ|jdƒdƒ^q S(Ns header.xmli(t_fileidstostpathtjointstrt_roottsplit(Rtf((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pyR>scCs'g|jD]}|jdƒd^q S(sf Returns a list of file identifiers for the fileids that make up this corpus. s header.xmli(RR(RR((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pyRAscKsª|jdtjƒ}|tjkr4t|d|ƒS|tjkrSt|d|ƒS|tjkrrt|d|ƒS|tjkršt |d|dt jƒSt dƒ‚dS(sQ Returns a view specialised for use with particular corpus file. tmodettagss No such mode!N( tpopRt WORDS_MODEtNKJPCorpus_Morph_Viewt SENTS_MODEtNKJPCorpus_Segmentation_Viewt HEADER_MODEtNKJPCorpus_Header_ViewtRAW_MODEtNKJPCorpus_Text_Viewt NameError(RtfilenameRRR((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pyt_viewHscCs|j|kr|S|j|S(s< Add root if necessary to specified fileid. (R(RR((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pytadd_rootYscKsAtg|D]0}|j|j|ƒdtj|jƒ^q ƒS(s9 Returns header(s) of specified fileids. R(RR*R+RR$t handle_query(RRRR((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pytheaderascKsAtg|D]0}|j|j|ƒdtj|jƒ^q ƒS(s9 Returns sentences in specified fileids. R(RR*R+RR"R,(RRRR((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pytsentsjscKsAtg|D]0}|j|j|ƒdtj|jƒ^q ƒS(s5 Returns words in specified fileids. R(RR*R+RR R,(RRRR((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pytwordsssc KsY|jdgƒ}tg|D]6}|j|j|ƒdtjd||jƒ^qƒS(s… Call with specified tags as a list, e.g. tags=['subst', 'comp']. Returns tagged words in specified fileids. RR(RRR*R+RR R,(RRRRR((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pyt tagged_words}scKsAtg|D]0}|j|j|ƒdtj|jƒ^q ƒS(s5 Returns words in specified fileids. R(RR*R+RR&R,(RRRR((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pytrawˆsN(t__name__t __module__R R"R$R&RRRR R*R+R R-R.R/R0R1(((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pyR s&       R%cBs#eZd„Zd„Zd„ZRS(cKs'd|_tj||d|jƒdS(sƒ HEADER_MODE A stream backed corpus view specialized for use with header.xml files in NKJP corpus. s.*/sourceDesc$s header.xmlN(ttagspecRR(RR)R((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pyR”s cCsc|jƒg}xBtrTtj||jƒ}t|ƒdkrDPn|j|ƒqW|jƒ|S(Ni(t_opentTrueRt read_blockt_streamtlentextendtclose(RR-tsegm((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pyR,s   cCsx|jdƒ}g}|r7djd„|Dƒƒ}n|jdƒ}g}|rndjd„|Dƒƒ}n|jdƒ}g}|r¥djd„|Dƒƒ}n|jdƒ} g} | rÜdjd „| Dƒƒ} n|jd ƒ} g} | rdjd „| Dƒƒ} n|jd ƒ} g}| rJdjd „| Dƒƒ}ni|d6|d6|d6| d6| d6|d6S(Ns bibl/titles css|]}|jjƒVqdS(N(ttexttstrip(t.0ttitle((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pys ¬ss bibl/authorcss|]}|jjƒVqdS(N(R=R>(R?tauthor((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pys ±ss bibl/datecss|]}|jjƒVqdS(N(R=R>(R?tdate((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pys ¶ssbibl/publishercss|]}|jjƒVqdS(N(R=R>(R?t publisher((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pys »ss bibl/idnocss|]}|jjƒVqdS(N(R=R>(R?tidno((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pys Àss bibl/notecss|]}|jjƒVqdS(N(R=R>(R?tnote((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pys ÅsR@RARBRCRDRE(tfindallR(RtelttcontextttitlesR@tauthorsRAtdatesRBt publishersRCtidnosRDtnotesRE((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pyt handle_elt¨s4(R2R3RR,RO(((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pyR%’s tXML_ToolcBs)eZdZd„Zd„Zd„ZRS(s Helper class creating xml file to one without references to nkjp: namespace. That's needed because the XMLCorpusView assumes that one can find short substrings of XML that are valid XML, which is not true if a namespace is declared at top level cCs1tjj||ƒ|_tjdtƒ|_dS(Ntdelete(RRRt read_filettempfiletNamedTemporaryFiletFalset write_file(RRR)((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pyRÑscCs<yt|jdƒ}|j}d}xÎt|ƒrô|jƒ}tjd|ƒ}dj|ƒ}tjd|ƒ}dj|ƒ}tjd|ƒ}dj|ƒ}tjd|ƒ}dj|ƒ}tjd|ƒ}dj|ƒ}|j|ƒq'W|j ƒ|j ƒ|jj SWn!t k r7|j ƒt ‚nXdS(Ntrt s nkjp:[^ ]* s s ss ( topenRRRVR9treadlinetreRRtwriteR;tnamet Exceptiontremove_preprocessed_file(Rtfrtfwtlinetxtret((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pytbuild_preprocessed_fileÕs.      cCstj|jjƒdS(N(RtremoveRVR](R((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pyR_îs(R2R3t__doc__RReR_(((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pyRPËs  R#cBsVeZdZd„Zd„Zd„Zd„Zd„Zd„Zd„Z d„Z RS( sm A stream backed corpus view specialized for use with ann_segmentation.xml files in NKJP corpus. cKscd|_t|dtjƒ|_|jjƒt|dƒ|_tj||jj ƒ|jƒdS(Ns.*p/.*sRsann_segmentation.xml( R4R'R"t text_viewR,RPtxml_toolRRRe(RR)R((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pyRùs   cCs|jdƒdjdƒdS(Nt(it,i(R(Rt example_word((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pyt get_segm_idscCst|jdƒdƒS(NRki(tintR(Rtbeg_word((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pyt get_sent_begscCs8|jdƒdjdƒ}t|dƒt|dƒS(Nt)iRkii(RRn(Rtend_wordtsplitted((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pyt get_sent_end scCs^|j|dƒ}|jj|}|j|dƒ}|j|t|ƒdƒ}|||!S(Nii(RmRht segm_dictRpRtR9(Rt sent_segmtidR<tbegtend((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pyt get_sentencess cCs€g}d}d}xg|D]_}|j|ƒ}|j|ƒ|dksS||krr|j|ƒ|j|ƒ}n|}qW|S(Niÿÿÿÿi(RmRptappendRt(RR<Rdt prev_txt_endt prev_txt_nrtwordttxt_nr((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pyt remove_choices %  cCsÈy|jƒg}xktr€tj||jƒ}t|ƒdkrGPnx3|D]+}|j|ƒ}|j|j|ƒƒqNWqW|j ƒ|j j ƒ|SWn$t k rÃ|j j ƒt ‚nXdS(Ni( R5R6RR7R8R9R€R{RzR;RiR_R^(Rt sentencesRvR<((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pyR,%s        cCs1g}x$|D]}|j|jdƒƒq W|S(Ntcorresp(R{tget(RRGRHRdtseg((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pyRO7s ( R2R3RgRRmRpRtRzR€R,RO(((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pyR#ós      R'cBsMeZdZdZdZd„Zd„Zddd„Zd„Z d„Z RS( sa A stream backed corpus view specialized for use with text.xml files in NKJP corpus. iicKs_|jddƒ|_d|_tƒ|_t|dƒ|_tj||jj ƒ|jƒdS(NRis .*/div/abstext.xml( RRR4tdictRuRPRiRRRe(RR)R((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pyRFs   cCsfy;|jƒ|j|jƒ}|jƒ|jjƒ|SWn$tk ra|jjƒt‚nXdS(N(R5R7R8R;RiR_R^(RRc((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pyR,Os     cCs|g}xPtrXtj||ƒ}t|ƒdkr7Pnx|D]}|j|ƒq>Wq Wdjg|D] }|^qfƒgS(s6 Returns text as a list of sentences. iRX(R6RR7R9R{R(RtstreamR4t elt_handlerttxtR<tpart((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pyR7Zs  cCs4x-|jD]"}|jdƒr |j|ƒSq WdS(NRw(tattribtendswithRƒ(RRGtattr((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pyRmhscCs5|jtjkr.|j|j|j|ƒs  R!cBs)eZdZd„Zd„Zd„ZRS(sm A stream backed corpus view specialized for use with ann_morphosyntax.xml files in NKJP corpus. cKsS|jddƒ|_d|_t|dƒ|_tj||jjƒ|jƒdS(NRs .*/seg/fssann_morphosyntax.xml( RR RR4RPRiRRRe(RR)R((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pyRzs cCs¿y”|jƒg}xbtrwtj||jƒ}t|ƒdkrGPnx*|D]"}|dk rN|j|ƒqNqNWqW|jƒ|j j ƒ|SWn$t k rº|j j ƒt ‚nXdS(Ni( R5R6RR7R8R9R R{R;RiR_R^(RR/R<R‰((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pyR,€s         c Cs¼d}t}t}|jdkr*t}nx{|D]s}d|jƒkr‹|jddkr‹xE|D]!}|jdkrc|j}qcqcWq1d|jƒkr1|jddkr1xñ|D]æ}d|jƒkr·|jddkr·x¸|D]­}d|jƒkré|jddkréx|D]t} d | jƒkra|jdk ra| jd |jkrat}qd | jƒkr| jd d krt}qqWqéqéWq·q·Wq1q1W|r¸|r¸|SdS( NtR]torthtstringtinterpsttypetlextctagtvaluetinterp(RUR6RR tkeysRŠttagR=( RRGRHR~tflagt is_not_interptchildtsymboltsymbol2tsymbol3((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pyRO’s,  % % % % 7 %" (R2R3RgRR,RO(((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pyR!ts  ((R RRStnltkRtnltk.corpus.reader.utilRtnltk.corpus.reader.xmldocsRRR[R RR%RPR#R'R!(((si/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/nkjp.pyts     r9(K6