є
<┐CVc           @  s#  d  Z  d d l m Z m Z d d l Z d d l Z d d l m Z d d l m	 Z	 m
 Z
 m Z d d l m Z d d l m Z d d >Z d d	 >Z d d
 >Z d d >Z d d >Z d d >Z e e e Z e e e Z i e d d f 6e d d f 6e d d f 6e d d f 6e d d f 6e d d f 6Z d Z d Z d Z d Z d Z d Z d Z d e  f d Д  Г  YZ! e j" d e j# Г Z$ d Д  Z% d e  f d Д  Г  YZ& e
 d e  f d  Д  Г  YГ Z' d! e  f d" Д  Г  YZ( d# e( f d$ Д  Г  YZ) d% e( e f d& Д  Г  YZ* d' Z+ d( Д  Z, e* e) d) Д Z- d S(*   uЗ  
Punkt Sentence Tokenizer

This tokenizer divides a text into a list of sentences,
by using an unsupervised algorithm to build a model for abbreviation
words, collocations, and words that start sentences.  It must be
trained on a large collection of plaintext in the target language
before it can be used.

The NLTK data package includes a pre-trained Punkt tokenizer for
English.

    >>> import nltk.data
    >>> text = '''
    ... Punkt knows that the periods in Mr. Smith and Johann S. Bach
    ... do not mark sentence boundaries.  And sometimes sentences
    ... can start with non-capitalized words.  i is a good variable
    ... name.
    ... '''
    >>> sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    >>> print('\n-----\n'.join(sent_detector.tokenize(text.strip())))
    Punkt knows that the periods in Mr. Smith and Johann S. Bach
    do not mark sentence boundaries.
    -----
    And sometimes sentences
    can start with non-capitalized words.
    -----
    i is a good variable
    name.

(Note that whitespace from the original text, including newlines, is
retained in the output.)

Punctuation following sentences is also included by default
(from NLTK 3.0 onwards). It can be excluded with the realign_boundaries
flag.

    >>> text = '''
    ... (How does it deal with this parenthesis?)  "It should be part of the
    ... previous sentence." "(And the same with this one.)" ('And this one!')
    ... "('(And (this)) '?)" [(and this. )]
    ... '''
    >>> print('\n-----\n'.join(
    ...     sent_detector.tokenize(text.strip())))
    (How does it deal with this parenthesis?)
    -----
    "It should be part of the
    previous sentence."
    -----
    "(And the same with this one.)"
    -----
    ('And this one!')
    -----
    "('(And (this)) '?)"
    -----
    [(and this. )]
    >>> print('\n-----\n'.join(
    ...     sent_detector.tokenize(text.strip(), realign_boundaries=False)))
    (How does it deal with this parenthesis?
    -----
    )  "It should be part of the
    previous sentence.
    -----
    " "(And the same with this one.
    -----
    )" ('And this one!
    -----
    ')
    "('(And (this)) '?
    -----
    )" [(and this.
    -----
    )]

However, Punkt is designed to learn parameters (a list of abbreviations, etc.)
unsupervised from a corpus similar to the target domain. The pre-packaged models
may therefore be unsuitable: use ``PunktSentenceTokenizer(text)`` to learn
parameters from the given text.

:class:`.PunktTrainer` learns parameters such as a list of abbreviations
(without supervision) from portions of text. Using a ``PunktTrainer`` directly
allows for incremental training and modification of the hyper-parameters used
to decide what is considered an abbreviation, etc.

The algorithm for this tokenizer is described in::

  Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence
    Boundary Detection.  Computational Linguistics 32: 485-525.
i    (   t   print_functiont   unicode_literalsN(   t   defaultdict(   t   unicode_reprt   python_2_unicode_compatiblet   string_types(   t   FreqDist(   t
   TokenizerIi   i   i   i   i   i   u   initialu   upperu   internalu   unknownu   loweru   default decisionu   known collocation (both words)u%   abbreviation + orthographic heuristicu(   abbreviation + frequent sentence starteru    initial + orthographic heuristicu(   initial + special orthographic heuristict   PunktLanguageVarsc           B  sП   e  Z d  Z d Z d Д  Z d Д  Z d Z e d Д  Г Z d	 Z	 e
 j d
 e
 j Г Z d Z d Z d Z d Z d Д  Z d Д  Z d Z d Д  Z RS(   uX  
    Stores variables, mostly regular expressions, which may be
    language-dependent for correct application of the algorithm.
    An extension of this class may modify its properties to suit
    a language other than English; an instance can then be passed
    as an argument to PunktSentenceTokenizer and PunktTrainer
    constructors.
    u   _re_period_contextu   _re_word_tokenizerc         C  s   d S(   Ni   (    (   t   self(    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   __getstate__└   s    c         C  s   d S(   Ni   (    (   R	   t   state(    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   __setstate__╞   s    u   .u   ?u   !c         C  s   d t  j d j |  j Г Г S(   Nu   [%s]u    (   t   ret   escapet   joint   sent_end_chars(   R	   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   _re_sent_end_chars╠   s    u   ,:;u   ["\')\]}]+?(?:\s+|(?=--)|$)u   [^\(\"\`{\[:;&\#\*@\)}\]\-,]u   (?:[?!)\";}\]\*:@\'\({\[])u    (?:\-{2,}|\.{2,}|(?:\.\s){2,}\.)u─  (
        %(MultiChar)s
        |
        (?=%(WordStart)s)\S+?  # Accept word characters until end is found
        (?= # Sequences marking a word's end
            \s|                                 # White-space
            $|                                  # End-of-string
            %(NonWord)s|%(MultiChar)s|          # Punctuation
            ,(?=$|\s|%(NonWord)s|%(MultiChar)s) # Comma if at end of word
        )
        |
        \S
    )c         C  sk   y |  j  SWnY t k
 rf t j |  j i |  j d 6|  j d 6|  j d 6t j t j	 BГ |  _  |  j  SXd S(   u?   Compiles and returns a regular expression for word tokenizationu   NonWordu	   MultiCharu	   WordStartN(
   t   _re_word_tokenizert   AttributeErrorR   t   compilet   _word_tokenize_fmtt   _re_non_word_charst   _re_multi_char_punctt   _re_word_startt   UNICODEt   VERBOSE(   R	   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   _word_tokenizer_reЄ   s    

c         C  s   |  j  Г  j | Г S(   u=   Tokenize a string to split off punctuation other than periods(   R   t   findall(   R	   t   s(    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   word_tokenize  s    u:  
        \S*                          # some word material
        %(SentEndChars)s             # a potential sentence ending
        (?=(?P<after_tok>
            %(NonWord)s              # either other punctuation
            |
            \s+(?P<next_tok>\S+)     # or whitespace and some other token
        ))c         C  sW   y |  j  SWnE t j |  j i |  j d 6|  j d 6t j t j BГ |  _  |  j  SXd S(   uj   Compiles and returns a regular expression to find contexts
        including possible sentence boundaries.u   NonWordu   SentEndCharsN(   t   _re_period_contextR   R   t   _period_context_fmtR   R   R   R   (   R	   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   period_context_re  s    
(   u   _re_period_contextu   _re_word_tokenizer(   u   .u   ?u   !(   t   __name__t
   __module__t   __doc__t	   __slots__R
   R   R   t   propertyR   t   internal_punctuationR   R   t	   MULTILINEt   re_boundary_realignmentR   R   R   R   R   R   R    R!   (    (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyR   ┤   s"   					u   [^\W\d]c         c  sI   t  |  Г }  t |  Г } x |  D] } | | f V| } q W| d f Vd S(   u─   
    Yields pairs of tokens from the given iterator such that each input
    token will appear as the first element in a yielded tuple. The last
    pair will have None as its second element.
    N(   t   itert   nextt   None(   t   itt   prevt   el(    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt
   _pair_iter/  s    
t   PunktParametersc           B  sM   e  Z d  Z d Д  Z d Д  Z d Д  Z d Д  Z d Д  Z d Д  Z d Д  Z	 RS(   uC   Stores data used to perform sentence boundary detection with Punkt.c         C  s7   t  Г  |  _ t  Г  |  _ t  Г  |  _ t t Г |  _ d  S(   N(   t   sett   abbrev_typest   collocationst   sent_startersR   t   intt   ortho_context(   R	   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   __init__C  s
    c         C  s   t  Г  |  _ d  S(   N(   R2   R3   (   R	   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   clear_abbrevsW  s    c         C  s   t  Г  |  _ d  S(   N(   R2   R4   (   R	   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   clear_collocationsZ  s    c         C  s   t  Г  |  _ d  S(   N(   R2   R5   (   R	   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   clear_sent_starters]  s    c         C  s   t  t Г |  _ d  S(   N(   R   R6   R7   (   R	   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   clear_ortho_context`  s    c         C  s   |  j  | c | O<d  S(   N(   R7   (   R	   t   typt   flag(    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   add_ortho_contextc  s    c         c  s}   |  j  | } | t @r d Vn  | t @r1 d Vn  | t @rC d Vn  | t @rU d Vn  | t @rg d Vn  | t @ry d Vn  d  S(   Nu   BEG-UCu   MID-UCu   UNK-UCu   BEG-LCu   MID-LCu   UNK-LC(   R7   t   _ORTHO_BEG_UCt   _ORTHO_MID_UCt   _ORTHO_UNK_UCt   _ORTHO_BEG_LCt   _ORTHO_MID_LCt   _ORTHO_UNK_LC(   R	   R=   t   c(    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   _debug_ortho_contextf  s    





(
   R"   R#   R$   R8   R9   R:   R;   R<   R?   RG   (    (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyR1   @  s   						t
   PunktTokenc           B  s8  e  Z d  Z d d d d d g Z d d d g e Z d	 Д  Z e j d
 Г Z e j d Г Z	 e j d e j
 Г Z e j d e j
 Г Z d Д  Z e d Д  Г Z e d Д  Г Z e d Д  Г Z e d Д  Г Z e d Д  Г Z e d Д  Г Z e d Д  Г Z e d Д  Г Z e d Д  Г Z e d Д  Г Z d Д  Z d Д  Z RS(   uX   Stores a token of text with annotations produced during
    sentence boundary detection.u	   parastartu	   linestartu	   sentbreaku   abbru   ellipsisu   toku   typeu   period_finalc         K  sz   | |  _  |  j | Г |  _ | j d Г |  _ x! |  j D] } t |  | d  Г q7 Wx" | D] } t |  | | | Г qX Wd  S(   Nu   .(   t   tokt	   _get_typet   typet   endswitht   period_finalt   _propertiest   setattrR,   (   R	   RI   t   paramst   pt   k(    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyR8   Д  s    	u   \.\.+$u   ^-?[\.,]?\d[\d,\.-]*\.?$u
   [^\W\d]\.$u	   [^\W\d]+$c         C  s   |  j  j d | j Г  Г S(   u6   Returns a case-normalized representation of the token.u
   ##number##(   t   _RE_NUMERICt   subt   lower(   R	   RI   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyRJ   Ы  s    c         C  s:   t  |  j Г d k r3 |  j d d k r3 |  j d  S|  j S(   uG   
        The type with its final period removed if it has one.
        i   i    u   .(   t   lenRK   (   R	   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   type_no_periodЯ  s    (c         C  s   |  j  r |  j S|  j S(   ue   
        The type with its final period removed if it is marked as a
        sentence break.
        (   t	   sentbreakRW   RK   (   R	   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   type_no_sentperiodи  s    	c         C  s   |  j  d j Г  S(   u1   True if the token's first character is uppercase.i    (   RI   t   isupper(   R	   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   first_upper▓  s    c         C  s   |  j  d j Г  S(   u1   True if the token's first character is lowercase.i    (   RI   t   islower(   R	   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   first_lower╖  s    c         C  s   |  j  r d S|  j r d Sd S(   Nu   loweru   upperu   none(   R]   R[   (   R	   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt
   first_case╝  s
    		c         C  s   |  j  j |  j Г S(   u.   True if the token text is that of an ellipsis.(   t   _RE_ELLIPSISt   matchRI   (   R	   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   is_ellipsis─  s    c         C  s   |  j  j d Г S(   u+   True if the token text is that of a number.u
   ##number##(   RK   t
   startswith(   R	   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt	   is_number╔  s    c         C  s   |  j  j |  j Г S(   u-   True if the token text is that of an initial.(   t   _RE_INITIALR`   RI   (   R	   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt
   is_initial╬  s    c         C  s   |  j  j |  j Г S(   u)   True if the token text is all alphabetic.(   t	   _RE_ALPHAR`   RI   (   R	   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   is_alpha╙  s    c         C  s   t  j |  j Г S(   u6   True if the token is either a number or is alphabetic.(   t   _re_non_punctt   searchRK   (   R	   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   is_non_punct╪  s    c           sp   И  j  И  j k r% d t И  j  Г n d } d j З  f d Ж  И  j DГ Г } d И  j j t И  j Г | | f S(   uЮ   
        A string representation of the token that can reproduce it
        with eval(), which lists all the token's non-default
        annotations.
        u	    type=%s,u    u   , c         3  s=   |  ]3 } t  И  | Г r d  | t t  И  | Г Г f Vq d S(   u   %s=%sN(   t   getattrR   (   t   .0RQ   (   R	   (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pys	   <genexpr>ы  s   u   %s(%s,%s %s)(   RK   RI   R   R   RN   t	   __class__R"   (   R	   t   typestrt   propvals(    (   R	   se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   __repr__с  s    +c         C  sO   |  j  } |  j r | d 7} n  |  j r5 | d 7} n  |  j rK | d 7} n  | S(   uO   
        A string representation akin to that used by Kiss and Strunk.
        u   <A>u   <E>u   <S>(   RI   t   abbrt   ellipsisRX   (   R	   t   res(    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   __str__є  s    				(   R"   R#   R$   RN   R%   R8   R   R   R_   RS   R   Rd   Rf   RJ   R&   RW   RY   R[   R]   R^   Ra   Rc   Re   Rg   Rj   Rp   Rt   (    (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyRH   y  s,   			
		t   PunktBaseClassc           B  sA   e  Z d  Z e Г  e e Г  d Д Z d Д  Z d Д  Z d Д  Z	 RS(   uP   
    Includes common components of PunktTrainer and PunktSentenceTokenizer.
    c         C  s   | |  _  | |  _ | |  _ d  S(   N(   t   _paramst
   _lang_varst   _Token(   R	   t	   lang_varst	   token_clsRP   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyR8   	  s    			c         c  sЦ   t  } xЙ | j d Г D]x } | j Г  rИ t |  j j | Г Г } |  j t | Г d | d t ГVt  } x% | D] } |  j | Г Vqm Wq t } q Wd S(   uB  
        Divide the given text into tokens, using the punkt word
        segmentation regular expression, and generate the resulting list
        of tokens augmented as three-tuples with two boolean values for whether
        the given token occurs at the start of a paragraph or a new line,
        respectively.
        u   
t	   parastartt	   linestartN(	   t   Falset   splitt   stripR*   Rw   R   Rx   R+   t   True(   R	   t	   plaintextR{   t   linet	   line_tokst   t(    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   _tokenize_words  s    c         c  s'   x  | D] } |  j  | Г | Vq Wd S(   u▒  
        Perform the first pass of annotation, which makes decisions
        based purely based on the word type of each word:

          - '?', '!', and '.' are marked as sentence breaks.
          - sequences of two or more periods are marked as ellipsis.
          - any word ending in '.' that's a known abbreviation is
            marked as an abbreviation.
          - any other word ending in '.' is marked as a sentence break.

        Return these annotations as a tuple of three sets:

          - sentbreak_toks: The indices of all sentence breaks.
          - abbrev_toks: The indices of all abbreviations.
          - ellipsis_toks: The indices of all ellipsis marks.
        N(   t   _first_pass_annotation(   R	   t   tokenst   aug_tok(    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   _annotate_first_pass0  s    c         C  s╢   | j  } | |  j j k r' t | _ nЛ | j r< t | _ nv | j r▓ | j d Г r▓ | d  j	 Г  |  j
 j k sЪ | d  j	 Г  j d Г d |  j
 j k rж t | _ q▓ t | _ n  d S(   uC   
        Performs type-based annotation on a single token.
        u   ..i    u   -N(   RI   Rw   R   RА   RX   Ra   Rr   RM   RL   RU   Rv   R3   R~   Rq   (   R	   RИ   RI   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyRЖ   E  s    		)(
   R"   R#   R$   R   RH   R1   R8   RЕ   RЙ   RЖ   (    (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyRu     s   			t   PunktTrainerc           B  s(  e  Z d  Z d e e Г  e d Д Z d Д  Z d Z	 e Z
 d Z d Z d Z e Z e Z d Z e e d Д Z e e d	 Д Z d
 Д  Z d Д  Z e d Д Z d d d d d Д Z d Д  Z d Д  Z d Д  Z d Д  Z d Д  Z e d Д  Г Z e d Д  Г Z d Д  Z  d Д  Z! d Д  Z" d Д  Z# d Д  Z$ RS(   u<   Learns parameters used in Punkt sentence boundary detection.c         C  s{   t  j |  d | d | Гt Г  |  _ d |  _ t Г  |  _ t Г  |  _ d |  _ t |  _	 | rw |  j
 | | d t Гn  d  S(   NRy   Rz   i    t   finalize(   Ru   R8   R   t   _type_fdistt   _num_period_tokst   _collocation_fdistt   _sent_starter_fdistt   _sentbreak_countRА   t
   _finalizedt   train(   R	   t
   train_textt   verboseRy   Rz   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyR8   b  s    			c         C  s   |  j  s |  j Г  n  |  j S(   ul   
        Calculates and returns parameters for sentence boundary detection as
        derived from training.(   RС   t   finalize_trainingRv   (   R	   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt
   get_paramsЛ  s    	g333333╙?i   gЕыQ╕Е@i   i   c         C  s3   |  j  |  j | Г | Г | r/ |  j | Г n  d S(   u8  
        Collects training data from a given text. If finalize is True, it
        will determine all the parameters for sentence boundary detection. If
        not, this will be delayed until get_params() or finalize_training() is
        called. If verbose is True, abbreviations found will be listed.
        N(   t   _train_tokensRЕ   RХ   (   R	   t   textRФ   RЛ   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyRТ   └  s    	c           s:   И  j  З  f d Ж  | DГ | Г | r6 И  j | Г n  d S(   uE   
        Collects training data from a given list of tokens.
        c         3  s   |  ] } И  j  | Г Vq d  S(   N(   Rx   (   Rl   RД   (   R	   (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pys	   <genexpr>╤  s    N(   RЧ   RХ   (   R	   RЗ   RФ   RЛ   (    (   R	   se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   train_tokens═  s     c   
      C  s#  t  |  _ t | Г } x? | D]7 } |  j | j c d 7<| j r |  j d 7_ q q W|  j | Г } xд |  j | Г D]У \ } } } | |  j	 k r╨ | r	|  j
 j j | Г | r═ t d | | f Г q═ q	qv | sv |  j
 j j | Г | r	t d | | f Г q	qv qv Wt |  j | Г Г } |  j | Г |  j |  j | Г 7_ x╒ t | Г D]╟ \ } }	 | j sT|	 rwqTn  |  j | |	 Г r╝|  j
 j j | j Г | r╝t d | j Г q╝n  |  j |	 | Г rч|  j |	 j c d 7<n  |  j | |	 Г rT|  j | j |	 j f c d 7<qTqTWd  S(   Ni   u     Abbreviation: [%6.4f] %su"     Removed abbreviation: [%6.4f] %su     Rare Abbrev: %s(   R}   RС   t   listRМ   RK   RM   RН   t   _unique_typest   _reclassify_abbrev_typest   ABBREVRv   R3   t   addt   printt   removeRЙ   t   _get_orthography_dataRР   t   _get_sentbreak_countR0   t   _is_rare_abbrev_typeRW   t   _is_potential_sent_starterRП   t   _is_potential_collocationRО   RY   (
   R	   RЗ   RФ   RИ   t   unique_typesRq   t   scoret   is_addt   aug_tok1t   aug_tok2(    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyRЧ   ╒  sD    		c         C  s   t  d Д  | DГ Г S(   Nc         s  s   |  ] } | j  Vq d  S(   N(   RK   (   Rl   RИ   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pys	   <genexpr>  s    (   R2   (   R	   RЗ   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyRЫ     s    c         C  s╨   |  j  j Г  xJ |  j Г  D]< \ } } |  j  j j | Г | r t d | | f Г q q W|  j  j Г  xY |  j Г  D]K \ \ } } } |  j  j j | | f Г | rt t d | | | f Г qt qt Wt	 |  _
 d S(   u~   
        Uses data that has been gathered in training to determine likely
        collocations and sentence starters.
        u     Sent Starter: [%6.4f] %ru     Collocation: [%6.4f] %r+%rN(   Rv   R;   t   _find_sent_startersR5   RЮ   RЯ   R:   t   _find_collocationsR4   RА   RС   (   R	   RФ   R=   t   llt   typ1t   typ2(    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyRХ     s    i   c         C  s╕   | d k rl |  j  j } |  j  j Г  xD |  j D]6 } |  j | } | | k r/ | | |  j  j | <q/ q/ Wn  |  j |  j | Г |  _ |  j |  j | Г |  _ |  j |  j | Г |  _ d S(   u  
        Allows memory use to be reduced after much training by removing data
        about rare tokens that are unlikely to have a statistical effect with
        further training. Entries occurring above the given thresholds will be
        retained.
        i   N(   Rv   R7   R<   RМ   t   _freq_thresholdRО   RП   (   R	   t   ortho_thresht   type_thresht   colloc_threst   sentstart_thresht   old_ocRI   t   count(    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   freq_threshold.  s    c         C  sg   t  Г  } d } xA | D]9 } | | } | | k  r? | d 7} q | | c | 7<q W| d c | 7<| S(   uФ   
        Returns a FreqDist containing only data with counts below a given
        threshold, as well as a mapping (None -> count_removed).
        i    i   N(   R   R,   (   R	   t   fdistt	   thresholdRs   t   num_removedRI   R╢   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyR░   D  s    	
c         C  sё   d } t  | Г } x╪ | D]╨ } | j r= | d k r= d } n  | j r[ | d k r[ d } n  | j } t j | | j f d Г } | rЫ |  j j | | Г n  | j	 r╚ | j
 p│ | j s┐ d } qщ d } q | j s┌ | j rу d } q d } q Wd S(   u┌   
        Collect information about whether each token type occurs
        with different case patterns (i) overall, (ii) at
        sentence-initial positions, and (iii) at sentence-internal
        positions.
        u   internalu   unknownu   initiali    N(   RЪ   R{   R|   RY   t
   _ORTHO_MAPt   getR^   Rv   R?   RX   Rc   Re   Rr   Rq   (   R	   RЗ   t   contextRИ   R=   R>   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyRб   Z  s$    							c         c  sT  xM| D]E} t  j | Г s | d k r/ q n  | j d Г ri | |  j j k rV q n  | d  } t } n | |  j j k rБ q n  t } | j d Г d } t | Г | d } |  j	 | d } |  j	 | } |  j
 | | |  j | |  j	 j Г  Г } t j | Г }	 | }
 t |  j Г p)t j | | Г } | |	 |
 | } | | | f Vq Wd S(   u▐  
        (Re)classifies each given token if
          - it is period-final and not a known abbreviation; or
          - it is not period-final and is otherwise a known abbreviation
        by checking whether its previous classification still holds according
        to the heuristics of section 3.
        Yields triples (abbr, score, is_add) where abbr is the type in question,
        score is its log-likelihood with penalties applied, and is_add specifies
        whether the present type is a candidate for inclusion or exclusion as an
        abbreviation, such that:
          - (is_add and score >= 0.3)    suggests a new abbreviation; and
          - (not is_add and score < 0.3) suggests excluding an abbreviation.
        u
   ##number##u   .i    i   N(   Rh   Ri   RL   Rv   R3   RА   R}   R╢   RV   RМ   t   _dunning_log_likelihoodRН   t   Nt   matht   expR6   t   IGNORE_ABBREV_PENALTYt   pow(   R	   t   typesR=   Rи   t   num_periodst   num_nonperiodst   count_with_periodt   count_without_periodRн   t   f_lengtht	   f_periodst	   f_penaltyRз   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyRЬ   К  s2    
		c         C  sl   |  j  j Г  d Д  |  j DГ } xE |  j | Г D]4 \ } } } | |  j k r0 |  j  j j | Г q0 q0 Wd S(   u─   
        Recalculates abbreviations given type frequencies, despite no prior
        determination of abbreviations.
        This fails to include abbreviations otherwise found as "rare".
        c         s  s*   |  ]  } | r | j  d  Г r | Vq d S(   u   .N(   RL   (   Rl   R=   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pys	   <genexpr>╤  s    N(   Rv   R9   RМ   RЬ   RЭ   R3   RЮ   (   R	   RЗ   Rq   Rз   Rи   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   find_abbrev_types╩  s
    c         C  s└   | j  s | j r t S| j } |  j | |  j | d  } | |  j j k s] | |  j k ra t S| j d  |  j	 j
 k r~ t S| j r╝ | j } |  j j | } | t @r╝ | t @r╝ t Sn  d S(   u▓  
        A word type is counted as a rare abbreviation if...
          - it's not already marked as an abbreviation
          - it occurs fewer than ABBREV_BACKOFF times
          - either it is followed by a sentence-internal punctuation
            mark, *or* it is followed by a lower-case word that
            sometimes appears with upper case, but never occurs with
            lower case at the beginning of sentences.
        i    i   N(   Rq   RX   R}   RY   RМ   Rv   R3   t   ABBREV_BACKOFFRI   Rw   R'   RА   R]   R7   R@   RA   (   R	   t   cur_tokt   next_tokR=   R╢   Rп   t   typ2ortho_context(    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyRг   ┘  s    
	!		
c   	      C  sМ   t  | Г | } d } t  | Г t j | Г |  | t j d | Г } t  | Г t j | Г |  | t j d | Г } | | } d | S(   u─   
        A function that calculates the modified Dunning log-likelihood
        ratio scores for abbreviation candidates.  The details of how
        this works is available in the paper.
        gоGсzоя?g      Ё?g       └(   t   floatR└   t   log(	   t   count_at   count_bt   count_abR┐   t   p1t   p2t	   null_hypot   alt_hypot
   likelihood(    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyR╛     s    
c         C  sJ  d d l  } d | | } d | |  } d | | | |  } | | j | Г |  | | j d | Г } | | | j | Г | |  | | | j d | Г }	 |  | k r╖ d }
 n, | | j | Г |  | | j d | Г }
 | | k r° d } n8 | | | j | Г | |  | | | j d | Г } | |	 |
 | } d | S(   u=  
        A function that will just compute log-likelihood estimate, in
        the original paper it's described in algorithm 6 and 7.

        This *should* be the original Dunning log-likelihood values,
        unlike the previous log_l function where it used modified
        Dunning log-likelihood values
        i    Ng      Ё?i    g       └(   R└   R╥   (   R╙   R╘   R╒   R┐   R└   RQ   R╓   R╫   t   summand1t   summand2t   summand3t   summand4R┌   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   _col_log_likelihood  s$    
$		$c         C  sF   |  j  s6 |  j r | j s6 | j oE | j s6 | j oE | j oE | j S(   ut   
        Returns True if the pair of tokens may form a collocation given
        log-likelihood statistics.
        (   t   INCLUDE_ALL_COLLOCSt   INCLUDE_ABBREV_COLLOCSRq   RX   Rc   Re   Rj   (   R	   Rй   Rк   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyRе   E  s    			c         c  sL  xE|  j  D]:} y | \ } } Wn t k
 r6 q
 n X| |  j j k rO q
 n  |  j  | } |  j | |  j | d } |  j | |  j | d } | d k r
 | d k r
 |  j | k  o╧ t | | Г k n r
 |  j | | | |  j j Г  Г } | |  j	 k rDt
 |  j j Г  Г | t
 | Г | k rD| | f | f VqDq
 q
 Wd S(   uI   
        Generates likely collocations and their log-likelihood.
        u   .i   N(   RО   t	   TypeErrorRv   R5   RМ   t   MIN_COLLOC_FREQt   minR▀   R┐   t   COLLOCATIONR╤   (   R	   R─   Rо   Rп   t	   col_countt
   typ1_countt
   typ2_countRн   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyRм   Q  s&    "c         C  s#   | j  o" | j p | j o" | j S(   uМ   
        Returns True given a token and the token that preceds it if it
        seems clear that the token is beginning a sentence.
        (   RX   Rc   Re   Rg   (   R	   R╬   t   prev_tok(    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyRд   q  s    	c         c  s╠   x┼ |  j  D]║ } | s q
 n  |  j  | } |  j | |  j | d } | | k  rW q
 n  |  j |  j | | |  j j Г  Г } | |  j k r
 t |  j j Г  Г |  j t | Г | k r
 | | f Vq
 q
 Wd S(   u~   
        Uses collocation heuristics for each candidate token to
        determine if it frequently starts sentences.
        u   .N(   RП   RМ   R▀   RР   R┐   t   SENT_STARTERR╤   (   R	   R=   t   typ_at_break_countt	   typ_countRн   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyRл   }  s    c         C  s   t  d Д  | DГ Г S(   uj   
        Returns the number of sentence breaks marked in a given set of
        augmented tokens.
        c         s  s   |  ] } | j  r d  Vq d S(   i   N(   RX   (   Rl   RИ   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pys	   <genexpr>Ы  s    (   t   sum(   R	   RЗ   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyRв   Ц  s    N(%   R"   R#   R$   R,   R}   R   RH   R8   RЦ   RЭ   R┬   R═   Rх   Rъ   Rр   Rс   Rу   RА   RТ   RЩ   RЧ   RЫ   RХ   R╖   R░   Rб   RЬ   R╠   Rг   t   staticmethodR╛   R▀   Rе   Rм   Rд   Rл   Rв   (    (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyRК   _  s>   (			>				0	@		/*		 		t   PunktSentenceTokenizerc           B  s╬   e  Z d  Z d e e Г  e d Д Z e d Д Z e	 d Д Z
 d Д  Z e	 d Д Z e	 d Д Z d Д  Z d Д  Z d	 Д  Z d
 Д  Z d Д  Z d Д  Z d Д  Z d Д  Z e d Г Z d Д  Z d Д  Z d Д  Z RS(   u'  
    A sentence tokenizer which uses an unsupervised algorithm to build
    a model for abbreviation words, collocations, and words that start
    sentences; and then uses that model to find sentence boundaries.
    This approach has been shown to work well for many European
    languages.
    c         C  s;   t  j |  d | d | Г| r7 |  j | | Г |  _ n  d S(   uТ   
        train_text can either be the sole training text for this sentence
        boundary detector, or can be a PunktParameters object.
        Ry   Rz   N(   Ru   R8   RТ   Rv   (   R	   RУ   RФ   Ry   Rz   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyR8   л  s    c         C  s5   t  | t Г s | St | d |  j d |  j Гj Г  S(   uё   
        Derives parameters from a given training text, or uses the parameters
        given. Repeated calls to this method destroy previous parameters. For
        incremental training, instantiate a separate PunktTrainer instance.
        Ry   Rz   (   t
   isinstanceR   RК   Rw   Rx   RЦ   (   R	   RУ   RФ   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyRТ   ╖  s    c         C  s   t  |  j | | Г Г S(   uM   
        Given a text, returns a list of the sentences in that text.
        (   RЪ   t   sentences_from_text(   R	   RШ   t   realign_boundaries(    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   tokenize╞  s    c         c  sБ  xz|  j  j Г  j | Г D]`} | j Г  | j d Г } |  j | Г } t |  j | Г Г } x | d j s| | j d Г q_ Wt	 d | j
 Г  d d | d | d j d | d j d t | d j Г d	 t | d j Г d
 | d j |  j j k d |  j | d Г d t |  j j | d j Г Г d | d j | d j f |  j j k d |  j | d | d Г pgt d | d j Г Vq Wd S(   uх   
        Classifies candidate periods as sentence breaks, yielding a dict for
        each that may be used to understand why the decision was made.

        See format_debug_decision() to help make this output readable.
        u	   after_toki    t   period_indexi   RШ   t   type1t   type2t   type1_in_abbrst   type1_is_initialt   type2_is_sent_startert   type2_ortho_heuristict   type2_ortho_contextst   collocationt   reasont   break_decisionN(   Rw   R!   t   finditert   groupRЕ   RЪ   RЙ   RM   t   popt   dictt   endRK   t   boolRq   Re   RY   Rv   R5   t   _ortho_heuristicR2   RG   R4   t   _second_pass_annotationt   REASON_DEFAULT_DECISIONRX   (   R	   RШ   R`   t   decision_textRЗ   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   debug_decisions╠  s$    & c         C  sM   |  j  | Г } | r* |  j | | Г } n  g  | D] } | j | j f ^ q1 S(   uj   
        Given a text, returns a list of the (start, end) spans of sentences
        in the text.
        (   t   _slices_from_textt   _realign_boundariest   startt   stop(   R	   RШ   RЄ   t   slicest   sl(    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   span_tokenizeщ  s    c         C  s0   g  |  j  | | Г D] \ } } | | | !^ q S(   uь   
        Given a text, generates the sentences in that text by only
        testing candidate sentence breaks. If realign_boundaries is
        True, includes in the sentence closing punctuation that
        follows the period.
        (   R  (   R	   RШ   RЄ   R   t   e(    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyRё   є  s    c         c  sн   d } xМ |  j  j Г  j | Г D]r } | j Г  | j d Г } |  j | Г r t | | j Г  Г V| j d Г rВ | j d Г } qС | j Г  } q q Wt | t | Г Г Vd  S(   Ni    u	   after_toku   next_tok(	   Rw   R!   R    R   t   text_contains_sentbreakt   sliceR  R  RV   (   R	   RШ   t
   last_breakR`   R╜   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyR
  №  s    c         c  s╤   d } x─ t  | Г D]╢ \ } } t | j | | j Г } | sV | | r | Vq q n  |  j j j | | Г } | r▒ t | j | j t | j d Г j	 Г  Г Г V| j
 Г  } q d } | | r | Vq q Wd S(   u@  
        Attempts to realign punctuation that falls after the period but
        should otherwise be included in the same sentence.

        For example: "(Sent1.) Sent2." will otherwise be split as::

            ["(Sent1.", ") Sent1."].

        This method will produce::

            ["(Sent1.)", "Sent2."].
        i    N(   R0   R  R  R  Rw   R)   R`   RV   R   t   rstripR  (   R	   RШ   R  t   realignt   sl1t   sl2t   m(    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyR  
  s    
-
c         C  sI   t  } x< |  j |  j | Г Г D]" } | r/ t S| j r t } q q Wt  S(   uK   
        Returns True if the given text includes a sentence break.
        (   R}   t   _annotate_tokensRЕ   RА   RX   (   R	   RШ   t   foundRД   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyR  (  s    	c         C  s(   |  j  |  j | Г Г } |  j | | Г S(   u▌   
        Given a text, generates the sentences in that text. Annotates all
        tokens, rather than just those with possible sentence breaks. Should
        produce the same results as ``sentences_from_text``.
        (   R  RЕ   t   _build_sentence_list(   R	   RШ   RЗ   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   sentences_from_text_legacy4  s    c         #  su   t  И  j З  f d Ж  | DГ Г Г } g  } x5 | D]- } | j | j Г | j r2 | Vg  } q2 q2 W| rq | Vn  d S(   uw   
        Given a sequence of tokens, generates lists of tokens, each list
        corresponding to a sentence.
        c         3  s   |  ] } И  j  | Г Vq d  S(   N(   Rx   (   Rl   RД   (   R	   (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pys	   <genexpr>B  s    N(   R*   R  t   appendRI   RX   (   R	   RЗ   t   sentenceRИ   (    (   R	   se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   sentences_from_tokens=  s    %	c         C  s"   |  j  | Г } |  j | Г } | S(   u╒   
        Given a set of tokens augmented with markers for line-start and
        paragraph-start, returns an iterator through those tokens with full
        annotation including predicted sentence breaks.
        (   RЙ   t   _annotate_second_pass(   R	   RЗ   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyR  L  s    c         c  s?  d } t  j d Г } d } x| D]} | j } | j | | Г j Г  } | t | Г 7} | | | t | Г !| k r┬ d j d Д  | DГ Г }	 t  j |	 Г j | | Г }
 |
 r┬ |
 j Г  } q┬ n  | | | t | Г !| k sх t В | t | Г 7} | r| | 7} n  | | 7} | j r" | Vd } q" q" W| r;| Vn  d S(   uУ   
        Given the original text and the list of augmented word tokens,
        construct and return a tokenized list of sentence strings.
        i    u   \s*u    c         s  s   |  ] } t  j | Г Vq d  S(   N(   R   R   (   Rl   RF   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pys	   <genexpr>~  s    N(	   R   R   RI   R`   R   RV   R   t   AssertionErrorRX   (   R	   RШ   RЗ   t   post	   WS_REGEXPR  RИ   RI   t   wst   patR  (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyR  a  s,    
	 #
	c         C  sЙ   t  d Г t d d Г Пj } x` | D]X } | j rB | j d Г n& | j r[ | j d Г n | j d Г | j t | Г Г q# WWd  QXd  S(   Nu   writing to /tmp/punkt.new...u   /tmp/punkt.newu   wu   

u   
u    (   RЯ   t   openR{   t   writeR|   t   str(   R	   RЗ   t   outfileRИ   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   dumpЧ  s    
		u   ;:,.!?c         c  s6   x/ t  | Г D]! \ } } |  j | | Г | Vq Wd S(   uы   
        Performs a token-based classification (section 4) over the given
        tokens, making use of the orthographic heuristic (4.1.1), collocation
        heuristic (4.1.2) and frequent sentence starter heuristic (4.1.3).
        N(   R0   R  (   R	   RЗ   t   t1t   t2(    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyR!  о  s    c   	      C  s{  | s
 d S| j  } | j s  d S| j } | j  } | j } | j } | | f |  j j k rr t | _ t	 | _
 t S| j
 sД | j r▐ | r▐ |  j | Г } | t	 k r│ t	 | _ t S| j r▐ | |  j j k r▐ t	 | _ t Sn  | sЁ | d k rw|  j | Г } | t k r.t | _ t	 | _
 | r't St Sn  | d k rw| rw| j rw|  j j | t @rwt | _ t	 | _
 t Sn  d S(   ur   
        Performs token-based classification over a pair of contiguous tokens
        updating the first.
        Nu
   ##number##u   unknown(   RI   RM   RW   RY   Re   Rv   R4   R}   RX   RА   Rq   t   REASON_KNOWN_COLLOCATIONRr   R  t'   REASON_ABBR_WITH_ORTHOGRAPHIC_HEURISTICR[   R5   t!   REASON_ABBR_WITH_SENTENCE_STARTERt*   REASON_INITIAL_WITH_ORTHOGRAPHIC_HEURISTICt)   REASON_NUMBER_WITH_ORTHOGRAPHIC_HEURISTICR7   t	   _ORTHO_LCt2   REASON_INITIAL_WITH_SPECIAL_ORTHOGRAPHIC_HEURISTIC(	   R	   Rй   Rк   RI   R=   R╧   t   next_typt   tok_is_initialt   is_sent_starter(    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyR  ╕  sL    																c         C  sq   | j  |  j k r t S|  j j | j } | j rK | t @rK | t @rK t	 S| j
 rm | t @si | t @rm t Sd S(   uR   
        Decide whether the given token is the first token in a sentence.
        u   unknown(   RI   t   PUNCTUATIONR}   Rv   R7   RY   R[   R3  RA   RА   R]   t	   _ORTHO_UCRC   (   R	   RИ   R7   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyR  
  s    	
	
N(   R"   R#   R$   R,   R}   R   RH   R8   RТ   RА   Rє   R	  R  Rё   R
  R  R  R  R   R  R  R+  t   tupleR8  R!  R  R  (    (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyRя   г  s(   	
									6		
	Ruк  Text: %(text)r (at offset %(period_index)d)
Sentence break? %(break_decision)s (%(reason)s)
Collocation? %(collocation)s
%(type1)r:
    known abbreviation: %(type1_in_abbrs)s
    is initial: %(type1_is_initial)s
%(type2)r:
    known sentence starter: %(type2_is_sent_starter)s
    orthographic heuristic suggests is a sentence starter? %(type2_ortho_heuristic)s
    orthographic contexts in training: %(type2_ortho_contexts)s
c         C  s   t  |  S(   N(   t   DEBUG_DECISION_FMT(   t   d(    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   format_debug_decision4  s    c         C  sh   d Д  } | Г  } t  | _ | j |  Г | | j Г  Г } x' | j |  Г D] } t | | Г Г qJ Wd S(   u4   Builds a punkt model and applies it to the same textc         S  s+   t  j d t  j Г j d |  Г j d d Г S(   Nu   (?:\r|^\s+)u    u   
u    (   R   R   R(   RT   t   replace(   R   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   <lambda>9  s    N(   RА   Rр   RТ   RЦ   Rё   RЯ   (   RШ   t   tok_clst	   train_clst   cleanupt   trainert   sbdt   l(    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   demo7  s    			(.   R$   t
   __future__R    R   R   R└   t   collectionsR   t   nltk.compatR   R   R   t   nltk.probabilityR   t   nltk.tokenize.apiR   R@   RA   RB   RC   RD   RE   R9  R3  R╗   R  R.  R/  R0  R1  R2  R4  t   objectR   R   R   Rh   R0   R1   RH   Ru   RК   Rя   R;  R=  RF  (    (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/punkt.pyt   <module>e   sX   	


	n	9К[    F  С