є
<┐CVc           @  sя   d  Z  d d l m Z d d l Z d d l m Z d d l m Z d d l m	 Z	 e	 d e f d Д  Г  YГ Z
 d	 e
 f d
 Д  Г  YZ d e
 f d Д  Г  YZ d e
 f d Д  Г  YZ e e e j e j Be j Bd Д Z e Г  j Z e Г  j Z d S(   u√	  
Regular-Expression Tokenizers

A ``RegexpTokenizer`` splits a string into substrings using a regular expression.
For example, the following tokenizer forms tokens out of alphabetic sequences,
money expressions, and any other non-whitespace sequences:

    >>> from nltk.tokenize import RegexpTokenizer
    >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
    >>> tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
    >>> tokenizer.tokenize(s)
    ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.',
    'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']

A ``RegexpTokenizer`` can use its regexp to match delimiters instead:

    >>> tokenizer = RegexpTokenizer('\s+', gaps=True)
    >>> tokenizer.tokenize(s)
    ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
    'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']

Note that empty tokens are not returned when the delimiter appears at
the start or end of the string.

The material between the tokens is discarded.  For example,
the following tokenizer selects just the capitalized words:

    >>> capword_tokenizer = RegexpTokenizer('[A-Z]\w+')
    >>> capword_tokenizer.tokenize(s)
    ['Good', 'New', 'York', 'Please', 'Thanks']

This module contains several subclasses of ``RegexpTokenizer``
that use pre-defined regular expressions.

    >>> from nltk.tokenize import BlanklineTokenizer
    >>> # Uses '\s*\n\s*\n\s*':
    >>> BlanklineTokenizer().tokenize(s)
    ['Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.',
    'Thanks.']

All of the regular expression tokenizers are also available as functions:

    >>> from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, blankline_tokenize
    >>> regexp_tokenize(s, pattern='\w+|\$[\d\.]+|\S+')
    ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.',
    'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
    >>> wordpunct_tokenize(s)
    ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York',
     '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
    >>> blankline_tokenize(s)
    ['Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.', 'Thanks.']

Caution: The function ``regexp_tokenize()`` takes the text as its
first argument, and the regular expression pattern as its second
argument.  This differs from the conventions used by Python's
``re`` functions, where the pattern is always the first argument.
(This is for consistency with the other NLTK tokenizers.)
i    (   t   unicode_literalsN(   t
   TokenizerI(   t   regexp_span_tokenize(   t   python_2_unicode_compatiblet   RegexpTokenizerc           B  sU   e  Z d  Z e e e j e j Be j Bd Д Z	 d Д  Z
 d Д  Z d Д  Z d Д  Z RS(   u╫  
    A tokenizer that splits a string using a regular expression, which
    matches either the tokens or the separators between tokens.

        >>> tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')

    :type pattern: str
    :param pattern: The pattern used to build this tokenizer.
        (This pattern may safely contain capturing parentheses.)
    :type gaps: bool
    :param gaps: True if this tokenizer's pattern should be used
        to find separators between tokens; False if this
        tokenizer's pattern should be used to find the tokens
        themselves.
    :type discard_empty: bool
    :param discard_empty: True if any empty tokens `''`
        generated by the tokenizer should be discarded.  Empty
        tokens can only be generated if `_gaps == True`.
    :type flags: int
    :param flags: The regexp flags used to compile this
        tokenizer's pattern.  By default, the following flags are
        used: `re.UNICODE | re.MULTILINE | re.DOTALL`.

    c         C  sC   t  | d | Г } | |  _ | |  _ | |  _ | |  _ d  |  _ d  S(   Nu   pattern(   t   getattrt   _patternt   _gapst   _discard_emptyt   _flagst   Nonet   _regexp(   t   selft   patternt   gapst   discard_emptyt   flags(    (    sf   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/regexp.pyt   __init__f   s    				c         C  s+   |  j  d  k r' t j |  j Г |  _  n  d  S(   N(   R   R
   t   ret   compileR   (   R   (    (    sf   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/regexp.pyt   _check_regexpq   s    c         C  sl   |  j  Г  |  j rX |  j rE g  |  j j | Г D] } | r/ | ^ q/ S|  j j | Г Sn |  j j | Г Sd  S(   N(   R   R   R   R   t   splitt   findall(   R   t   textt   tok(    (    sf   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/regexp.pyt   tokenizeu   s    
		)c         c  sЛ   |  j  Г  |  j r\ xq t | |  j Г D]/ \ } } |  j oD | | k s& | | f Vq& q& Wn+ x( t j |  j | Г D] } | j Г  Vqr Wd  S(   N(   R   R   R   R   R   R   t   finditert   span(   R   R   t   leftt   rightt   m(    (    sf   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/regexp.pyt   span_tokenizeВ   s    
	c         C  s)   d |  j  j |  j |  j |  j |  j f S(   Nu3   %s(pattern=%r, gaps=%r, discard_empty=%r, flags=%r)(   t	   __class__t   __name__R   R   R   R	   (   R   (    (    sf   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/regexp.pyt   __repr__Н   s    (   R!   t
   __module__t   __doc__t   Falset   TrueR   t   UNICODEt	   MULTILINEt   DOTALLR   R   R   R   R"   (    (    (    sf   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/regexp.pyR   L   s   
			t   WhitespaceTokenizerc           B  s   e  Z d  Z d Д  Z RS(   u╘  
    Tokenize a string on whitespace (space, tab, newline).
    In general, users should use the string ``split()`` method instead.

        >>> from nltk.tokenize import WhitespaceTokenizer
        >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
        >>> WhitespaceTokenizer().tokenize(s)
        ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
        'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
    c         C  s   t  j |  d d t Гd  S(   Nu   \s+R   (   R   R   R&   (   R   (    (    sf   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/regexp.pyR   Ю   s    (   R!   R#   R$   R   (    (    (    sf   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/regexp.pyR*   Т   s   
t   BlanklineTokenizerc           B  s   e  Z d  Z d Д  Z RS(   u╕   
    Tokenize a string, treating any sequence of blank lines as a delimiter.
    Blank lines are defined as lines containing no characters, except for
    space or tab characters.
    c         C  s   t  j |  d d t Гd  S(   Nu   \s*\n\s*\n\s*R   (   R   R   R&   (   R   (    (    sf   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/regexp.pyR   з   s    (   R!   R#   R$   R   (    (    (    sf   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/regexp.pyR+   б   s   t   WordPunctTokenizerc           B  s   e  Z d  Z d Д  Z RS(   u▀  
    Tokenize a text into a sequence of alphabetic and
    non-alphabetic characters, using the regexp ``\w+|[^\w\s]+``.

        >>> from nltk.tokenize import WordPunctTokenizer
        >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
        >>> WordPunctTokenizer().tokenize(s)
        ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York',
        '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
    c         C  s   t  j |  d Г d  S(   Nu   \w+|[^\w\s]+(   R   R   (   R   (    (    sf   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/regexp.pyR   ╡   s    (   R!   R#   R$   R   (    (    (    sf   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/regexp.pyR,   к   s   
c         C  s"   t  | | | | Г } | j |  Г S(   ur   
    Return a tokenized copy of *text*.  See :class:`.RegexpTokenizer`
    for descriptions of the arguments.
    (   R   R   (   R   R   R   R   R   t	   tokenizer(    (    sf   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/regexp.pyt   regexp_tokenize╝   s    (   R$   t
   __future__R    R   t   nltk.tokenize.apiR   t   nltk.tokenize.utilR   t   nltk.compatR   R   R*   R+   R,   R%   R&   R'   R(   R)   R.   R   t   blankline_tokenizet   wordpunct_tokenize(    (    (    sf   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/regexp.pyt   <module>C   s   E	