ó
<¿CVc           @  s&  d  Z  d d l m Z d d l Z d d l m Z m Z m Z d Z d Z	 e	 d e d d	 d
 d d f Z
 e j d d j e
 ƒ e j e j Be j Bƒ Z e j e e j e j Be j Bƒ Z e j d ƒ Z d d d „ Z d e d d „ Z d d d „  ƒ  YZ d „  Z d „  Z e e e d „ Z d S(   uœ  
Twitter-aware tokenizer, designed to be flexible and easy to adapt to new
domains and tasks. The basic logic is this:

1. The tuple regex_strings defines a list of regular expression
   strings.

2. The regex_strings strings are put, in order, into a compiled
   regular expression object called word_re.

3. The tokenization is done by word_re.findall(s), where s is the
   user-supplied string, inside the tokenize() method of the class
   Tokenizer.

4. When instantiating Tokenizer objects, there is a single option:
   preserve_case.  By default, it is set to True. If it is set to
   False, then the tokenizer will downcase everything except for
   emoticons.

iÿÿÿÿ(   t   unicode_literalsN(   t   htmlentitydefst   int2bytet   unichruc  
    (?:
      [<>]?
      [:;=8]                     # eyes
      [\-o\*\']?                 # optional nose
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      |
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      [\-o\*\']?                 # optional nose
      [:;=8]                     # eyes
      [<>]?
      |
      <3                         # heart
    )u  			# Capture 1: entire matched URL
  (?:
  https?:				# URL protocol and colon
    (?:
      /{1,3}				# 1-3 slashes
      |					#   or
      [a-z0-9%]				# Single letter or digit or '%'
                                       # (Trying not to match e.g. "URI::Escape")
    )
    |					#   or
                                       # looks like domain name followed by a slash:
    [a-z0-9.\-]+[.]
    (?:[a-z]{2,13})
    /
  )
  (?:					# One or more:
    [^\s()<>{}\[\]]+			# Run of non-space, non-()<>{}[]
    |					#   or
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
    |
    \([^\s]+?\)				# balanced parens, non-recursive: (...)
  )+
  (?:					# End with:
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
    |
    \([^\s]+?\)				# balanced parens, non-recursive: (...)
    |					#   or
    [^\s`!()\[\]{};:'".,<>?Â«Â»â€œâ€â€˜â€™]	# not a space or one of these punct chars
  )
  |					# OR, the following to match naked domains:
  (?:
  	(?<!@)			        # not preceded by a @, avoid matching foo@_gmail.com_
    [a-z0-9]+
    (?:[.\-][a-z0-9]+)*
    [.]
    (?:[a-z]{2,13})
    \b
    /?
    (?!@)			        # not succeeded by a @,
                            # avoid matching "foo.na" in "foo.na@example.com"
  )
u  
    (?:
      (?:            # (international)
        \+?[01]
        [\-\s.]*
      )?
      (?:            # (area code)
        [\(]?
        \d{3}
        [\-\s.\)]*
      )?
      \d{3}          # exchange
      [\-\s.]*
      \d{4}          # base
    )u	   <[^>\s]+>u   [\-]+>|<[\-]+u   (?:@[\w_]+)u   (?:\#+[\w_]+[\w\'_\-]*[\w_]+)u{  
    (?:[a-z][a-z'\-_]+[a-z])       # Words with apostrophes or dashes.
    |
    (?:[+\-]?\d+[,/.:-]\d+[+\-]?)  # Numbers, including fractions, decimals.
    |
    (?:[\w_]+)                     # Words without apostrophes or dashes.
    |
    (?:\.(?:\s*\.){1,})            # Ellipsis dots.
    |
    (?:\S)                         # Everything else that isn't whitespace.
    u   (%s)u   |u   &(#?(x?))([^&;\s]+);u   strictc         C  s8   | d  k r d } n  t |  t ƒ r4 |  j | | ƒ S|  S(   Nu   utf-8(   t   Nonet
   isinstancet   bytest   decode(   t   textt   encodingt   errors(    (    sf   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/casual.pyt   _str_to_unicode½   s
    	u   utf-8c           s+   ‡  ‡ f d †  } t  j | t |  | ƒ ƒ S(   u·  
    Remove entities from text by converting them to their
    corresponding unicode character.

    :param text: a unicode string or a byte string encoded in the given
    `encoding` (which defaults to 'utf-8').

    :param list keep:  list of entity names which should not be replaced.    This supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``)
    and named entities (such as ``&nbsp;`` or ``&gt;``).

    :param bool remove_illegal: If `True`, entities that can't be converted are    removed. Otherwise, entities that can't be converted are kept "as
    is".

    :returns: A unicode string with the entities removed.

    See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py

        >>> from nltk.tokenize.casual import _replace_html_entities
        >>> _replace_html_entities(b'Price: &pound;100')
        'Price: \xa3100'
        >>> print(_replace_html_entities(b'Price: &pound;100'))
        Price: Â£100
        >>>
    c           s  |  j  d ƒ } |  j  d ƒ rž yc |  j  d ƒ rB t | d ƒ } n t | d ƒ } d | k oh d k n r€ t | ƒ j d ƒ SWqÉ t k
 rš d  } qÉ Xn+ | ˆ  k r· |  j  d	 ƒ St j j | ƒ } | d  k	 rú y t	 | ƒ SWqú t k
 rö qú Xn  ˆ rd
 S|  j  d	 ƒ S(   Ni   i   i   i   i
   i€   iŸ   u   cp1252i    u    (
   t   groupt   intR   R   t
   ValueErrorR   R   t   name2codepointt   getR   (   t   matcht   entity_bodyt   number(   t   keept   remove_illegal(    sf   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/casual.pyt   _convert_entityà   s&    (   t   ENT_REt   subR   (   R   R   R   R	   R   (    (   R   R   sf   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/casual.pyt   _replace_html_entitiesÄ   s    t   TweetTokenizerc           B  s)   e  Z d  Z e e e d „ Z d „  Z RS(   uÁ  
    Tokenizer for tweets.

        >>> from nltk.tokenize import TweetTokenizer
        >>> tknzr = TweetTokenizer()
        >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
        >>> tknzr.tokenize(s0)
        ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--']
        >>> s1 = "@Joyster2012 @CathStaincliffe Good for you, girl!! Best wishes :-)"
        >>> tknzr.tokenize(s1)
        ['@Joyster2012', '@CathStaincliffe', 'Good', 'for', 'you', ',', 'girl', '!', '!', 'Best', 'wishes', ':-)']
        >>> s2 = "3Points for #DreamTeam Gooo BAILEY! :) #PBB737Gold @PBBabscbn"
        >>> tknzr.tokenize(s2)
        ['3Points', 'for', '#DreamTeam', 'Gooo', 'BAILEY', '!', ':)', '#PBB737Gold', '@PBBabscbn']
        >>> s3 = "@Insanomania They do... Their mentality doesn't :("
        >>> tknzr.tokenize(s3)
        ['@Insanomania', 'They', 'do', '...', 'Their', 'mentality', "doesn't", ':(']
        >>> s4 = "RT @facugambande: Ya por arrancar a grabar !!! #TirenTirenTiren vamoo !!"
        >>> tknzr.tokenize(s4)
        ['RT', '@facugambande', ':', 'Ya', 'por', 'arrancar', 'a', 'grabar', '!', '!', '!', '#TirenTirenTiren', 'vamoo', '!', '!']
        >>> tknzr = TweetTokenizer(reduce_len=True)
        >>> s5 = "@crushinghes the summer holidays are great but I'm so bored already :("
        >>> tknzr.tokenize(s5)
        ['@crushinghes', 'the', 'summer', 'holidays', 'are', 'great', 'but', "I'm", 'so', 'bored', 'already', ':(']

    Examples using `strip_handles` and `reduce_len parameters`:

        >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
        >>> s6 = '@remy: This is waaaaayyyy too much for you!!!!!!'
        >>> tknzr.tokenize(s6)
        [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']
        >>> s7 = '@_willy65: No place for @chuck tonight. Sorry.'
        >>> tknzr.tokenize(s7)
        [':', 'No', 'place', 'for', 'tonight', '.', 'Sorry', '.']
        >>> s8 = '@mar_tin is a great developer. Contact him at mar_tin@email.com'
        >>> tknzr.tokenize(s8)
        ['is', 'a', 'great', 'developer', '.', 'Contact', 'him', 'at', 'mar_tin', '@email', '.', 'com']
    c         C  s   | |  _  | |  _ | |  _ d  S(   N(   t   preserve_caset
   reduce_lent   strip_handles(   t   selfR   R   R   (    (    sf   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/casual.pyt   __init__*  s    		c         C  ss   t  | ƒ } |  j r$ t | ƒ } n  |  j r< t | ƒ } n  t j | ƒ } |  j so t t	 d „  | ƒ ƒ } n  | S(   u¾   
        :param text: str
        :rtype: list(str)
        :return: a tokenized list of strings; concatenating this list returns        the original string if `preserve_case=False`
        c         S  s   t  j |  ƒ r |  S|  j ƒ  S(   N(   t   EMOTICON_REt   searcht   lower(   t   x(    (    sf   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/casual.pyt   <lambda>B  s   (
   R   R   t   remove_handlesR   t   reduce_lengtheningt   WORD_REt   findallR   t   listt   map(   R   R   t   words(    (    sf   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/casual.pyt   tokenize/  s    			(   t   __name__t
   __module__t   __doc__t   Truet   FalseR   R,   (    (    (    sf   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/casual.pyR     s   &c         C  s   t  j d ƒ } | j d |  ƒ S(   ue   
    Replace repeated character sequences of length 3 or greater with sequences
    of length 3.
    u	   (.)\1{2,}u   \1\1\1(   t   ret   compileR   (   R   t   pattern(    (    sf   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/casual.pyR&   J  s    c         C  s   t  j d ƒ } | j d |  ƒ S(   u4   
    Remove Twitter username handles from text.
    u   (^|(?<=[^\w.-]))@[A-Za-z_]+\w+u    (   R2   R3   R   (   R   R4   (    (    sf   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/casual.pyR%   R  s    c         C  s"   t  d | d | d | ƒ j |  ƒ S(   u:   
    Convenience function for wrapping the tokenizer.
    R   R   R   (   R   R,   (   R   R   R   R   (    (    sf   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/casual.pyt   casual_tokenize]  s    (    (    (   R/   t
   __future__R    R2   t   nltk.compatR   R   R   t	   EMOTICONSt   URLSt   REGEXPSR3   t   joint   VERBOSEt   It   UNICODER'   R    R   R   R   R0   R   R   R&   R%   R1   R5   (    (    (    sf   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/casual.pyt   <module>!   s.    .	#>H	