є
<┐CVc           @   sД   d  Z  d d l Z d d l Z d d l m Z d d l m Z d d l m Z m	 Z	 m
 Z
 d d l m Z d e f d Д  Г  YZ d S(	   s{   
A reader for corpora that consist of Tweets. It is assumed that the Tweets
have been serialised into line-delimited JSON.
i    N(   t   compat(   t   TweetTokenizer(   t   StreamBackedCorpusViewt   concatt   ZipFilePathPointer(   t   CorpusReadert   TwitterCorpusReaderc           B   sb   e  Z d  Z e Z d e Г  d d Д Z d d Д Z d d Д Z	 d d Д Z
 d d Д Z d Д  Z RS(	   s7  
    Reader for corpora that consist of Tweets represented as a list of line-delimited JSON.

    Individual Tweets can be tokenized using the default tokenizer, or by a
    custom tokenizer specified as a parameter to the constructor.

    Construct a new Tweet corpus reader for a set of documents
    located at the given root directory.

    If you made your own tweet collection in a directory called
    `twitter-files`, then you can initialise the reader as::

        from nltk.corpus import TwitterCorpusReader
        reader = TwitterCorpusReader(root='/path/to/twitter-files', '.*\.json')

    However, the recommended approach is to set the relevant directory as the
    value of the environmental variable `TWITTER`, and then invoke the reader
    as follows::

       root = os.environ['TWITTER']
       reader = TwitterCorpusReader(root, '.*\.json')

    If you want to work directly with the raw Tweets, the `json` library can
    be used::

       import json
       for tweet in reader.docs():
           print(json.dumps(tweet, indent=1, sort_keys=True))

    t   utf8c         C   sВ   t  j |  | | | Г x\ |  j |  j Г D]H } t | t Г rA q) t j j | Г d k r) t	 d j
 | Г Г В q) q) W| |  _ d S(   s  

        :param root: The root directory for this corpus.

        :param fileids: A list or regexp specifying the fileids in this corpus.

        :param word_tokenizer: Tokenizer for breaking the text of Tweets into
        smaller units, including but not limited to words.

        i    s   File {} is emptyN(   R   t   __init__t   abspathst   _fileidst
   isinstanceR   t   ost   patht   getsizet
   ValueErrort   formatt   _word_tokenizer(   t   selft   roott   fileidst   word_tokenizert   encodingR   (    (    sl   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/twitter.pyR   <   s    c         C   sJ   t  g  |  j | t t Г D]* \ } } } |  j | |  j d | Г^ q Г S(   s$  
        Returns the full Tweet objects, as specified by `Twitter
        documentation on Tweets
        <https://dev.twitter.com/docs/platform-objects/tweets>`_

        :return: the given file(s) as a list of dictionaries deserialised
        from JSON.
        :rtype: list(dict)
        R   (   R   R	   t   Truet
   CorpusViewt   _read_tweets(   R   R   R   t   enct   fileid(    (    sl   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/twitter.pyt   docsV   s    
c         C   s}   |  j  | Г } g  } xa | D]Y } y? | d } t | t Г rS | j |  j Г } n  | j | Г Wq t k
 rt q Xq W| S(   sЫ   
        Returns only the text content of Tweets in the file(s)

        :return: the given file(s) as a list of Tweets.
        :rtype: list(str)
        t   text(   R   R   t   bytest   decodeR   t   appendt   KeyError(   R   R   t
   fulltweetst   tweetst   jsonoR   (    (    sl   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/twitter.pyt   stringsd   s    
c         C   s8   |  j  | Г } |  j } g  | D] } | j | Г ^ q S(   s╩   
        :return: the given file(s) as a list of the text content of Tweets as
        as a list of words, screenanames, hashtags, URLs and punctuation symbols.

        :rtype: list(list(str))
        (   R%   R   t   tokenize(   R   R   R#   t	   tokenizert   t(    (    sl   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/twitter.pyt	   tokenizedx   s    	c         C   sb   | d k r |  j } n t | t j Г r6 | g } n  t g  | D] } |  j | Г j Г  ^ q@ Г S(   s7   
        Return the corpora in their raw form.
        N(   t   NoneR
   R   R    t   string_typesR   t   opent   read(   R   R   t   f(    (    sl   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/twitter.pyt   rawД   s
    c         C   sS   g  } xF t  d Г D]8 } | j Г  } | s/ | St j | Г } | j | Г q W| S(   sS   
        Assumes that each line in ``stream`` is a JSON-serialised object.
        i
   (   t   ranget   readlinet   jsont   loadsR    (   R   t   streamR#   t   it   linet   tweet(    (    sl   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/twitter.pyR   П   s    N(   t   __name__t
   __module__t   __doc__R   R   R*   R   R   R   R%   R)   R/   R   (    (    (    sl   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/twitter.pyR      s   (   R:   R2   R   t   nltkR    t   nltk.tokenizeR   t   nltk.corpus.reader.utilR   R   R   t   nltk.corpus.reader.apiR   R   (    (    (    sl   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/twitter.pyt   <module>   s   