є
<┐CVc           @   sC   d  Z  d d l Z d d l Td d l Td e e f d Д  Г  YZ d S(   sЗ  
CorpusReader for the Pros and Cons dataset.

- Pros and Cons dataset information -

Contact: Bing Liu, liub@cs.uic.edu
        http://www.cs.uic.edu/~liub

Distributed with permission.

Related papers:

- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
    Proceedings of the 22nd International Conference on Computational Linguistics
    (Coling-2008), Manchester, 18-22 August, 2008.

- Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and Comparing
    Opinions on the Web". Proceedings of the 14th international World Wide Web
    conference (WWW-2005), May 10-14, 2005, in Chiba, Japan.
i    N(   t   *t   ProsConsCorpusReaderc           B   s_   e  Z d  Z e Z e Г  d d Д Z d d d Д Z d d d Д Z	 d Д  Z
 d Д  Z d Д  Z RS(	   s▒  
    Reader for the Pros and Cons sentence dataset.

        >>> from nltk.corpus import pros_cons
        >>> pros_cons.sents(categories='Cons')
        [['East', 'batteries', '!', 'On', '-', 'off', 'switch', 'too', 'easy',
        'to', 'maneuver', '.'], ['Eats', '...', 'no', ',', 'GULPS', 'batteries'],
        ...]
        >>> pros_cons.words('IntegratedPros.txt')
        ['Easy', 'to', 'use', ',', 'economical', '!', ...]
    t   utf8c         K   s3   t  j |  | | | Г t j |  | Г | |  _ d S(   s╡  
        :param root: The root directory for the corpus.
        :param fileids: a list or regexp specifying the fileids in the corpus.
        :param word_tokenizer: a tokenizer for breaking sentences or paragraphs
            into words. Default: `WhitespaceTokenizer`
        :param encoding: the encoding that should be used to read the corpus.
        :param kwargs: additional parameters passed to CategorizedCorpusReader.
        N(   t   CorpusReadert   __init__t   CategorizedCorpusReadert   _word_tokenizer(   t   selft   roott   fileidst   word_tokenizert   encodingt   kwargs(    (    sn   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/pros_cons.pyR   0   s    c         C   sТ   |  j  | | Г } | d k r* |  j } n t | t j Г rH | g } n  t g  |  j | t t Г D]* \ } } } |  j	 | |  j
 d | Г^ qa Г S(   sш  
        Return all sentences in the corpus or in the specified files/categories.

        :param fileids: a list or regexp specifying the ids of the files whose
            sentences have to be returned.
        :param categories: a list specifying the categories whose sentences
            have to be returned.
        :return: the given file(s) as a list of sentences. Each sentence is
            tokenized using the specified word_tokenizer.
        :rtype: list(list(str))
        R   N(   t   _resolvet   Nonet   _fileidst
   isinstancet   compatt   string_typest   concatt   abspathst   Truet
   CorpusViewt   _read_sent_block(   R   R	   t
   categoriest   patht   enct   fileid(    (    sn   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/pros_cons.pyt   sents?   s    c         C   sТ   |  j  | | Г } | d k r* |  j } n t | t j Г rH | g } n  t g  |  j | t t Г D]* \ } } } |  j	 | |  j
 d | Г^ qa Г S(   s┐  
        Return all words and punctuation symbols in the corpus or in the specified
        files/categories.

        :param fileids: a list or regexp specifying the ids of the files whose
            words have to be returned.
        :param categories: a list specifying the categories whose words have
            to be returned.
        :return: the given file(s) as a list of words and punctuation symbols.
        :rtype: list(str)
        R   N(   R   R   R   R   R   R   R   R   R   R   t   _read_word_block(   R   R	   R   R   R   R   (    (    sn   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/pros_cons.pyt   wordsS   s    c         C   s|   g  } xo t  d Г D]a } | j Г  } | s1 q n  t j d | Г } | r | j |  j j | j d Г j Г  Г Г q q W| S(   Ni   s+   ^(?!\n)\s*<(Pros|Cons)>(.*)</(?:Pros|Cons)>i   (	   t   ranget   readlinet   ret   matcht   appendR   t   tokenizet   groupt   strip(   R   t   streamR   t   it   linet   sent(    (    sn   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/pros_cons.pyR   g   s    /c         C   s1   g  } x$ |  j  | Г D] } | j | Г q W| S(   N(   R   t   extend(   R   R'   R   R*   (    (    sn   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/pros_cons.pyR   r   s    c         C   sH   | d  k	 r' | d  k	 r' t d Г В n  | d  k	 r@ |  j | Г S| Sd  S(   Ns'   Specify fileids or categories, not both(   R   t
   ValueErrorR	   (   R   R	   R   (    (    sn   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/pros_cons.pyR   x   s
    N(   t   __name__t
   __module__t   __doc__t   StreamBackedCorpusViewR   t   WordPunctTokenizerR   R   R   R   R   R   R   (    (    (    sn   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/pros_cons.pyR   "   s   		(   R/   R!   t   nltk.corpus.reader.apit   nltk.tokenizeR   R   R   (    (    (    sn   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/corpus/reader/pros_cons.pyt   <module>   s   