d?d@dAdBdCdDdEdFdGdHdIdJdKdLdMdNdOdPdQgZdRZedSZ dTZ!eedUZ"dVZ#dWZ$ddXZ&dddYZ'dZd[ee(e(e(e(e(dd\ Z)dde(d]Z*ddd^Z+ddd_Z,eddd`Z-daZ.edbZ/dcZ0ddddZ1e2dekrHddfl3m4Z4m5Z5ddgl6m7Z7ddhl8m9Z9e4j:Z;e7e9j:Z<e5j:Z=e+e;ndS(is) Utility methods for Sentiment Analysis. i(tdeepcopyN(t CategorizedPlaintextCorpusReader(tload(t EMOTICON_RE(toutf_writer_compattextract_fieldss (?: ^(?:never|no|nothing|nowhere|noone|none|not| havent|hasnt|hadnt|cant|couldnt|shouldnt| wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint )$ ) | n'ts ^[.:;!?]$s:-)s:)s;)s:o)s:]s:3s:c)s:>s=]s8)s=)s:}s:^)s:-Ds:Ds8-Dt8Dsx-DtxDsX-DtXDs=-Ds=Ds=-3s=3s:-))s:'-)s:')s:*s:^*s>:Ps:-Ps:PsX-Psx-ptxptXPs:-ps:ps=ps:-bs:bs>:)s>;)s>:-)s<3s:Ls:-/s>:/s:Ss>:[s:@s:-(s:[s:-||s=Ls::(s:(s>.:\s;(csfd}|S(sH A timer decorator to measure execution performance of methods. c stj}||}tj}||}t|d}t|dd}tt|d}|dkr|dkr|dkrdjj|GHndjj|||GH|S(Nii<ii s[TIMER] {0}(): {:.3f} secondss[TIMER] {0}(): {1}h {2}m {3}s(ttimetinttroundtformatt__name__( targstkwtstarttresulttendttot_timethourstminstsecs(tmethod(se/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/sentiment/util.pyttimedFs   $((RR((Rse/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/sentiment/util.pyttimerBscCsOi}|rt|}nx-|D]%}|t|k|dj|>> words = ['ice', 'police', 'riot'] >>> document = 'ice is melting due to global warming'.split() >>> sorted(extract_unigram_feats(document, words).items()) [('contains(ice)', True), ('contains(police)', False), ('contains(riot)', False)] s contains({0})(t mark_negationtsetR(tdocumenttunigramsthandle_negationtfeaturestword((se/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/sentiment/util.pytextract_unigram_feats_s  #cCsHi}x;|D]3}|tj|k|dj|d|d>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')] >>> document = 'ice is melting due to global warming'.split() >>> sorted(extract_bigram_feats(document, bigrams).items()) [('contains(global - warming)', True), ('contains(love - you)', False), ('contains(police - prevented)', False)] scontains({0} - {1})ii(tnltktbigramsR(RR%R!tbigr((se/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/sentiment/util.pytextract_bigram_featsws 1cCs|st|}n|o1t|dttf}|rG|d}n|}t}xt|D]\}}tj|r| s|r|r| }q`q||cd7>> sent = "I didn't like this movie . It was bad .".split() >>> mark_negation(sent) ['I', "didn't", 'like_NEG', 'this_NEG', 'movie_NEG', '.', 'It', 'was', 'bad', '.'] it_NEG( Rt isinstancettupletlisttFalset enumeratet NEGATION_REtsearchtCLAUSE_PUNCT_RE(Rtdouble_neg_fliptshallowtlabeledtdoct neg_scopetiR"((se/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/sentiment/util.pyRs$  c Ks4tj|d}d}|djtjd7}xt|D]}t||tr||}|dj|7}xt|D] }|dj|||7}qWqDt||tr|dj|7}xB||D]}|dj|7}qWqD|dj|||7}qDW|j |Wd QXd S( s4 Write the output of an analysis to a file. tats *** s{0} s%d/%m/%Y, %H:%Ms - **{0}:** s - {0}: {1} s - {0} s - **{0}:** {1} N( tcodecstopenRR tstrftimetsortedR)tdictR+twrite(tfilenametkwargstoutfilettexttkt dictionarytentry((se/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/sentiment/util.pytoutput_markdowns !c Cs@d|fGHtj|d}tj||ddWdQXdS(sR Store `content` in `filename`. Can be used to store a SentimentAnalyzer. tSavingtwbtprotocoliN(R8R9tpickletdump(tcontentR>t storage_file((se/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/sentiment/util.pyt save_files cCswtjdtj|| s3|t|krBt|}n|td| }|td||!}||fS(st Randomly split `n` instances of the dataset into train and test sets. :param all_instances: a list of instances (e.g. documents) that will be split. :param n: the number of instances to consider (in case we want to use only a subset). :return: two lists of instances. Train set is 8/10 of the total and test set is 2/10 of the total. i90g?(trandomtseedtshuffletlenR (t all_instancestnt train_setttest_set((se/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/sentiment/util.pytsplit_train_tests  cCsyddlj}Wntk r2tdnX|jdddd|j}|jj|j||ddd |jd d d d |j dd|r|j ||ddn|r|j dddg|ddn|j d|j dS(NisPThe plot function requires matplotlib to be installed.See http://matplotlib.org/taxistytnbinsitrotcolortredtyming333333tymaxg333333?tpaditrotationtverticaliit horizontalg?(tmatplotlib.pyplottpyplott ImportErrortlocator_paramstaxestyaxistgridtplottylimt tight_layouttxtickstytickstmarginstshow(tx_valuesty_valuestx_labelsty_labelstpltRg((se/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/sentiment/util.pyt _show_plots    " tutf8treplacec  Cstj|d|} t||||\} }| j|| tkrUg}nd}x| D]}tj|}t||}y7||jd}|tkrt j d|rwbqn|tkrt j d|rwbqn|tkr7t j |}|r7t |t@r4t |t@r4wbq4q7n| tkrtt jddt jd|||jdtlabeltword_tokenizertsent_tokenizert skip_headerttweetstcsvfileRR6ttweet_idRAtsenttwRRtxt unicode_row((se/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/sentiment/util.pytparse_tweets_setJsF  %   "  . c!Csddlm}ddlm}ddlm}m}|dt}|d"k rdt |d}nddg}|j d } d } t | | |d ||j d } d } t | | |d |t | ddd|} t | ddd|}t |\}}t | \}}||}||}|}g|j|D] }|^qG}|j|dd}|jtd||jg|D]}|d^qdddd}|jtd||j|}|j|}|j||}y|jWntk rdGHnX|j|}|rg|jD]}|j^q<} t|dddt|jd|jjd| d |d!|nd"S(#s  Train and test Naive Bayes classifier on 10000 tweets, tokenized using TweetTokenizer. Features are composed of: - 1000 most frequent unigrams - 100 top bigrams (using BigramAssocMeasures.pmi) :param trainer: `train` method of a classifier. :param n_instances: the number of total tweets that have to be used for training and testing. Tweets will be equally split between positive and negative. :param output: the output file where results have to be reported. i(tTweetTokenizer(tSentimentAnalyzer(ttwitter_samplest stopwordst preserve_caseitidRAspositive_tweets.jsonspositive_tweets.csvRsnegative_tweets.jsonsnegative_tweets.csvRtnegRtposttop_niRiidtmin_freqi R%sKYour classifier does not provide a show_most_informative_features() method.tDatasettlabeled_tweetst Classifiert TokenizertFeatstResultst InstancesN(t nltk.tokenizeRtsentiment_analyzerRt nltk.corpusRRR,RR tabspathRRRVt all_wordstunigram_word_featstadd_feat_extractorR#tbigram_collocation_featsR'tapply_featuresttraintshow_most_informative_featurestAttributeErrortevaluatetfeat_extractorsRREttypet __class__(!ttrainert n_instancestoutputRRRRt tokenizerRt positive_jsont positive_csvt negative_jsont negative_csvtneg_docstpos_docsttrain_pos_docst test_pos_docsttrain_neg_docst test_neg_docsttraining_tweetsttesting_tweetstsentim_analyzerR"Rt unigram_featsRtbigram_collocs_featst training_setRUt classifiertresultstftextr((se/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/sentiment/util.pyt demo_tweetssN     "#  cCsddlm}ddlm}|dk r?t|d}ng|jd| D]!}t|j|df^qS}g|jd| D]!}t|j|df^q}t |\} } t |\} } | | } | | }|}|j | }|j |dd}|j t d ||j| }|j|}|j||}y|jWntk rd GHnX|j|}|rg|jD]}|j^q}t|d d d t|jddd|d|d|ndS(s Train classifier on all instances of the Movie Reviews dataset. The corpus has been preprocessed using the default sentence tokenizer and WordPunctTokenizer. Features are composed of: - most frequent unigrams :param trainer: `train` method of a classifier. :param n_instances: the number of total reviews that have to be used for training and testing. Reviews will be equally split between positive and negative. :param output: the output file where results have to be reported. i(t movie_reviews(RiRRRiRsKYour classifier does not provide a show_most_informative_features() method.Rt Movie_reviewsRRtWordPunctTokenizerRRRN(RRRRRR tfileidsR+twordsRVRRRR#RRRRRRRRER(RRRRRtpos_idRtneg_idRRRRRt training_docst testing_docsRRRRRURRRR((se/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/sentiment/util.pytdemo_movie_reviewss6 ;;     cCsddlm}ddlm}|dk r?t|d}ng|jdd| D]}|df^qV}g|jdd| D]}|df^q}t|\} } t|\} } | | } | | }|}|jg| D]}t |^q}|j |dd }|j t d ||j | }|j |}|j||}y|jWntk rd GHnX|j|}|tkrt|d n|rg|jD]}|j^q}t|d ddt|jddd|d|d|n|S(s Train and test a classifier on instances of the Subjective Dataset by Pang and Lee. The dataset is made of 5000 subjective and 5000 objective sentences. All tokens (words and punctuation marks) are separated by a whitespace, so we use the basic WhitespaceTokenizer to parse the data. :param trainer: `train` method of a classifier. :param save_analyzer: if `True`, store the SentimentAnalyzer in a pickle file. :param n_instances: the number of total sentences that have to be used for training and testing. Sentences will be equally split between positive and negative. :param output: the output file where results have to be reported. i(R(t subjectivityit categoriestsubjtobjRiRsKYour classifier does not provide a show_most_informative_features() method.ssa_subjectivity.pickleRRRRtWhitespaceTokenizerRRRN(RRRRRR tsentsRVRRRRR#RRRRRR}RMRRRER(Rt save_analyzerRRRRRt subj_docstobj_docsttrain_subj_docsttest_subj_docsttrain_obj_docst test_obj_docsRRRR4t all_words_negRRRURRRR((se/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/sentiment/util.pytdemo_subjectivity s< //   (    cCsddlm}ddlm}|j}ytd}Wn-tk rkdGHdGHt|jt }nXg|j |D]}|j ^q|}|j |GHdS(s Classify a single sentence as subjective or objective using a stored SentimentAnalyzer. :param text: a sentence whose subjectivity has to be classified. i(tNaiveBayesClassifier(tregexpssa_subjectivity.pickles4Cannot find the sentiment analyzer you want to load.s.Training a new one using NaiveBayesClassifier.N( t nltk.classifyRRRRRt LookupErrorRRR}Rtlowertclassify(RARRRRR"ttokenized_text((se/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/sentiment/util.pytdemo_sent_subjectivityIs  (c Cseddlm}ddlm}|j}d}d}g|j|D]}|j^qH}ttt |} g} xs|D]k}||j kr|d7}| j dq||j kr|d7}| j dq| j dqW||krdGHn(||krdGHn||kr0dGHn|t krat| | d |d dddgnd S( s Basic example of sentiment classification using Liu and Hu opinion lexicon. This function simply counts the number of positive, negative and neutral words in the sentence and classifies it depending on which polarity is more represented. Words that do not appear in the lexicon are considered as neutral. :param sentence: a sentence whose polarity has to be classified. :param plot: if True, plot a visual representation of the sentence polarity. i(topinion_lexicon(ttreebankiitPositivetNegativetNeutralRsRtN(RRRRtTreebankWordTokenizerRR R+trangeRQtpositiveRtnegativeR}Rv( tsentenceRjRRRt pos_wordst neg_wordsR"ttokenized_sentRRX((se/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/sentiment/util.pytdemo_liu_hu_lexicon^s0  (       cCs+ddlm}|}|j|GHdS(s~ Output polarity scores for a text using Vader approach. :param text: a text whose polarity has to be evaluated. i(tSentimentIntensityAnalyzerN(tvaderRtpolarity_scores(RARtvader_analyzer((se/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/sentiment/util.pytdemo_vader_instances c( Cs$ddlm}ddlm}ddlm}ddlm}m}m }m }|d k rqt |d}nddg} |j d } d } t| | | d td ||j d } d} t| | | d td |t| dd}t| dd}t|\}}t|\}}||}||}|}|t}|t}g}g}t}d}xt|D]\}\}}|j|||j||j||j|d} | dkrd}!nd}!|d7}|j|!||!j|qWi}"x|D]}|||}#|#|"d<|||||}$|$|"dj|<|||||}%|%|"dj|<|||||}&|&|"dj|t__doc__tcopyRR8RR~RIRNRRR R$RRt nltk.dataRtnltk.tokenize.casualRtnltk.twitter.commonRRtNEGATIONtcompiletVERBOSER.t CLAUSE_PUNCTR0RRRRR,R#R'RRERMRRVRvR}RRRRRR RR R>RRRR@tnltk.classify.scikitlearnRAt sklearn.svmRBRt naive_bayestsvmtmaxent(((se/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/sentiment/util.pyt sr         $!!! !!   (     D @M5<  ) P