]? [:;=8] # eyes [\-o\*\']? # optional nose [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth | [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth [\-o\*\']? # optional nose [:;=8] # eyes [<>]? | <3 # heart )u # Capture 1: entire matched URL (?: https?: # URL protocol and colon (?: /{1,3} # 1-3 slashes | # or [a-z0-9%] # Single letter or digit or '%' # (Trying not to match e.g. "URI::Escape") ) | # or # looks like domain name followed by a slash: [a-z0-9.\-]+[.] (?:[a-z]{2,13}) / ) (?: # One or more: [^\s()<>{}\[\]]+ # Run of non-space, non-()<>{}[] | # or \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...) | \([^\s]+?\) # balanced parens, non-recursive: (...) )+ (?: # End with: \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...) | \([^\s]+?\) # balanced parens, non-recursive: (...) | # or [^\s`!()\[\]{};:'".,<>?«»“”‘’] # not a space or one of these punct chars ) | # OR, the following to match naked domains: (?: (?\s]+>u [\-]+>|<[\-]+u (?:@[\w_]+)u(?:\#+[\w_]+[\w\'_\-]*[\w_]+)u{ (?:[a-z][a-z'\-_]+[a-z]) # Words with apostrophes or dashes. | (?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals. | (?:[\w_]+) # Words without apostrophes or dashes. | (?:\.(?:\s*\.){1,}) # Ellipsis dots. | (?:\S) # Everything else that isn't whitespace. u(%s)u|u&(#?(x?))([^&;\s]+);ustrictcCs8|dkrd}nt|tr4|j||S|S(Nuutf-8(tNonet isinstancetbytestdecode(ttexttencodingterrors((sf/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/casual.pyt_str_to_unicodes   uutf-8cs+fd}tj|t||S(u Remove entities from text by converting them to their corresponding unicode character. :param text: a unicode string or a byte string encoded in the given `encoding` (which defaults to 'utf-8'). :param list keep: list of entity names which should not be replaced. This supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``) and named entities (such as `` `` or ``>``). :param bool remove_illegal: If `True`, entities that can't be converted are removed. Otherwise, entities that can't be converted are kept "as is". :returns: A unicode string with the entities removed. See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py >>> from nltk.tokenize.casual import _replace_html_entities >>> _replace_html_entities(b'Price: £100') 'Price: \xa3100' >>> print(_replace_html_entities(b'Price: £100')) Price: £100 >>> cs|jd}|jdryc|jdrBt|d}nt|d}d|kohdknrt|jdSWqtk rd}qXn+|kr|jd Stjj|}|dk ryt |SWqtk rqXnrd S|jd S( Niiiii iiucp1252iu( tgrouptintRRt ValueErrorRRtname2codepointtgetR(tmatcht entity_bodytnumber(tkeeptremove_illegal(sf/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/casual.pyt_convert_entitys&      (tENT_REtsubR (RRRR R((RRsf/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/casual.pyt_replace_html_entitiesstTweetTokenizercBs)eZdZeeedZdZRS(u Tokenizer for tweets. >>> from nltk.tokenize import TweetTokenizer >>> tknzr = TweetTokenizer() >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--" >>> tknzr.tokenize(s0) ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--'] >>> s1 = "@Joyster2012 @CathStaincliffe Good for you, girl!! Best wishes :-)" >>> tknzr.tokenize(s1) ['@Joyster2012', '@CathStaincliffe', 'Good', 'for', 'you', ',', 'girl', '!', '!', 'Best', 'wishes', ':-)'] >>> s2 = "3Points for #DreamTeam Gooo BAILEY! :) #PBB737Gold @PBBabscbn" >>> tknzr.tokenize(s2) ['3Points', 'for', '#DreamTeam', 'Gooo', 'BAILEY', '!', ':)', '#PBB737Gold', '@PBBabscbn'] >>> s3 = "@Insanomania They do... Their mentality doesn't :(" >>> tknzr.tokenize(s3) ['@Insanomania', 'They', 'do', '...', 'Their', 'mentality', "doesn't", ':('] >>> s4 = "RT @facugambande: Ya por arrancar a grabar !!! #TirenTirenTiren vamoo !!" >>> tknzr.tokenize(s4) ['RT', '@facugambande', ':', 'Ya', 'por', 'arrancar', 'a', 'grabar', '!', '!', '!', '#TirenTirenTiren', 'vamoo', '!', '!'] >>> tknzr = TweetTokenizer(reduce_len=True) >>> s5 = "@crushinghes the summer holidays are great but I'm so bored already :(" >>> tknzr.tokenize(s5) ['@crushinghes', 'the', 'summer', 'holidays', 'are', 'great', 'but', "I'm", 'so', 'bored', 'already', ':('] Examples using `strip_handles` and `reduce_len parameters`: >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True) >>> s6 = '@remy: This is waaaaayyyy too much for you!!!!!!' >>> tknzr.tokenize(s6) [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!'] >>> s7 = '@_willy65: No place for @chuck tonight. Sorry.' >>> tknzr.tokenize(s7) [':', 'No', 'place', 'for', 'tonight', '.', 'Sorry', '.'] >>> s8 = '@mar_tin is a great developer. Contact him at mar_tin@email.com' >>> tknzr.tokenize(s8) ['is', 'a', 'great', 'developer', '.', 'Contact', 'him', 'at', 'mar_tin', '@email', '.', 'com'] cCs||_||_||_dS(N(t preserve_caset reduce_lent strip_handles(tselfRRR((sf/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/casual.pyt__init__*s  cCsst|}|jr$t|}n|jr<t|}ntj|}|jsott d|}n|S(u :param text: str :rtype: list(str) :return: a tokenized list of strings; concatenating this list returns the original string if `preserve_case=False` cSstj|r|S|jS(N(t EMOTICON_REtsearchtlower(tx((sf/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/casual.pytBs( RRtremove_handlesRtreduce_lengtheningtWORD_REtfindallRtlisttmap(RRtwords((sf/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/casual.pyttokenize/s     (t__name__t __module__t__doc__tTruetFalseRR,(((sf/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/casual.pyRs&cCstjd}|jd|S(ue Replace repeated character sequences of length 3 or greater with sequences of length 3. u (.)\1{2,}u\1\1\1(tretcompileR(Rtpattern((sf/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/casual.pyR&JscCstjd}|jd|S(u4 Remove Twitter username handles from text. u(^|(?<=[^\w.-]))@[A-Za-z_]+\w+u(R2R3R(RR4((sf/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/casual.pyR%RscCs"td|d|d|j|S(u: Convenience function for wrapping the tokenizer. RRR(RR,(RRRR((sf/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/casual.pytcasual_tokenize]s(((R/t __future__RR2t nltk.compatRRRt EMOTICONStURLStREGEXPSR3tjointVERBOSEtItUNICODER'R RRR R0RRR&R%R1R5(((sf/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/casual.pyt!s.  . #>H