# coding: utf-8 # # Natural Language Toolkit: Twitter Tokenizer # # Copyright (C) 2001-2015 NLTK Project # Author: Christopher Potts # Ewan Klein (modifications) # Pierpaolo Pantone <> (modifications) # URL: # For license information, see LICENSE.TXT # """ Twitter-aware tokenizer, designed to be flexible and easy to adapt to new domains and tasks. The basic logic is this: 1. The tuple regex_strings defines a list of regular expression strings. 2. The regex_strings strings are put, in order, into a compiled regular expression object called word_re. 3. The tokenization is done by word_re.findall(s), where s is the user-supplied string, inside the tokenize() method of the class Tokenizer. 4. When instantiating Tokenizer objects, there is a single option: preserve_case. By default, it is set to True. If it is set to False, then the tokenizer will downcase everything except for emoticons. """ ###################################################################### from __future__ import unicode_literals import re from nltk.compat import htmlentitydefs, int2byte, unichr ###################################################################### # The following strings are components in the regular expression # that is used for tokenizing. It's important that phone_number # appears first in the final regex (since it can contain whitespace). # It also could matter that tags comes after emoticons, due to the # possibility of having text like # # <:| and some text >:) # # Most importantly, the final element should always be last, since it # does a last ditch whitespace-based tokenization of whatever is left. # ToDo: Update with http://en.wikipedia.org/wiki/List_of_emoticons ? # This particular element is used in a couple ways, so we define it # with a name: EMOTICONS = r""" (?: [<>]? [:;=8] # eyes [\-o\*\']? # optional nose [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth | [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth [\-o\*\']? # optional nose [:;=8] # eyes [<>]? | <3 # heart )""" # URL pattern due to John Gruber, modified by Tom Winzig. See # https://gist.github.com/winzig/8894715 URLS = r""" # Capture 1: entire matched URL (?: https?: # URL protocol and colon (?: /{1,3} # 1-3 slashes | # or [a-z0-9%] # Single letter or digit or '%' # (Trying not to match e.g. "URI::Escape") ) | # or # looks like domain name followed by a slash: [a-z0-9.\-]+[.] (?:[a-z]{2,13}) / ) (?: # One or more: [^\s()<>{}\[\]]+ # Run of non-space, non-()<>{}[] | # or \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...) | \([^\s]+?\) # balanced parens, non-recursive: (...) )+ (?: # End with: \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...) | \([^\s]+?\) # balanced parens, non-recursive: (...) | # or [^\s`!()\[\]{};:'".,<>?«»“”‘’] # not a space or one of these punct chars ) | # OR, the following to match naked domains: (?: (?\s]+>""" , # ASCII Arrows r"""[\-]+>|<[\-]+""" , # Twitter username: r"""(?:@[\w_]+)""" , # Twitter hashtags: r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""" , # Remaining word types: r""" (?:[a-z][a-z'\-_]+[a-z]) # Words with apostrophes or dashes. | (?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals. | (?:[\w_]+) # Words without apostrophes or dashes. | (?:\.(?:\s*\.){1,}) # Ellipsis dots. | (?:\S) # Everything else that isn't whitespace. """ ) ###################################################################### # This is the core tokenizing regex: WORD_RE = re.compile(r"""(%s)""" % "|".join(REGEXPS), re.VERBOSE | re.I | re.UNICODE) # The emoticon string gets its own regex so that we can preserve case for # them as needed: EMOTICON_RE = re.compile(EMOTICONS, re.VERBOSE | re.I | re.UNICODE) # These are for regularizing HTML entities to Unicode: ENT_RE = re.compile(r'&(#?(x?))([^&;\s]+);') ###################################################################### # Functions for converting html entities ###################################################################### def _str_to_unicode(text, encoding=None, errors='strict'): if encoding is None: encoding = 'utf-8' if isinstance(text, bytes): return text.decode(encoding, errors) return text def _replace_html_entities(text, keep=(), remove_illegal=True, encoding='utf-8'): """ Remove entities from text by converting them to their corresponding unicode character. :param text: a unicode string or a byte string encoded in the given `encoding` (which defaults to 'utf-8'). :param list keep: list of entity names which should not be replaced.\ This supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``) and named entities (such as `` `` or ``>``). :param bool remove_illegal: If `True`, entities that can't be converted are\ removed. Otherwise, entities that can't be converted are kept "as is". :returns: A unicode string with the entities removed. See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py >>> from nltk.tokenize.casual import _replace_html_entities >>> _replace_html_entities(b'Price: £100') 'Price: \\xa3100' >>> print(_replace_html_entities(b'Price: £100')) Price: £100 >>> """ def _convert_entity(match): entity_body = match.group(3) if match.group(1): try: if match.group(2): number = int(entity_body, 16) else: number = int(entity_body, 10) # Numeric character references in the 80-9F range are typically # interpreted by browsers as representing the characters mapped # to bytes 80-9F in the Windows-1252 encoding. For more info # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML if 0x80 <= number <= 0x9f: return int2byte(number).decode('cp1252') except ValueError: number = None else: if entity_body in keep: return match.group(0) else: number = htmlentitydefs.name2codepoint.get(entity_body) if number is not None: try: return unichr(number) except ValueError: pass return "" if remove_illegal else match.group(0) return ENT_RE.sub(_convert_entity, _str_to_unicode(text, encoding)) ###################################################################### class TweetTokenizer: r""" Tokenizer for tweets. >>> from nltk.tokenize import TweetTokenizer >>> tknzr = TweetTokenizer() >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--" >>> tknzr.tokenize(s0) ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--'] >>> s1 = "@Joyster2012 @CathStaincliffe Good for you, girl!! Best wishes :-)" >>> tknzr.tokenize(s1) ['@Joyster2012', '@CathStaincliffe', 'Good', 'for', 'you', ',', 'girl', '!', '!', 'Best', 'wishes', ':-)'] >>> s2 = "3Points for #DreamTeam Gooo BAILEY! :) #PBB737Gold @PBBabscbn" >>> tknzr.tokenize(s2) ['3Points', 'for', '#DreamTeam', 'Gooo', 'BAILEY', '!', ':)', '#PBB737Gold', '@PBBabscbn'] >>> s3 = "@Insanomania They do... Their mentality doesn't :(" >>> tknzr.tokenize(s3) ['@Insanomania', 'They', 'do', '...', 'Their', 'mentality', "doesn't", ':('] >>> s4 = "RT @facugambande: Ya por arrancar a grabar !!! #TirenTirenTiren vamoo !!" >>> tknzr.tokenize(s4) ['RT', '@facugambande', ':', 'Ya', 'por', 'arrancar', 'a', 'grabar', '!', '!', '!', '#TirenTirenTiren', 'vamoo', '!', '!'] >>> tknzr = TweetTokenizer(reduce_len=True) >>> s5 = "@crushinghes the summer holidays are great but I'm so bored already :(" >>> tknzr.tokenize(s5) ['@crushinghes', 'the', 'summer', 'holidays', 'are', 'great', 'but', "I'm", 'so', 'bored', 'already', ':('] Examples using `strip_handles` and `reduce_len parameters`: >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True) >>> s6 = '@remy: This is waaaaayyyy too much for you!!!!!!' >>> tknzr.tokenize(s6) [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!'] >>> s7 = '@_willy65: No place for @chuck tonight. Sorry.' >>> tknzr.tokenize(s7) [':', 'No', 'place', 'for', 'tonight', '.', 'Sorry', '.'] >>> s8 = '@mar_tin is a great developer. Contact him at mar_tin@email.com' >>> tknzr.tokenize(s8) ['is', 'a', 'great', 'developer', '.', 'Contact', 'him', 'at', 'mar_tin', '@email', '.', 'com'] """ def __init__(self, preserve_case=True, reduce_len=False, strip_handles=False): self.preserve_case = preserve_case self.reduce_len = reduce_len self.strip_handles = strip_handles def tokenize(self, text): """ :param text: str :rtype: list(str) :return: a tokenized list of strings; concatenating this list returns\ the original string if `preserve_case=False` """ # Fix HTML character entities: text = _replace_html_entities(text) # Remove username handles if self.strip_handles: text = remove_handles(text) # Normalize word lengthening if self.reduce_len: text = reduce_lengthening(text) # Tokenize: words = WORD_RE.findall(text) # Possibly alter the case, but avoid changing emoticons like :D into :d: if not self.preserve_case: words = list(map((lambda x : x if EMOTICON_RE.search(x) else x.lower()), words)) return words ###################################################################### # Normalization Functions ###################################################################### def reduce_lengthening(text): """ Replace repeated character sequences of length 3 or greater with sequences of length 3. """ pattern = re.compile(r"(.)\1{2,}") return pattern.sub(r"\1\1\1", text) def remove_handles(text): """ Remove Twitter username handles from text. """ pattern = re.compile(r"(^|(?<=[^\w.-]))@[A-Za-z_]+\w+") return pattern.sub('', text) ###################################################################### # Tokenization Function ###################################################################### def casual_tokenize(text, preserve_case=True, reduce_len=False, strip_handles=False): """ Convenience function for wrapping the tokenizer. """ return TweetTokenizer(preserve_case=preserve_case, reduce_len=reduce_len, strip_handles=strip_handles).tokenize(text) ###############################################################################