ó <¿CVc@s/ddlmZd„Zd„Zd„ZdS(iÿÿÿÿ(tfinditerccs³t|ƒdkr!tdƒ‚nd}x…tr®y0|j||ƒ}|dkr_||fVnWn8tk rš|t|ƒkr–|t|ƒfVnPnX|t|ƒ}q*WdS(sŽ Return the offsets of the tokens in *s*, as a sequence of ``(start, end)`` tuples, by splitting the string at each occurrence of *sep*. >>> from nltk.tokenize.util import string_span_tokenize >>> s = '''Good muffins cost $3.88\nin New York. Please buy me ... two of them.\n\nThanks.''' >>> list(string_span_tokenize(s, " ")) [(0, 4), (5, 12), (13, 17), (18, 26), (27, 30), (31, 36), (37, 37), (38, 44), (45, 48), (49, 55), (56, 58), (59, 73)] :param s: the string to be tokenized :type s: str :param sep: the token separator :type sep: str :rtype: iter(tuple(int, int)) is!Token delimiter must not be emptyN(tlent ValueErrortTruetindex(tstseptlefttright((sd/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/util.pytstring_span_tokenize s ccsgd}xIt||ƒD]8}|jƒ\}}|dkrH||fVn|}qW|t|ƒfVdS(sÒ Return the offsets of the tokens in *s*, as a sequence of ``(start, end)`` tuples, by splitting the string at each successive match of *regexp*. >>> from nltk.tokenize import WhitespaceTokenizer >>> s = '''Good muffins cost $3.88\nin New York. Please buy me ... two of them.\n\nThanks.''' >>> list(WhitespaceTokenizer().span_tokenize(s)) [(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), (38, 44), (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)] :param s: the string to be tokenized :type s: str :param regexp: regular expression that matches token separators :type regexp: str :rtype: iter(tuple(int, int)) iN(RtspanR(RtregexpRtmRtnext((sd/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/util.pytregexp_span_tokenize+s ccs:d}x-|D]%\}}||||fV|}q WdS(sŠ Return a sequence of relative spans, given a sequence of spans. >>> from nltk.tokenize import WhitespaceTokenizer >>> from nltk.tokenize.util import spans_to_relative >>> s = '''Good muffins cost $3.88\nin New York. Please buy me ... two of them.\n\nThanks.''' >>> list(spans_to_relative(WhitespaceTokenizer().span_tokenize(s))) [(0, 4), (1, 7), (1, 4), (1, 5), (1, 2), (1, 3), (1, 5), (2, 6), (1, 3), (1, 2), (1, 3), (1, 2), (1, 5), (2, 7)] :param spans: a sequence of (start, end) offsets of the tokens :type spans: iter(tuple(int, int)) :rtype: iter(tuple(int, int)) iN((tspanstprevRR((sd/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/util.pytspans_to_relativeEsN(treRR RR(((sd/private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/tokenize/util.pyts !