import re import jieba.posseg from .base import RegexDetector from ..implement import CNNameImplement from ..utils import CanonicalStringSet class CNNameDetector(RegexDetector): """Use part of speech tagging to clean proper nouns out of the dirty dirty ``text``. Disallow particular nouns by adding them to the ``NameDetector.disallowed_nouns`` set. """ filth_cls = CNNameImplement disallowed_nouns = CanonicalStringSet(["skype"]) def iter_filth(self, text): if not isinstance(self.disallowed_nouns, CanonicalStringSet): raise TypeError( 'NameDetector.disallowed_nouns must be CanonicalStringSet' ) # string = "æŽä¼ŸçŸ¥é“伟大的ä¸åŽäººæ°‘共和国的创始人是毛泽东,朱德ç‰äºº" proper_nouns = [] seg = jieba.posseg.cut(text) l = [] index=0 for i in seg: l.append((i.word, i.flag, index, index+len(i.word))) index+=len(i.word) for element in l: if element[1] == "nr": proper_nouns.append((element[0],element[2],element[3])) print(element[0], element[1], element[2], element[3]) """# use a regex to replace the proper nouns by first escaping any # lingering punctuation in the regex # http://stackoverflow.com/a/4202559/564709 if proper_nouns: re_list = [] for proper_noun in proper_nouns: re_list.append(r'\b' + re.escape(str(proper_noun)) + r'\b') self.filth_cls.regex = re.compile('|'.join(re_list)) else: self.filth_cls.regex = None return super(CNNameDetector, self).iter_filth(text) """ for proper_noun in proper_nouns: print("chinese name para:",proper_noun[1]," ",proper_noun[2]," ",proper_noun[0]) yield CNNameImplement( beg=proper_noun[1], end=proper_noun[2], text=proper_noun[0] )