B /WbD@sJddlmZddlmZmZddlmZmZddlm Z m Z m Z m Z m Z mZmZmZmZmZmZmZmZmZmZmZGdddZGdd d eZGd d d eZGd d d eZGdddeZGdddeZGdddeZGdddeZ GdddeZ!eddee"ee"e#dddZ$eddd$e"e%e#e%d d!d"Z&d#S)%) lru_cache)ListOptional)COMMON_SAFE_ASCII_CHARACTERSUNICODE_SECONDARY_RANGE_KEYWORD)is_accentuatedis_asciiis_case_variableis_cjk is_emoticon is_hangul is_hiragana is_katakanais_latinis_punctuation is_separator is_symbolis_thaiis_unprintable remove_accent unicode_rangec@sPeZdZdZeedddZeddddZddd d Ze e dd d Z dS) MessDetectorPluginzy Base abstract class used for mess detection plugins. All detectors MUST extend and implement given methods. ) characterreturncCstdS)z@ Determine if given character should be fed in. N)NotImplementedError)selfrrs r>c@sZeZdZddddZeedddZedddd Zddd d Ze e dd d Z dS)SuperWeirdWordPluginN)rcCs:d|_d|_d|_d|_d|_d|_d|_d|_d|_dS)NrF) _word_count_bad_word_count_foreign_long_count_is_current_word_bad_foreign_long_watchr._bad_character_count_buffer_buffer_accent_count)rrrrr0szSuperWeirdWordPlugin.__init__)rrcCsdS)NTr)rrrrrrszSuperWeirdWordPlugin.eligiblecCs|r|j|7_t|r,|jd7_|jdkrt|dksJt|rt|dkrt|dkrt|dkrt |dkrt |dkrd|_dS|jsdS| st |st |r|jr|jd7_t|j}|j|7_|dkr6|j|dkrd|_t|jdr6|jdr6|jd7_d|_|dkr\|jr\|jd7_d|_|jr|jd7_|jt|j7_d|_d|_d|_d |_n6|d kr|dkrt|rd|_|j|7_dS) NrFTg(\?rFr>~>|_=-<)r6rMrrNrKrr r rrrrArrrGlenr.rJr=rIrHrLr3r)rrZ buffer_lengthrrrr sR           zSuperWeirdWordPlugin.feedcCs4d|_d|_d|_d|_d|_d|_d|_d|_dS)NrFFr)rMrJrKrHrGr.rLrI)rrrrr!=szSuperWeirdWordPlugin.resetcCs$|jdkr|jdkrdS|j|jS)N rg)rGrIrLr.)rrrrr"GszSuperWeirdWordPlugin.ratio) r#r$r%r0r'r(rr r!r)r*r"rrrrrEs 6 rEc@s^eZdZdZddddZeedddZeddd d Zddd d Z e e dd dZ dS)CjkInvalidStopPluginu GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and can be easily detected. Searching for the overuse of '丅' and '丄'. N)rcCsd|_d|_dS)Nr)_wrong_stop_count_cjk_character_count)rrrrr0UszCjkInvalidStopPlugin.__init__)rrcCsdS)NTr)rrrrrrYszCjkInvalidStopPlugin.eligiblecCs4|dkr|jd7_dSt|r0|jd7_dS)N>丄丅r)r\r r])rrrrrr \s zCjkInvalidStopPlugin.feedcCsd|_d|_dS)Nr)r\r])rrrrr!cszCjkInvalidStopPlugin.resetcCs|jdkrdS|j|jS)Ng)r]r\)rrrrr"gs zCjkInvalidStopPlugin.ratio) r#r$r%r&r0r'r(rr r!r)r*r"rrrrr[Osr[c@sZeZdZddddZeedddZedddd Zddd d Ze e dd d Z dS)ArchaicUpperLowerPluginN)rcCs.d|_d|_d|_d|_d|_d|_d|_dS)NFrT)_buf_character_count_since_last_sep_successive_upper_lower_count#_successive_upper_lower_count_finalr._last_alpha_seen_current_ascii_only)rrrrr0osz ArchaicUpperLowerPlugin.__init__)rrcCsdS)NTr)rrrrrr|sz ArchaicUpperLowerPlugin.eligiblecCs$|ot|}|dk}|r|jdkr|jdkrV|dkrV|jdkrV|j|j7_d|_d|_d|_d|_|j d7_ d|_dS|jdkrt |dkrd|_|jdk r| r|j s| r|j r|jdkr|jd7_d|_qd|_nd|_|j d7_ |jd7_||_dS)NFr@rTr2) r6r rcr3rgrerdrfrbr.r r=islower)rrZ is_concernedZ chunk_seprrrr s8      zArchaicUpperLowerPlugin.feedcCs.d|_d|_d|_d|_d|_d|_d|_dS)NrFT)r.rcrdrerfrbrg)rrrrr!szArchaicUpperLowerPlugin.resetcCs|jdkrdS|j|jS)Nrg)r.re)rrrrr"s zArchaicUpperLowerPlugin.ratio) r#r$r%r0r'r(rr r!r)r*r"rrrrrans  * rai)maxsize)rCrDrcCs~|dks|dkrdS||kr dSd|kr4d|kr4dSd|ksDd|krHdSd|ksXd|krld|kshd|krldS|d|d}}x"|D]}|tkrq||krdSqW|dk|dk}}|s|rd |ksd |krdS|r|rdSd |ksd |kr"d |ksd |kr dS|d ks|d kr"dSd |ksJd |ksJ|d krz|d krzd |ks^d |krbdSd|ksvd|krzdSdS)za Determine if two Unicode range seen next to each other can be considered as suspicious. NTFZLatinZ EmoticonsZ Combining )HiraganaKatakanaCJKZHangulz Basic Latin)rmrlZ PunctuationZForms)splitr)rCrDZkeywords_range_aZkeywords_range_belZrange_a_jp_charsZrange_b_jp_charsrrrrBsP    rBi皙?F)decoded_sequencemaximum_thresholddebugrc CsddtD}t|d}d}|dkr0d}n|dkr>d}nd }x|t|d t|D]f\}}x |D]} | |rd| |qdW|d kr||d ks||dkrVtd d |D}||krVPqVW|rx|D]} t| j | j qWt |dS)zw Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier. cSsg|] }|qSrr).0Zmd_classrrr szmess_ratio..rgi irh rcss|] }|jVqdS)N)r")rudtrrr szmess_ratio..) r__subclasses__rYziprangerr sumprint __class__r"round) rrrsrtZ detectorslengthZmean_mess_ratioZ!intermediary_mean_mess_ratio_calcrindexdetectorrzrrr mess_ratios*      rN)rqF)' functoolsrtypingrrZconstantrrutilsrr r r r r rrrrrrrrrrrr+r4r7r:r>rEr[rar'r(rBr*rrrrrs$ H"/%4ZL D