B _oa C@s<ddlmZddlmZmZddlmZmZddlm Z m Z m Z m Z m Z mZmZmZmZmZmZmZmZmZmZGdddZGdd d eZGd d d eZGd d d eZGdddeZGdddeZGdddeZGdddeZGdddeZ ee!ee!e"dddZ#eddd#e!e$e"e$dd d!Z%d"S)$) lru_cache)ListOptional)COMMON_SAFE_ASCII_CHARACTERSUNICODE_SECONDARY_RANGE_KEYWORD)is_accentuatedis_asciiis_case_variableis_cjk is_emoticon is_hangul is_hiragana is_katakanais_latinis_punctuation is_separator is_symbolis_thai remove_accent unicode_rangec@sPeZdZdZeedddZeddddZddd d Ze e dd d Z dS) MessDetectorPluginzy Base abstract class used for mess detection plugins. All detectors MUST extend and implement given methods. ) characterreturncCstdS)z@ Determine if given character should be fed in. N)NotImplementedError)selfrr>-=<)r5joinrLrrMrJrr r rrrr9rrrGlenr-rIrHrKr2r)rrZ buffer_lengthrrrr sH          zSuperWeirdWordPlugin.feedcCs.d|_d|_d|_d|_d|_d|_d|_dS)NrFFr)rLrIrJrHrGr-rK)rrrrr :szSuperWeirdWordPlugin.resetcCs|jdkrdS|j|jS)N g)rGrKr-)rrrrr!Cs zSuperWeirdWordPlugin.ratio) r"r#r$r/r&r'rrr r(r)r!rrrrrEs  / rEc@s^eZdZdZddddZeedddZeddd d Zddd d Z e e dd dZ dS)CjkInvalidStopPluginu GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and can be easily detected. Searching for the overuse of '丅' and '丄'. N)rcCsd|_d|_dS)Nr)_wrong_stop_count_cjk_character_count)rrrrr/QszCjkInvalidStopPlugin.__init__)rrcCsdS)NTr)rrrrrrUszCjkInvalidStopPlugin.eligiblecCs4|dkr|jd7_dSt|r0|jd7_dS)N)u丅u丄r)rXr rY)rrrrrrXs zCjkInvalidStopPlugin.feedcCsd|_d|_dS)Nr)rXrY)rrrrr _szCjkInvalidStopPlugin.resetcCs|jdkrdS|j|jS)Ng)rYrX)rrrrr!cs zCjkInvalidStopPlugin.ratio) r"r#r$r%r/r&r'rrr r(r)r!rrrrrWKsrWc@sZeZdZddddZeedddZedddd Zddd d Ze e dd d Z dS)ArchaicUpperLowerPluginN)rcCs.d|_d|_d|_d|_d|_d|_d|_dS)NFrT)_buf_character_count_since_last_sep_successive_upper_lower_count#_successive_upper_lower_count_finalr-_last_alpha_seen_current_ascii_only)rrrrr/ksz ArchaicUpperLowerPlugin.__init__)rrcCsdS)NTr)rrrrrrxsz ArchaicUpperLowerPlugin.eligiblecCs$|ot|}|dk}|r|jdkr|jdkrV|dkrV|jdkrV|j|j7_d|_d|_d|_d|_|j d7_ d|_dS|jdkrt |dkrd|_|jdk r| r|j s| r|j r|jdkr|jd7_d|_qd|_nd|_|j d7_ |jd7_||_dS)NFr@rTr1) r5r r]r2rar_r^r`r\r-r r>islower)rrZ is_concernedZ chunk_seprrrr{s8      zArchaicUpperLowerPlugin.feedcCs.d|_d|_d|_d|_d|_d|_d|_dS)NrFT)r-r]r^r_r`r\ra)rrrrr szArchaicUpperLowerPlugin.resetcCs|jdkrdS|j|jS)Nrg)r-r_)rrrrr!s zArchaicUpperLowerPlugin.ratio) r"r#r$r/r&r'rrr r(r)r!rrrrr[js  * r[)rCrDrcCsP|dks|dkrdS||kr dSd|kr4d|kr4dSd|ksDd|krHdS|d|d}}x"|D]}|tkrrqd||krddSqdW|dk|dk}}|s|rd|ksd|krdS|r|rdSd |ksd |krd|ksd|krdS|d ks|d krdSd|ksd|ks|d krL|d krLd |ks0d |kr4dSd |ksHd |krLdSdS)za Determine if two Unicode range seen next to each other can be considered as suspicious. NTFZLatinZ Emoticons )HiraganaKatakanaCJKZHangulz Basic Latin)rfreZ PunctuationZForms)splitr)rCrDZkeywords_range_aZkeywords_range_belZrange_a_jp_charsZrange_b_jp_charsrrrrBsJ    rBi)maxsize皙?F)decoded_sequencemaximum_thresholddebugrc CsddtD}t|}d}|dkr,d}n|dkr:d}nd}xzt|td |D]f\}}x |D]} | |r^| |q^W|d kr||d ks||d krPtd d|D}||krPPqPW|rx|D]} t| j | j qWt |d S) zw Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier. cSsg|] }|qSrr).0Zmd_classrrr szmess_ratio..gi irbrrcSsg|] }|jqSr)r!)rodtrrrrps) r__subclasses__rUziprangerrsumprint __class__r!round) rlrmrnZ detectorslengthZmean_mess_ratioZ!intermediary_mean_mess_ratio_calcrindexdetectorrsrrr mess_ratios*     rN)rkF)& functoolsrtypingrrZconstantrrutilsrr r r r r rrrrrrrrrrr*r3r6r;r?rErWr[r&r'rBr)rrrrrs" D"/$6PM <