a *Na0%@sTddlZddlmZddlmZddlmZddlmZm Z m Z m Z m Z ddl mZddlmZdd lmZdd lmZmZmZee ed d d Zee edddZeee ed ddZee ed ddZe ee edddZee eedddZee edddZe eeddd Z ed!d"d'eee eed$d%d&Z!dS)(N)IncrementalDecoder)Counter) lru_cache)DictListOptionalSetTuple) FREQUENCIES) is_suspiciously_successive_range)CoherenceMatches)is_multi_byte_encodingis_unicode_range_secondary unicode_range) iana_namereturncCst|rtdtd|j}|dd}t}tddD]@}|t |g}|rg}tD],\}}|D]}t||kr||q qq |S)z> Return inferred languages used with a unicode range. )r itemsrappend)r* languageslanguage characters characterr'r'r(unicode_range_languages(s  r1cCs<t|}d}|D]}d|vr|}q&q|dur4dgSt|S)z Single-byte encoding language association. Some code page are heavily linked to particular language(s). This function does the correspondence. NZLatin Latin Based)r)r1)rZunicode_rangesr*Zspecified_ranger'r'r(encoding_languages7sr3cCsb|ds&|ds&|ds&|dvr,dgS|ds>|dvrFdd gS|d sX|d vr^d gSgS) z Multi-byte encoding language association. Some code page are heavily linked to particular language(s). This function does the correspondence. Zshift_ iso2022_jpZeuc_j>cp932Japanesegb> big5hkscscp950big5ChinesezClassical Chinese iso2022_kr>cp949johabeuc_krKorean) startswith)rr'r'r(mb_encoding_languagesKsrB)r/rcCsVg}tD]D\}}d}t|}|D]}||vr$|d7}q$||dkr ||q |S)zE Return associated languages associated to given characters. rr g?)r r+lenr,)r/r-r.Zlanguage_charactersZcharacter_match_countcharacter_countr0r'r'r(alphabet_languages_s   rE)r.ordered_charactersrcs0|tvrtd|d}|D]}|t|vr2qt|dt||}t|t||d}|d|||||dfdd|Dd}fdd|Dd}t|dkr|dkr|d 7}qt|dkr|dkr|d 7}q|t|d ks|t|d kr|d 7}qq|t|S) aN Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language. The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit). Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.) z{} not availablerNcsg|] }|vqSr'r'.0e)characters_beforer'r( sz1characters_popularity_compare..Tcsg|] }|vqSr'r'rG)characters_afterr'r(rKsr g?)r ValueErrorrindexcountrC)r.rFZcharacter_approved_countr0Zcharacters_before_sourceZcharacters_after_sourceZbefore_match_countZafter_match_countr')rLrJr(characters_popularity_comparessR      rQ)decoded_sequencercCsi}|D]~}|durqt|}|dur,qd}|D]}t||dur4|}qPq4|dur\|}||vrr|||<q|||7<qt|S)a Given a decoded text sequence, return a list of str. Unicode range / alphabet separation. Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list; One containing the latin letters and the other hebrew. FN)isalpharr lowerr!values)rRZlayersr0r&Zlayer_target_rangeZdiscovered_ranger'r'r(alpha_unicode_splits,  rV)resultsrc Csi}g}|D]8}|D].}|\}}||vr4|g||<q|||qq |D],}||tt||t||dfqJt|ddddS)z This function merge results previously given by the function coherence_ratio. The return type is the same as coherence_ratio. rMcSs|dSNr r'xr'r'r(z(merge_coherence_ratios..Tkeyreverse)r,roundsumrCr )rWZper_language_ratiosmergeresultZ sub_resultr.ratior'r'r(merge_coherence_ratioss*    rei)maxsize皙?)rR threshold lg_inclusionrc Csg}g}d}|dur|d}d|vr0|dt|D]}t|}|}tdd|D} | dkrhq8dd|D} |pt| D]J} t| | } | |krqn| d kr|d 7}|| t | d f|d krq8qq8t |d dddS)z Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers. A layer = Character extraction by alphabets/ranges. rN,r2cSsg|] \}}|qSr'r'rHcor'r'r(rK r\z#coherence_ratio.. cSsg|] \}}|qSr'r'rkr'r'r(rKr\g?r rMcSs|dSrXr'rYr'r'r(r[%r\z!coherence_ratio..Tr]) splitremoverVr most_commonrarErQr,r`r ) rRrhrirWZlg_inclusion_listZsufficient_match_countZlayerZsequence_frequenciesrrrDZpopular_character_orderedr.rdr'r'r(coherence_ratios8    rs)rgN)"rcodecsr collectionsr functoolsrtypingrrrrr Zassetsr Zmdr modelsr utilsrrrstrr)r1r3rBrEfloatrQrVrersr'r'r'r(s2       <'