B `g0@sddlZddlmZddlmZddlmZddlmZm Z m Z m Z ddl m Z ddlmZmZddlmZdd lmZdd lmZmZmZGd d d ZGd ddZe eefZe eZGdddZ eZ!dS)N)aliases)sha256)dumps)OptionalListTupleSet)Counter)subcompile)TOO_BIG_SEQUENCE) mess_ratio) iana_nameis_multi_byte_encoding unicode_rangec@s eZdZd@eeeedeedddZedddZ edd d Z e edd d Z e edd dZ e edddZedddZedddZdddddZe edddZe eedddZe edddZe edddZe eedd d!Ze edd"d#Ze edd$d%Ze edd&d'Ze edd(d)Ze edd*d+Ze edd,d-Ze eddd.d/Ze edd0d1Z e eedd2d3Z!e eedd4d5Z"ddd6d7Z#ddd8d9Z$dAeed;dd?Z&dS)B CharsetMatchNCoherenceMatches)payloadguessed_encodingmean_mess_ratiohas_sig_or_bom languagesdecoded_payloadcCsF||_||_||_||_||_d|_g|_d|_d|_d|_ ||_ dS)Ng) _payload _encoding_mean_mess_ratio _languages_has_sig_or_bom_unicode_ranges_leavesZ_mean_coherence_ratio_output_payload_output_encoding_string)selfrrrrrrr$t/private/var/folders/7j/8686xlfs15q3tgljmghtvg0r0000gn/T/pip-target-isidps9b/lib/python/charset_normalizer/models.py__init__s zCharsetMatch.__init__)returncCs>t|ts&tdt|jt|j|j|jko<|j|jkS)Nz&__eq__ cannot be invoked on {} and {}.) isinstancer TypeErrorformatstr __class__encoding fingerprint)r#otherr$r$r%__eq__(s zCharsetMatch.__eq__cCs>t|tstt|j|j}|dkr2|j|jkS|j|jkS)zQ Implemented to make sorted available upon CharsetMatches items. g{Gz?)r(r ValueErrorabschaos coherence)r#r/Zchaos_differencer$r$r%__lt__-s   zCharsetMatch.__lt__cCstdttt|dS)z Check once again chaos in decoded text, except this time, with full content. Use with caution, this can be very slow. Notice: Will be removed in 3.0 z=chaos_secondary_pass is deprecated and will be removed in 3.0g?)warningswarnDeprecationWarningr r+)r#r$r$r%chaos_secondary_pass<s z!CharsetMatch.chaos_secondary_passcCstdtdS)zy Coherence ratio on the first non-latin language detected if ANY. Notice: Will be removed in 3.0 z)r*r-r.)r#r$r$r%__repr__dszCharsetMatch.__repr__)r/r'cCs8t|tr||kr"td|jd|_|j|dS)Nz;Unable to add instance <{}> as a submatch of a CharsetMatch)r(rr1r*r,r"rappend)r#r/r$r$r% add_submatchgszCharsetMatch.add_submatchcCs|jS)N)r)r#r$r$r%r-nszCharsetMatch.encodingcCsHg}x>tD]2\}}|j|kr,||q|j|kr||qW|S)z Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855. )ritemsr-rC)r#Z also_known_asupr$r$r%encoding_aliasesrs   zCharsetMatch.encoding_aliasescCs|jS)N)r)r#r$r$r%bomszCharsetMatch.bomcCs|jS)N)r)r#r$r$r%byte_order_markszCharsetMatch.byte_order_markcCsdd|jDS)z Return the complete list of possible languages found in decoded sequence. Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'. cSsg|] }|dqS)rr$).0er$r$r% sz*CharsetMatch.languages..)r)r#r$r$r%rszCharsetMatch.languagescCsp|jsbd|jkrdSddlm}m}t|jr8||jn||j}t|dksVd|krZdS|dS|jddS)z Most probable language found in decoded sequence. If none were detected or inferred, the property will return "Unknown". asciiZEnglishr)mb_encoding_languagesencoding_languagesz Latin BasedUnknown)rcould_be_from_charsetZcharset_normalizer.cdrOrPrr-len)r#rOrPrr$r$r%languages zCharsetMatch.languagecCs|jS)N)r)r#r$r$r%r3szCharsetMatch.chaoscCs|js dS|jddS)Ngr)r)r#r$r$r%r4szCharsetMatch.coherencecCst|jdddS)Nd)ndigits)roundr3)r#r$r$r% percent_chaosszCharsetMatch.percent_chaoscCst|jdddS)NrVrW)rX)rYr4)r#r$r$r%percent_coherenceszCharsetMatch.percent_coherencecCs|jS)z+ Original untouched bytes. )r)r#r$r$r%rawszCharsetMatch.rawcCs|jS)N)r)r#r$r$r%submatchszCharsetMatch.submatchcCst|jdkS)Nr)rSr)r#r$r$r% has_submatchszCharsetMatch.has_submatchcCsJ|jdk r|jSt}xt|D]}|t|q Wtt||_|jS)N)rsetr+addrsortedlist)r#Zdetected_ranges characterr$r$r% alphabetss zCharsetMatch.alphabetscCs|jgdd|jDS)z The complete list of encoding that output the exact SAME str result and therefore could be the originating encoding. This list does include the encoding available in property 'encoding'. cSsg|] }|jqSr$)r-)rKmr$r$r%rMsz6CharsetMatch.could_be_from_charset..)rr)r#r$r$r%rRsz"CharsetMatch.could_be_from_charsetcCs|S)z> Kept for BC reasons. Will be removed in 3.0. r$)r#r$r$r%firstszCharsetMatch.firstcCs|S)z> Kept for BC reasons. Will be removed in 3.0. r$)r#r$r$r%bestszCharsetMatch.bestutf_8)r-r'cCs2|jdks|j|kr,||_t||d|_|jS)z Method to get re-encoded bytes payload using given target encoding. Default to UTF-8. Any errors will be simply ignored by the encoder NOT replaced. Nreplace)r!r+encoder )r#r-r$r$r%outputszCharsetMatch.outputcCst|S)zw Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one. )rrk hexdigest)r#r$r$r%r.szCharsetMatch.fingerprint)N)rh)'__name__ __module__ __qualname__bytesr+floatboolrr&r0r5propertyr9r:r r?rArBrDr-rrHrIrJrrTr3r4rZr[r\r]r^rdrRrfrgrkr.r$r$r$r%rs^     rc@s|eZdZdZdeedddZddZedd d Ze dd d Z edd ddZ e ddddZ e ddddZdS)CharsetMatchesz Container with every CharsetMatch items ordered by default from most probable to the less one. Act like a list(iterable) but does not implements all related methods. N)resultscCs|r t|ng|_dS)N)ra_results)r#rur$r$r%r&szCharsetMatches.__init__ccsx|jD] }|VqWdS)N)rv)r#resultr$r$r%__iter__s zCharsetMatches.__iter__)r'cCsNt|tr|j|St|trFt|d}x|jD]}||jkr0|Sq0WtdS)z Retrieve a single item either by its position or encoding name (alias may be used here). Raise KeyError upon invalid index or encoding not present in results. FN)r(intrvr+rrRKeyError)r#itemrwr$r$r% __getitem__s      zCharsetMatches.__getitem__cCs t|jS)N)rSrv)r#r$r$r%__len__szCharsetMatches.__len__)r{r'cCsrt|tstdt|jt|jtkrVx(|j D]}|j |j kr4| |dSq4W|j |t |j |_ dS)z~ Insert a single match. Will be inserted accordingly to preserve sort. Can be inserted as a submatch. z-Cannot append instance '{}' to CharsetMatchesN)r(rr1r*r+r,rSr\r rvr.rDrCra)r#r{matchr$r$r%rCs     zCharsetMatches.appendrcCs|js dS|jdS)zQ Simply return the first match. Strict equivalent to matches[0]. Nr)rv)r#r$r$r%rg)szCharsetMatches.bestcCs|S)zP Redundant method, call the method best(). Kept for BC reasons. )rg)r#r$r$r%rf1szCharsetMatches.first)N)rmrnro__doc__rrr&rxr|ryr}rCrrgrfr$r$r$r%rtsrtc @sXeZdZeeeeeeeeeeeeeeed ddZe ddZ edddZ d S) CliDetectionResult) pathr-rHalternative_encodingsrTrdrr3r4 unicode_path is_preferredc CsF||_| |_||_||_||_||_||_||_||_| |_ | |_ dS)N) rrr-rHrrTrdrr3r4r) r#rr-rHrrTrdrr3r4rrr$r$r%r&>szCliDetectionResult.__init__c Cs2|j|j|j|j|j|j|j|j|j|j |j d S)N) rr-rHrrTrdrr3r4rr) rr-rHrrTrdrr3r4rr)r#r$r$r%__dict__KszCliDetectionResult.__dict__)r'cCst|jdddS)NT) ensure_asciiindent)rr)r#r$r$r%to_json[szCliDetectionResult.to_jsonN) rmrnror+rrrrqrr&rsrrr$r$r$r%r<s2 r)"r6Zencodings.aliasesrhashlibrjsonrtypingrrrr collectionsr rer r r<Zcharset_normalizer.constantr Zcharset_normalizer.mdr Zcharset_normalizer.utilsrrrrrtr+rqZCoherenceMatchrrZCharsetNormalizerMatchr$r$r$r%s       o< '