U C^=d@sddlmZmZddlZddlZddlZddlmZddlZddl m Z ddl m Z ddl mZddlZddlZddlZddlZddlZddlZz ddlZWnek rdZYnXz ddlZWnek rdZYnXddlmZdd lmZmZmZmZm Z dd lm!Z!dd l"m#Z#m$Z$m%Z%ee&j'd a(d a)Gddde*Z+ddZ,ddZ-ddZ.ddZ/d}ddZ0ddZ1ddZ2dd Z3d!d"Z4d#d$Z5d%d&Z6d'd(Z7d)d*Z8d~d+d,Z9d-d.Z:d/d0Z;d1d2Zd7d8Z?dd9d:Z@d;d<ZAdd=d>ZBd?d@ZCdAdBZDdCdDZEdEdFZFdGdHZGdIdJZHdKdLZIdMdNZJddOdPZKddRdSZLdTdUZMdVdWZNdXdYZOdePfdZd[ZQdd]d^ZRd_d`ZSdadbZTdcddZUdedfZVdgdhZWdidjZXdkdlZYdmdnZZddodpZ[dqdrZ\dsdtZ]dudvZ^dwdxZ_Gdydzdze`ZaGd{d|d|e*ZbdS))unicode_literalsprint_functionN)Path) OrderedDict)Model)NumpyOpsORTH)cupy CudaStreampath2str basestring_unicode_) import_file)ErrorsWarningsdeprecation_warningdataFc@s\eZdZejddddZejddddZejddddZejddddZejddddZ d S) registryspacy languagesT) entry_points architectureslookups factoriesdisplacy_colorsN) __name__ __module__ __qualname__ cataloguecreaterrrrrr"r"-/tmp/pip-install-6_kvzl1k/spacy/spacy/util.pyr's rcCs|adSN) _PRINT_ENVvaluer"r"r# set_env_log/sr(cCs |tjkS)aCheck whether a Language class is already loaded. Language classes are loaded lazily, to avoid expensive setup code associated with the language data. lang (unicode): Two-letter language code, e.g. 'en'. RETURNS (bool): Whether a Language class has been loaded. )rr)langr"r"r#lang_class_is_loaded4sr*c Cs|tjkrtj|Sztd|d}Wn6tk r`}zttjj||dW5d}~XYnXt |t ||j dtj|S)zImport and load a Language class. lang (unicode): Two-letter language code, e.g. 'en'. RETURNS (Language): Language class. z.lang.%sr)r)errNr) rrget importlib import_module ImportErrorrZE048formatset_lang_classgetattr__all__)r)moduler+r"r"r#get_lang_class?s  &r5cCstjj||ddS)zSet a custom Language class name that can be loaded via get_lang_class. name (unicode): Name of Language class. cls (Language): Language class. )funcN)rrregister)nameclsr"r"r#r1Qsr1TcCs|stStrtSdSdS)zGet path to spaCy data directory. require_exists (bool): Only return path if it exists, otherwise None. RETURNS (Path or None): Data path or None. N) _data_pathexists)Zrequire_existsr"r"r# get_data_pathZsr<cCs t|adS)z_Set path to spaCy data directory. path (unicode or Path): Path to new data directory. N) ensure_pathr:pathr"r"r# set_data_pathfsr@cCstj|d}||dS)Narchconfig)rrr,)Z arch_configZ arch_funcr"r"r# make_layerosrCcCst|trt|S|SdS)zEnsure string is converted to a Path. path: Anything. If string, it's converted to Path. RETURNS: Path or original argument. N) isinstancerrr>r"r"r#r=ts r=cCsVt|}|rt|S||jd}|rN) r=r;srsly read_jsonZ with_suffixsuffixZread_gzip_json ValueErrorrZE160r0r r>r"r"r#load_language_datas  rIcCs4t|ds ttjjt|dttj|j j j S)Nrr4) hasattrrHrZE169r0reprrsysmodulesr__file__parentrJr"r"r#get_module_paths rQcKst}|r|s(ttjjt|dt|tr|t dd| DkrXt |f|St |rlt |f|St|rtt|f|Snt|drt|f|Sttjj|ddS)aLoad a model from a shortcut link, package or data path. name (unicode): Package name, shortcut link or model path. **overrides: Specific overrides, like pipeline components to disable. RETURNS (Language): `Language` class with the loaded model. r>cSsg|] }|jqSr"r8).0dr"r"r# szload_model..r;rRN)r<r;IOErrorrZE049r0r rDrsetiterdirload_model_from_link is_packageload_model_from_packagerload_model_from_pathrKZE050)r8 overrides data_pathr"r"r# load_models       r_cKsPt|d}zt||}Wn&tk rBttjj|dYnX|jf|S)zCLoad a model from a shortcut link, or directory in spaCy data path.z __init__.pyrR)r<rAttributeErrorrVrZE051r0load)r8r]r?r9r"r"r#rYs rYcKst|}|jf|S)z'Load a model from an installed package.)r-r.ra)r8r]r9r"r"r#r[s r[c Ks|s t|}|d|d}t|}|fd|i|}|dg}|di}|dg}|dkrl|jj}n |dkrxg}|D]H} | |kr||d i| i} || | } |j| | d } |j| | d q|||S) zLoad a model from a data directory path. Creates Language class with pipeline from meta.json and then calls from_disk() with path.Z lang_factoryr)metapipelinerdisableT)FNZ pipeline_args)rBrR)get_model_metar,r5ZDefaultsZ pipe_namesZ create_pipeZadd_pipe from_disk) model_pathrbr]r)r9Znlprcrrdr8rBfactory componentr"r"r#r\s&     r\cKs`t|j}t|}d|d|d|df}||}|sRttjjt|dt ||f|S)a&Helper function to use in the `load()` method of a model package's __init__.py. init_file (unicode): Path to model's __init__.py, i.e. `__file__`. **overrides: Specific overrides, like pipeline components to disable. RETURNS (Language): `Language` class with loaded model. z%s_%s-%sr)r8versionr>) rrPrer;rVrE052r0r r\)Z init_filer]rgrbdata_dirr^r"r"r#load_model_from_init_pys rmcCst|}|s&ttjjt|d|d}|sHttjj|dt |}dD]&}||ksj||sVt tj j|dqV|S)zGet model meta.json from a directory path and validate its contents. path (unicode or Path): Path to model directory. RETURNS (dict): The model's meta data. r>z meta.json)r)r8rj)setting) r=r;rVrrkr0r is_fileZE053rErFrHZE054)r?rg meta_pathrbrnr"r"r#res recCsDddl}|}|jj}|D]}|dd|kr dSq dS)zCheck if string maps to a package installed via pip. name (unicode): Name of package. RETURNS (bool): True if installed package, False if not. rN-_TF) pkg_resourceslower working_setby_keykeysreplace)r8rspackagespackager"r"r#rZs rZcCs|}t|}t|jjS)z|Get the path to an installed package. name (unicode): Package name. RETURNS (Path): Path to installed package. )rtr-r.rrOrP)r8pkgr"r"r#get_package_path s r|cCs8ztjj}|dkrWdSWntk r2YdSXdS)zCheck if user is running spaCy from a Jupyter notebook by detecting the IPython kernel. Mainly used for the displaCy visualizer. RETURNS (bool): True if in Jupyter, False if not. ZZMQInteractiveShellTF)Z get_ipython __class__r NameError)shellr"r"r# is_in_jupyters  rcCsFt|dr|jSt|dr |jSt|dr>t|jdr>|jjSt|S)Nr8rr})rKr8rr}rL)rir"r"r#get_component_name&s  rcCs*tdkr dSttjtrdSt|dSdS)N) non_blocking)r rDropsr)requirerr"r"r#get_cuda_stream0s  rcCs6tdkr |Stj|jd|jd}|j||d|SdS)NC)orderdtype)stream)r ZndarrayshaperrW)rZ numpy_arrayarrayr"r"r# get_async9s rcCst|tkrt}nt}d|tjkrb|tjd|}tr^t|dt|dd||S|tjkr|tj|}trt|dt|dd||Strt|dt|d|SdS)NZSPACY_=Zviaz$SPACY_$z by default) typefloatintupperosenvironr%printrL)r8defaultZ type_convertr'r"r"r#env_optBs   rc CsLt|}|jdd}|d}W5QRXddd|D}t|S)Nutf8)encoding |cSs"g|]}|rdt|qS^stripreescaperSZpiecer"r"r#rU\szread_regex..)r=openreadsplitjoinrcompile)r?file_entries expressionr"r"r# read_regexWs rcCsHd|kr&ddd|D}t|Sddd|D}t|SdS)zCompile a sequence of prefix rules into a regex object. entries (tuple): The prefix rules, e.g. spacy.lang.punctuation.TOKENIZER_PREFIXES. RETURNS (regex object): The regex object. to be used for Tokenizer.prefix_search. (rcSs"g|]}|rdt|qSrrrr"r"r#rUjsz(compile_prefix_regex..cSsg|]}|rd|qSrrrr"r"r#rUnsNrrrrrr"r"r#compile_prefix_regexas  rcCsddd|D}t|S)zCompile a sequence of suffix rules into a regex object. entries (tuple): The suffix rules, e.g. spacy.lang.punctuation.TOKENIZER_SUFFIXES. RETURNS (regex object): The regex object. to be used for Tokenizer.suffix_search. rcSsg|]}|r|dqS)rrrr"r"r#rUxsz(compile_suffix_regex..rrr"r"r#compile_suffix_regexrsrcCsddd|D}t|S)zCompile a sequence of infix rules into a regex object. entries (tuple): The infix rules, e.g. spacy.lang.punctuation.TOKENIZER_INFIXES. RETURNS (regex object): The regex object. to be used for Tokenizer.infix_finditer. rcSsg|]}|r|qSr"rrr"r"r#rUsz'compile_infix_regex..rrr"r"r#compile_infix_regex|srcGstt||S)aQExtend an attribute function with special cases. If a word is in the lookups, the value is returned. Otherwise the previous function is used. default_func (callable): The default function to execute. *lookups (dict): Lookup dictionary mapping string to attribute value. RETURNS (callable): Lexical attribute getter. ) functoolspartial_get_attr_unless_lookup) default_funcrr"r"r# add_lookupss rcCs&|D]}||kr||Sq||Sr$r")rrstringlookupr"r"r#rsrcGst|}|D]v}|D]^\}}tdd|DsFttjj||dddd|D}||krttjj||dq| |q t |dd}|S)zUpdate and validate tokenizer exceptions. Will overwrite exceptions. base_exceptions (dict): Base exceptions. *addition_dicts (dict): Exceptions to add to the base dict, in order. RETURNS (dict): Combined tokenizer exceptions. css|]}t|ttVqdSr$)rDr rrSattrr"r"r# szupdate_exc..)keyZorthscss|]}|tVqdSr$r rr"r"r#rs'u’) dictitemsallrHrZE055r0rZE056update expand_exc)Zbase_exceptionsZaddition_dictsexcZ additionsZorthZ token_attrsZdescribed_orthr"r"r# update_excs  rcsXddt|}|D]:\}}|kr|}fdd|D}|||<q|S)aHFind string in tokenizer exceptions, duplicate entry and replace string. For example, to add additional versions with typographic apostrophes. excs (dict): Tokenizer exceptions. search (unicode): String to find and replace. replace (unicode): Replacement. RETURNS (dict): Combined tokenizer exceptions. cSs t|}|t|||t<|Sr$)rr rx)tokensearchrxZfixedr"r"r# _fix_tokenszexpand_exc.._fix_tokencsg|]}|qSr"r")rStrrxrr"r#rUszexpand_exc..)rrrx)ZexcsrrxZnew_excsZ token_stringtokensZnew_keyZ new_valuer"rr#rs   rcCs~|dks|dksttj|dkr(d}n|dkr8||7}t|td|}|dkrV|}n|dkrf||7}t|t||}||fS)Nrr)rHrZE057minmax)lengthstartstopstepr"r"r#normalize_slices rccs\t|trt|}n|}t|}t|}tt|t|}t|dkrLqXt|Vq"dS)zlIterate over batches of items. `size` may be an iterator, so that batch-size can vary on each step. rN) rDr itertoolsrepeatiternextlistislicelen)rsizesize_ batch_sizebatchr"r"r# minibatchs   rc#s.fdd}t}||V||9}qdS)aZYield an infinite series of compounding values. Each time the generator is called, a value is produced by multiplying the previous value by the compound rate. EXAMPLE: >>> sizes = compounding(1., 10., 1.5) >>> assert next(sizes) == 1. >>> assert next(sizes) == 1 * 1.5 >>> assert next(sizes) == 1.5 * 1.5 cskrt|St|Sr$rrr&rrr"r#clipszcompounding..clipNr)rrZcompoundrcurrr"rr# compoundings  rc#s6fdd}t}||V||7}qdS)aYield an infinite series of values that step from a start value to a final value over some number of steps. Each step is (stop-start)/steps. After the final value is reached, the generator continues yielding that value. EXAMPLE: >>> sizes = stepping(1., 200., 100) >>> assert next(sizes) == 1. >>> assert next(sizes) == 1 * (200.-1.) / 100 >>> assert next(sizes) == 1 + (200.-1.) / 100 + (200.-1.) / 100 cskrt|St|Sr$rr&rr"r#r szstepping..clipNr)rrZstepsrrr"rr#steppings rccs"t|}t||V||8}qdS)z5Yield an infinite series of linearly decaying values.N)rr)rrZdecayrr"r"r#decayings rc cst|trt|}n|}t|}t|}g}|dkrz|rJt|\}}nt|}Wn tk rt|rn|VYdSX|||8}|r|||fq.||q.|r"|Vq"dS)z.Create minibatches of a given number of words.rN)rDrrrrr StopIterationappend) rrZtuplesZ count_wordsrrrdocZgoldr"r"r#minibatch_by_wordss*     rccst|}g}zfttd|t|D]}|t|q&t|ttd|D]}|rh|VqTqqTqWn2t k rt||r|Vqt YnXdS)uShuffle an iterator. This works by holding `bufsize` items back and yielding them sometime later. Obviously, this is not unbiased – but should be good enough for batching. Larger bufsize means less bias. From https://gist.github.com/andres-erbsen/1307752 iterable (iterable): Iterator to shuffle. bufsize (int): Items to hold back. YIELDS (iterable): The shuffled iterator. rN) rrangerandomrandintrrrshufflepopr)iterablebufsizebufir"r"r# itershuffle9s      rcCstdd}t||dd}g}t}|D]:}|j|krJ|jd|krJ|||t|j|jq$t|ddd}|S)atFilter a sequence of spans and remove duplicates or overlaps. Useful for creating named entities (where one token can only be part of one entity) or when merging spans with `Retokenizer.merge`. When spans overlap, the (first) longest span is preferred over shorter spans. spans (iterable): The spans to filter. RETURNS (list): The filtered spans. cSs|j|j|j fSr$)endrspanr"r"r#_zfilter_spans..T)rreversercSs|jSr$)rrr"r"r#rhrr)sortedrWrrrrr)ZspansZ get_sort_keyZ sorted_spansresultZ seen_tokensrr"r"r# filter_spansVs  rcCs>t}|D]$\}}|dd|kr|||<qt|SN.r)rrrrEZ msgpack_dumps)ZgettersexcludeZ serializedrgetterr"r"r#to_bytesls  rcCsFt|}|D].\}}|dd|kr||kr|||q|Sr)rEZ msgpack_loadsrr) bytes_dataZsettersrmsgrsetterr"r"r# from_bytesus  r cCsLt|}|s||D]&\}}|dd|kr |||q |Sr)r=r;mkdirrr)r?Zwritersrrwriterr"r"r#to_disk~sr cCs<t|}|D]&\}}|dd|kr|||q|Sr)r=rr)r?Zreadersrrreaderr"r"r#rfs rfcCs|ddddS)zPerform a template-specific, rudimentary HTML minification for displaCy. Disclaimer: NOT a general-purpose solution, only removes indentation and newlines. html (unicode): Markup to minify. RETURNS (unicode): "Minified" HTML. z rr)rrx)htmlr"r"r# minify_htmlsrcCs4|dd}|dd}|dd}|dd}|S) zReplace <, >, &, " with their HTML encoded representation. Intended to prevent HTML errors in rendered displaCy markup. text (unicode): The original text. RETURNS (unicode): Equivalent text to be safely used within HTML. &z&z>"z")rx)textr"r"r# escape_htmls     rcCsXz ddl}Wntk r"YdSXddlm}|jj|}||t_ |t_ |S)Nr)CupyOps) Zcupy.cuda.devicer/thinc.neural.opsrZcudadeviceZDeviceZuserrZOps)Zgpu_idr rrr"r"r#use_gpus  rcCs.t|tj|tdk r*tj|dSr$)rseednumpyr )rr"r"r#fix_random_seeds  rcCstdkrttjt|Sr$) jsonschemarHrZE136ZDraft4Validator)schemar"r"r#get_json_validators r cCst|}||dS)zHValidate a given schema. This just checks if the schema itself is valid.N)r Z check_schema)r validatorr"r"r#validate_schemasr"cCsg}t||dddD]n}|jrBdddd|jD}nd}|jd |}|jr~d d|jD}|d d|7}||q|S) zValidate data against a given JSON schema (see https://json-schema.org). data: JSON-serializable data to validate. validator (jsonschema.DraftXValidator): The validator. RETURNS (list): A list of error messages, if available. cSs|jSr$r>)er"r"r#rrzvalidate_json..rz[{}]z -> cSsg|] }t|qSr")str)rSpr"r"r#rUsz!validate_json..r cSsg|]}d|jqS)z - {})r0message)rSZsuberrr"r"r#rUsz: {})rZ iter_errorsr?r0rr'contextr)rr!errorsr+Zerr_pathrZsuberrsr"r"r# validate_jsons r*cCs~t|}dd|D}|D]Z\}}|dkrT|dkrTttjj|d||q|dd|krtt j j|dq|S)zHelper function to validate serialization args and manage transition from keyword arguments (pre v2.1) to exclude argument. cSsg|]}|ddqS)rr)r)rSr8r"r"r#rUsz-get_serialization_exclude..)ZvocabF)argrr) rrrrZW015r0rrrHrZE128)Z serializersrkwargsoptionsrr'r"r"r#get_serialization_excludes r.c@s*eZdZdZddZd ddZddZdS) SimpleFrozenDictzSimplified implementation of a frozen dict, mainly used as default function or method argument (for arguments that should default to empty dictionary). Will raise an error if user or spaCy attempts to add to dict. cCsttjdSr$NotImplementedErrorrZE095)selfrr'r"r"r# __setitem__szSimpleFrozenDict.__setitem__NcCsttjdSr$r0)r2rrr"r"r#rszSimpleFrozenDict.popcCsttjdSr$r0)r2otherr"r"r#rszSimpleFrozenDict.update)N)rrr__doc__r3rrr"r"r"r#r/s r/c@s,eZdZddZddZddZddZd S) DummyTokenizercKsdS)Nrr")r2r,r"r"r#r szDummyTokenizer.to_bytescKs|Sr$r")r2Z _bytes_datar,r"r"r#r  szDummyTokenizer.from_bytescKsdSr$r"r2_pathr,r"r"r#r szDummyTokenizer.to_diskcKs|Sr$r"r7r"r"r#rfszDummyTokenizer.from_diskN)rrrrr r rfr"r"r"r#r6sr6)T)F)FT)N)N)r)r)r)c __future__rrrr-rpathlibrr collectionsrZthinc.neural._classes.modelrrrrrZ numpy.randomrrEr rMrr/Z cupy.randomr symbolsr compatr r rrrr)rrrrOrPr:r%objectrr(r*r5r1r<r@rCr=rIrQr_rYr[r\rmrerZr|rrrrrrrrrrrrrrrrrrrrrrrr r rfrrrrr r"r*r.rr/r6r"r"r"r#s