U C^7 @sddlmZddlZddlmZmZmZmZmZddl m Z m Z ddl m Z mZmZddlmZddlmZmZddlmZdd lmZmZmZmZmZmZdd lmZmZdd lm Z m!Z!m"Z"dd l#m$Z$dd l%m&Z&m'Z'ddl(m)Z)m*Z*ddl+m,Z,ddl-m.Z.ddl/m0Z0m1Z1m2Z2m3Z3ddl4m5Z5ddl6Z-ddl7m8Z8m9Z9m:Z:m;Z;mZ>ddl?m@Z@mAZAmBZBddlCmDZDddlCmEZFddlEmGZGdZHdZIddZJddZKedd d!ZLd"d#ZMd$d%ZNd&d'ZOd(d)ZPGd*d+d+eZQe.Re5d,d-e.jSe0d.e0d/e0d0e0d1e1d2d3d-e2d4d5d-e1d6d7d-d8d-e3d9e3d:e3d;d< Gd=d>d>eZTd?d@ZUddBdCZVdDdEZWdFdGZXdHdIZYdJdKZZdLdMZ[ddNdOZ\dPdQZ]eddRdSZ^eddTdUZ_dVdWZ`dXdYZae.jSe1d2dZd-d[d-d\Gd]d^d^eZbd_d`ZcdadbZdeddcddZeddfdgZfddhdiZgeddjdkZhddldmZidndoZjeddpdqZkdrdsZlddudvZmGdwdxdxenZoddydzZpdd|d}Zqd~dZre.jSe0de0de1ddd-erdde3ddGdddeZsdddZtdS))unicode_literalsN)ModelMaxoutSoftmaxAffineReLu) ExtractWindowParametricAttention)Poolingsum_pool mean_pool) HashEmbed)ResidualFeatureExtracter) LayerNorm)addlayerizechainclone concatenate with_flatten) with_getitemflatten_add_lengths)uniquedwrapnoop) LinearModel)NumpyOpsCupyOps)get_array_module copy_array)Adam)describe) DimensionSynapsesBiasesGradient)_set_dimensions_if_needed)IDORTHLOWERNORMPREFIXSUFFIXSHAPE)Errors user_warningWarnings)util)ml)_legacy_tok2vecZspacy_pretrained_vectorsFcCsJt|}|j|}|j|}|dks0|dkr4dS||||SdS)Nr)rlinalgnormdot)Zvec1Zvec2xpZnorm1Znorm2r;,/tmp/pip-install-6_kvzl1k/spacy/spacy/_ml.pycosine$s   r=c Ksntdd}tdd}tdd}tdd}td d }td d }t||||||d }||_|j|_|S)N learn_rategMbP?Z optimizer_B1?Z optimizer_B2g+?Z optimizer_eps:0yE>Z L2_penaltygư>Zgrad_norm_clip?)L2beta1beta2eps)r3env_optr! max_grad_normdevice) opscfgr>rCrDrErBrGZ optimizerr;r;r<create_default_optimizer.s      rKr6csJtjjdd|Ddddfdd }j|d}|f|fS) NcSsg|] }t|qSr;len.0seqr;r;r< >sz(_flatten_add_lengths..idtypecsj|dS)Npad unflattenZd_XsgdlengthsrIrVr;r< finish_update@sz+_flatten_add_lengths..finish_updaterU)NrrIasarrayflatten)seqsrVdropr]Xr;r[r<_flatten_add_lengths;s rdcCs.dd}|j||jdk r*|jd|S)Nc_s|jddSNrWfill)selfargskwargsr;r;r<_zero_init_implHsz#_zero_init.._zero_init_implr6)Z on_init_hooksappendrgrhmodelrlr;r;r< _zero_initGs    rpcs"dfdd }t|S)zVWrap a model that should run on CPU, transferring inputs and outputs as necessary.r6cs6jt||d\}t|}dfdd }||fS)Nrbcst|}||dSNrZ_to_cpu)Z d_outputsrZZ cpu_d_outputsbackpropr;r<with_cpu_backpropZsz=with_cpu..with_cpu_forward..with_cpu_backprop)N) begin_updateru _to_device)inputsrbZ cpu_outputsZ gpu_outputsrxrorIrvr<with_cpu_forwardVs z"with_cpu..with_cpu_forward)r6)Zto_cpur)rIror}r;r|r<with_cpuQs r~cCs^t|tjr|St|tr,tdd|DSt|trDdd|DSt|drV|S|SdS)NcSsg|] }t|qSr;rtrOxr;r;r<rQgsz_to_cpu..cSsg|] }t|qSr;rtrr;r;r<rQisget) isinstancenumpyndarraytuplelisthasattrr)rcr;r;r<rucs    rucsJt|tr tfdd|DSt|tr<fdd|DS|SdS)Ncsg|]}t|qSr;rzrrIr;r<rQrsz_to_device..csg|]}t|qSr;rrrr;r<rQts)rrrr_)rIrcr;rr<rzps   rzc@s"eZdZefddZdddZdS)extract_ngramscCst|||_||_dSN)r__init__ ngram_sizeattr)rirrr;r;r<rzs zextract_ngrams.__init__r6c Csg}g}|D]z}||jg}|g}td|jdD]}||j||q4|jj|} |jjj | dd\} } || || q |jj dd|Dt j d} |jj|}|jj |jj|dd}||| fdfS) Nr(T)Z return_countscSsg|]}|jdqS)rshape)rOZarrr;r;r<rQsz/extract_ngrams.begin_update..rSf) to_arrayrrangerrmrIngramsr:runiquer_rint_) ridocsrbZ batch_keysZ batch_valsdocZunigramsrnkeysvalsr\r;r;r<rys$   zextract_ngrams.begin_updateN)r6)__name__ __module__ __qualname__r+rryr;r;r;r<rys rcCs ||Sr) init_weights)rorcyr;r;r<rz Input sizezNumber of featuresz Output sizez Maxout pieceszWeights matrixcCs|j|j|j|jfSr)nFnOnPnIobjr;r;r<rrz Bias vectorcCs |j|jfSr)rrrr;r;r<rrZPadcCsd|j|j|jfSNr()rrrrr;r;r<rrcCs ||dS)NrA) normal_init)MrIr;r;r<rrrgrVb) rrrrrgrrVd_Wd_padd_bc@s<eZdZd ddZdddZddZd d Zed d ZdS)PrecomputableAffineNcKs*tj|f|||_||_||_||_dSr)rrrrrr)rirrrrrkr;r;r<rs zPrecomputableAffine.__init__r6csnjjjjjjjfdd}||jdjjjf} |}dfdd }||fS)NT)Ztrans2rcsX|\}}||\}}|}||jdjjf}j|jdd7_||jdjjf}j d}j j |}|jjjjf}j ||jdjjf|}|}|dj j|||dd|jjjjf}j| d7_|dk r>|jjjjjd||jdjjfS) Nraxis)r(rrr6T)outtrans1)rrr(rkey)_backprop_paddingreshaperrrrsumrrrgZ transposerIr:ascontiguousarraygemmrhr_memweightsgradientid)ZdY_idsrZdYidsZXfZWopfiZdXfZdWopfircrir;r<backwards$ &  z2PrecomputableAffine.begin_update..backward)N) rIrrgrrrrrr _add_padding)rircrbYfrr;rr<rys z PrecomputableAffine.begin_updatecCs|jj|j|f}|Sr)rIr:ZvstackrV)rirZ Yf_paddedr;r;r<rsz PrecomputableAffine._add_paddingcCsL|dk}|jdd}|||jdddf}|j|jdd7_||fS)Nr6r(rr)rrrr)rirrmaskrr;r;r<rs  z%PrecomputableAffine._backprop_paddingc sLjddkrdSj}|j}|jjjjdd|jdjfdd}||j d d |j 7}|j |d d}|jdjfdd}||jj dd |j d |j 7}fdd}d}d}d}d } t|D]z} |||} jj| } jj| } t| d |kr"jjj| _qt| |kr@j| 8_qqHqdS)aThis is like the 'layer sequential unit variance', but instead of taking the actual inputs, we randomly generate whitened data. Why's this all so complicated? We have a huge number of inputs, and the maxout unit makes guessing the dynamics tricky. Instead we set the maxout weights to values that empirically result in whitened outputs given whitened inputs. rr6NTZinplacerrSrirRrA)locZscalesizecs|dd}jj|jdjjfdd}||jdjjjf}j|||||jdjjf}|j 7}j |}jdkrj |dS||dkSdS)NrrrSr) rIallocaterrrrrZ scatter_addr`rr_Zmaxout)rtokvecsZhiddensvectorsror;r<predicts"   z1PrecomputableAffine.init_weights..predictg{Gz? )rgrrIr:rrrrrandomuniformrr_normalrrrvarmeanabssqrtr) rorIr:rrrZtol_varZtol_meanZt_maxZt_iZacts1rrr;rr<rs4    z PrecomputableAffine.init_weights)NNNN)r6) rrrrryrr staticmethodrr;r;r;r<rs   #rcCs|j}|jdkr8t|_|jjdkr8ttjj|jj dt j }|D]&}|j |j krb|j |j |_qBd|_qB||j}|j|jf}|tjjjkrtjjj|j |j kr|j}|jd|j d}ttjj||d||_|j|jf}|tjjj|<dS)Nrrz_%d)oldnew)rname VECTORS_KEYdatarr1r2ZW020formatrrrIorthkey2rowZrankr_rHthincextraZload_nlpZVECTORSZW019)vocabrrIwordrrZold_namenew_namer;r;r<link_vectors_to_modelss(      r皙?cCsXddl}ddlm}ddlm}|dkr2ttS|jj||d|d|d}|||S)Nr)with_square_sequences)PyTorchWrapperRNNrT) bidirectionaldropout) Ztorch.nn thinc.apirZthinc.extra.wrappersrrrnnZLSTM)rrdepthrZtorchrrror;r;r< PyTorchBiLSTM5s   rc Kststj||f|S|dd}|dd}|dd}|dd}|dd }|d d }|d d } ddddddg} dd| id} |rdddd|dddddd} nHd||| |dd|ddddd} |rd||| dd d| d!d"<|d#krd$|| ||d%d} nd&|| |d'd} d(||d)d}|d krP|d krPi}n<|d krx|d krxd*d+| |gid}n|d kr| }n|}| | |d,}t|S)-Npretrained_vectorscnn_maxout_piecesrsubword_featuresT char_embedF conv_depth bilstm_depthr conv_windowr(r)r,r-r.r/r*zspacy.Doc2Feats.v1columns)archconfigzspacy.CharacterEmbed.v1@zspacy.LayerNormalizedMaxout.v1)widthpieces)rchars@mixz@embed_featureszspacy.MultiHashEmbed.v1)rZrowsrZ use_subwords@pretrained_vectorsrzspacy.PretrainedVectors.v1)Z vectors_namercolumnrrrzspacy.MaxoutWindowEncoder.v1)r window_sizerrzspacy.MishWindowEncoder.v1)rrrzspacy.TorchBiLSTMEncoder.v1)rrzthinc.FeedForward.v1children)z @doc2featsz@embedz@encode)USE_MODEL_REGISTRY_TOK2VECr5Tok2Vecrindexnew_ml)r embed_sizerkrrrrrrrcolsZ doc2feats_cfgZ embed_cfgZcnn_cfgZ bilstm_cfgZ encode_cfgrr;r;r<r @s             r csdfdd }t|S)Nr6csHgtD]$}j||d\}}|}|q dfdd }||fS)Nrqcs8d}tD]&}|||d}|dkr*|}q ||7}q |Srr)reversed)rrZdXrwZ backpropsr;r< reapply_bwds   z1reapply..reapply_fwd..reapply_bwd)N)rryrm)rcrbrRYrwrlayern_timesrr< reapply_fwds   zreapply..reapply_fwd)r6)r)rrrr;rr<reapplysrcsdfdd }t|S)Nr6csj|ddfS)NrS)r_rcrbrTrIr;r<forwardszasarray..forward)r6r)rIrTrr;rr<r_sr_cCs8g}d}|t|kr4|||||||7}q|Sre)rMrm)rcrpartsr r;r;r< _divide_arrays   rcs0dkrttjjddfdd }t|S)Nr)valuer6csRttjrtntjjddfjd}dfdd }||fS)NrScs(j}|ddf|7<|Sr)rr)rrZr)rcidxrIr;r<rs z*get_col..forward..backward)N)rrrrrr:rrT)rcrboutputrr)rcrIr<rs  zget_col..forward)r6) IndexErrorr0ZE066rr)rrr;r!r<get_colsr#cs8dkrttttttgdfdd }t|}|_|S)Nr6cs&g}|D]}||q|dfSr)rmr)rrbZfeatsrr r;r<rszdoc2feats..forward)r6)r)r,r-r.r/r*rr )r rror;r$r< doc2featss r%cCsddd}t|S)Nr6cSs |ddfS)Nc[s|Srr;)rrkr;r;r<rrz.print_shape..forward..r;rr;r;r<rszprint_shape..forward)r6r)prefixrr;r;r< print_shapes r'cs |\}}dfdd }||fS)Ncs|fSrr;Zd_outputrZtokensr;r<rsz#get_token_vectors..backward)Nr;)Ztokens_attrs_vectorsrbattrsrrr;r)r<get_token_vectorss r,csdt|}t||js||}||d|}||d|}dd|| dfdd }|fS)Ng$@g$rAcs|d}|Srr;)rrZrrr;r< logistic_bwdszlogistic..logistic_bwd)N)rrrr_Zminimummaximumexp)rcrbr:r.r;r-r<logistics  r1cCsdd}|j||S)NcSs|jddSrerf)rircrr;r;r<rlsz"zero_init.._zero_init_impl)Z on_data_hooksrmrnr;r;r< zero_inits r2csdfdd }t|S)Nr6cs |dfSrr;rrRr;r< getitem_fwd szgetitem..getitem_fwd)r6r)rRr4r;r3r<getitem sr5cCs |j|jfSr)rrrr;r;r<rrcCsdSrr;rgrIr;r;r<rr)rgc@s0eZdZdZdZd ddZddZd d d ZdS) MultiSoftmaxaANeural network layer that predicts several multi-class attributes at once. For instance, we might predict one class with 6 variables, and another with 5. We predict the 11 neurons required for this, and then softmax them such that columns 0-6 make a probability distribution and coumns 6-11 make another. Z multisoftmaxNcKs(tj|f|||_t||_||_dSr)rr out_sizesrrr)rir8rrkr;r;r<rs zMultiSoftmax.__init__cCsT|j|j|j|}d}|jD]0}|jj|dd|||fdd||7}q|S)NrTr)rIZaffinergrr8softmax)ri input__BI output__BOrRZout_sizer;r;r<r$s  $ zMultiSoftmax.predictr6cs"}dfdd }||fS)Ncsfjjj|dd7_j|jdd7_j|j}|dk rb|jjjjj d|S)NT)rrrr) rrIrrrrgrrrr)Zgrad__BOrZZgrad__BIr:rir;r<r]/s z0MultiSoftmax.begin_update..finish_update)N)r)rir:rbr;r]r;r<r<ry,s zMultiSoftmax.begin_update)N)r6)rrr__doc__rrrryr;r;r;r<r7s  r7c Kstdd}d|kr|d}n tdd}|d}|dd}tttd>d |krd|d }nt||||d }tt ||}||?}W5QRXd|_ ||_ ||_ |S) Nr itoken_vector_width`rrT)>>+tok2vec)rr) r3rFrrdefine_operatorsrrr rrrrBr9) nr_classrJr r>rrrBr9ror;r;r<build_tagger_model:s*      rEc Kstdd}d|kr|d}n tdd}|d}|dd}ttttdDd |krf|d }nt||||d }t t ||}||_ ||?}W5QRXd|_ ||_ ||_|S) Nr iXr>rrT)r@rA**rB)rr)r3rFrrrCrrrr rr7r8rrBr9) Z class_numsrJr r>rrrBr9ror;r;r<build_morphologizer_modelTs,      rHcCsg}|D]p}tjt|fdd}t|D]6\}}|j|jjjkrV|jjj|j||<q(d||<q(|jjj|}| |q|dfS)NrRrSr) rzerosrM enumeraterrrrrrm)rrbbatchrindicesrRrrr;r;r< SpacyVectorsos  rMrc Ksh|dd}|dd}|dd}tttttd|dr|rtt?t dt ||?t |?t t ?tt||d?tt ||d d ?t?}|W5QRSt||d d }t|d|dd }t|d|d d } t|d|dd } tttttttgtt||B| B| Btt|||dd ?dd ?} |rdttt ||?} t| | } |d}n | } |}d} | ttt||ttd dtt||d ?|?|d?}|t?t |?t t ?ttt||?tt ||d d ?}t ||dd dd}|drt!||d}ntt ||dd d t?}||B|?}t|t"|_#W5QRX||_$d|_%|S)Nrr nr_vectorrpretrained_dimsr)r@rA|rGZlow_datar6 drop_factorr()rrr)ZnWrUrF)rexclusive_classesrS)&rrrCrrrrrMrrrr r r rrr2r1r rr*r+r-r.r/r)rrLNrconcatenate_listsrbuild_bow_text_classifierrr`rBrZlsuv)rDrrJrrNrOrolowerr&suffixrZtrained_vectorsZstatic_vectorsrZ vectors_widthrBZ cnn_modelZ linear_model output_layerr;r;r<build_text_classifier~s        "    rZc KsTtdti6ttjt|tdt|?}|s@||r:tnt ?}W5QRX||_ |S)Nr@)r) rrCrr~rIrr*r cpu_softmaxr1r)rDrrSZno_output_layerrJror;r;r<rVsrVcCst}ddd}|||fS)NcSs|Srr;)rrZr;r;r<cpu_softmax_backwardsz)cpu_softmax..cpu_softmax_backward)N)rr9)rcrbrIr\r;r;r<r[s r[c KsntdtiD|r"t||j}ntt||jddt?}|t?t t ?|?}W5QRXt|t |_ ||_|S)a1 Build a simple CNN text classifier, given a token-to-vector model as inputs. If exclusive_classes=True, a softmax non-linearity is applied, so that the outputs sum to 1. If exclusive_classes=False, a logistic non-linearity is applied instead, so that outputs are in the range [0, 1]. r@r6rQ) rrCrrrr2rr1rr r r`rB)rBrDrSrJrYror;r;r< build_simple_cnn_text_classifiers r]c Ksd|krttjjdd|dd}|dd}|dd}|d}tttd\t ||||d |d d }|t ?t t ?t tt||?tt||d d ?} || _|| _W5QRX| S)NZ entity_width)paramrrrrr)r@rGTr)rr rrrrrr6rQ) ValueErrorr0ZE144rrrrCrrr rr r rr2rrrBr) Z embed_widthZ hidden_widthZ ner_typesrJrrrZ context_widthrBror;r;r<build_nel_encoders:     r`csDtjjdd|Dddd fdd }j|dd}||fS) NcSsg|] }t|qSr;rLrNr;r;r<rQszflatten..rRrScsj|ddS)NrrUrWrYr\rIr;r<r]szflatten..finish_updaterrU)Nr^)rarbr]rcr;rar<r`s r`csV|s tS|dd|djdd|D}t|d fdd }t|}|S) zCompose two or more models `f`, `g`, etc, such that their outputs are concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))` rRrArcSsg|]}t|tqSr;)rr`)rOrr;r;r<rQ)sz%concatenate_lists..r6cs^|dk r|9}jdd|Ddd}j||d\}||}dfdd }||fS) NcSsg|] }t|qSr;rL)rOrcr;r;r<rQ/szDconcatenate_lists..concatenate_lists_fwd..rRrSrqcs||dSrr)r`)Zd_ysrZ) bp_flat_yrIr;r<concatenate_lists_bwd3szOconcatenate_lists..concatenate_lists_fwd..concatenate_lists_bwd)N)r_ryrX)ZXsrbr\Zflat_yZysrcconcatrRrI)rbr<concatenate_lists_fwd,s z0concatenate_lists..concatenate_lists_fwd)r6)rrrIrr)Zlayersrkrfror;rdr<rU!s   rU333333?cs$t|dfdd }t|S)z7Convert a model into a BERT-style masked language modelr6csXt|d\}jjddfj||d\}dfdd }||fS)N) mask_probrr(rqcs|d9}||dS)Nr(rsr;r(rwrr;r< mlm_backwardFs z@masked_language_model..mlm_forward..mlm_backward)N) _apply_maskrIr_rrry)rrbr rjrhro random_wordsrir< mlm_forwardAs z*masked_language_model..mlm_forward)r6) _RandomWordsr)rrorhrnr;rlr<masked_language_model<s rpc@seZdZddZddZdS)rocCsvdd|D|_dd|D|_|jdd|_|jdd|_ttj|jdd|_|j|j_g|_dS)NcSsg|]}|jdkr|jqSr6)probtextrOlexr;r;r<rQQs z)_RandomWords.__init__..cSsg|]}|jdkr|jqSrq)rrrtr;r;r<rQRs 'rrS)wordsprobsrr0arrayr_cache)rirr;r;r<rPsz_RandomWords.__init__cCs<|js(|jtjjt|jd|jd|j}|j|S)Nrv)p) rzextendrrchoicerMrwrxpop)rir r;r;r<nextYs  z_RandomWords.nextN)rrrrrr;r;r;r<roOs roc Csddlm}tdd|D}tjdd|f}||k}d}g}|D]d}g} |D]2} ||sjt| j|} n| j} | | |d7}qPdd |D} |||j | | d qD||fS) Nr()Doccss|]}t|VqdSrrL)rOrr;r;r< fsz_apply_mask..r6rArcSsg|]}t|jqSr;)boolZ whitespace_)rOwr;r;r<rQtsz_apply_mask..)rwspaces) Z tokens.docrrrrr _replace_wordrsrmr) rrmrhrNrrRZ masked_docsrrwtokenrrr;r;r<rkbs"   rk[MASK]cCs.tj}|dkr|S|dkr&|S|SdS)Ng?r?)rrr)rrmrZrollr;r;r<r~s  rcsfdd}|S)Ncst||jj|jdSr)r r:rrrr6hilor;r<wrappedsz_uniform_init..wrappedr;)rrrr;rr< _uniform_initsrzVector dimensionszNumber of characters per wordz Embed matrixcCs|j|j|jfSr)nCnVnMrr;r;r<rrgg?r)rrr d_vectorsc@s8eZdZd ddZeddZeddZd d d ZdS) CharacterEmbedNcKstj|f|||_||_dSr)rrrr)rirrrkr;r;r<rszCharacterEmbed.__init__cCs |j|jSr)rrrir;r;r<rszCharacterEmbed.nOcCsdS)Nr;rr;r;r<rszCharacterEmbed.nVr6c s|sgSgg}j}jjj|D]t}|jjd}jt|jjf}||ddff|ddf<| | t|j f |q*dfdd }||fS)N)Znr_charcsj}t|D]L\}}|t|jjf}||ddff|ddf7<q|dk r~|jjjjj ddS)Nr) rziprrMrrrrrr)rrZrdoc_idsZ d_doc_vectorsrZnCvrir;r<backprop_character_embeds.z=CharacterEmbed.begin_update..backprop_character_embed)N) rrIr:ZarangerZ to_utf8_arrayrrMrrmrr) rirrbr rrrZ doc_vectorsrr;rr<rys$  zCharacterEmbed.begin_update)NN)r6)rrrrpropertyrrryr;r;r;r<rs    rc Cst|}|r"||jdddk}|d}|d}|jj|ddd}|jj|ddd}||}||jddd|}|||||d} ||d} |rd| |<d| |<| } | | fS)Nr(rrr@T)rZkeepdimsr)rrrr7r8) Zyhr ignore_zerosr:Z zero_indicesZnorm_yhZnorm_yZ mul_normsr=Zd_yhZlossesZlossr;r;r<get_cossim_losss r)rr6)r)N)r6)r6)r6)r)r(FF)r6)F)r6)rg)rg)r)F)u __future__rrZ thinc.v2vrrrrrZ thinc.t2trr Z thinc.t2vr r r Z thinc.i2vr Z thinc.miscrrrrTrrrrrrrrrrrrZthinc.linear.linearrZthinc.neural.opsrrZthinc.neural.utilrr Zthinc.neural.optimizersr!rr"Zthinc.describer#r$r%r&Zthinc.neural._classes.affiner'Zthinc.extra.load_nlpr+r)r*r+r,r-r.r/errorsr0r1r2r3r4r r5rrr=rKrdrpr~rurzrZon_data attributesrrrr rr_rr#r%r'r,r1r2r5r7rErHrMrZrVr[r]r`r`rUrpobjectrorkrrrrr;r;r;r<s        $           p V    %  K  "    .