U C^.@sddlmZmZddlmZddlmZddlmZm Z m Z ddl m Z dd l Zdd lZdd lZdd d ZddZddZddZGdddeZddZd S))unicode_literalsprint_function)Model)Affine) with_reshapelayerizewrap)get_array_moduleN,cs<||dk rt|nddfdd }t|S)Nc sj|}dd|Dj||d\}t|\}}}dfdd }dk r|t|fddD}nddD}||||f|fS) NcSsg|] }t|qS)len).0xrrN/tmp/pip-install-6_kvzl1k/thinc/thinc/neural/_classes/multiheaded_attention.py szBprepare_self_attention..qkv_sa_forward..)dropcs4|\}}}t|||}||d}j|SN)sgd) _join_seqsopsZ unflatten)Z dQs_dKs_dVsrdQsdKsdVsZdQKVZdX)affineget_dXlengthsnDnHrrqkv_sa_backwards  zGprepare_self_attention..qkv_sa_forward..qkv_sa_backwardcsg|]}||qSrr)rlength)get_maskxprrrscSsg|]}dqSNrr_rrrr!s)N)rflatten begin_update _split_seqsr ) ZXsrXQKVQsKsVsr#masksrr%r!r")rr r&rqkv_sa_forwards z.prepare_self_attention..qkv_sa_forward)r) window_maskr )rZwindowZnMr"r4rr3rprepare_self_attention s  r6csfdd}|S)Ncs<|j||fdd}t|D]}d||||f<q|S)Nf)Zdtyper)zerosrange)r&ZnXZnYmaskinrrr%'s zwindow_mask..get_maskr)r=r%rr<rr5&s r5cCst||jdks(tt||jdfg}g}g}d}t|}|D]} |||| } | | d||f} || dddf} || dddf} || dddf} || d||f|| d||f|| d||f|| 7}qD|||fS)Nrrrr )sumshapeAssertionErrorr reshapeascontiguousarrayappend)r.r r"r!r/r0r1r;r&r$ZqkvZquerieskeysvaluesrrrr,/s"( r,c Cst|d}||d||f}||d||f}||d||f}|jd|jdkrz|jdksnt||||fS)Nrr>)r ZvstackrBr@rAZhstack) r/r0r1r"r!r&QKVrrrrCs  ,rc@sJeZdZdZddZdddZddZd d Zd d Zd dZ ddZ dS)MultiHeadedAttentionzMulti-headed attention. Requires a preprocessor to prepare (Qs, Ks, Vs, masks) triples, such as the prepare_self_attention() preprocessor. A layer should run after this to do the projection as well.cCst|dSr')r__init__)selfrrrrKPszMultiHeadedAttention.__init__rcCsX|\}}}}|dkr"dd|D}t|t|krBt|ksHnt|||||S)NcSsg|]}dqSr'rr(rrrrVsz5MultiHeadedAttention.begin_update..)rrA _attend_seqs)rLZQs_Ks_Vs_masksrr/r0r1r2rrrr+Ss  &z!MultiHeadedAttention.begin_updatec sg}gt|t|kr6t|kr6t|ksXntt|t|t|t|ft||||D]L\}}}} ||||| \} } || | | jd|jdksftqfdfdd } t|t|kstt||| fS)Nrc sZg}g}g}t|D]8\}}|||d\}}} |||||| q|||fSr)ziprD) Z d_outputsrrrrd_outputbackpropdQdKdVZ backpropsrrbackprop_attend_seqsds   z?MultiHeadedAttention._attend_seqs..backprop_attend_seqs)N)rrArN_attendrDr@) rLr/r0r1r2outputsrGrHrIr:outputrPrUrrTrrMZsP   z!MultiHeadedAttention._attend_seqscs:||||\}|||\}dfdd }||fS)z Compute attention on a (query, key, value) triplet. The similarity of the (Q, K) pairs are used to compute an attention matrix, which is used to rescale V. Ncs"|\}}|\}}|||fSr'r)rOrd_attnrSrQrRZ get_dQ_dKZ get_d_attn_dVrrbackprop_attend{s  z5MultiHeadedAttention._attend..backprop_attend)N)_get_attn_weights _apply_attn)rLrGrHrIr:attnrXr[rrZrrVqszMultiHeadedAttention._attendc s8|jd|jd|jd|jdf\|jfks@t|jfksTtjjd t|dddjfkstt|dddjfkst j}|jfkst||\}jj |ddjfkstd  f dd }|fS) Nrrr r7r>Zaxiscs|jfkstjj|dd}|}|jfksBtj|tddd}|jfksntjtddd|}|jfkstt|ddd}| }|jfkstt|ddd}|jfkst||fS)Nr>r_rr r)r@rArZbackprop_softmaxmatmul_trans)Zd_attn2rZd_attn1Zd_attn0ZdQ1ZdK1ZdK0ZdQ0 ZK1ZQ1Zattn2Z backprop_maskr!r"ZnKnQrLZsqrtMrrbackprop_attn1sz>MultiHeadedAttention._get_attn_weights..backprop_attn1)N) r@rArr&sqrtZastyperar` _apply_maskZsoftmax)rLZQ0ZK0r:Zattn0Zattn1rdrrbrr\s , z&MultiHeadedAttention._get_attn_weightscs6dfdd }dkr||fS|dd|fSdS)Ncsdkr |S|SdSr'r)rYrr:rrbackprop_apply_masksz=MultiHeadedAttention._apply_mask..backprop_apply_maskrgeA)Nr)rLr^r:rhrrgrrfsz MultiHeadedAttention._apply_maskcsj\|jd|jfks*tt|dddjfksLtj}|jfksntt|ddd}|jfkst|f}fdd}||fS)z Multiplication with values r>rrr cs|jfkst|f}|d}|jfksDtjj|}j|tddd}|jfks~tjtddd|}|jfkst|d}|jfkst||fS)N)rrr rr r) r@rArB transposerr&rCr`ra)ZdS2ZdS1ZdS0rYZdV1ZdV0ZV1r^r!r"rcZnVrLrrbackprop_apply_attns  z=MultiHeadedAttention._apply_attn..backprop_apply_attn)r@rArarr`rB)rLr^ZV0ZS0ZS1ZS2rkrrjrr]s  z MultiHeadedAttention._apply_attnN)r) __name__ __module__ __qualname____doc__rKr+rMrVr\rfr]rrrrrJLs " rJcGst|}|||S)zTranspose and make contiguous)r rCri)r-orderr&rrrrasra)Nr r) __future__rrmodelrrrapirr r utilr Znumpynpcopymathr6r5r,rrJrarrrrs