є
<┐CVc           @   s▓   d  Z  d d l m Z d d l m Z d d l m Z d d l m Z d d l m Z d d l m	 Z	 d d l
 m Z d d	 l Z d
 e f d Д  Г  YZ d e f d Д  Г  YZ d	 S(   sС  
Lexical translation model that considers word order.

IBM Model 2 improves on Model 1 by accounting for word order.
An alignment probability is introduced, a(i | j,l,m), which predicts
a source word position, given its aligned target word's position.

The EM algorithm used in Model 2 is:
E step - In the training data, collect counts, weighted by prior
         probabilities.
         (a) count how many times a source language word is translated
             into a target language word
         (b) count how many times a particular position in the source
             sentence is aligned to a particular position in the target
             sentence

M step - Estimate new probabilities based on the counts from the E step


Notations:
i: Position in the source sentence
    Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
j: Position in the target sentence
    Valid values are 1, 2, ..., length of target sentence
l: Number of words in the source sentence, excluding NULL
m: Number of words in the target sentence
s: A word in the source language
t: A word in the target language


References:
Philipp Koehn. 2010. Statistical Machine Translation.
Cambridge University Press, New York.

Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
Robert L. Mercer. 1993. The Mathematics of Statistical Machine
Translation: Parameter Estimation. Computational Linguistics, 19 (2),
263-311.
i    (   t   division(   t   defaultdict(   t   AlignedSent(   t	   Alignment(   t   IBMModel(   t	   IBMModel1(   t   CountsNt	   IBMModel2c           B   sb   e  Z d  Z d
 d Д Z d Д  Z d Д  Z d Д  Z d Д  Z d Д  Z	 d Д  Z
 d Д  Z d	 Д  Z RS(   sY  
    Lexical translation model that considers word order

    >>> bitext = []
    >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
    >>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'gro├Я'], ['the', 'house', 'is', 'big']))
    >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
    >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
    >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
    >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))

    >>> ibm2 = IBMModel2(bitext, 5)

    >>> print(round(ibm2.translation_table['buch']['book'], 3))
    1.0
    >>> print(round(ibm2.translation_table['das']['book'], 3))
    0.0
    >>> print(round(ibm2.translation_table['buch'][None], 3))
    0.0
    >>> print(round(ibm2.translation_table['ja'][None], 3))
    0.0

    >>> print(ibm2.alignment_table[1][1][2][2])
    0.938...
    >>> print(round(ibm2.alignment_table[1][2][2][2], 3))
    0.0
    >>> print(round(ibm2.alignment_table[2][2][4][5], 3))
    1.0

    >>> test_sentence = bitext[2]
    >>> test_sentence.words
    ['das', 'buch', 'ist', 'ja', 'klein']
    >>> test_sentence.mots
    ['the', 'book', 'is', 'small']
    >>> test_sentence.alignment
    Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)])

    c         C   sг   t  t |  Г j | Г | d k rQ t | d | Г } | j |  _ |  j | Г n | d |  _ | d |  _ x$ t d | Г D] } |  j	 | Г q{ W|  j
 | Г d S(   sЩ  
        Train on ``sentence_aligned_corpus`` and create a lexical
        translation model and an alignment model.

        Translation direction is from ``AlignedSent.mots`` to
        ``AlignedSent.words``.

        :param sentence_aligned_corpus: Sentence-aligned parallel corpus
        :type sentence_aligned_corpus: list(AlignedSent)

        :param iterations: Number of iterations to run training algorithm
        :type iterations: int

        :param probability_tables: Optional. Use this to pass in custom
            probability values. If not specified, probabilities will be
            set to a uniform distribution, or some other sensible value.
            If specified, all the following entries must be present:
            ``translation_table``, ``alignment_table``.
            See ``IBMModel`` for the type and purpose of these tables.
        :type probability_tables: dict[str]: object
        i   t   translation_tablet   alignment_tablei    N(   t   superR   t   __init__t   NoneR   R   t   set_uniform_probabilitiesR	   t   ranget   traint   _IBMModel2__align_all(   t   selft   sentence_aligned_corpust
   iterationst   probability_tablest   ibm1t   n(    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/translate/ibm2.pyR   c   s    c   	      C   s·   t  Г  } xъ | D]т } t | j Г } t | j Г } | | f | k r | j | | f Г d t | d Г } | t j k  rЪ t j	 d t
 | Г d Г n  xU t d | d Г D]= } x4 t d | d Г D] } | |  j | | | | <q╚ Wqо Wq q Wd  S(   Ni   s   A source sentence is too long (s&    words). Results may be less accurate.i    (   t   sett   lent   motst   wordst   addt   floatR   t   MIN_PROBt   warningst   warnt   strR   R	   (	   R   R   t   l_m_combinationst   aligned_sentencet   lt   mt   initial_probt   it   j(    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/translate/ibm2.pyR   Н   s    	c      	   C   s*  t  Г  } x | D]° } d  g | j } d g | j } t | j Г } t | j Г } |  j | | Г } xЯ t d | d Г D]К }	 | |	 }
 xw t d | d Г D]b } | | } |  j | |	 | | Г } | | |
 } | j | | |
 Г | j	 | | |	 | | Г qЮ Wqz Wq W|  j
 | Г |  j | Г d  S(   Nt   UNUSEDi   i    (   t   Model2CountsR   R   R   R   t   prob_all_alignmentsR   t   prob_alignment_pointt   update_lexical_translationt   update_alignmentt*   maximize_lexical_translation_probabilitiest    maximize_alignment_probabilities(   R   t   parallel_corpust   countsR"   t   src_sentencet   trg_sentenceR#   R$   t   total_countR'   t   tR&   t   st   countt   normalized_count(    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/translate/ibm2.pyR   Ю   s$    	

%c         C   s╞   t  j } x╢ | j j Г  D]е \ } } xЦ | j Г  D]И \ } } xy | j Г  D]k \ } } x\ | D]T }	 | j | | | |	 | j | | |	 }
 t |
 | Г |  j | | | |	 <q^ WqK Wq2 Wq Wd  S(   N(   R   R   t	   alignmentt   itemst   alignment_for_any_it   maxR	   (   R   R1   R   R&   t   j_sR'   t   src_sentence_lengthsR#   t   trg_sentence_lengthsR$   t   estimate(    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/translate/ibm2.pyR/   ╣   s    	c      
   C   s   t  d Д  Г } xi t d t | Г Г D]R } | | } x? t d t | Г Г D]( } | | c |  j | | | | Г 7<qK Wq% W| S(   sя  
        Computes the probability of all possible word alignments,
        expressed as a marginal distribution over target words t

        Each entry in the return value represents the contribution to
        the total alignment probability by the target word t.

        To obtain probability(alignment | src_sentence, trg_sentence),
        simply sum the entries in the return value.

        :return: Probability of t for all s in ``src_sentence``
        :rtype: dict(str): float
        c           S   s   d S(   Ng        (    (    (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/translate/ibm2.pyt   <lambda>╥   s    i   i    (   R   R   R   R+   (   R   R2   R3   t   alignment_prob_for_tR'   R5   R&   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/translate/ibm2.pyR*   ─   s    
c   	      C   sZ   t  | Г d } t  | Г d } | | } | | } |  j | | |  j | | | | S(   sz   
        Probability that position j in ``trg_sentence`` is aligned to
        position i in the ``src_sentence``
        i   (   R   R   R	   (	   R   R&   R'   R2   R3   R#   R$   R6   R5   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/translate/ibm2.pyR+   ┌   s
    

c   	      C   s┤   d } t  | j Г d } t  | j Г d } xu t | j Г D]d \ } } | d k rZ q< n  | j | } | j | } | |  j | | |  j | | | | 9} q< Wt | t j	 Г S(   sc   
        Probability of target sentence and an alignment given the
        source sentence
        g      Ё?i   i    (
   R   R2   R3   t	   enumerateR9   R   R	   R<   R   R   (	   R   t   alignment_infot   probR#   R$   R'   R&   t   trg_wordt   src_word(    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/translate/ibm2.pyt   prob_t_a_given_sх   s    c         C   s"   x | D] } |  j  | Г q Wd  S(   N(   t   _IBMModel2__align(   R   R0   t   sentence_pair(    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/translate/ibm2.pyt   __align_all°   s    c         C   s  g  } t  | j Г } t  | j Г } x▀ t | j Г D]╬ \ } } |  j | d |  j d | d | | } t | t j	 Г } d } xh t | j Г D]W \ }	 }
 |  j | |
 |  j |	 d | d | | } | | k rФ | } |	 } qФ qФ W| j
 | | f Г q4 Wt | Г | _ d S(   s  
        Determines the best word alignment for one sentence pair from
        the corpus that the model was trained on.

        The best alignment will be set in ``sentence_pair`` when the
        method returns. In contrast with the internal implementation of
        IBM models, the word indices in the ``Alignment`` are zero-
        indexed, not one-indexed.

        :param sentence_pair: A sentence in the source language and its
            counterpart sentence in the target language
        :type sentence_pair: AlignedSent
        i    i   N(   R   R   R   RC   R   R   R	   R<   R   R   t   appendR   R9   (   R   RJ   t   best_alignmentR#   R$   R'   RF   t	   best_probt   best_alignment_pointR&   RG   t
   align_prob(    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/translate/ibm2.pyt   __align№   s     "N(   t   __name__t
   __module__t   __doc__R   R   R   R   R/   R*   R+   RH   R   RI   (    (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/translate/ibm2.pyR   ;   s   &)							R)   c           B   s)   e  Z d  Z d Д  Z d Д  Z d Д  Z RS(   so   
    Data object to store counts of various parameters during training.
    Includes counts for alignment.
    c         C   s;   t  t |  Г j Г  t d Д  Г |  _ t d Д  Г |  _ d  S(   Nc           S   s   t  d Д  Г S(   Nc           S   s   t  d Д  Г S(   Nc           S   s   t  d Д  Г S(   Nc           S   s   d S(   Ng        (    (    (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/translate/ibm2.pyRA   *  s    (   R   (    (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/translate/ibm2.pyRA   )  s   (   R   (    (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/translate/ibm2.pyRA   )  s    (   R   (    (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/translate/ibm2.pyRA   )  s    c           S   s   t  d Д  Г S(   Nc           S   s   t  d Д  Г S(   Nc           S   s   d S(   Ng        (    (    (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/translate/ibm2.pyRA   ,  s    (   R   (    (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/translate/ibm2.pyRA   ,  s    (   R   (    (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/translate/ibm2.pyRA   ,  s    (   R
   R)   R   R   R9   R;   (   R   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/translate/ibm2.pyR   &  s
    c         C   s.   |  j  | | c | 7<|  j | c | 7<d  S(   N(   t	   t_given_st   any_t_given_s(   R   R7   R6   R5   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/translate/ibm2.pyR,   .  s    c         C   s>   |  j  | | | | c | 7<|  j | | | c | 7<d  S(   N(   R9   R;   (   R   R7   R&   R'   R#   R$   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/translate/ibm2.pyR-   2  s    (   RR   RS   RT   R   R,   R-   (    (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/translate/ibm2.pyR)   !  s   		(   RT   t
   __future__R    t   collectionsR   t   nltk.translateR   R   R   R   t   nltk.translate.ibm_modelR   R   R   R)   (    (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/translate/ibm2.pyt   <module>/   s   ц