є
<┐CVc           @   sМ   d  Z  d d l m Z d d l m Z d d l m Z d d l m Z d d l m Z d d l	 m
 Z
 d d l Z d	 e f d
 Д  Г  YZ d S(   sї  
Lexical translation model that ignores word order.

In IBM Model 1, word order is ignored for simplicity. Thus, the
following two alignments are equally likely.

Source: je mange du jambon
Target: i eat some ham
Alignment: (1,1) (2,2) (3,3) (4,4)

Source: je mange du jambon
Target: some ham eat i
Alignment: (1,4) (2,3) (3,2) (4,1)

The EM algorithm used in Model 1 is:
E step - In the training data, count how many times a source language
         word is translated into a target language word, weighted by
         the prior probability of the translation.

M step - Estimate the new probability of translation based on the
         counts from the Expectation step.


Notations:
i: Position in the source sentence
    Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
j: Position in the target sentence
    Valid values are 1, 2, ..., length of target sentence
s: A word in the source language
t: A word in the target language


References:
Philipp Koehn. 2010. Statistical Machine Translation.
Cambridge University Press, New York.

Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
Robert L. Mercer. 1993. The Mathematics of Statistical Machine
Translation: Parameter Estimation. Computational Linguistics, 19 (2),
263-311.
i    (   t   division(   t   defaultdict(   t   AlignedSent(   t	   Alignment(   t   IBMModel(   t   CountsNt	   IBMModel1c           B   sY   e  Z d  Z d	 d Д Z d Д  Z d Д  Z d Д  Z d Д  Z d Д  Z	 d Д  Z
 d Д  Z RS(
   sБ  
    Lexical translation model that ignores word order

    >>> bitext = []
    >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
    >>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'gro├Я'], ['the', 'house', 'is', 'big']))
    >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
    >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
    >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
    >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))

    >>> ibm1 = IBMModel1(bitext, 5)

    >>> print(ibm1.translation_table['buch']['book'])
    0.889...
    >>> print(ibm1.translation_table['das']['book'])
    0.061...
    >>> print(ibm1.translation_table['buch'][None])
    0.113...
    >>> print(ibm1.translation_table['ja'][None])
    0.072...

    >>> test_sentence = bitext[2]
    >>> test_sentence.words
    ['das', 'buch', 'ist', 'ja', 'klein']
    >>> test_sentence.mots
    ['the', 'book', 'is', 'small']
    >>> test_sentence.alignment
    Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)])

    c         C   sw   t  t |  Г j | Г | d k r2 |  j | Г n | d |  _ x$ t d | Г D] } |  j | Г qO W|  j | Г d S(   se  
        Train on ``sentence_aligned_corpus`` and create a lexical
        translation model.

        Translation direction is from ``AlignedSent.mots`` to
        ``AlignedSent.words``.

        :param sentence_aligned_corpus: Sentence-aligned parallel corpus
        :type sentence_aligned_corpus: list(AlignedSent)

        :param iterations: Number of iterations to run training algorithm
        :type iterations: int

        :param probability_tables: Optional. Use this to pass in custom
            probability values. If not specified, probabilities will be
            set to a uniform distribution, or some other sensible value.
            If specified, the following entry must be present:
            ``translation_table``.
            See ``IBMModel`` for the type and purpose of this table.
        :type probability_tables: dict[str]: object
        t   translation_tablei    N(	   t   superR   t   __init__t   Nonet   set_uniform_probabilitiesR   t   ranget   traint   _IBMModel1__align_all(   t   selft   sentence_aligned_corpust
   iterationst   probability_tablest   n(    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/translate/ibm1.pyR	   c   s    c            s}   d t  |  j Г Й  И  t j k  rI t j d t t  |  j Г Г d Г n  x- |  j D]" } t З  f d Ж  Г |  j | <qS Wd  S(   Ni   s)   Target language vocabulary is too large (s&    words). Results may be less accurate.c              s   И  S(   N(    (    (   t   initial_prob(    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/translate/ibm1.pyt   <lambda>П   s    (	   t   lent	   trg_vocabR   t   MIN_PROBt   warningst   warnt   strR   R   (   R   R   t   t(    (   R   se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/translate/ibm1.pyR   З   s    !c         C   s┬   t  Г  } xе | D]Э } | j } d  g | j } |  j | | Г } xi | D]a } xX | D]P } |  j | | Г }	 |	 | | }
 | j | | c |
 7<| j | c |
 7<qU WqH Wq W|  j | Г d  S(   N(	   R   t   wordsR
   t   motst   prob_all_alignmentst   prob_alignment_pointt	   t_given_st   any_t_given_st*   maximize_lexical_translation_probabilities(   R   t   parallel_corpust   countst   aligned_sentencet   trg_sentencet   src_sentencet   total_countR   t   st   countt   normalized_count(    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/translate/ibm1.pyR   С   s    		c         C   sQ   t  d Д  Г } x; | D]3 } x* | D]" } | | c |  j | | Г 7<q# Wq W| S(   sя  
        Computes the probability of all possible word alignments,
        expressed as a marginal distribution over target words t

        Each entry in the return value represents the contribution to
        the total alignment probability by the target word t.

        To obtain probability(alignment | src_sentence, trg_sentence),
        simply sum the entries in the return value.

        :return: Probability of t for all s in ``src_sentence``
        :rtype: dict(str): float
        c           S   s   d S(   Ng        (    (    (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/translate/ibm1.pyR   │   s    (   R   R    (   R   R(   R'   t   alignment_prob_for_tR   R*   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/translate/ibm1.pyR   е   s
    $c         C   s   |  j  | | S(   s|   
        Probability that word ``t`` in the target sentence is aligned to
        word ``s`` in the source sentence
        (   R   (   R   R*   R   (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/translate/ibm1.pyR    ╣   s    c         C   sw   d } x^ t  | j Г D]M \ } } | d k r4 q n  | j | } | j | } | |  j | | 9} q Wt | t j Г S(   sc   
        Probability of target sentence and an alignment given the
        source sentence
        g      Ё?i    (   t	   enumeratet	   alignmentR'   R(   R   t   maxR   R   (   R   t   alignment_infot   probt   jt   it   trg_wordt   src_word(    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/translate/ibm1.pyt   prob_t_a_given_s└   s    c         C   s"   x | D] } |  j  | Г q Wd  S(   N(   t   _IBMModel1__align(   R   R$   t   sentence_pair(    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/translate/ibm1.pyt   __align_all╨   s    c   
      C   s╗   g  } xЯ t  | j Г D]О \ } } t |  j | d t j Г } d } xI t  | j Г D]8 \ } } |  j | | }	 |	 | k rU |	 } | } qU qU W| j | | f Г q Wt	 | Г | _
 d S(   s  
        Determines the best word alignment for one sentence pair from
        the corpus that the model was trained on.

        The best alignment will be set in ``sentence_pair`` when the
        method returns. In contrast with the internal implementation of
        IBM models, the word indices in the ``Alignment`` are zero-
        indexed, not one-indexed.

        :param sentence_pair: A sentence in the source language and its
            counterpart sentence in the target language
        :type sentence_pair: AlignedSent
        N(   R.   R   R0   R   R
   R   R   R   t   appendR   R/   (
   R   R9   t   best_alignmentR3   R5   t	   best_probt   best_alignment_pointR4   R6   t
   align_prob(    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/translate/ibm1.pyt   __align╘   s    N(   t   __name__t
   __module__t   __doc__R
   R	   R   R   R   R    R7   R   R8   (    (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/translate/ibm1.pyR   B   s   #	
					(   RC   t
   __future__R    t   collectionsR   t   nltk.translateR   R   R   t   nltk.translate.ibm_modelR   R   R   (    (    (    se   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/translate/ibm1.pyt   <module>7   s   