ó
<¿CVc           @   sŒ   d  Z  y d d l Z Wn e k
 r) n Xd d l m Z d e d „ Z d „  Z d „  Z	 d d d	 d d
 „ Z
 d d d „ Z d „  Z d S(   s  
Text Segmentation Metrics

1. Windowdiff

Pevzner, L., and Hearst, M., A Critique and Improvement of
  an Evaluation Metric for Text Segmentation,
Computational Linguistics 28, 19-36


2. Generalized Hamming Distance

Bookstein A., Kulyukin V.A., Raita T.
Generalized Hamming Distance
Information Retrieval 5, 2002, pp 353-375

Baseline implementation in C++
http://digital.cs.usu.edu/~vkulyukin/vkweb/software/ghd/ghd.html

Study describing benefits of Generalized Hamming Distance Versus
WindowDiff for evaluating text segmentation tasks
Begsten, Y.  Quel indice pour mesurer l'efficacite en segmentation de textes ?
TALN 2009


3. Pk text segmentation metric

Beeferman D., Berger A., Lafferty J. (1999)
Statistical Models for Text Segmentation
Machine Learning, 34, 177-210
iÿÿÿÿN(   t   xranget   1c         C   sç   t  |  ƒ t  | ƒ k r' t d ƒ ‚ n  | t  |  ƒ k rH t d ƒ ‚ n  d } x€ t t  |  ƒ | d ƒ D]d } t |  | | | !j | ƒ | | | | !j | ƒ ƒ } | rº | | 7} qi | t d | ƒ 7} qi W| t  |  ƒ | d S(   sW  
    Compute the windowdiff score for a pair of segmentations.  A
    segmentation is any sequence over a vocabulary of two items
    (e.g. "0", "1"), where the specified boundary value is used to
    mark the edge of a segmentation.

        >>> s1 = "000100000010"
        >>> s2 = "000010000100"
        >>> s3 = "100000010000"
        >>> '%.2f' % windowdiff(s1, s1, 3)
        '0.00'
        >>> '%.2f' % windowdiff(s1, s2, 3)
        '0.30'
        >>> '%.2f' % windowdiff(s2, s3, 3)
        '0.80'

    :param seg1: a segmentation
    :type seg1: str or list
    :param seg2: a segmentation
    :type seg2: str or list
    :param k: window width
    :type k: int
    :param boundary: boundary value
    :type boundary: str or int or bool
    :param weighted: use the weighted variant of windowdiff
    :type weighted: boolean
    :rtype: float
    s!   Segmentations have unequal lengthsC   Window width k should be smaller or equal than segmentation lengthsi    i   g      ð?(   t   lent
   ValueErrort   ranget   abst   countt   min(   t   seg1t   seg2t   kt   boundaryt   weightedt   wdt   it   ndiff(    (    sk   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/metrics/segmentation.pyt
   windowdiff3   s    !8c         C   s_   t  j |  | f ƒ } | t  j | ƒ | d d  d  … f <| t  j |  ƒ | d  d  … d f <| S(   Ni    (   t   npt   emptyt   arange(   t   nrowst   ncolst   ins_costt   del_costt   mat(    (    sk   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/metrics/segmentation.pyt	   _init_matb   s    ##c         C   sß   xØ t  | ƒ D]Ê \ } } x» t  | ƒ D]­ \ } }	 | t | |	 ƒ |  | | f }
 | |	 k rs |  | | f } n? | |	 k rš | |  | | d f } n | |  | d | f } t | |
 ƒ |  | d | d f <q& Wq Wd  S(   Ni   (   t	   enumerateR   R   (   R   t   rowvt   colvR   R   t   shift_cost_coeffR   t   rowit   jt   coljt
   shift_costt   tcost(    (    sk   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/metrics/segmentation.pyt   _ghd_auxi   s    "g       @g      ð?c         C   s  g  t  |  ƒ D] \ } } | | k r | ^ q } g  t  | ƒ D] \ } } | | k r> | ^ q> }	 t | ƒ }
 t |	 ƒ } |
 d k r– | d k r– d S|
 d k r¶ | d k r¶ |
 | S|
 d k rÖ | d k rÖ | | St | d |
 d | | ƒ } t | |	 | | | | ƒ | d S(   sb  
    Compute the Generalized Hamming Distance for a reference and a hypothetical
    segmentation, corresponding to the cost related to the transformation
    of the hypothetical segmentation into the reference segmentation
    through boundary insertion, deletion and shift operations.

    A segmentation is any sequence over a vocabulary of two items
    (e.g. "0", "1"), where the specified boundary value is used to
    mark the edge of a segmentation.

    Recommended parameter values are a shift_cost_coeff of 2.
    Associated with a ins_cost, and del_cost equal to the mean segment
    length in the reference segmentation.

        >>> # Same examples as Kulyukin C++ implementation
        >>> ghd('1100100000', '1100010000', 1.0, 1.0, 0.5)
        0.5
        >>> ghd('1100100000', '1100000001', 1.0, 1.0, 0.5)
        2.0
        >>> ghd('011', '110', 1.0, 1.0, 0.5)
        1.0
        >>> ghd('1', '0', 1.0, 1.0, 0.5)
        1.0
        >>> ghd('111', '000', 1.0, 1.0, 0.5)
        3.0
        >>> ghd('000', '111', 1.0, 2.0, 0.5)
        6.0

    :param ref: the reference segmentation
    :type ref: str or list
    :param hyp: the hypothetical segmentation
    :type hyp: str or list
    :param ins_cost: insertion cost
    :type ins_cost: float
    :param del_cost: deletion cost
    :type del_cost: float
    :param shift_cost_coeff: constant used to compute the cost of a shift.
    shift cost = shift_cost_coeff * |i - j| where i and j are
    the positions indicating the shift
    :type shift_cost_coeff: float
    :param boundary: boundary value
    :type boundary: str or int or bool
    :rtype: float
    i    g        i   iÿÿÿÿ(   iÿÿÿÿiÿÿÿÿ(   R   R   R   R#   (   t   reft   hypR   R   R   R   R   t   valt   ref_idxt   hyp_idxt
   nref_boundt
   nhyp_boundR   (    (    sk   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/metrics/segmentation.pyt   ghdy   s    .11c         C   sÒ   | d k r8 t t t |  ƒ |  j | ƒ d ƒ ƒ } n  d } x{ t t |  ƒ | d ƒ D]_ } |  | | | !j | ƒ d k } | | | | !j | ƒ d k } | | k rY | d 7} qY qY W| t |  ƒ | d S(   sù  
    Compute the Pk metric for a pair of segmentations A segmentation
    is any sequence over a vocabulary of two items (e.g. "0", "1"),
    where the specified boundary value is used to mark the edge of a
    segmentation.

    >>> '%.2f' % pk('0100'*100, '1'*400, 2)
    '0.50'
    >>> '%.2f' % pk('0100'*100, '0'*400, 2)
    '0.50'
    >>> '%.2f' % pk('0100'*100, '0100'*100, 2)
    '0.00'

    :param ref: the reference segmentation
    :type ref: str or list
    :param hyp: the segmentation to evaluate
    :type hyp: str or list
    :param k: window size, if None, set to half of the average reference segment length
    :type boundary: str or int or bool
    :param boundary: boundary value
    :type boundary: str or int or bool
    :rtype: float
    g       @i    i   g      ð?N(   t   Nonet   intt   roundR   R   R    (   R$   R%   R
   R   t   errR   t   rt   h(    (    sk   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/metrics/segmentation.pyt   pk»   s    ,!  c         C   sD   d d l  m } y d d  l } Wn t k
 r? | d ƒ ‚ n Xd  S(   Niÿÿÿÿ(   t   SkipTests/   numpy is required for nltk.metrics.segmentation(   t   noseR3   t   numpyt   ImportError(   t   moduleR3   R5   (    (    sk   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/metrics/segmentation.pyt   setup_moduleá   s
    (   t   __doc__R5   R   R6   t   nltk.compatR    t   FalseR   R   R#   R+   R,   R2   R8   (    (    (    sk   /private/var/folders/cc/xm4nqn811x9b50x1q_zpkmvdjlphkp/T/pip-build-FUwmDn/nltk/nltk/metrics/segmentation.pyt   <module>*   s   /		B&