"""Information Retrieval metrics Useful Resources: http://www.cs.utexas.edu/~mooney/ir-course/slides/Evaluation.ppt http://www.nii.ac.jp/TechReports/05-014E.pdf http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf http://hal.archives-ouvertes.fr/docs/00/72/67/60/PDF/07-busa-fekete.pdf Learning to Rank for Information Retrieval (Tie-Yan Liu) """ import numpy as np def mean_reciprocal_rank(rs): """Score is reciprocal of the rank of the first relevant item First element is 'rank 1'. Relevance is binary (nonzero is relevant). Example from http://en.wikipedia.org/wiki/Mean_reciprocal_rank >>> rs = [[0, 0, 1], [0, 1, 0], [1, 0, 0]] >>> mean_reciprocal_rank(rs) 0.61111111111111105 >>> rs = np.array([[0, 0, 0], [0, 1, 0], [1, 0, 0]]) >>> mean_reciprocal_rank(rs) 0.5 >>> rs = [[0, 0, 0, 1], [1, 0, 0], [1, 0, 0]] >>> mean_reciprocal_rank(rs) 0.75 Args: rs: Iterator of relevance scores (list or numpy) in rank order (first element is the first item) Returns: Mean reciprocal rank """ rs = (np.asarray(r).nonzero()[0] for r in rs) return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs]) def r_precision(r): """Score is precision after all relevant documents have been retrieved Relevance is binary (nonzero is relevant). >>> r = [0, 0, 1] >>> r_precision(r) 0.33333333333333331 >>> r = [0, 1, 0] >>> r_precision(r) 0.5 >>> r = [1, 0, 0] >>> r_precision(r) 1.0 Args: r: Relevance scores (list or numpy) in rank order (first element is the first item) Returns: R Precision """ r = np.asarray(r) != 0 z = r.nonzero()[0] if not z.size: return 0. return np.mean(r[:z[-1] + 1]) def precision_at_k(r, k): """Score is precision @ k Relevance is binary (nonzero is relevant). >>> r = [0, 0, 1] >>> precision_at_k(r, 1) 0.0 >>> precision_at_k(r, 2) 0.0 >>> precision_at_k(r, 3) 0.33333333333333331 >>> precision_at_k(r, 4) Traceback (most recent call last): File "", line 1, in ? ValueError: Relevance score length < k Args: r: Relevance scores (list or numpy) in rank order (first element is the first item) Returns: Precision @ k Raises: ValueError: len(r) must be >= k """ assert k >= 1 r = np.asarray(r)[:k] != 0 if r.size != k: raise ValueError('Relevance score length < k') return np.mean(r) def average_precision(r): """Score is average precision (area under PR curve) Relevance is binary (nonzero is relevant). >>> r = [1, 1, 0, 1, 0, 1, 0, 0, 0, 1] >>> delta_r = 1. / sum(r) >>> sum([sum(r[:x + 1]) / (x + 1.) * delta_r for x, y in enumerate(r) if y]) 0.7833333333333333 >>> average_precision(r) 0.78333333333333333 Args: r: Relevance scores (list or numpy) in rank order (first element is the first item) Returns: Average precision """ r = np.asarray(r) != 0 out = [precision_at_k(r, k + 1) for k in range(r.size) if r[k]] if not out: return 0. return np.mean(out) def mean_average_precision(rs): """Score is mean average precision Relevance is binary (nonzero is relevant). >>> rs = [[1, 1, 0, 1, 0, 1, 0, 0, 0, 1]] >>> mean_average_precision(rs) 0.78333333333333333 >>> rs = [[1, 1, 0, 1, 0, 1, 0, 0, 0, 1], [0]] >>> mean_average_precision(rs) 0.39166666666666666 Args: rs: Iterator of relevance scores (list or numpy) in rank order (first element is the first item) Returns: Mean average precision """ return np.mean([average_precision(r) for r in rs]) def dcg_at_k(r, k, method=0): """Score is discounted cumulative gain (dcg) Relevance is positive real values. Can use binary as the previous methods. Example from http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0] >>> dcg_at_k(r, 1) 3.0 >>> dcg_at_k(r, 1, method=1) 3.0 >>> dcg_at_k(r, 2) 5.0 >>> dcg_at_k(r, 2, method=1) 4.2618595071429155 >>> dcg_at_k(r, 10) 9.6051177391888114 >>> dcg_at_k(r, 11) 9.6051177391888114 Args: r: Relevance scores (list or numpy) in rank order (first element is the first item) k: Number of results to consider method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...] If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...] Returns: Discounted cumulative gain """ r = np.asfarray(r)[:k] if r.size: if method == 0: return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1))) elif method == 1: return np.sum(r / np.log2(np.arange(2, r.size + 2))) else: raise ValueError('method must be 0 or 1.') return 0. def ndcg_at_k(r, k, method=0): """Score is normalized discounted cumulative gain (ndcg) Relevance is positive real values. Can use binary as the previous methods. Example from http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0] >>> ndcg_at_k(r, 1) 1.0 >>> r = [2, 1, 2, 0] >>> ndcg_at_k(r, 4) 0.9203032077642922 >>> ndcg_at_k(r, 4, method=1) 0.96519546960144276 >>> ndcg_at_k([0], 1) 0.0 >>> ndcg_at_k([1], 2) 1.0 Args: r: Relevance scores (list or numpy) in rank order (first element is the first item) k: Number of results to consider method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...] If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...] Returns: Normalized discounted cumulative gain """ dcg_max = dcg_at_k(sorted(r, reverse=True), k, method) if not dcg_max: return 0. return dcg_at_k(r, k, method) / dcg_max if __name__ == "__main__": import doctest doctest.testmod()