"""Utility functions for Document search""" try: import numpy as np except ImportError: # No need to log it here as numpy is only used if SentenceTransformers is used # The latter has numpy as dependency. pass import math import editdistance from textractor.data.constants import SimilarityMetric from textractor.exceptions import MissingDependencyException from textractor.data.constants import ( IS_COLUMN_HEAD, IS_FOOTER_CELL, IS_TITLE_CELL, IS_SUMMARY_CELL, IS_SECTION_TITLE_CELL, CellTypes, ) class SearchUtils: model = None util = None model_string = "all-MiniLM-L6-v2" @classmethod def get_word_similarity( cls, word_1: str, word_2: str, similarity_metric: SimilarityMetric ) -> float: """ Returns the extent of similarity between the input words using the similarity_metric input by the user. :param word_1: First word to check for similarity :type word_1: str :param word_2: Second word to check for similarity :type word_2: str :param similarity_metric: The function supports one of 3 metrics \ * Levenshtein distance/ edit distance \ * Euclidean distance \ * Cosine distance :type similarity_metric: str :return: Returns the similarity measure calculated based on the metric for the 2 input words. :rtype: float """ if cls.model is None and similarity_metric != SimilarityMetric.LEVENSHTEIN: try: from sentence_transformers import SentenceTransformer, util except ImportError: raise MissingDependencyException( "sentence_transformers is not installed. Use SimilarityMetric.LEVENSHTEIN." ) cls.model = SentenceTransformer(cls.model_string) cls.util = util if similarity_metric == SimilarityMetric.LEVENSHTEIN: return normalized_edit_distance( word_1.lower(), word_2.lower() ) elif similarity_metric == SimilarityMetric.EUCLIDEAN: ref_word_emb = cls.model.encode([word_1]) word_emb = cls.model.encode([word_2]) dist = np.linalg.norm(ref_word_emb - word_emb) return dist else: ref_word_emb = cls.model.encode([word_1]) word_emb = cls.model.encode([word_2]) similarity = cls.util.cos_sim(ref_word_emb, word_emb) return similarity.item() def jaccard_similarity(list_1: list, list_2: list) -> float: """ Calculates Jaccard similarity between the 2 input lists. :param list_1: First list to check for similarity :type list_1: list :param list_2: Second list to check for similarity :type list_2: list :return: Returns the similarity measure calculated for the 2 input lists. :rtype: float """ set_1 = set(list_1) set_2 = set(list_2) return float(len(set_1.intersection(set_2)) / len(set_1.union(set_2))) def get_metadata_attr_name(cell_atr): """ Returns metadata attribute mapping to the input CellType. :param cell_atr: Input cell type :type: enum :return: Returns metadata attribute mapping to the input CellType. :rtype: str """ cell_map = { CellTypes.COLUMN_HEADER: IS_COLUMN_HEAD, CellTypes.SECTION_TITLE: IS_SECTION_TITLE_CELL, CellTypes.SUMMARY_CELL: IS_SUMMARY_CELL, CellTypes.FLOATING_TITLE: IS_TITLE_CELL, CellTypes.FLOATING_FOOTER: IS_FOOTER_CELL, } try: return cell_map[cell_atr] except: return "" def normalized_edit_distance(s1: str, s2: str): """ Returns the normalized edit distance from Lopresti et al. :param s1: First string :type s1: str :param s2: Second string :type s2: str """ dist = editdistance.eval(s1, s2) if min(len(s1), len(s2)) - dist == 0: return 0.0 return 1.0 / math.exp(dist / (min(len(s1), len(s2)) - dist))