import random from datetime import datetime AUTOANNOTATION_THRESHOLD = 0.50 JOB_TYPE = "groundtruth/text-classification" class SimpleActiveLearning: def __init__(self, job_name, label_category_name, label_names, max_selections): self.job_name = job_name self.label_category_name = label_category_name self.label_names = label_names self.max_selections = max_selections def compute_margin(self, probabilities, labels): """ compute the confidence and the best label given the probability distribution. """ max_probability = max(probabilities) max_prob_index = probabilities.index(max_probability) best_label = labels[max_prob_index] remaining_probs = [prob for i, prob in enumerate(probabilities) if i != max_prob_index] second_probability = max(remaining_probs, default=0.0) return max_probability - second_probability, best_label def get_label_index(self, inference_label_output): """ inference_label_output is of the format "__label__0". This method gets an integer suffix from the end of the string. For this example, "__label__0" the function returns 0. """ return int(inference_label_output.split("_")[-1]) def make_metadata(self, margin, best_label): """ make required metadata to match the output label. """ return { "confidence": float(f"{margin: 1.2f}"), "job-name": self.job_name, "class-name": self.label_names[self.get_label_index(best_label)], "human-annotated": "no", "creation-date": datetime.utcnow().strftime("%Y-%m-%dT%H:%m:%S.%f"), "type": JOB_TYPE, } def make_autoannotation(self, prediction, source, margin, best_label): """ generate the final output prediction with the label and confidence. """ return { "source": source["inputs"], "id": prediction["id"], f"{self.label_category_name}": self.get_label_index(best_label), f"{self.label_category_name}-metadata": self.make_metadata(margin, best_label), } def autoannotate(self, predictions, sources): """ auto annotate all unlabeled data with confidence above AUTOANNOTATION_THRESHOLD. """ sources_by_id = {source["id"]: source for source in sources} autoannotations = [] for prediction in predictions: probabilities = prediction["outputs"][0] labels = ["__label__0","__label__1","__label__2","__label__3"] margin, best_label = self.compute_margin(probabilities, labels) if margin > AUTOANNOTATION_THRESHOLD: autoannotations.append( self.make_autoannotation( prediction, sources_by_id[prediction["id"]], margin, best_label ) ) return autoannotations def select_for_labeling(self, predictions, autoannotations): """ Select the next set of records to be labeled by humans. """ initial_ids = {prediction["id"] for prediction in predictions} autoannotation_ids = {autoannotation["id"] for autoannotation in autoannotations} remaining_ids = initial_ids - autoannotation_ids selections = random.sample(remaining_ids, min(self.max_selections, len(remaining_ids))) return selections