# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: MIT-0 import numpy as np import tensorflow_hub as hub from bert.tokenization import bert_tokenization #Functions in this file are from the following two repositories: #https://github.com/soutsios/pos-tagger-bert/blob/master/pos_tagger_bert.ipynb #https://github.com/google-research/bert class PaddingInputExample(object): """Fake example so the num input examples is a multiple of the batch size. When running eval/predict on the TPU, we need to pad the number of examples to be a multiple of the batch size, because the TPU requires a fixed batch size. The alternative is to drop the last batch, which is bad because it means the entire output data won't be generated. We use this class instead of `None` because treating `None` as padding batches could cause silent errors. """ class InputExample(object): """A single training/test example for simple sequence classification.""" def __init__(self, guid, text_a, text_b=None, label=None): """Constructs a InputExample. Args: guid: Unique id for the example. text_a: string. The untokenized text of the first sequence. For single sequence tasks, only this sequence must be specified. text_b: (Optional) string. The untokenized text of the second sequence. Only must be specified for sequence pair tasks. label: (Optional) string. The label of the example. This should be specified for train and dev examples, but not for test examples. """ self.guid = guid self.text_a = text_a self.text_b = text_b self.label = label def create_tokenizer_from_hub_module(bert_path): """ Get the vocab file and casing info from the Hub module. :param bert_path: (str) path to either internet address or s3 location of bert :return: """ BertTokenizer = bert_tokenization.FullTokenizer bert_layer = hub.KerasLayer(bert_path, trainable=False) vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy() to_lower_case = bert_layer.resolved_object.do_lower_case.numpy() tokenizer = BertTokenizer(vocabulary_file, to_lower_case) return tokenizer def convert_single_example(tokenizer, example, tag2int, max_seq_length=256): """ Converts a single `InputExample` into a single `InputFeatures`. :param tokenizer: tokenizer created by create_tokenizer_from_hub_module :param example: example created by convert_text_to_examples :param tag2int: (dict) dictionary of tags to corresponding integer conversion :param max_seq_length: (int) length of input example (input size of bert model) :return: input_ids, input_masks, segment_ids (all three as input for BERT model, and label_ids (true labels, useful for testing). At inference, we create placeholder label_ids that we don't reuse (eg '-PAD-') """ if isinstance(example, PaddingInputExample): input_ids = [0] * max_seq_length input_mask = [0] * max_seq_length segment_ids = [0] * max_seq_length label_ids = [0] * max_seq_length return input_ids, input_mask, segment_ids, label_ids tokens_a = example.text_a if len(tokens_a) > max_seq_length-2: tokens_a = tokens_a[0 : (max_seq_length-2)] # Token map will be an int -> int mapping between the `orig_tokens` index and # the `bert_tokens` index. # bert_tokens == ["[CLS]", "john", "johan", "##son", "'", "s", "house", "[SEP]"] # orig_to_tok_map == [1, 2, 4, 6] orig_to_tok_map = [] tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) orig_to_tok_map.append(len(tokens)-1) for token in tokens_a: orig_to_tok_map.append(len(tokens)) tokens.extend(tokenizer.tokenize(token)) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) orig_to_tok_map.append(len(tokens)-1) input_ids = tokenizer.convert_tokens_to_ids([tokens[i] for i in orig_to_tok_map]) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) label_ids = [] labels = example.label label_ids.append(0) label_ids.extend([tag2int[label] for label in labels]) label_ids.append(0) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) label_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(label_ids) == max_seq_length return input_ids, input_mask, segment_ids, label_ids def convert_examples_to_features(tokenizer, examples, tag2int, max_seq_length=256): """ Convert a set of `InputExample`s to a list of `InputFeatures`. :return: numpy arrays """ input_ids, input_masks, segment_ids, labels = [], [], [], [] for example in examples: input_id, input_mask, segment_id, label = convert_single_example( tokenizer, example, tag2int, max_seq_length ) input_ids.append(input_id) input_masks.append(input_mask) segment_ids.append(segment_id) labels.append(label) return ( np.array(input_ids), np.array(input_masks), np.array(segment_ids), np.array(labels), ) def convert_text_to_examples(texts, labels): """ Create InputExamples :param texts: (list of lists of str) list of sentences where each sentence is a list of words :param labels: (list of lists of str) list of lists of words/subwords labels :return: list of InputExample objects """ InputExamples = [] for text, label in zip(texts, labels): InputExamples.append( InputExample(guid=None, text_a=text, text_b=None, label=label) ) return InputExamples