# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: MIT-0 # Functions essentially used in train.py def create_sentences_out_of_dataframe(data): """ Create sentences out of a dataframe of tagged data containing the columns "Words" and "Tags" :param data: (pandas DataFrame) where the first element of each line corresponds to a word, where ### is the end of a sentence word, and where the second element of each line is the tag of this word :return: (list of lists of tuples) tag sentences as list of lists of tuples (word, tag) """ sentence_data = list(zip(data['Sentence #'], data['Word'], data['Tag'])) tagged_sentences = [] tag_sent = [] for line in sentence_data: if line[0] == line[0]: # When we meet a "Sentence: " for a sentence start if tag_sent: # Other cases tagged_sentences.append(tag_sent) tag_sent = [] tag_sent.append((line[1], line[2])) if not tag_sent: # First case tag_sent.append((line[1], line[2])) elif line[0] != line[0]: # Check if NaN tag_sent.append((line[1],line[2])) # Last case tagged_sentences.append(tag_sent) return tagged_sentences # Some usefull functions def tag_sequence(sentences): return [[t for w, t in sentence] for sentence in sentences] def text_sequence(sentences): return [[w for w, t in sentence] for sentence in sentences] def from_iob_to_io(sentences): """ Transforms the IOB tags in sentences (output of create_sentences_out_of_dataframe) to IO tags :param sentences: (list of list of tuples) :return: (list of list of tuples) """ clean_sentences=[] for desc in sentences: sublist=[] for x in desc: l = list(x) tag = l[1] if 'B-' in tag: tag = tag.replace('B-', '') elif 'I-' in tag: tag = tag.replace('I-', '') elif 'b-' in tag: tag = tag.replace('b-', '') elif 'i-' in tag: tag = tag.replace('i-', '') t = tuple([l[0], tag]) sublist.append(t) clean_sentences.append(sublist) return clean_sentences def split(sentences, max): """ Splits sentences (as list of lists of tuples), to list of lists of len(max) or less """ new = [] for data in sentences: new.append(([data[x:x+max] for x in range(0, len(data), max)])) new = [val for sublist in new for val in sublist] return new def split_and_duplicate_index(sentences, max): """ Splits sentences (as list of lists of tuples), to list of lists of len(max) or less And keeps track of the sentence "index" (usefull for inference) """ new = [] index = [] for i, data in enumerate(sentences): new.append(([data[x:x+max] for x in range(0, len(data), max)])) index.append([i for x in range(0, len(data), max)]) new = [val for sublist in new for val in sublist] index = [val for sublist in index for val in sublist] return new, index