#Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. #SPDX-License-Identifier: MIT-0 import os import logging import argparse import jsonlines import json from itertools import chain import numpy as np import pandas as pd from pandarallel import pandarallel import nltk import sklearn from search_utils import helpers, search_preprocessing nltk.download('punkt') nltk.download('wordnet') logging.basicConfig(level=logging.INFO) logger = logging.getLogger() if __name__=='__main__': parser = argparse.ArgumentParser() parser.add_argument('--train-test-split-ratio', type=float, default=0.2) parser.add_argument('--total-nb-of-records', type=int, default=1000) args, _ = parser.parse_known_args() #initialise pandarallel pandarallel.initialize(progress_bar=False, use_memory_fs=False) logger.info(f'Received arguments {args}') input_data_path = os.path.join('/opt/ml/processing/input', 'data.csv') logger.info("-------------------Reading and processing data----------------") logger.info(f'Reading input data from {input_data_path}') data = pd.read_csv(input_data_path, index_col=0) textual_columns = ["processed_title"] features_columns = ["id","category_id","text_information"] category_column = "product_category" MIN_NB_PRODUCTS_PER_CAT = 10 data = search_preprocessing.pre_model_data_preprocessing(data, textual_columns, features_columns, category_column) logger.info("Spliting the data to training and test sets") pc_test = float(args.train_test_split_ratio) train_data, test_data, train_cat, test_cat = sklearn.model_selection.train_test_split(data[features_columns], data[["category_id"]],\ test_size=pc_test, stratify=data[["category_id"]]) logger.info("-------------------Generating positive and negative pairs----------------") limits = {"TOTAL_NB_OF_RECORDS" : int(args.total_nb_of_records), "PC_POSITIVE" : 0.5} train_sentences_data_negative, train_sentences_data_positive,\ train_negative_indices, train_positive_indices =\ search_preprocessing.generate_sentence_pairs(train_data, limits) logger.info(f"Number of negative data points : {len(train_sentences_data_negative)}\nNumber of positive data points : {len(train_sentences_data_positive)}") limits["TOTAL_NB_OF_RECORDS"] = pc_test*limits["TOTAL_NB_OF_RECORDS"] test_sentences_data_negative, test_sentences_data_positive, \ test_negative_indices, test_positive_indices = search_preprocessing.generate_sentence_pairs(test_data, limits) logger.info(f"Number of negative data points : {len(test_sentences_data_negative)}\nNumber of positive data points : {len(test_sentences_data_positive)}") #Combining positive and negative pairs and shuffling data training_records = train_sentences_data_negative + train_sentences_data_positive np.random.shuffle(training_records) test_records = test_sentences_data_negative + test_sentences_data_positive np.random.shuffle(test_records) logger.info(f"Training records count : {len(training_records)}") logger.info(f"Test records count : {len(test_records)}") logger.info("Done preprocessing, performing data leak check...") data_leak_from_train_to_test = search_preprocessing.check_data_leak(training_records, test_records) logger.info(f"Data from train to test leaked ? {data_leak_from_train_to_test}") textual_train_data_output_path = os.path.join('/opt/ml/processing/train_textual', 'textual_train_data.jsonl') textual_test_data_output_path = os.path.join('/opt/ml/processing/test_textual', 'textual_test_data.jsonl') logger.info(f"Saving the textual training data to : {textual_train_data_output_path}") with jsonlines.open(textual_train_data_output_path, mode='w') as writer: for record in training_records: writer.write(record) logger.info(f"Saving the textual test data to : {textual_test_data_output_path}") with jsonlines.open(textual_test_data_output_path, mode='w') as writer: for record in test_records: writer.write(record) logger.info("-------------------Generating the vocabulary----------------") #The raw vocabulary, used for debuging and inspection raw_vocabulary_output_path = os.path.join("/opt/ml/processing/raw_vocab", "raw_vocab.json") #The final vocab, used for transforming text to ids vocabulary_output_path = os.path.join("/opt/ml/processing/vocab", "vocab.json") data_iter_list = [] data_iter_list.append(helpers.read_jsonline(textual_train_data_output_path)) data_iter_list.append(helpers.read_jsonline(textual_test_data_output_path)) data_iter = chain(data_iter_list[0], data_iter_list[1]) raw_vocab, word_to_id = search_preprocessing.build_vocab_parallel(data_iter,\ num_words=1000000, min_count=1, use_reserved_symbols=False, sort=True) logger.info("Generated vocabulary, saving vocabulary...") with open(vocabulary_output_path, "w") as write_file: json.dump(word_to_id, write_file) with open(raw_vocabulary_output_path, "w") as write_file: json.dump(raw_vocab, write_file) logger.info("-----------------Converting textual data to integers using generated vocabulary-------------------") #This tokenize will be used to process the textual data tokenizer = nltk.tokenize.TreebankWordTokenizer() numerical_train_data_output_path = os.path.join("/opt/ml/processing/train_numerical", "numerical_train_data.jsonl") numerical_test_data_output_path = os.path.join("/opt/ml/processing/test_numerical", "numerical_test_data.jsonl") logger.info("Converting training textual records to numerical records using the vocabulary") training_textual_records = pd.DataFrame(training_records) training_numerical_records = search_preprocessing.transform_textual_records_to_numerical(training_textual_records, tokenizer, word_to_id) with jsonlines.open(numerical_train_data_output_path, mode='w') as writer: for record in training_numerical_records: writer.write(record) logger.info("Converting test textual records to numerical records using the vocabulary") test_textual_records = pd.DataFrame(test_records) test_numerical_records = search_preprocessing.transform_textual_records_to_numerical(test_textual_records, tokenizer, word_to_id) with jsonlines.open(numerical_test_data_output_path, mode='w') as writer: for record in test_numerical_records: writer.write(record) logger.info("Converting to integers done") logger.info(f"Saving the numerical training data to : {numerical_train_data_output_path}") logger.info(f"Saving the numerical test data to : {numerical_test_data_output_path}") logger.info("Done preprocessing")