''' #################################################################################### ###### preprocess.py #################################################################################### # (c) 2022 Amazon Web Services, Inc. or its affiliates. All Rights Reserved. # This AWS Content is provided subject to the terms of the AWS Customer Agreement available at # http://aws.amazon.com/agreement or other written agreement between Customer and either # Amazon Web Services, Inc. or Amazon Web Services EMEA SARL or both. #################################################################################### # May 2022 # Preprocess fn # that uses implements standard train/test split fn to prepare # train, test and validation files #################################################################################### ''' import os import pandas as pd import numpy as np from sklearn.model_selection import train_test_split import utils SEED=0 def print_df_shape(df, df_name='data'): print(f"Shape of {df_name} is:", df.shape) def read_input_data(input_data_folder): input_data_path = os.path.join(input_data_folder, "dataset.csv") df = pd.read_csv(input_data_path) print_df_shape(df) return df def run_sklearn_train_test_split(df, test_size): train, test = train_test_split(df, test_size=test_size) print_df_shape(train, df_name='train set') print_df_shape(test, df_name='test set') return train, test def get_train_set_metrics(df_train): num_rows=df_train.shape[0] num_features=df_train.shape[1] avg_age = df_train['age'].mean() num_classes = df_train['class of worker'].unique().shape[0] metrics = [[num_rows, num_features, avg_age, num_classes]] df_metrics = pd.DataFrame(metrics, columns=['num_rows','num_feats','avg_age','num_classes']) return df_metrics def create_output_dirs(output_data_folder): try: utils.mkpath_if_not_exist(output_data_folder) print(f"Successfully created directory {output_data_folder}") except Exception as e: # if the Processing call already creates these directories (or directory otherwise cannot be created) print(e) print("Could not create directory") pass def save_output_files(df, output_data_folder, file_name='train.csv' ): try: df.to_csv(f"{output_data_folder}/{file_name}") print(f"Wrote {file_name} files successfully to {output_data_folder}") except Exception as e: print("Failed to write the files") print(e) pass