# Prepare your data for training

In this lab you will use a SageMaker Processing job to convert your raw data into a set of train, test, and validation datasets that can be used to train a model.

In [None]:
!pip install "sagemaker>=2.123.0"

In [None]:
import sagemaker
import boto3
import numpy as np 
import pandas as pd 
import os 
import time
from sagemaker import get_execution_role
from sagemaker.processing import ProcessingInput, ProcessingOutput

# Get default bucket
session = sagemaker.Session()
default_bucket = session.default_bucket()
bucket_prefix = "mlops-workshop/"
# Get SageMaker Execution Role
role = get_execution_role()
region = boto3.Session().region_name

In [None]:
# Define boto session and SageMaker Client

boto_session = boto3.Session(region_name=region)
sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)

In [None]:
!mkdir -p scripts

In [None]:
%%writefile ./scripts/preprocessing.py
import argparse
import os

import numpy as np
import pandas as pd

def process(df):
 # Add two new indicators
 df["no_previous_contact"] = (df["pdays"] == 999).astype(int)
 df["not_working"] = df["job"].isin(["student", "retired", "unemployed"]).astype(int)
 columns = list(df.columns)
 
 toremove = ["emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"]
 columns = [x for x in columns if x not in toremove]
 
 # Keeping only columns that we need
 df = df[columns]
 
 # One hot encode
 df=pd.get_dummies(df)
 df = pd.concat([df['y_yes'], df.drop(['y_no', 'y_yes'], axis=1)], axis=1)
 df = df.sample(frac=1).reset_index(drop=True)
 return df

if __name__ == "__main__":
 parser = argparse.ArgumentParser()
 parser.add_argument("--input-path", type=str, default="/opt/ml/processing")
 args, _ = parser.parse_known_args()
 
 base_dir = args.input_path

 df = pd.read_csv(
 f"{base_dir}/input/bank-additional-full.csv",
 header=0
 )
 
 # Call the helper method
 df = process(df)
 
 train, validation, test = np.split(df, [int(.7*len(df)), int(.85*len(df))])

 train.to_csv(f"{base_dir}/train/train.csv", header=False, index=False)
 validation.to_csv(f"{base_dir}/validation/validation.csv", header=False, index=False)
 test.to_csv(f"{base_dir}/test/test.csv", header=False, index=False)

In [None]:
# Upload the raw dataset to S3 so that it can be used in the processing job

local_data_path = "bank-additional-full.csv"

base_uri = f"s3://{default_bucket}/marketing"
input_data_uri = sagemaker.s3.S3Uploader.upload(
 local_path=local_data_path, 
 desired_s3_uri=base_uri,
)

In [None]:
from sagemaker.sklearn.processing import SKLearnProcessor

framework_version = "0.23-1"

sklearn_processor = SKLearnProcessor(
 framework_version=framework_version,
 instance_type="ml.m5.xlarge",
 instance_count=1,
 base_job_name="sklearn-marketing-process",
 role=role,
 sagemaker_session=session
)

In [None]:
sklearn_processor.run(
 inputs=[
 ProcessingInput(source=input_data_uri, destination="/opt/ml/processing/input"), 
 ],
 outputs=[
 ProcessingOutput(output_name="train", source="/opt/ml/processing/train"),
 ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"),
 ProcessingOutput(output_name="test", source="/opt/ml/processing/test")
 ],
 code="scripts/preprocessing.py",
)

In [None]:
preprocessing_job_description = sklearn_processor.jobs[-1].describe()

In [None]:
train_uri = preprocessing_job_description['ProcessingOutputConfig']['Outputs'][0]['S3Output']['S3Uri']
val_uri = preprocessing_job_description['ProcessingOutputConfig']['Outputs'][1]['S3Output']['S3Uri']
test_uri = preprocessing_job_description['ProcessingOutputConfig']['Outputs'][2]['S3Output']['S3Uri']

In [None]:
%store train_uri
%store val_uri
%store test_uri

### Let's view the processed data

Here we download the training dataset and view the first 10 rows

In [None]:
!aws s3 cp {train_uri}/train.csv /tmp/train.csv

In [None]:
train_df = pd.read_csv('/tmp/train.csv', header = None)

In [None]:
train_df.head()

#### You can now move to the next section of the module `Train a model & track your experiments`

The notebook used in that section is `sagemaker-train.ipynb`