In [None]:
import pandas as pd
import sagemaker
print(pd.__version__)
print(sagemaker.__version__)
pd.set_option('display.max_colwidth', None)

In [None]:
session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = session.default_bucket()
prefix = "dataset/appflow-sagemakerdemo"

In [None]:
#upload dataset
input_data = session.upload_data(path='dataset/all_tickets.csv', key_prefix=prefix)

# Processing script

In [None]:
%%writefile preprocessing.py
import argparse, os, subprocess, sys
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split


def install(package):
 subprocess.call([
 sys.executable, "-m","pip", "install", package
 ])

def remove_non_alphanumeric(row):
 row = [word for word in row if word.isalpha()]
 return row



if __name__ == '__main__':
 install('nltk')
 import nltk
 from nltk.corpus import stopwords
 nltk.download('punkt')
 nltk.download('stopwords')
 parser = argparse.ArgumentParser()
 parser.add_argument('--filename', type=str)
 parser.add_argument('--num-cases', type=int, default=20000)
 parser.add_argument('--split-ratio', type=float, default=0.1)

 args, _ = parser.parse_known_args()

 print("Recieved arguments {}".format(args))

 filename = args.filename
 num_cases = args.num_cases
 split_ratio = args.split_ratio
 
 #load dataset

 input_data_path = os.path.join('/opt/ml/processing/input', filename)
 print("Reading input data from {}".format(input_data_path))

 data = pd.read_csv(input_data_path)

 #remove lines with missing values
 data.dropna(inplace=True)

 if num_cases is not None:
 data = data[:num_cases]

 #drop unwanted columns
 data = data[['category', 'body']]

 data['label'] = data.category.replace({
 0: '__label__Category0__',
 1: '__label__Category1__',
 2: '__label__Category2__',
 3: '__label__Category3__',
 4: '__label__Category4__',
 5: '__label__Category5__',
 6: '__label__Category6__',
 7: '__label__Category7__',
 8: '__label__Category8__',
 9: '__label__Category9__',
 10: '__label__Category10__',
 11: '__label__Category11__',
 12: '__label__Category12__',
 13: '__label__Category12__'
 }
 ) 
 data = data.drop(['category'], axis=1)

 #move the label column to the front
 data = data[['label', 'body']]

 #tokenize the data
 print("Tokenizing the reviews")

 data['body'] = data['body'].apply(nltk.word_tokenize)

 #remove none alpanumeric chars
 data['body'] = data['body'].apply(remove_non_alphanumeric)

 #remove punctuation
 #data['body'] = data['body'].apply(remove_punctuation)

 #remove stop words
 def remove_stop_words(row):
 stop_words = set(stopwords.words('english'))
 words = [w for w in row if not w in stop_words]
 return words
 
 data['body'] = data['body'].apply(remove_stop_words)

 #convert all text to lowercase
 data['email_body'] = data.apply(lambda row: " ".join(row['body']).lower(), axis=1)

 #drop unwanted columns
 data = data.drop(['body'], axis=1)

 # Process data
 print('Splitting data with ratio {}'.format(split_ratio))
 training, validation = train_test_split(data, test_size=split_ratio)

 training_output_path = os.path.join('/opt/ml/processing/train', 'training.txt')
 validation_output_path = os.path.join('/opt/ml/processing/validation', 'validation.txt')

 print('Saving training data to {}'.format(training_output_path))
 np.savetxt(training_output_path, training.values, fmt='%s')

 print('Saving validation data to {}'.format(validation_output_path))
 np.savetxt(validation_output_path, validation.values, fmt='%s')


In [None]:
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput

In [None]:
sklearn_processor = SKLearnProcessor(
 framework_version='0.20.0',
 role=role,
 instance_type='ml.c5.2xlarge',
 instance_count=1
 
)

In [None]:
%%time

sklearn_processor.run(
 code='preprocessing.py',
 inputs=[ProcessingInput(source=input_data, # Our data from s3
 destination='/opt/ml/processing/input')],
 outputs=[
 ProcessingOutput(output_name="training", 
 source='/opt/ml/processing/train'),
 ProcessingOutput(output_name="validation", 
 source='/opt/ml/processing/validation')
 ],
 arguments=[
 "--filename", "all_tickets.csv",
 "--num-cases", "35000",
 "--split-ratio", "0.05"
 ]
 
)

In [None]:
sklearn_processor.latest_job.describe()['ProcessingOutputConfig']['Outputs'][0]['S3Output']['S3Uri']

In [None]:
sklearn_processor.latest_job.describe()['ProcessingOutputConfig']['Outputs'][1]['S3Output']['S3Uri']

In [None]:
### The output above will be used in the training notebook