# Import library and set the roles and S3 buckets

In [None]:
%pip install nltk

In [None]:
import boto3
import sagemaker
import pandas as pd
import matplotlib.pyplot as plt
import sagemaker, boto3, json
from sagemaker import get_execution_role
import nltk
nltk.download('punkt')
%matplotlib inline

In [None]:
aws_role = get_execution_role()
aws_region = boto3.Session().region_name
sess = sagemaker.Session()
region = boto3.Session().region_name

# Specify S3 bucket and prefix where you have uploaded email_dataset.csv
training_data_bucket = ""
training_data_prefix = ""

training_dataset_s3_path = f"s3://{training_data_bucket}/{training_data_prefix}/email_dataset.csv"

output_bucket = sess.default_bucket()
output_prefix = ""

s3_output_location = f"s3://{output_bucket}/{output_prefix}/output"

# Load Data

In [None]:
# load data
df = pd.read_csv(training_dataset_s3_path)
df.head()


In [None]:
# Check if dataset is balanced or not
df['Category'].value_counts()

# Prepare the Data

In [None]:
#Replace SPAM with 1 and HAM with 0
df['Category'] = df['Category'].apply(lambda x:1 if x=='SPAM' else 0)
df.head()

In [None]:
def tokenize(message):
 # delete quotation marks and commas , apply tokenization and join back into a string separating by spaces
 return ' '.join([str(token) for token in nltk.word_tokenize(str(message).replace(',', '').replace('"', '').lower())])
 
def prepare_data(df):
 df['Category'] = df['Category'].map(lambda category : '__label__{}'.format(str(category).replace('__label__', '')))
 df['Message'] = df['Message'].map(lambda message : tokenize(message)) 
 return df

df_final = df[['Category', 'Message']].reset_index(drop=True)
df_final = prepare_data(df_final)
df_final.head()

In [None]:
# Split data into train and validation
from sklearn.model_selection import train_test_split
df_train, df_validation = train_test_split(df_final, 
 test_size=0.10,
 stratify=df_final['Category'])

In [None]:
#upload transformed data to S3 bucket
train_path = './train.csv'
df_train[['Category', 'Message']].to_csv(train_path, index=False, header=False, sep=' ')

validation_path = './validation.csv'
df_validation[['Category', 'Message']].to_csv(validation_path, index=False, header=False, sep=' ')

#Specify S3 bucket prefix
train_s3_uri = sess.upload_data(bucket=training_data_bucket, key_prefix='trainig', path=train_path)
validation_s3_uri = sess.upload_data(bucket=training_data_bucket, key_prefix='validation', path= validation_path)

# Train the Model

In [None]:
image_uri = sagemaker.image_uris.retrieve(
 region=region,
 framework='blazingtext'
)

In [None]:
estimator = sagemaker.estimator.Estimator(image_uri=image_uri, 
 role=aws_role, 
 instance_count=1, 
 instance_type='ml.m5.large',
 volume_size=30,
 max_run=7200,
 disable_profiler=True, 
 sagemaker_session=sess
)

In [None]:
#Hyperparameter
estimator.set_hyperparameters(mode='supervised', 
 epochs=10, 
 learning_rate=0.01, 
 min_count=2, 
 vector_dim=300, 
 word_ngrams=3) 

In [None]:
train_data = sagemaker.inputs.TrainingInput(
 train_s3_uri, 
 distribution='FullyReplicated', 
 content_type='text/plain', 
 s3_data_type='S3Prefix'
)
validation_data = sagemaker.inputs.TrainingInput(
 validation_s3_uri, 
 distribution='FullyReplicated', 
 content_type='text/plain', 
 s3_data_type='S3Prefix'
)

data_channels = {
 'train': train_data,
 'validation': validation_data 
}

In [None]:
estimator.fit(
 inputs=data_channels,
 wait=True
)

In [None]:
#Get the accuracy of the train and validation dataset
estimator.training_job_analytics.dataframe()

# Deploy the Model

In [None]:
text_classifier = estimator.deploy(initial_instance_count=1,
 instance_type='ml.m5.large',
 serializer=sagemaker.serializers.JSONSerializer(),
 deserializer=sagemaker.deserializers.JSONDeserializer())
print()
print('Endpoint name: {}'.format(text_classifier.endpoint_name))

# Test the Model

In [None]:
messages = [
 # Spam
 'Click on below link, provide your details and win this award' ,
 'Best summer deal here',
 #ham
 'See you in the office on Friday.'

]

tokenized_message = [' '.join(nltk.word_tokenize(mesaage)) for mesaage in messages]
payload = {"instances" : tokenized_message}
print(payload)

In [None]:
predictions = text_classifier.predict(data=payload)
for prediction in predictions:
 predicted_class = prediction['label'][0].lstrip('__label__')
 print('SPAM' if predicted_class == '1' else 'HAM')
 

# Delete the Model Endpoint

In [None]:
text_classifier.delete_endpoint()