## Classifying news with HuggingFace and PyTorch on Amazon SageMaker

In [None]:
# make sure the Amazon SageMaker SDK is updated
!pip install "sagemaker" --upgrade

In [None]:
# import a few libraries that will be needed
import sagemaker
from sagemaker.huggingface import HuggingFace
import boto3
import pandas as pd
import os, time, tarfile

In [None]:
# gets role for executing training job and set a few variables
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
prefix = "news-hf"
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

This example uses the AG News dataset cited in the paper [Character-level Convolutional Networks for Text Classification](https://arxiv.org/abs/1509.01626) by Xiang Zhang and [Yann LeCun](https://twitter.com/ylecun). This dataset is available on the [AWS Open Data Registry](https://registry.opendata.aws/fast-ai-nlp/).

In [None]:
# download and extract our custom dataset
!wget -nc https://s3.amazonaws.com/fast-ai-nlp/ag_news_csv.tgz
tf = tarfile.open('ag_news_csv.tgz')
tf.extractall()
!rm -fr ag_news_csv.tgz

In [None]:
# read training data and add a header
train = pd.read_csv('./ag_news_csv/train.csv')
train.columns = ['label', 'title', 'description']

# read testing data and add a header
test = pd.read_csv('./ag_news_csv/test.csv')
test.columns = ['label', 'title', 'description']

# write the files with header
train.to_csv("ag_news_csv/ag-train.csv", index=False)
test.to_csv("ag_news_csv/ag-test.csv", index=False)

In [None]:
# take a look at the training data
train

In [None]:
# upload training and testing data to Amazon S3
inputs_train = sagemaker_session.upload_data("ag_news_csv/ag-train.csv", bucket=bucket, key_prefix='{}/train'.format(prefix))
inputs_test = sagemaker_session.upload_data("ag_news_csv/ag-test.csv", bucket=bucket, key_prefix='{}/test'.format(prefix))
print(inputs_train)
print(inputs_test)

In [None]:
# keep in mind the classes used in this dataset
classes = pd.read_csv('./ag_news_csv/classes.txt', header=None)
classes.columns = ['label']
classes

----

## BERT large uncased
https://huggingface.co/bert-large-uncased
#### Fine-tuning

In [None]:
hyperparameters = {
	'model_name_or_path':'bert-large-uncased',
	'output_dir':'/opt/ml/model',
 'train_file':'/opt/ml/input/data/train/ag-train.csv',
 'validation_file':'/opt/ml/input/data/test/ag-test.csv',
 'do_train':True,
 'do_eval':True,
 'num_train_epochs': 1,
 'save_total_limit': 1,
	# add your remaining hyperparameters
	# more info here https://github.com/huggingface/transformers/tree/v4.10.0/examples/pytorch/text-classification
}

In [None]:
# git configuration to download our fine-tuning script
git_config = {'repo': 'https://github.com/huggingface/transformers.git','branch': 'v4.6.1'}

In [None]:
# creates Hugging Face estimator
huggingface_estimator_bert = HuggingFace(
	entry_point='run_glue.py', # note we are pointing to the processing script in HF repo
	source_dir='./examples/pytorch/text-classification',
	instance_type='ml.g4dn.16xlarge',
	instance_count=1,
	role=role,
	git_config=git_config,
	transformers_version='4.6.1',
	pytorch_version='1.7.1',
	py_version='py36',
	hyperparameters = hyperparameters,
 disable_profiler=True
)

In [None]:
training_path='s3://{}/{}/train'.format(bucket, prefix)
testing_path='s3://{}/{}/test'.format(bucket, prefix)
# starting the train job
huggingface_estimator_bert.fit({"train": training_path, "test": testing_path}, wait=False)

In [None]:
# check the status of the training job
client = boto3.client("sagemaker")
describe_response = client.describe_training_job(TrainingJobName=huggingface_estimator_bert.latest_training_job.name)

print ('Time - JobStatus - SecondaryStatus')
print('------------------------------')
print (time.strftime("%H:%M", time.localtime()), '-', describe_response['TrainingJobStatus'] + " - " + describe_response['SecondaryStatus'])

# uncomment this for monitoring the job status...
#job_run_status = describe_response['TrainingJobStatus']
#while job_run_status not in ('Failed', 'Completed', 'Stopped'):
# describe_response = client.describe_training_job(TrainingJobName=huggingface_estimator_bert.latest_training_job.name)
# job_run_status = describe_response['TrainingJobStatus']
# print (time.strftime("%H:%M", time.localtime()), '-', describe_response['TrainingJobStatus'] + " - " + describe_response['SecondaryStatus'])
# sleep(30)

**Important:** Make sure the training job is completed before running the "Inference" section below.

You can verify this by running the previous cell and getting JobStatus = "Completed".

#### Inference

In [None]:
from sagemaker.huggingface.model import HuggingFaceModel

# create Hugging Face Model Class
huggingface_model = sagemaker.huggingface.HuggingFaceModel(
env={ 'HF_TASK':'text-classification' },
model_data=huggingface_estimator_bert.model_data,
role=role,
transformers_version="4.6.1",
pytorch_version="1.7.1",
py_version='py36',
)

In [None]:
# create SageMaker Endpoint with the HF model
predictor = huggingface_model.deploy(
initial_instance_count=1,
instance_type="ml.g4dn.xlarge"
)

In [None]:
# example request (you always need to define "inputs"). You can try with your own news' titles here...
data = {
 #"inputs": "Armed robbery last night in the city."
 "inputs": "Great match from Real Madrid tonight."
 #"inputs": "Stocks went up 30% after yesterday's market closure."
 #"inputs": "There is a new chipset that outperforms current GPUs."
}

response = predictor.predict(data)
print(response, classes['label'][int(response[0]['label'][-1:])])

In [None]:
# let us run a quick performance test
sum_BERT=0
for i in range(1, 1000):
 a_time = float(time.time())
 result_BERT = predictor.predict(data)
 b_time = float(time.time())
 sum_BERT = sum_BERT + (b_time - a_time)
 #print(b_time - a_time)
avg_BERT = sum_BERT/1000
print('BERT average inference time: {:.3f}'.format(avg_BERT), 'secs,')

-----

## Amazon's BORT
https://huggingface.co/amazon/bort
#### Fine-tuning

In [None]:
hyperparameters_bort = {
	'model_name_or_path':'amazon/bort',
	'output_dir':'/opt/ml/model',
 'train_file':'/opt/ml/input/data/train/ag-train.csv',
 'validation_file':'/opt/ml/input/data/test/ag-test.csv',
 'do_train':True,
 'do_eval':True,
 'num_train_epochs': 1,
 'save_total_limit': 1
 # add your remaining hyperparameters
	# more info here https://github.com/huggingface/transformers/tree/v4.6.1/examples/pytorch/text-classification
}

In [None]:
# git configuration to download our fine-tuning script
git_config = {'repo': 'https://github.com/huggingface/transformers.git','branch': 'v4.6.1'}

In [None]:
# creates Hugging Face estimator
huggingface_estimator_bort = HuggingFace(
	entry_point='run_glue.py', # note we are pointing to the processing script in HF repo
	source_dir='./examples/pytorch/text-classification',
	instance_type='ml.g4dn.12xlarge',
	instance_count=1,
	role=role,
	git_config=git_config,
	transformers_version='4.6.1',
	pytorch_version='1.7.1',
	py_version='py36',
	hyperparameters = hyperparameters_bort,
 disable_profiler=True
)

In [None]:
training_path='s3://{}/{}/train'.format(bucket, prefix)
testing_path='s3://{}/{}/test'.format(bucket, prefix)
# starting the train job
huggingface_estimator_bort.fit({"train": training_path, "test": testing_path}, wait=False)

In [None]:
# check the status of the training job
client = boto3.client("sagemaker")
describe_response = client.describe_training_job(TrainingJobName=huggingface_estimator_bort.latest_training_job.name)

print ('Time - JobStatus - SecondaryStatus')
print('------------------------------')
print (time.strftime("%H:%M", time.localtime()), '-', describe_response['TrainingJobStatus'] + " - " + describe_response['SecondaryStatus'])

# uncomment this for monitoring the job status...
#job_run_status = describe_response['TrainingJobStatus']
#while job_run_status not in ('Failed', 'Completed', 'Stopped'):
# describe_response = client.describe_training_job(TrainingJobName=huggingface_estimator_bort.latest_training_job.name)
# job_run_status = describe_response['TrainingJobStatus']
# print (time.strftime("%H:%M", time.localtime()), '-', describe_response['TrainingJobStatus'] + " - " + describe_response['SecondaryStatus'])
# sleep(30)

**Important:** Make sure the training job is completed before running the "Inference" section below.

You can verify this by running the previous cell and getting JobStatus = "Completed".

#### Inference

In [None]:
from sagemaker.huggingface.model import HuggingFaceModel

# create Hugging Face Model Class
huggingface_model_bort = sagemaker.huggingface.HuggingFaceModel(
env={ 'HF_TASK':'text-classification' },
model_data=huggingface_estimator_bort.model_data,
role=role,
transformers_version="4.6.1",
pytorch_version="1.7.1",
py_version='py36',
)

In [None]:
# create SageMaker Endpoint with the HF model
predictor_bort = huggingface_model_bort.deploy(
initial_instance_count=1,
instance_type="ml.g4dn.xlarge"
)

In [None]:
# example request (you always need to define "inputs"). You can try with your own news' titles here...
data = {
 "inputs": "Stocks went up 30% after yesterday's market closure."
 #"inputs": "There is a new chipset that outperforms current GPUs."
}

response = predictor_bort.predict(data)
print(response, classes['label'][int(response[0]['label'][-1:])])

In [None]:
# let us run a quick performance test
sum_BORT=0
for i in range(1, 1000):
 a_time = float(time.time())
 result_BORT = predictor_bort.predict(data)
 b_time = float(time.time())
 sum_BORT = sum_BORT + (b_time - a_time)
 #print(b_time - a_time)
avg_BORT = sum_BORT/1000
print('BORT average inference time: {:.3f}'.format(avg_BORT), 'secs,')

-----

#### Clean-up

In [None]:
# uncomment for cleaning-up endpoints
#sess = boto3.Session()
#sess.delete_endpoint(predictor_bert.endpoint)
#sess.delete_endpoint(predictor_bort.endpoint)