{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Classifying news with HuggingFace and PyTorch on Amazon SageMaker"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# make sure the Amazon SageMaker SDK is updated\n",
    "!pip install \"sagemaker\" --upgrade"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import a few libraries that will be needed\n",
    "import sagemaker\n",
    "from sagemaker.huggingface import HuggingFace\n",
    "import boto3\n",
    "import pandas as pd\n",
    "import os, time, tarfile"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# gets role for executing training job and set a few variables\n",
    "sagemaker_session = sagemaker.Session()\n",
    "bucket = sagemaker_session.default_bucket()\n",
    "prefix = \"news-hf\"\n",
    "role = sagemaker.get_execution_role()\n",
    "region = boto3.Session().region_name"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This example uses the AG News dataset cited in the paper [Character-level Convolutional Networks for Text Classification](https://arxiv.org/abs/1509.01626) by Xiang Zhang and [Yann LeCun](https://twitter.com/ylecun). This dataset is available on the [AWS Open Data Registry](https://registry.opendata.aws/fast-ai-nlp/)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# download and extract our custom dataset\n",
    "!wget -nc https://s3.amazonaws.com/fast-ai-nlp/ag_news_csv.tgz\n",
    "tf = tarfile.open('ag_news_csv.tgz')\n",
    "tf.extractall()\n",
    "!rm -fr ag_news_csv.tgz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# read training data and add a header\n",
    "train = pd.read_csv('./ag_news_csv/train.csv')\n",
    "train.columns = ['label', 'title', 'description']\n",
    "\n",
    "# read testing data and add a header\n",
    "test = pd.read_csv('./ag_news_csv/test.csv')\n",
    "test.columns = ['label', 'title', 'description']\n",
    "\n",
    "# write the files with header\n",
    "train.to_csv(\"ag_news_csv/ag-train.csv\", index=False)\n",
    "test.to_csv(\"ag_news_csv/ag-test.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# take a look at the training data\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# upload training and testing data to Amazon S3\n",
    "inputs_train = sagemaker_session.upload_data(\"ag_news_csv/ag-train.csv\", bucket=bucket, key_prefix='{}/train'.format(prefix))\n",
    "inputs_test = sagemaker_session.upload_data(\"ag_news_csv/ag-test.csv\", bucket=bucket, key_prefix='{}/test'.format(prefix))\n",
    "print(inputs_train)\n",
    "print(inputs_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# keep in mind the classes used in this dataset\n",
    "classes = pd.read_csv('./ag_news_csv/classes.txt', header=None)\n",
    "classes.columns = ['label']\n",
    "classes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "----"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## BERT large uncased\n",
    "https://huggingface.co/bert-large-uncased\n",
    "#### Fine-tuning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "hyperparameters = {\n",
    "\t'model_name_or_path':'bert-large-uncased',\n",
    "\t'output_dir':'/opt/ml/model',\n",
    "    'train_file':'/opt/ml/input/data/train/ag-train.csv',\n",
    "    'validation_file':'/opt/ml/input/data/test/ag-test.csv',\n",
    "    'do_train':True,\n",
    "    'do_eval':True,\n",
    "    'num_train_epochs': 1,\n",
    "    'save_total_limit': 1,\n",
    "\t# add your remaining hyperparameters\n",
    "\t# more info here https://github.com/huggingface/transformers/tree/v4.10.0/examples/pytorch/text-classification\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# git configuration to download our fine-tuning script\n",
    "git_config = {'repo': 'https://github.com/huggingface/transformers.git','branch': 'v4.6.1'}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# creates Hugging Face estimator\n",
    "huggingface_estimator_bert = HuggingFace(\n",
    "\tentry_point='run_glue.py', # note we are pointing to the processing script in HF repo\n",
    "\tsource_dir='./examples/pytorch/text-classification',\n",
    "\tinstance_type='ml.g4dn.16xlarge',\n",
    "\tinstance_count=1,\n",
    "\trole=role,\n",
    "\tgit_config=git_config,\n",
    "\ttransformers_version='4.6.1',\n",
    "\tpytorch_version='1.7.1',\n",
    "\tpy_version='py36',\n",
    "\thyperparameters = hyperparameters,\n",
    "    disable_profiler=True\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "training_path='s3://{}/{}/train'.format(bucket, prefix)\n",
    "testing_path='s3://{}/{}/test'.format(bucket, prefix)\n",
    "# starting the train job\n",
    "huggingface_estimator_bert.fit({\"train\": training_path, \"test\": testing_path}, wait=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# check the status of the training job\n",
    "client = boto3.client(\"sagemaker\")\n",
    "describe_response = client.describe_training_job(TrainingJobName=huggingface_estimator_bert.latest_training_job.name)\n",
    "\n",
    "print ('Time - JobStatus - SecondaryStatus')\n",
    "print('------------------------------')\n",
    "print (time.strftime(\"%H:%M\", time.localtime()), '-', describe_response['TrainingJobStatus'] + \" - \" + describe_response['SecondaryStatus'])\n",
    "\n",
    "# uncomment this for monitoring the job status...\n",
    "#job_run_status = describe_response['TrainingJobStatus']\n",
    "#while job_run_status not in ('Failed', 'Completed', 'Stopped'):\n",
    "#    describe_response = client.describe_training_job(TrainingJobName=huggingface_estimator_bert.latest_training_job.name)\n",
    "#    job_run_status = describe_response['TrainingJobStatus']\n",
    "#    print (time.strftime(\"%H:%M\", time.localtime()), '-', describe_response['TrainingJobStatus'] + \" - \" + describe_response['SecondaryStatus'])\n",
    "#    sleep(30)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Important:** Make sure the training job is completed before running the \"Inference\" section below.\n",
    "\n",
    "You can verify this by running the previous cell and getting JobStatus = \"Completed\"."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Inference"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sagemaker.huggingface.model import HuggingFaceModel\n",
    "\n",
    "# create Hugging Face Model Class\n",
    "huggingface_model = sagemaker.huggingface.HuggingFaceModel(\n",
    "env={ 'HF_TASK':'text-classification' },\n",
    "model_data=huggingface_estimator_bert.model_data,\n",
    "role=role,\n",
    "transformers_version=\"4.6.1\",\n",
    "pytorch_version=\"1.7.1\",\n",
    "py_version='py36',\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create SageMaker Endpoint with the HF model\n",
    "predictor = huggingface_model.deploy(\n",
    "initial_instance_count=1,\n",
    "instance_type=\"ml.g4dn.xlarge\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# example request (you always need to define \"inputs\"). You can try with your own news' titles here...\n",
    "data = {\n",
    "   #\"inputs\": \"Armed robbery last night in the city.\"\n",
    "   \"inputs\": \"Great match from Real Madrid tonight.\"\n",
    "   #\"inputs\": \"Stocks went up 30% after yesterday's market closure.\"\n",
    "   #\"inputs\": \"There is a new chipset that outperforms current GPUs.\"\n",
    "}\n",
    "\n",
    "response = predictor.predict(data)\n",
    "print(response, classes['label'][int(response[0]['label'][-1:])])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# let us run a quick performance test\n",
    "sum_BERT=0\n",
    "for i in range(1, 1000):\n",
    "    a_time = float(time.time())\n",
    "    result_BERT = predictor.predict(data)\n",
    "    b_time = float(time.time())\n",
    "    sum_BERT = sum_BERT + (b_time - a_time)\n",
    "    #print(b_time - a_time)\n",
    "avg_BERT = sum_BERT/1000\n",
    "print('BERT average inference time: {:.3f}'.format(avg_BERT), 'secs,')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "-----"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Amazon's BORT\n",
    "https://huggingface.co/amazon/bort\n",
    "#### Fine-tuning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "hyperparameters_bort = {\n",
    "\t'model_name_or_path':'amazon/bort',\n",
    "\t'output_dir':'/opt/ml/model',\n",
    "    'train_file':'/opt/ml/input/data/train/ag-train.csv',\n",
    "    'validation_file':'/opt/ml/input/data/test/ag-test.csv',\n",
    "    'do_train':True,\n",
    "    'do_eval':True,\n",
    "    'num_train_epochs': 1,\n",
    "    'save_total_limit': 1\n",
    "    # add your remaining hyperparameters\n",
    "\t# more info here https://github.com/huggingface/transformers/tree/v4.6.1/examples/pytorch/text-classification\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# git configuration to download our fine-tuning script\n",
    "git_config = {'repo': 'https://github.com/huggingface/transformers.git','branch': 'v4.6.1'}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# creates Hugging Face estimator\n",
    "huggingface_estimator_bort = HuggingFace(\n",
    "\tentry_point='run_glue.py', # note we are pointing to the processing script in HF repo\n",
    "\tsource_dir='./examples/pytorch/text-classification',\n",
    "\tinstance_type='ml.g4dn.12xlarge',\n",
    "\tinstance_count=1,\n",
    "\trole=role,\n",
    "\tgit_config=git_config,\n",
    "\ttransformers_version='4.6.1',\n",
    "\tpytorch_version='1.7.1',\n",
    "\tpy_version='py36',\n",
    "\thyperparameters = hyperparameters_bort,\n",
    "    disable_profiler=True\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "training_path='s3://{}/{}/train'.format(bucket, prefix)\n",
    "testing_path='s3://{}/{}/test'.format(bucket, prefix)\n",
    "# starting the train job\n",
    "huggingface_estimator_bort.fit({\"train\": training_path, \"test\": testing_path}, wait=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# check the status of the training job\n",
    "client = boto3.client(\"sagemaker\")\n",
    "describe_response = client.describe_training_job(TrainingJobName=huggingface_estimator_bort.latest_training_job.name)\n",
    "\n",
    "print ('Time - JobStatus - SecondaryStatus')\n",
    "print('------------------------------')\n",
    "print (time.strftime(\"%H:%M\", time.localtime()), '-', describe_response['TrainingJobStatus'] + \" - \" + describe_response['SecondaryStatus'])\n",
    "\n",
    "# uncomment this for monitoring the job status...\n",
    "#job_run_status = describe_response['TrainingJobStatus']\n",
    "#while job_run_status not in ('Failed', 'Completed', 'Stopped'):\n",
    "#    describe_response = client.describe_training_job(TrainingJobName=huggingface_estimator_bort.latest_training_job.name)\n",
    "#    job_run_status = describe_response['TrainingJobStatus']\n",
    "#    print (time.strftime(\"%H:%M\", time.localtime()), '-', describe_response['TrainingJobStatus'] + \" - \" + describe_response['SecondaryStatus'])\n",
    "#    sleep(30)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Important:** Make sure the training job is completed before running the \"Inference\" section below.\n",
    "\n",
    "You can verify this by running the previous cell and getting JobStatus = \"Completed\"."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Inference"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sagemaker.huggingface.model import HuggingFaceModel\n",
    "\n",
    "# create Hugging Face Model Class\n",
    "huggingface_model_bort = sagemaker.huggingface.HuggingFaceModel(\n",
    "env={ 'HF_TASK':'text-classification' },\n",
    "model_data=huggingface_estimator_bort.model_data,\n",
    "role=role,\n",
    "transformers_version=\"4.6.1\",\n",
    "pytorch_version=\"1.7.1\",\n",
    "py_version='py36',\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create SageMaker Endpoint with the HF model\n",
    "predictor_bort = huggingface_model_bort.deploy(\n",
    "initial_instance_count=1,\n",
    "instance_type=\"ml.g4dn.xlarge\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# example request (you always need to define \"inputs\"). You can try with your own news' titles here...\n",
    "data = {\n",
    "   \"inputs\": \"Stocks went up 30% after yesterday's market closure.\"\n",
    "   #\"inputs\": \"There is a new chipset that outperforms current GPUs.\"\n",
    "}\n",
    "\n",
    "response = predictor_bort.predict(data)\n",
    "print(response, classes['label'][int(response[0]['label'][-1:])])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# let us run a quick performance test\n",
    "sum_BORT=0\n",
    "for i in range(1, 1000):\n",
    "    a_time = float(time.time())\n",
    "    result_BORT = predictor_bort.predict(data)\n",
    "    b_time = float(time.time())\n",
    "    sum_BORT = sum_BORT + (b_time - a_time)\n",
    "    #print(b_time - a_time)\n",
    "avg_BORT = sum_BORT/1000\n",
    "print('BORT average inference time: {:.3f}'.format(avg_BORT), 'secs,')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "-----"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Clean-up"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# uncomment for cleaning-up endpoints\n",
    "#sess = boto3.Session()\n",
    "#sess.delete_endpoint(predictor_bert.endpoint)\n",
    "#sess.delete_endpoint(predictor_bort.endpoint)"
   ]
  }
 ],
 "metadata": {
  "instance_type": "ml.t3.medium",
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}