{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Classifying news with HuggingFace and PyTorch on Amazon SageMaker" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "# make sure the Amazon SageMaker SDK is updated\n", "!pip install \"sagemaker\" --upgrade" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# import a few libraries that will be needed\n", "import sagemaker\n", "from sagemaker.huggingface import HuggingFace\n", "import boto3\n", "import pandas as pd\n", "import os, time, tarfile" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# gets role for executing training job and set a few variables\n", "sagemaker_session = sagemaker.Session()\n", "bucket = sagemaker_session.default_bucket()\n", "prefix = \"news-hf\"\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This example uses the AG News dataset cited in the paper [Character-level Convolutional Networks for Text Classification](https://arxiv.org/abs/1509.01626) by Xiang Zhang and [Yann LeCun](https://twitter.com/ylecun). This dataset is available on the [AWS Open Data Registry](https://registry.opendata.aws/fast-ai-nlp/)." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# download and extract our custom dataset\n", "!wget -nc https://s3.amazonaws.com/fast-ai-nlp/ag_news_csv.tgz\n", "tf = tarfile.open('ag_news_csv.tgz')\n", "tf.extractall()\n", "!rm -fr ag_news_csv.tgz" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# read training data and add a header\n", "train = pd.read_csv('./ag_news_csv/train.csv')\n", "train.columns = ['label', 'title', 'description']\n", "\n", "# read testing data and add a header\n", "test = pd.read_csv('./ag_news_csv/test.csv')\n", "test.columns = ['label', 'title', 'description']\n", "\n", "# write the files with header\n", "train.to_csv(\"ag_news_csv/ag-train.csv\", index=False)\n", "test.to_csv(\"ag_news_csv/ag-test.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "# take a look at the training data\n", "train" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "# upload training and testing data to Amazon S3\n", "inputs_train = sagemaker_session.upload_data(\"ag_news_csv/ag-train.csv\", bucket=bucket, key_prefix='{}/train'.format(prefix))\n", "inputs_test = sagemaker_session.upload_data(\"ag_news_csv/ag-test.csv\", bucket=bucket, key_prefix='{}/test'.format(prefix))\n", "print(inputs_train)\n", "print(inputs_test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# keep in mind the classes used in this dataset\n", "classes = pd.read_csv('./ag_news_csv/classes.txt', header=None)\n", "classes.columns = ['label']\n", "classes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "----" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## BERT large uncased\n", "https://huggingface.co/bert-large-uncased\n", "#### Fine-tuning" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "hyperparameters = {\n", "\t'model_name_or_path':'bert-large-uncased',\n", "\t'output_dir':'/opt/ml/model',\n", " 'train_file':'/opt/ml/input/data/train/ag-train.csv',\n", " 'validation_file':'/opt/ml/input/data/test/ag-test.csv',\n", " 'do_train':True,\n", " 'do_eval':True,\n", " 'num_train_epochs': 1,\n", " 'save_total_limit': 1,\n", "\t# add your remaining hyperparameters\n", "\t# more info here https://github.com/huggingface/transformers/tree/v4.10.0/examples/pytorch/text-classification\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# git configuration to download our fine-tuning script\n", "git_config = {'repo': 'https://github.com/huggingface/transformers.git','branch': 'v4.6.1'}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# creates Hugging Face estimator\n", "huggingface_estimator_bert = HuggingFace(\n", "\tentry_point='run_glue.py', # note we are pointing to the processing script in HF repo\n", "\tsource_dir='./examples/pytorch/text-classification',\n", "\tinstance_type='ml.g4dn.16xlarge',\n", "\tinstance_count=1,\n", "\trole=role,\n", "\tgit_config=git_config,\n", "\ttransformers_version='4.6.1',\n", "\tpytorch_version='1.7.1',\n", "\tpy_version='py36',\n", "\thyperparameters = hyperparameters,\n", " disable_profiler=True\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "training_path='s3://{}/{}/train'.format(bucket, prefix)\n", "testing_path='s3://{}/{}/test'.format(bucket, prefix)\n", "# starting the train job\n", "huggingface_estimator_bert.fit({\"train\": training_path, \"test\": testing_path}, wait=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# check the status of the training job\n", "client = boto3.client(\"sagemaker\")\n", "describe_response = client.describe_training_job(TrainingJobName=huggingface_estimator_bert.latest_training_job.name)\n", "\n", "print ('Time - JobStatus - SecondaryStatus')\n", "print('------------------------------')\n", "print (time.strftime(\"%H:%M\", time.localtime()), '-', describe_response['TrainingJobStatus'] + \" - \" + describe_response['SecondaryStatus'])\n", "\n", "# uncomment this for monitoring the job status...\n", "#job_run_status = describe_response['TrainingJobStatus']\n", "#while job_run_status not in ('Failed', 'Completed', 'Stopped'):\n", "# describe_response = client.describe_training_job(TrainingJobName=huggingface_estimator_bert.latest_training_job.name)\n", "# job_run_status = describe_response['TrainingJobStatus']\n", "# print (time.strftime(\"%H:%M\", time.localtime()), '-', describe_response['TrainingJobStatus'] + \" - \" + describe_response['SecondaryStatus'])\n", "# sleep(30)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Important:** Make sure the training job is completed before running the \"Inference\" section below.\n", "\n", "You can verify this by running the previous cell and getting JobStatus = \"Completed\"." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Inference" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sagemaker.huggingface.model import HuggingFaceModel\n", "\n", "# create Hugging Face Model Class\n", "huggingface_model = sagemaker.huggingface.HuggingFaceModel(\n", "env={ 'HF_TASK':'text-classification' },\n", "model_data=huggingface_estimator_bert.model_data,\n", "role=role,\n", "transformers_version=\"4.6.1\",\n", "pytorch_version=\"1.7.1\",\n", "py_version='py36',\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# create SageMaker Endpoint with the HF model\n", "predictor = huggingface_model.deploy(\n", "initial_instance_count=1,\n", "instance_type=\"ml.g4dn.xlarge\"\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# example request (you always need to define \"inputs\"). You can try with your own news' titles here...\n", "data = {\n", " #\"inputs\": \"Armed robbery last night in the city.\"\n", " \"inputs\": \"Great match from Real Madrid tonight.\"\n", " #\"inputs\": \"Stocks went up 30% after yesterday's market closure.\"\n", " #\"inputs\": \"There is a new chipset that outperforms current GPUs.\"\n", "}\n", "\n", "response = predictor.predict(data)\n", "print(response, classes['label'][int(response[0]['label'][-1:])])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "# let us run a quick performance test\n", "sum_BERT=0\n", "for i in range(1, 1000):\n", " a_time = float(time.time())\n", " result_BERT = predictor.predict(data)\n", " b_time = float(time.time())\n", " sum_BERT = sum_BERT + (b_time - a_time)\n", " #print(b_time - a_time)\n", "avg_BERT = sum_BERT/1000\n", "print('BERT average inference time: {:.3f}'.format(avg_BERT), 'secs,')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "-----" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Amazon's BORT\n", "https://huggingface.co/amazon/bort\n", "#### Fine-tuning" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "hyperparameters_bort = {\n", "\t'model_name_or_path':'amazon/bort',\n", "\t'output_dir':'/opt/ml/model',\n", " 'train_file':'/opt/ml/input/data/train/ag-train.csv',\n", " 'validation_file':'/opt/ml/input/data/test/ag-test.csv',\n", " 'do_train':True,\n", " 'do_eval':True,\n", " 'num_train_epochs': 1,\n", " 'save_total_limit': 1\n", " # add your remaining hyperparameters\n", "\t# more info here https://github.com/huggingface/transformers/tree/v4.6.1/examples/pytorch/text-classification\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# git configuration to download our fine-tuning script\n", "git_config = {'repo': 'https://github.com/huggingface/transformers.git','branch': 'v4.6.1'}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# creates Hugging Face estimator\n", "huggingface_estimator_bort = HuggingFace(\n", "\tentry_point='run_glue.py', # note we are pointing to the processing script in HF repo\n", "\tsource_dir='./examples/pytorch/text-classification',\n", "\tinstance_type='ml.g4dn.12xlarge',\n", "\tinstance_count=1,\n", "\trole=role,\n", "\tgit_config=git_config,\n", "\ttransformers_version='4.6.1',\n", "\tpytorch_version='1.7.1',\n", "\tpy_version='py36',\n", "\thyperparameters = hyperparameters_bort,\n", " disable_profiler=True\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "training_path='s3://{}/{}/train'.format(bucket, prefix)\n", "testing_path='s3://{}/{}/test'.format(bucket, prefix)\n", "# starting the train job\n", "huggingface_estimator_bort.fit({\"train\": training_path, \"test\": testing_path}, wait=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# check the status of the training job\n", "client = boto3.client(\"sagemaker\")\n", "describe_response = client.describe_training_job(TrainingJobName=huggingface_estimator_bort.latest_training_job.name)\n", "\n", "print ('Time - JobStatus - SecondaryStatus')\n", "print('------------------------------')\n", "print (time.strftime(\"%H:%M\", time.localtime()), '-', describe_response['TrainingJobStatus'] + \" - \" + describe_response['SecondaryStatus'])\n", "\n", "# uncomment this for monitoring the job status...\n", "#job_run_status = describe_response['TrainingJobStatus']\n", "#while job_run_status not in ('Failed', 'Completed', 'Stopped'):\n", "# describe_response = client.describe_training_job(TrainingJobName=huggingface_estimator_bort.latest_training_job.name)\n", "# job_run_status = describe_response['TrainingJobStatus']\n", "# print (time.strftime(\"%H:%M\", time.localtime()), '-', describe_response['TrainingJobStatus'] + \" - \" + describe_response['SecondaryStatus'])\n", "# sleep(30)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Important:** Make sure the training job is completed before running the \"Inference\" section below.\n", "\n", "You can verify this by running the previous cell and getting JobStatus = \"Completed\"." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Inference" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sagemaker.huggingface.model import HuggingFaceModel\n", "\n", "# create Hugging Face Model Class\n", "huggingface_model_bort = sagemaker.huggingface.HuggingFaceModel(\n", "env={ 'HF_TASK':'text-classification' },\n", "model_data=huggingface_estimator_bort.model_data,\n", "role=role,\n", "transformers_version=\"4.6.1\",\n", "pytorch_version=\"1.7.1\",\n", "py_version='py36',\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# create SageMaker Endpoint with the HF model\n", "predictor_bort = huggingface_model_bort.deploy(\n", "initial_instance_count=1,\n", "instance_type=\"ml.g4dn.xlarge\"\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# example request (you always need to define \"inputs\"). You can try with your own news' titles here...\n", "data = {\n", " \"inputs\": \"Stocks went up 30% after yesterday's market closure.\"\n", " #\"inputs\": \"There is a new chipset that outperforms current GPUs.\"\n", "}\n", "\n", "response = predictor_bort.predict(data)\n", "print(response, classes['label'][int(response[0]['label'][-1:])])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# let us run a quick performance test\n", "sum_BORT=0\n", "for i in range(1, 1000):\n", " a_time = float(time.time())\n", " result_BORT = predictor_bort.predict(data)\n", " b_time = float(time.time())\n", " sum_BORT = sum_BORT + (b_time - a_time)\n", " #print(b_time - a_time)\n", "avg_BORT = sum_BORT/1000\n", "print('BORT average inference time: {:.3f}'.format(avg_BORT), 'secs,')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "-----" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Clean-up" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# uncomment for cleaning-up endpoints\n", "#sess = boto3.Session()\n", "#sess.delete_endpoint(predictor_bert.endpoint)\n", "#sess.delete_endpoint(predictor_bort.endpoint)" ] } ], "metadata": { "instance_type": "ml.t3.medium", "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.9" } }, "nbformat": 4, "nbformat_minor": 4 }