{ "cells": [ { "cell_type": "markdown", "id": "b6c2a95f-18e2-46f9-8167-b47796aa58d1", "metadata": { "tags": [] }, "source": [ "# Import library and set the roles and S3 buckets" ] }, { "cell_type": "code", "execution_count": null, "id": "a60a79dd-b31d-4530-a8ef-5af7b883cc41", "metadata": {}, "outputs": [], "source": [ "%pip install nltk" ] }, { "cell_type": "code", "execution_count": null, "id": "68616df4-ca90-49ef-94ff-b523c91d8ce2", "metadata": {}, "outputs": [], "source": [ "import boto3\n", "import sagemaker\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import sagemaker, boto3, json\n", "from sagemaker import get_execution_role\n", "import nltk\n", "nltk.download('punkt')\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": null, "id": "e5ee9122-9348-481e-9af3-75ce9618c51d", "metadata": { "tags": [] }, "outputs": [], "source": [ "aws_role = get_execution_role()\n", "aws_region = boto3.Session().region_name\n", "sess = sagemaker.Session()\n", "region = boto3.Session().region_name\n", "\n", "# Specify S3 bucket and prefix where you have uploaded email_dataset.csv\n", "training_data_bucket = \"\"\n", "training_data_prefix = \"\"\n", "\n", "training_dataset_s3_path = f\"s3://{training_data_bucket}/{training_data_prefix}/email_dataset.csv\"\n", "\n", "output_bucket = sess.default_bucket()\n", "output_prefix = \"\"\n", "\n", "s3_output_location = f\"s3://{output_bucket}/{output_prefix}/output\"" ] }, { "cell_type": "markdown", "id": "f44da972-06f4-48df-983d-094914142547", "metadata": {}, "source": [ "# Load Data" ] }, { "cell_type": "code", "execution_count": null, "id": "e458c9cc-2a06-4544-b70d-a701977891de", "metadata": {}, "outputs": [], "source": [ "# load data\n", "df = pd.read_csv(training_dataset_s3_path)\n", "df.head()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "356612c8-fb12-49bc-9b0c-002730430c9b", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Check if dataset is balanced or not\n", "df['Category'].value_counts()" ] }, { "cell_type": "markdown", "id": "24150ce4-ee02-4dff-973e-e460088cf970", "metadata": { "tags": [] }, "source": [ "# Prepare the Data" ] }, { "cell_type": "code", "execution_count": null, "id": "bced5b50-bfdf-460e-9575-0165aba739ee", "metadata": { "tags": [] }, "outputs": [], "source": [ "#Replace SPAM with 1 and HAM with 0\n", "df['Category'] = df['Category'].apply(lambda x:1 if x=='SPAM' else 0)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "c11b2f52-91b9-4eb8-96ba-7747a0c369fc", "metadata": { "tags": [] }, "outputs": [], "source": [ "def tokenize(message):\n", " # delete quotation marks and commas , apply tokenization and join back into a string separating by spaces\n", " return ' '.join([str(token) for token in nltk.word_tokenize(str(message).replace(',', '').replace('\"', '').lower())])\n", " \n", "def prepare_data(df):\n", " df['Category'] = df['Category'].map(lambda category : '__label__{}'.format(str(category).replace('__label__', '')))\n", " df['Message'] = df['Message'].map(lambda message : tokenize(message)) \n", " return df\n", "\n", "df_final = df[['Category', 'Message']].reset_index(drop=True)\n", "df_final = prepare_data(df_final)\n", "df_final.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "2fdc7afe-3d35-4fce-92a4-f3a7023d844d", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Split data into train and validation\n", "from sklearn.model_selection import train_test_split\n", "df_train, df_validation = train_test_split(df_final, \n", " test_size=0.10,\n", " stratify=df_final['Category'])" ] }, { "cell_type": "code", "execution_count": null, "id": "79dd1c81-ea7d-4f86-8925-8117104e626b", "metadata": { "tags": [] }, "outputs": [], "source": [ "#upload transformed data to S3 bucket\n", "train_path = './train.csv'\n", "df_train[['Category', 'Message']].to_csv(train_path, index=False, header=False, sep=' ')\n", "\n", "validation_path = './validation.csv'\n", "df_validation[['Category', 'Message']].to_csv(validation_path, index=False, header=False, sep=' ')\n", "\n", "#Specify S3 bucket prefix\n", "train_s3_uri = sess.upload_data(bucket=training_data_bucket, key_prefix='trainig', path=train_path)\n", "validation_s3_uri = sess.upload_data(bucket=training_data_bucket, key_prefix='validation', path= validation_path)" ] }, { "cell_type": "markdown", "id": "4e16151b-a92a-4677-9d61-6da4b59921cc", "metadata": {}, "source": [ "# Train the Model" ] }, { "cell_type": "code", "execution_count": null, "id": "63ec9fe1-8c8e-45f6-8ce3-ffa9df4da14e", "metadata": {}, "outputs": [], "source": [ "image_uri = sagemaker.image_uris.retrieve(\n", " region=region,\n", " framework='blazingtext'\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "e5f14cdf-fe4c-4d68-95ca-e812d71b65d1", "metadata": { "tags": [] }, "outputs": [], "source": [ "estimator = sagemaker.estimator.Estimator(image_uri=image_uri, \n", " role=aws_role, \n", " instance_count=1, \n", " instance_type='ml.m5.large',\n", " volume_size=30,\n", " max_run=7200,\n", " disable_profiler=True, \n", " sagemaker_session=sess\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "a068d35d-3aa9-4954-b614-d3fed8aef996", "metadata": { "tags": [] }, "outputs": [], "source": [ "#Hyperparameter\n", "estimator.set_hyperparameters(mode='supervised', \n", " epochs=10, \n", " learning_rate=0.01, \n", " min_count=2, \n", " vector_dim=300, \n", " word_ngrams=3) " ] }, { "cell_type": "code", "execution_count": null, "id": "ac5d47bf-1b0b-44d6-a428-75a3a222a87c", "metadata": { "tags": [] }, "outputs": [], "source": [ "train_data = sagemaker.inputs.TrainingInput(\n", " train_s3_uri, \n", " distribution='FullyReplicated', \n", " content_type='text/plain', \n", " s3_data_type='S3Prefix'\n", ")\n", "validation_data = sagemaker.inputs.TrainingInput(\n", " validation_s3_uri, \n", " distribution='FullyReplicated', \n", " content_type='text/plain', \n", " s3_data_type='S3Prefix'\n", ")\n", "\n", "data_channels = {\n", " 'train': train_data,\n", " 'validation': validation_data \n", "}" ] }, { "cell_type": "code", "execution_count": null, "id": "240bb9d4-a93d-493f-840b-6dc16d925517", "metadata": { "tags": [] }, "outputs": [], "source": [ "estimator.fit(\n", " inputs=data_channels,\n", " wait=True\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "fa513998-3af9-4359-bc67-141eacfc3af4", "metadata": { "tags": [] }, "outputs": [], "source": [ "#Get the accuracy of the train and validation dataset\n", "estimator.training_job_analytics.dataframe()" ] }, { "cell_type": "markdown", "id": "42393eed-c951-4954-935b-7ed8f81286ea", "metadata": {}, "source": [ "# Deploy the Model" ] }, { "cell_type": "code", "execution_count": null, "id": "ae248cb8-faf6-449a-8a48-b087b9a8df06", "metadata": { "tags": [] }, "outputs": [], "source": [ "text_classifier = estimator.deploy(initial_instance_count=1,\n", " instance_type='ml.m5.large',\n", " serializer=sagemaker.serializers.JSONSerializer(),\n", " deserializer=sagemaker.deserializers.JSONDeserializer())\n", "print()\n", "print('Endpoint name: {}'.format(text_classifier.endpoint_name))" ] }, { "cell_type": "markdown", "id": "86a8618e-1636-4bf8-9670-96928b8b11d0", "metadata": { "tags": [] }, "source": [ "# Test the Model" ] }, { "cell_type": "code", "execution_count": null, "id": "e6193943-3a56-491e-b9e9-282e79f289ee", "metadata": { "tags": [] }, "outputs": [], "source": [ "messages = [\n", " # Spam\n", " 'Click on below link, provide your details and win this award' ,\n", " 'Best summer deal here',\n", " #ham\n", " 'See you in the office on Friday.'\n", "\n", "]\n", "\n", "tokenized_message = [' '.join(nltk.word_tokenize(mesaage)) for mesaage in messages]\n", "payload = {\"instances\" : tokenized_message}\n", "print(payload)" ] }, { "cell_type": "code", "execution_count": null, "id": "a59c993f-ac26-4d8b-b1b7-971b207a12fb", "metadata": { "tags": [] }, "outputs": [], "source": [ "predictions = text_classifier.predict(data=payload)\n", "for prediction in predictions:\n", " predicted_class = prediction['label'][0].lstrip('__label__')\n", " print('SPAM' if predicted_class == '1' else 'HAM')\n", " " ] }, { "cell_type": "markdown", "id": "38a94267-ff39-4ead-bade-6b4d8cce5527", "metadata": {}, "source": [ "# Delete the Model Endpoint" ] }, { "cell_type": "code", "execution_count": null, "id": "1ffe090f-1406-4ee9-bdcb-2384e302b665", "metadata": { "tags": [] }, "outputs": [], "source": [ "text_classifier.delete_endpoint()" ] } ], "metadata": { "availableInstances": [ { "_defaultOrder": 0, "_isFastLaunch": true, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 4, "name": "ml.t3.medium", "vcpuNum": 2 }, { "_defaultOrder": 1, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 8, "name": "ml.t3.large", "vcpuNum": 2 }, { "_defaultOrder": 2, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 16, "name": "ml.t3.xlarge", "vcpuNum": 4 }, { "_defaultOrder": 3, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 32, "name": "ml.t3.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 4, "_isFastLaunch": true, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 8, "name": "ml.m5.large", "vcpuNum": 2 }, { "_defaultOrder": 5, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 16, "name": "ml.m5.xlarge", "vcpuNum": 4 }, { "_defaultOrder": 6, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 32, "name": "ml.m5.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 7, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 64, "name": "ml.m5.4xlarge", "vcpuNum": 16 }, { "_defaultOrder": 8, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 128, "name": "ml.m5.8xlarge", "vcpuNum": 32 }, { "_defaultOrder": 9, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 192, "name": "ml.m5.12xlarge", "vcpuNum": 48 }, { "_defaultOrder": 10, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 256, "name": "ml.m5.16xlarge", "vcpuNum": 64 }, { "_defaultOrder": 11, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 384, "name": "ml.m5.24xlarge", "vcpuNum": 96 }, { "_defaultOrder": 12, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 8, "name": "ml.m5d.large", "vcpuNum": 2 }, { "_defaultOrder": 13, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 16, "name": "ml.m5d.xlarge", "vcpuNum": 4 }, { "_defaultOrder": 14, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 32, "name": "ml.m5d.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 15, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 64, "name": "ml.m5d.4xlarge", "vcpuNum": 16 }, { "_defaultOrder": 16, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 128, "name": "ml.m5d.8xlarge", "vcpuNum": 32 }, { "_defaultOrder": 17, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 192, "name": "ml.m5d.12xlarge", "vcpuNum": 48 }, { "_defaultOrder": 18, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 256, "name": "ml.m5d.16xlarge", "vcpuNum": 64 }, { "_defaultOrder": 19, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 384, "name": "ml.m5d.24xlarge", "vcpuNum": 96 }, { "_defaultOrder": 20, "_isFastLaunch": false, "category": "General purpose", "gpuNum": 0, "hideHardwareSpecs": true, "memoryGiB": 0, "name": "ml.geospatial.interactive", "supportedImageNames": [ "sagemaker-geospatial-v1-0" ], "vcpuNum": 0 }, { "_defaultOrder": 21, "_isFastLaunch": true, "category": "Compute optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 4, "name": "ml.c5.large", "vcpuNum": 2 }, { "_defaultOrder": 22, "_isFastLaunch": false, "category": "Compute optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 8, "name": "ml.c5.xlarge", "vcpuNum": 4 }, { "_defaultOrder": 23, "_isFastLaunch": false, "category": "Compute optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 16, "name": "ml.c5.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 24, "_isFastLaunch": false, "category": "Compute optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 32, "name": "ml.c5.4xlarge", "vcpuNum": 16 }, { "_defaultOrder": 25, "_isFastLaunch": false, "category": "Compute optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 72, "name": "ml.c5.9xlarge", "vcpuNum": 36 }, { "_defaultOrder": 26, "_isFastLaunch": false, "category": "Compute optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 96, "name": "ml.c5.12xlarge", "vcpuNum": 48 }, { "_defaultOrder": 27, "_isFastLaunch": false, "category": "Compute optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 144, "name": "ml.c5.18xlarge", "vcpuNum": 72 }, { "_defaultOrder": 28, "_isFastLaunch": false, "category": "Compute optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 192, "name": "ml.c5.24xlarge", "vcpuNum": 96 }, { "_defaultOrder": 29, "_isFastLaunch": true, "category": "Accelerated computing", "gpuNum": 1, "hideHardwareSpecs": false, "memoryGiB": 16, "name": "ml.g4dn.xlarge", "vcpuNum": 4 }, { "_defaultOrder": 30, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "hideHardwareSpecs": false, "memoryGiB": 32, "name": "ml.g4dn.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 31, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "hideHardwareSpecs": false, "memoryGiB": 64, "name": "ml.g4dn.4xlarge", "vcpuNum": 16 }, { "_defaultOrder": 32, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "hideHardwareSpecs": false, "memoryGiB": 128, "name": "ml.g4dn.8xlarge", "vcpuNum": 32 }, { "_defaultOrder": 33, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 4, "hideHardwareSpecs": false, "memoryGiB": 192, "name": "ml.g4dn.12xlarge", "vcpuNum": 48 }, { "_defaultOrder": 34, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "hideHardwareSpecs": false, "memoryGiB": 256, "name": "ml.g4dn.16xlarge", "vcpuNum": 64 }, { "_defaultOrder": 35, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "hideHardwareSpecs": false, "memoryGiB": 61, "name": "ml.p3.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 36, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 4, "hideHardwareSpecs": false, "memoryGiB": 244, "name": "ml.p3.8xlarge", "vcpuNum": 32 }, { "_defaultOrder": 37, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 8, "hideHardwareSpecs": false, "memoryGiB": 488, "name": "ml.p3.16xlarge", "vcpuNum": 64 }, { "_defaultOrder": 38, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 8, "hideHardwareSpecs": false, "memoryGiB": 768, "name": "ml.p3dn.24xlarge", "vcpuNum": 96 }, { "_defaultOrder": 39, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 16, "name": "ml.r5.large", "vcpuNum": 2 }, { "_defaultOrder": 40, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 32, "name": "ml.r5.xlarge", "vcpuNum": 4 }, { "_defaultOrder": 41, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 64, "name": "ml.r5.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 42, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 128, "name": "ml.r5.4xlarge", "vcpuNum": 16 }, { "_defaultOrder": 43, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 256, "name": "ml.r5.8xlarge", "vcpuNum": 32 }, { "_defaultOrder": 44, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 384, "name": "ml.r5.12xlarge", "vcpuNum": 48 }, { "_defaultOrder": 45, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 512, "name": "ml.r5.16xlarge", "vcpuNum": 64 }, { "_defaultOrder": 46, "_isFastLaunch": false, "category": "Memory Optimized", "gpuNum": 0, "hideHardwareSpecs": false, "memoryGiB": 768, "name": "ml.r5.24xlarge", "vcpuNum": 96 }, { "_defaultOrder": 47, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "hideHardwareSpecs": false, "memoryGiB": 16, "name": "ml.g5.xlarge", "vcpuNum": 4 }, { "_defaultOrder": 48, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "hideHardwareSpecs": false, "memoryGiB": 32, "name": "ml.g5.2xlarge", "vcpuNum": 8 }, { "_defaultOrder": 49, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "hideHardwareSpecs": false, "memoryGiB": 64, "name": "ml.g5.4xlarge", "vcpuNum": 16 }, { "_defaultOrder": 50, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "hideHardwareSpecs": false, "memoryGiB": 128, "name": "ml.g5.8xlarge", "vcpuNum": 32 }, { "_defaultOrder": 51, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 1, "hideHardwareSpecs": false, "memoryGiB": 256, "name": "ml.g5.16xlarge", "vcpuNum": 64 }, { "_defaultOrder": 52, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 4, "hideHardwareSpecs": false, "memoryGiB": 192, "name": "ml.g5.12xlarge", "vcpuNum": 48 }, { "_defaultOrder": 53, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 4, "hideHardwareSpecs": false, "memoryGiB": 384, "name": "ml.g5.24xlarge", "vcpuNum": 96 }, { "_defaultOrder": 54, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 8, "hideHardwareSpecs": false, "memoryGiB": 768, "name": "ml.g5.48xlarge", "vcpuNum": 192 }, { "_defaultOrder": 55, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 8, "hideHardwareSpecs": false, "memoryGiB": 1152, "name": "ml.p4d.24xlarge", "vcpuNum": 96 }, { "_defaultOrder": 56, "_isFastLaunch": false, "category": "Accelerated computing", "gpuNum": 8, "hideHardwareSpecs": false, "memoryGiB": 1152, "name": "ml.p4de.24xlarge", "vcpuNum": 96 } ], "instance_type": "ml.t3.medium", "kernelspec": { "display_name": "Python 3 (Data Science)", "language": "python", "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:081325390199:image/datascience-1.0" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.10" } }, "nbformat": 4, "nbformat_minor": 5 }