{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import sagemaker\n", "print(pd.__version__)\n", "print(sagemaker.__version__)\n", "pd.set_option('display.max_colwidth', None)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "session = sagemaker.Session()\n", "role = sagemaker.get_execution_role()\n", "bucket = session.default_bucket()\n", "prefix = \"dataset/appflow-sagemakerdemo\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#upload dataset\n", "input_data = session.upload_data(path='dataset/all_tickets.csv', key_prefix=prefix)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Processing script" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%writefile preprocessing.py\n", "import argparse, os, subprocess, sys\n", "import pandas as pd\n", "import numpy as np\n", "import re\n", "import string\n", "from sklearn.model_selection import train_test_split\n", "\n", "\n", "def install(package):\n", " subprocess.call([\n", " sys.executable, \"-m\",\"pip\", \"install\", package\n", " ])\n", "\n", "def remove_non_alphanumeric(row):\n", " row = [word for word in row if word.isalpha()]\n", " return row\n", "\n", "\n", "\n", "if __name__ == '__main__':\n", " install('nltk')\n", " import nltk\n", " from nltk.corpus import stopwords\n", " nltk.download('punkt')\n", " nltk.download('stopwords')\n", " parser = argparse.ArgumentParser()\n", " parser.add_argument('--filename', type=str)\n", " parser.add_argument('--num-cases', type=int, default=20000)\n", " parser.add_argument('--split-ratio', type=float, default=0.1)\n", "\n", " args, _ = parser.parse_known_args()\n", "\n", " print(\"Recieved arguments {}\".format(args))\n", "\n", " filename = args.filename\n", " num_cases = args.num_cases\n", " split_ratio = args.split_ratio\n", " \n", " #load dataset\n", "\n", " input_data_path = os.path.join('/opt/ml/processing/input', filename)\n", " print(\"Reading input data from {}\".format(input_data_path))\n", "\n", " data = pd.read_csv(input_data_path)\n", "\n", " #remove lines with missing values\n", " data.dropna(inplace=True)\n", "\n", " if num_cases is not None:\n", " data = data[:num_cases]\n", "\n", " #drop unwanted columns\n", " data = data[['category', 'body']]\n", "\n", " data['label'] = data.category.replace({\n", " 0: '__label__Category0__',\n", " 1: '__label__Category1__',\n", " 2: '__label__Category2__',\n", " 3: '__label__Category3__',\n", " 4: '__label__Category4__',\n", " 5: '__label__Category5__',\n", " 6: '__label__Category6__',\n", " 7: '__label__Category7__',\n", " 8: '__label__Category8__',\n", " 9: '__label__Category9__',\n", " 10: '__label__Category10__',\n", " 11: '__label__Category11__',\n", " 12: '__label__Category12__',\n", " 13: '__label__Category12__'\n", " }\n", " ) \n", " data = data.drop(['category'], axis=1)\n", "\n", " #move the label column to the front\n", " data = data[['label', 'body']]\n", "\n", " #tokenize the data\n", " print(\"Tokenizing the reviews\")\n", "\n", " data['body'] = data['body'].apply(nltk.word_tokenize)\n", "\n", " #remove none alpanumeric chars\n", " data['body'] = data['body'].apply(remove_non_alphanumeric)\n", "\n", " #remove punctuation\n", " #data['body'] = data['body'].apply(remove_punctuation)\n", "\n", " #remove stop words\n", " def remove_stop_words(row):\n", " stop_words = set(stopwords.words('english'))\n", " words = [w for w in row if not w in stop_words]\n", " return words\n", " \n", " data['body'] = data['body'].apply(remove_stop_words)\n", "\n", " #convert all text to lowercase\n", " data['email_body'] = data.apply(lambda row: \" \".join(row['body']).lower(), axis=1)\n", "\n", " #drop unwanted columns\n", " data = data.drop(['body'], axis=1)\n", "\n", " # Process data\n", " print('Splitting data with ratio {}'.format(split_ratio))\n", " training, validation = train_test_split(data, test_size=split_ratio)\n", "\n", " training_output_path = os.path.join('/opt/ml/processing/train', 'training.txt')\n", " validation_output_path = os.path.join('/opt/ml/processing/validation', 'validation.txt')\n", "\n", " print('Saving training data to {}'.format(training_output_path))\n", " np.savetxt(training_output_path, training.values, fmt='%s')\n", "\n", " print('Saving validation data to {}'.format(validation_output_path))\n", " np.savetxt(validation_output_path, validation.values, fmt='%s')\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sagemaker.sklearn.processing import SKLearnProcessor\n", "from sagemaker.processing import ProcessingInput, ProcessingOutput" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sklearn_processor = SKLearnProcessor(\n", " framework_version='0.20.0',\n", " role=role,\n", " instance_type='ml.c5.2xlarge',\n", " instance_count=1\n", " \n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%time\n", "\n", "sklearn_processor.run(\n", " code='preprocessing.py',\n", " inputs=[ProcessingInput(source=input_data, # Our data from s3\n", " destination='/opt/ml/processing/input')],\n", " outputs=[\n", " ProcessingOutput(output_name=\"training\", \n", " source='/opt/ml/processing/train'),\n", " ProcessingOutput(output_name=\"validation\", \n", " source='/opt/ml/processing/validation')\n", " ],\n", " arguments=[\n", " \"--filename\", \"all_tickets.csv\",\n", " \"--num-cases\", \"35000\",\n", " \"--split-ratio\", \"0.05\"\n", " ]\n", " \n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sklearn_processor.latest_job.describe()['ProcessingOutputConfig']['Outputs'][0]['S3Output']['S3Uri']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sklearn_processor.latest_job.describe()['ProcessingOutputConfig']['Outputs'][1]['S3Output']['S3Uri']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "### The output above will be used in the training notebook" ] } ], "metadata": { "kernelspec": { "display_name": "conda_python3", "language": "python", "name": "conda_python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.10" } }, "nbformat": 4, "nbformat_minor": 4 }