{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Prepare your data for training\n", "\n", "In this lab you will use a SageMaker Processing job to convert your raw data into a set of train, test, and validation datasets that can be used to train a model." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install \"sagemaker>=2.123.0\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import sagemaker\n", "import boto3\n", "import numpy as np \n", "import pandas as pd \n", "import os \n", "import time\n", "from sagemaker import get_execution_role\n", "from sagemaker.processing import ProcessingInput, ProcessingOutput\n", "\n", "# Get default bucket\n", "session = sagemaker.Session()\n", "default_bucket = session.default_bucket()\n", "bucket_prefix = \"mlops-workshop/\"\n", "# Get SageMaker Execution Role\n", "role = get_execution_role()\n", "region = boto3.Session().region_name" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Define boto session and SageMaker Client\n", "\n", "boto_session = boto3.Session(region_name=region)\n", "sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!mkdir -p scripts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%writefile ./scripts/preprocessing.py\n", "import argparse\n", "import os\n", "\n", "import numpy as np\n", "import pandas as pd\n", "\n", "def process(df):\n", " # Add two new indicators\n", " df[\"no_previous_contact\"] = (df[\"pdays\"] == 999).astype(int)\n", " df[\"not_working\"] = df[\"job\"].isin([\"student\", \"retired\", \"unemployed\"]).astype(int)\n", " columns = list(df.columns)\n", " \n", " toremove = [\"emp.var.rate\", \"cons.price.idx\", \"cons.conf.idx\", \"euribor3m\", \"nr.employed\"]\n", " columns = [x for x in columns if x not in toremove]\n", " \n", " # Keeping only columns that we need\n", " df = df[columns]\n", " \n", " # One hot encode\n", " df=pd.get_dummies(df)\n", " df = pd.concat([df['y_yes'], df.drop(['y_no', 'y_yes'], axis=1)], axis=1)\n", " df = df.sample(frac=1).reset_index(drop=True)\n", " return df\n", "\n", "if __name__ == \"__main__\":\n", " parser = argparse.ArgumentParser()\n", " parser.add_argument(\"--input-path\", type=str, default=\"/opt/ml/processing\")\n", " args, _ = parser.parse_known_args()\n", " \n", " base_dir = args.input_path\n", "\n", " df = pd.read_csv(\n", " f\"{base_dir}/input/bank-additional-full.csv\",\n", " header=0\n", " )\n", " \n", " # Call the helper method\n", " df = process(df)\n", " \n", " train, validation, test = np.split(df, [int(.7*len(df)), int(.85*len(df))])\n", "\n", " train.to_csv(f\"{base_dir}/train/train.csv\", header=False, index=False)\n", " validation.to_csv(f\"{base_dir}/validation/validation.csv\", header=False, index=False)\n", " test.to_csv(f\"{base_dir}/test/test.csv\", header=False, index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Upload the raw dataset to S3 so that it can be used in the processing job\n", "\n", "local_data_path = \"bank-additional-full.csv\"\n", "\n", "base_uri = f\"s3://{default_bucket}/marketing\"\n", "input_data_uri = sagemaker.s3.S3Uploader.upload(\n", " local_path=local_data_path, \n", " desired_s3_uri=base_uri,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sagemaker.sklearn.processing import SKLearnProcessor\n", "\n", "framework_version = \"0.23-1\"\n", "\n", "sklearn_processor = SKLearnProcessor(\n", " framework_version=framework_version,\n", " instance_type=\"ml.m5.xlarge\",\n", " instance_count=1,\n", " base_job_name=\"sklearn-marketing-process\",\n", " role=role,\n", " sagemaker_session=session\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sklearn_processor.run(\n", " inputs=[\n", " ProcessingInput(source=input_data_uri, destination=\"/opt/ml/processing/input\"), \n", " ],\n", " outputs=[\n", " ProcessingOutput(output_name=\"train\", source=\"/opt/ml/processing/train\"),\n", " ProcessingOutput(output_name=\"validation\", source=\"/opt/ml/processing/validation\"),\n", " ProcessingOutput(output_name=\"test\", source=\"/opt/ml/processing/test\")\n", " ],\n", " code=\"scripts/preprocessing.py\",\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "preprocessing_job_description = sklearn_processor.jobs[-1].describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_uri = preprocessing_job_description['ProcessingOutputConfig']['Outputs'][0]['S3Output']['S3Uri']\n", "val_uri = preprocessing_job_description['ProcessingOutputConfig']['Outputs'][1]['S3Output']['S3Uri']\n", "test_uri = preprocessing_job_description['ProcessingOutputConfig']['Outputs'][2]['S3Output']['S3Uri']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%store train_uri\n", "%store val_uri\n", "%store test_uri" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Let's view the processed data\n", "\n", "Here we download the training dataset and view the first 10 rows" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!aws s3 cp {train_uri}/train.csv /tmp/train.csv" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_df = pd.read_csv('/tmp/train.csv', header = None)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### You can now move to the next section of the module `Train a model & track your experiments`\n", "\n", "The notebook used in that section is `sagemaker-train.ipynb`" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "instance_type": "ml.t3.medium", "kernelspec": { "display_name": "Python 3 (Data Science)", "language": "python", "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-2:429704687514:image/datascience-1.0" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.10" } }, "nbformat": 4, "nbformat_minor": 4 }