{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Prepare your data for training\n",
    "\n",
    "In this lab you will use a SageMaker Processing job to convert your raw data into a set of train, test, and validation datasets that can be used to train a model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install \"sagemaker>=2.123.0\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sagemaker\n",
    "import boto3\n",
    "import numpy as np                                \n",
    "import pandas as pd                               \n",
    "import os                                      \n",
    "import time\n",
    "from sagemaker import get_execution_role\n",
    "from sagemaker.processing import ProcessingInput, ProcessingOutput\n",
    "\n",
    "# Get default bucket\n",
    "session = sagemaker.Session()\n",
    "default_bucket = session.default_bucket()\n",
    "bucket_prefix = \"mlops-workshop/\"\n",
    "# Get SageMaker Execution Role\n",
    "role = get_execution_role()\n",
    "region = boto3.Session().region_name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define boto session and SageMaker Client\n",
    "\n",
    "boto_session = boto3.Session(region_name=region)\n",
    "sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!mkdir -p scripts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%writefile ./scripts/preprocessing.py\n",
    "import argparse\n",
    "import os\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "def process(df):\n",
    "    # Add two new indicators\n",
    "    df[\"no_previous_contact\"] = (df[\"pdays\"] == 999).astype(int)\n",
    "    df[\"not_working\"] = df[\"job\"].isin([\"student\", \"retired\", \"unemployed\"]).astype(int)\n",
    "    columns = list(df.columns)\n",
    "    \n",
    "    toremove = [\"emp.var.rate\", \"cons.price.idx\", \"cons.conf.idx\", \"euribor3m\", \"nr.employed\"]\n",
    "    columns = [x for x in columns if x not in toremove]\n",
    "    \n",
    "    # Keeping only columns that we need\n",
    "    df = df[columns]\n",
    "    \n",
    "    # One hot encode\n",
    "    df=pd.get_dummies(df)\n",
    "    df = pd.concat([df['y_yes'], df.drop(['y_no', 'y_yes'], axis=1)], axis=1)\n",
    "    df = df.sample(frac=1).reset_index(drop=True)\n",
    "    return df\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    parser = argparse.ArgumentParser()\n",
    "    parser.add_argument(\"--input-path\", type=str, default=\"/opt/ml/processing\")\n",
    "    args, _ = parser.parse_known_args()\n",
    "    \n",
    "    base_dir = args.input_path\n",
    "\n",
    "    df = pd.read_csv(\n",
    "        f\"{base_dir}/input/bank-additional-full.csv\",\n",
    "        header=0\n",
    "    )\n",
    "    \n",
    "    # Call the helper method\n",
    "    df = process(df)\n",
    "    \n",
    "    train, validation, test = np.split(df, [int(.7*len(df)), int(.85*len(df))])\n",
    "\n",
    "    train.to_csv(f\"{base_dir}/train/train.csv\", header=False, index=False)\n",
    "    validation.to_csv(f\"{base_dir}/validation/validation.csv\", header=False, index=False)\n",
    "    test.to_csv(f\"{base_dir}/test/test.csv\", header=False, index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Upload the raw dataset to S3 so that it can be used in the processing job\n",
    "\n",
    "local_data_path = \"bank-additional-full.csv\"\n",
    "\n",
    "base_uri = f\"s3://{default_bucket}/marketing\"\n",
    "input_data_uri = sagemaker.s3.S3Uploader.upload(\n",
    "    local_path=local_data_path, \n",
    "    desired_s3_uri=base_uri,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sagemaker.sklearn.processing import SKLearnProcessor\n",
    "\n",
    "framework_version = \"0.23-1\"\n",
    "\n",
    "sklearn_processor = SKLearnProcessor(\n",
    "    framework_version=framework_version,\n",
    "    instance_type=\"ml.m5.xlarge\",\n",
    "    instance_count=1,\n",
    "    base_job_name=\"sklearn-marketing-process\",\n",
    "    role=role,\n",
    "    sagemaker_session=session\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sklearn_processor.run(\n",
    "    inputs=[\n",
    "      ProcessingInput(source=input_data_uri, destination=\"/opt/ml/processing/input\"),  \n",
    "    ],\n",
    "    outputs=[\n",
    "        ProcessingOutput(output_name=\"train\", source=\"/opt/ml/processing/train\"),\n",
    "        ProcessingOutput(output_name=\"validation\", source=\"/opt/ml/processing/validation\"),\n",
    "        ProcessingOutput(output_name=\"test\", source=\"/opt/ml/processing/test\")\n",
    "    ],\n",
    "    code=\"scripts/preprocessing.py\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "preprocessing_job_description = sklearn_processor.jobs[-1].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_uri = preprocessing_job_description['ProcessingOutputConfig']['Outputs'][0]['S3Output']['S3Uri']\n",
    "val_uri = preprocessing_job_description['ProcessingOutputConfig']['Outputs'][1]['S3Output']['S3Uri']\n",
    "test_uri = preprocessing_job_description['ProcessingOutputConfig']['Outputs'][2]['S3Output']['S3Uri']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%store train_uri\n",
    "%store val_uri\n",
    "%store test_uri"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Let's view the processed data\n",
    "\n",
    "Here we download the training dataset and view the first 10 rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!aws s3 cp {train_uri}/train.csv /tmp/train.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df = pd.read_csv('/tmp/train.csv', header = None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### You can now move to the next section of the module `Train a model & track your experiments`\n",
    "\n",
    "The notebook used in that section is `sagemaker-train.ipynb`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "instance_type": "ml.t3.medium",
  "kernelspec": {
   "display_name": "Python 3 (Data Science)",
   "language": "python",
   "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-2:429704687514:image/datascience-1.0"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}