{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Model training\n",
    "\n",
    "This notebook shows how to train a XGBoost model to predict the likelyhood of a bid on an ad request. It is a binary classification problem, the model predicts bid / no_bid given the details of an ad request\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "60525699",
   "metadata": {},
   "source": [
    "This notebook is designed to be run with `Python 3 (Data Science)` kernel."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import boto3\n",
    "import os\n",
    "import sagemaker\n",
    "import re\n",
    "from sagemaker import get_execution_role\n",
    "\n",
    "sagemaker_sess = sagemaker.Session()\n",
    "role = get_execution_role()\n",
    "role"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Define input and output paths"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "boto_session = boto3.Session()\n",
    "ssm= boto_session.client('ssm')\n",
    "s3_client = boto3.client(\"s3\")\n",
    "bucket = ssm.get_parameter(Name=\"/aik/data-bucket\")[\"Parameter\"][\"Value\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "INPUT_BUCKET_NAME = bucket\n",
    "OUTPUT_BUCKET_NAME = bucket\n",
    "DATA_PREFIX = 'processed/sample'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def getTrainFiles(file_type='libsvm', bucket_name=INPUT_BUCKET_NAME, prefix=DATA_PREFIX):\n",
    "    items = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=f\"{prefix}/train/\")\n",
    "    files = []\n",
    "    for itm in items['Contents']:\n",
    "        if itm['Key'].endswith(file_type):\n",
    "            files.append(itm['Key'])\n",
    "    return files\n",
    "\n",
    "def getValidationFiles(file_type='libsvm', bucket_name=INPUT_BUCKET_NAME, prefix=DATA_PREFIX):\n",
    "    items = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=f\"{prefix}/valid/\")\n",
    "    files = []\n",
    "    for itm in items['Contents']:\n",
    "        if itm['Key'].endswith(file_type):\n",
    "            files.append(itm['Key'])\n",
    "    return files\n",
    "\n",
    "def getTestFiles(file_type='libsvm', bucket_name=INPUT_BUCKET_NAME, prefix=DATA_PREFIX):\n",
    "    items = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=f\"{prefix}/test/\")\n",
    "    files = []\n",
    "    for itm in items['Contents']:\n",
    "        if itm['Key'].endswith(file_type):\n",
    "            files.append(itm['Key'])\n",
    "    return files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "file_type = 'parquet'\n",
    "train_files = getTrainFiles(file_type=file_type)\n",
    "test_files = getTestFiles(file_type=file_type)\n",
    "valid_files = getValidationFiles(file_type=file_type)\n",
    "print(f\"Train files: {len(train_files)}\")\n",
    "print(f\"Test files: {len(test_files)}\")\n",
    "print(f\"Valid files: {len(valid_files)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Overview of sample data\n",
    "\n",
    "Let's have a look into the data by downloading sample train/test/valid files from S3 and store it locally in temp folder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from sklearn.datasets import load_svmlight_file\n",
    "import pandas as pd\n",
    "\n",
    "if not os.path.exists(os.path.join(\"./temp\")):\n",
    "    os.makedirs(os.path.join(\"./temp\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Download single train/valid/test file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "file_type = 'parquet'\n",
    "s3_client.download_file(INPUT_BUCKET_NAME, train_files[0], f\"./temp/training_set.{file_type}\")\n",
    "s3_client.download_file(INPUT_BUCKET_NAME, valid_files[0], f\"temp/validation_set.{file_type}\")\n",
    "s3_client.download_file(INPUT_BUCKET_NAME, test_files[0], f\"temp/test_set.{file_type}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's only look into first 100k rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_pd = pd.read_parquet(f\"temp/training_set.{file_type}\")[:100000]\n",
    "X_train_pd.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_pd.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Ensure we have only numeric values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_pd.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Check the ratio of bid/no bid"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_pd[\"label\"].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Vizualize Correlation matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_corrs = X_train_pd.corr()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "plt.figure(figsize=(40,20))\n",
    "sns.set(font_scale=1.5)\n",
    "sns.heatmap(train_corrs, annot = True, vmin=-1, vmax=1, center= 0, cmap= 'coolwarm')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We see that at this stage `device_type_id` has a high correlation to our target `label` (bid/no_bid). This is quite normal as we are only using a few features from the dataset. Also there is no value for features `dow` and `IndexAdvertiserID`, this is because the sample data has only one value for those column. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Distribution of feature `device_type_id` with respect to `label`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.set(font_scale=1.1)\n",
    "ax = X_train_pd[['device_type_id', 'label']].hist(by='label', sharey=True)\n",
    "ax[0].set_title(\"0 (no_bid)\")\n",
    "ax[1].set_title(\"1 (bid)\")\n",
    "plt.xticks(X_train_pd['device_type_id'].unique())\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can see the values of device_type_id when label is `0` (left chart)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_pd[X_train_pd[\"label\"]==0][\"device_type_id\"].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "When the label is `1` (right chart), we have different distribution for `device_type_id` as shown below"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_pd[X_train_pd[\"label\"]==1][\"device_type_id\"].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Vizualise distribution of other features with respect to the label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.set(font_scale=1.)\n",
    "for column in X_train_pd.select_dtypes(include=['object']).columns:\n",
    "    if column != 'label':\n",
    "        display(pd.crosstab(index=X_train_pd[column], columns=X_train_pd['label'], normalize='columns'))\n",
    "\n",
    "for column in X_train_pd.select_dtypes(exclude=['object']).columns:\n",
    "    if column != 'label':\n",
    "        print(column)\n",
    "        hist = X_train_pd[[column, 'label']].hist(by='label', bins=30, sharey=True)\n",
    "        plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## XGBoost Training\n",
    "We are now ready to train a first simple XGboost model using the features data prepared in EMR."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_path = f\"{DATA_PREFIX}/train/\"\n",
    "validation_path = f\"{DATA_PREFIX}/valid/\"\n",
    "test_path = f\"{DATA_PREFIX}/test/\"\n",
    "training_path, validation_path, test_path "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Define train input and validation input\n",
    "\n",
    "`Pipe mode` is available for parquet format, so we will use it to stream the data from S3 directly to training instances. \n",
    "\n",
    "We are also sharding the training data into multiple instances by activating `ShardedByS3Key` option, this will allow us to train the model using multiple instances, where each instance will recieve a portion of the data. This is a must for training with 10s of GB of data on multiple instances."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sagemaker.inputs import TrainingInput\n",
    "s3_input_train = TrainingInput(s3_data='s3://{}/{}'.format(INPUT_BUCKET_NAME, training_path), content_type='application/x-parquet', distribution=\"ShardedByS3Key\", input_mode='Pipe') # \n",
    "s3_input_validation = TrainingInput(s3_data='s3://{}/{}'.format(INPUT_BUCKET_NAME, validation_path), content_type='application/x-parquet', input_mode='Pipe')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Selecting XGboost version\n",
    "We are using SageMaker built-in XGboost algorithm version 1.2-1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "container = sagemaker.image_uris.retrieve('xgboost', boto_session.region_name, '1.2-1')\n",
    "display(container)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "from time import gmtime, strftime\n",
    "\n",
    "prefix = 'sagemaker/xgb_bid_filtering'\n",
    "\n",
    "# JOB_TS = time.strftime('%Y-%m-%d-%H-%M-%S', time.gmtime())\n",
    "base_job_name = f'sample-single-cpu-parquet-7-features-training1st'\n",
    "base_job_name"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### On-demand instances vs Spot intances for training"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We will use on-demand instances to train, however we could use spot instances to save costs but this will add some delays in training time. Spot intances might not be available during certain times. \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "use_spot_instances = False\n",
    "max_run = 5400 # max 90 mins run\n",
    "max_wait = 7200 if use_spot_instances else None\n",
    "checkpoint_s3_uri = (\n",
    "    \"s3://{}/{}/checkpoints/{}\".format(OUTPUT_BUCKET_NAME, prefix, base_job_name) if use_spot_instances else None\n",
    ")\n",
    "print(\"Checkpoint path:\", checkpoint_s3_uri)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_path='s3://{}/{}/output'.format(OUTPUT_BUCKET_NAME, prefix)\n",
    "output_path"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create XGBoost model and define hyperparameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "xgb = sagemaker.estimator.Estimator(container,\n",
    "                                    role,\n",
    "                                    # volume_size=150, # default 30GB \n",
    "                                    base_job_name=base_job_name,\n",
    "                                    instance_count=1,\n",
    "                                    instance_type='ml.m5.2xlarge', # Other alternatives for CPU ml.m5.12xlarge. For GPU ml.g4dn.4xlarge, ml.g4dn.xlarge, ml.p3.2xlarge\n",
    "                                    output_path='s3://{}/{}/output'.format(OUTPUT_BUCKET_NAME, prefix),\n",
    "                                    sagemaker_session=sagemaker_sess,\n",
    "                                    enable_sagemaker_metrics=True,\n",
    "                                    use_spot_instances=use_spot_instances, # used for managed spot training\n",
    "                                    max_run=max_run, # used for managed spot training\n",
    "                                    max_wait=max_wait, # used for managed spot training\n",
    "                                    checkpoint_s3_uri=checkpoint_s3_uri, # used for managed spot training\n",
    "                                   )\n",
    "xgb.set_hyperparameters(max_depth=5,\n",
    "                        # tree_method='gpu_hist', # Required when GPU instance is chosen\n",
    "                        eta=0.2,\n",
    "                        gamma=4,\n",
    "                        min_child_weight=6,\n",
    "                        subsample=0.8,\n",
    "                        # silent=0,\n",
    "                        objective='binary:logistic',\n",
    "                        early_stopping_rounds=20,\n",
    "                        num_round=50)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Start the training asynchronously"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "xgb.fit(inputs={'train': s3_input_train, 'validation': s3_input_validation},\n",
    "        wait=False) "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Wait till the training job is completed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "status = sagemaker_sess.describe_training_job(xgb._current_job_name)['TrainingJobStatus']\n",
    "while(status not in [\"Completed\", \"Failed\", \"Stopped\"]):\n",
    "    time.sleep(30)\n",
    "    status = sagemaker_sess.describe_training_job(xgb._current_job_name)['TrainingJobStatus']\n",
    "    print (status)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load trained model\n",
    "Once the Training Job is completed, let's load the model locally and perfrom evaluation on test set."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "TRAINING_JOB_NAME = xgb._current_job_name\n",
    "TRAINING_JOB_NAME"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s3_model_path = xgb.model_data\n",
    "s3_model_path"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Download model  from s3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "model_tar_file_local_path = f\"./trained_model/{TRAINING_JOB_NAME}/model.tar.gz\"\n",
    "\n",
    "# download trained model locally\n",
    "if not os.path.exists(os.path.dirname(model_tar_file_local_path)):\n",
    "    os.makedirs(os.path.dirname(model_tar_file_local_path))\n",
    "\n",
    "s3_client.download_file(Bucket=OUTPUT_BUCKET_NAME, \n",
    "                        Key=s3_model_path.replace(f\"s3://{OUTPUT_BUCKET_NAME}/\", \"\"), \n",
    "                        Filename=model_tar_file_local_path)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Extract model locally"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tarfile\n",
    "# Extract the model tar file and retrieve the model pickle file\n",
    "with tarfile.open(model_tar_file_local_path, \"r:gz\") as tar:\n",
    "    tar.extractall(path=f\"./trained_model/{TRAINING_JOB_NAME}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install -q xgboost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle as pkl\n",
    "import xgboost as xgblib\n",
    "import os\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.datasets import load_svmlight_file\n",
    "\n",
    "def model_fn(model_dir):\n",
    "    with open(os.path.join(model_dir, \"xgboost-model\"), \"rb\") as f:\n",
    "        booster = pkl.load(f)\n",
    "    return booster\n",
    "\n",
    "def local_predict(xgb_local, test_libsvm_file):\n",
    "    t_mat = xgblib.DMatrix(test_libsvm_file)\n",
    "    preds = xgb_local.predict(t_mat)\n",
    "    return preds\n",
    "\n",
    "def local_predict_nparray(xgb_local, np_array):\n",
    "    t_mat = xgblib.DMatrix(np_array)\n",
    "    preds = xgb_local.predict(t_mat)\n",
    "    return preds"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Download test dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "file_type = 'parquet'\n",
    "# Download test file matching the model\n",
    "model_folder = f\"./trained_model/{TRAINING_JOB_NAME}\"\n",
    "test_file_s3_path = test_files[0]\n",
    "test_file_local_path = f\"./trained_model/{TRAINING_JOB_NAME}/test_set.{file_type}\"\n",
    "\n",
    "s3_client.download_file(INPUT_BUCKET_NAME, test_file_s3_path, test_file_local_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "xgb_local = model_fn(model_folder)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if file_type == 'parquet':\n",
    "    X_test_pd = pd.read_parquet(test_file_local_path)\n",
    "    X_test = X_test_pd.drop([\"label\"], axis=1)\n",
    "    X_test.to_numpy().shape\n",
    "    y_test = X_test_pd[\"label\"].to_numpy()\n",
    "elif file_type == 'libsvm':\n",
    "    X_test, y_test = load_svmlight_file(test_file_local_path, zero_based=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Inference on test set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    " %%time\n",
    "if file_type == 'parquet':\n",
    "    local_preds = local_predict_nparray(xgb_local, X_test)\n",
    "    local_y_vals = np.round(local_preds)\n",
    "elif file_type == 'libsvm':\n",
    "    local_preds = local_predict(xgb_local, test_file_local_path)\n",
    "    local_y_vals = np.round(local_preds)\n",
    "local_preds, local_y_vals"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Confusion matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "176438e3",
   "metadata": {},
   "outputs": [],
   "source": [
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.metrics import confusion_matrix\n",
    "\n",
    "def show_confusion_matrix(y_true, y_preds, threshold=0.5):\n",
    "    y_vals = np.where(y_preds > threshold, 1, 0)\n",
    "    cf_matrix = confusion_matrix(y_test, y_vals)\n",
    "    group_names = ['True Neg','False Pos','False Neg','True Pos']\n",
    "    group_counts = [\"{0:0.0f}\".format(value) for value in\n",
    "                    cf_matrix.flatten()]\n",
    "    labels = [f\"{v1}\\n{v2}\" for v1, v2 in\n",
    "              zip(group_names,group_counts)]\n",
    "    labels = np.asarray(labels).reshape(2,2)\n",
    "\n",
    "    # plt.figure(figsize=(9,7))\n",
    "    ax = sns.heatmap(cf_matrix, annot=labels, fmt='',  cmap='Blues')\n",
    "\n",
    "    ax.set_title('Confusion Matrix \\n\\n');\n",
    "    ax.set_xlabel('\\nPredicted Values')\n",
    "    ax.set_ylabel('Actual Values ');\n",
    "\n",
    "    ax.xaxis.set_ticklabels(['No Bid','Bid'])\n",
    "    ax.yaxis.set_ticklabels(['No Bid','Bid'])\n",
    "\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "show_confusion_matrix(y_test, local_preds)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "35ae6d12",
   "metadata": {},
   "source": [
    "### Evaluation metrics\n",
    "\n",
    "Compute different evaluation metrics to assess the model performance."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "19a24219",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef\n",
    "\n",
    "def collect_eval_metrics(true_values, predicted_values, threshold=0.5):\n",
    "    metric_df = pd.DataFrame({\"test_roc_auc_score\":[round(roc_auc_score(true_values, (predicted_values > threshold)), 5)],\n",
    "                       \"test_accuracy\":[round(accuracy_score(true_values,(predicted_values > threshold)) ,5)],\n",
    "                       \"test_recall\":[round(recall_score(true_values, (predicted_values > threshold)), 5)],\n",
    "                       \"test_precision\":[round(precision_score(true_values, (predicted_values > threshold)),5)],\n",
    "                       \"test_f1_score\":[round(f1_score(true_values, (predicted_values > threshold)),5)],\n",
    "                       \"test_matthews_corrcoef\":[round(matthews_corrcoef(true_values, (predicted_values > threshold)),5)]})\n",
    "    return metric_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d17af2db",
   "metadata": {},
   "outputs": [],
   "source": [
    "evaluation_metrics_df = collect_eval_metrics(y_test, local_preds)\n",
    "evaluation_metrics_df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Histograms of probabilities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "plt.hist(local_preds)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The above chart shows the distribution of predicted probabilities. \n",
    "\n",
    "We are using np.round() to convert the proability into the class 0 (no_bid) and 1 (bid).\n",
    "In this case the cut off threshold is 0.5, however for bid prediction use-case false negatives are much more important than false positives. We can change the cut-off threshold to minimize the false negatives."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9e79d05d",
   "metadata": {},
   "source": [
    "### Change cut off value\n",
    "\n",
    "We will now change the default cut off value (0.5) to a different value and see the impact on confusion matrix and evaluation metrics like precision, recall, etc."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7016195a",
   "metadata": {},
   "outputs": [],
   "source": [
    "threshold = 0.1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "12cddf61",
   "metadata": {},
   "outputs": [],
   "source": [
    "show_confusion_matrix(y_test, local_preds, threshold)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b7612ff5",
   "metadata": {},
   "outputs": [],
   "source": [
    "evaluation_metrics_df = collect_eval_metrics(y_test, local_preds, threshold)\n",
    "evaluation_metrics_df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Visualize ROC curve"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import metrics\n",
    "\n",
    "def show_roc_curve(y_test, y_preds):\n",
    "    fpr1, tpr1, _ = metrics.roc_curve(y_test, y_preds)\n",
    "    auc_title = plt.title(\"ROC Curve\")\n",
    "    auc_full_model = plt.plot(fpr1, tpr1,\n",
    "                              color = 'blue',\n",
    "                              label = \"full model\")\n",
    "    auc_legend = plt.legend(loc = 'lower right')\n",
    "    random_guess = plt.plot([0,1],[0,1],'r--')\n",
    "    xlim = plt.xlim([-0.1,1.1])\n",
    "    ylim = plt.ylim([-0.1,1.1])\n",
    "    ylabel = plt.ylabel('True Positive Rate')\n",
    "    xlabel = plt.xlabel('False Positive Rate')\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "show_roc_curve(y_test, local_preds)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Save model in Binary format for inference in ECS"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Save locally in the notebook"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "xgb_model = xgb_local\n",
    "xgb_binary_model_path = \"xgboost.bin\"\n",
    "\n",
    "xgb_model.save_model(xgb_binary_model_path)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Upload to S3 so the inference application can use it in ECS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "binary_model_dir = ssm.get_parameter(Name=\"/aik/xgboost/path\")[\"Parameter\"][\"Value\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "binary_model_path = ssm.get_parameter(Name=\"/aik/xgboost/path\")[\"Parameter\"][\"Value\"]\n",
    "binary_model_path = binary_model_path.replace(\"s3://\" + bucket + \"/\", \"\")\n",
    "s3_client.upload_file(xgb_binary_model_path, bucket, binary_model_path)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Save the Schema as json for inference in ECS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%writefile schema.json\n",
    "{\n",
    "  \"BidID\": \"StringType\",\n",
    "  \"dow\": \"IntegerType\",\n",
    "  \"hour\": \"StringType\",\n",
    "  \"RegionID\": \"StringType\",\n",
    "  \"CityID\": \"StringType\",\n",
    "  \"Domain\": \"StringType\",\n",
    "  \"AdvertiserID\": \"StringType\",\n",
    "  \"BiddingPrice\": \"LongType\",\n",
    "  \"PayingPrice\": \"LongType\",\n",
    "  \"UserAgent\": \"StringType\"\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "inference_schema= ssm.get_parameter(Name=\"/aik/pipelineModelArtifactSchemaPath\")[\"Parameter\"][\"Value\"]\n",
    "inference_schema_path = inference_schema.replace(\"s3://\" + bucket + \"/\", \"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s3_client.upload_file(\"schema.json\", bucket, inference_schema_path)\n"
   ]
  }
 ],
 "metadata": {
  "instance_type": "ml.t3.medium",
  "kernelspec": {
   "display_name": "Python 3.9.2 64-bit",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.2"
  },
  "vscode": {
   "interpreter": {
    "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}