{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Input Redshift Cluster Endpoint and User\n",
    "Please input your redshift cluster endpoint and existing user on that cluster."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "REDSHIFT_ENDPOINT = 'redshift-cluster.xxxxxxxxxx.us-east-1.redshift.amazonaws.com:5439/dev'\n",
    "REDSHIFT_USER=\"awsuser\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Setup Run SQL function using Redshift Data API to get SQL query output directly into pandas dataframe\n",
    "In this step, we are creating function run_sql, which we will use to get SQL query output directly into pandas dataframe. We will also use this function to run DDL statements"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "import boto3\n",
    "import time\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "session = boto3.session.Session()\n",
    "region = session.region_name\n",
    "\n",
    "\n",
    "def run_sql(sql_text):\n",
    "    client = boto3.client(\"redshift-data\")\n",
    "    res = client.execute_statement(Database=REDSHIFT_ENDPOINT.split('/')[1], DbUser=REDSHIFT_USER, Sql=sql_text,\n",
    "                                   ClusterIdentifier=REDSHIFT_ENDPOINT.split('.')[0])\n",
    "    query_id = res[\"Id\"]\n",
    "    done = False\n",
    "    while not done:\n",
    "        time.sleep(1)\n",
    "        status_description = client.describe_statement(Id=query_id)\n",
    "        status = status_description[\"Status\"]\n",
    "        if status == \"FAILED\":\n",
    "            raise Exception('SQL query failed:' + query_id + \": \" + status_description[\"Error\"])\n",
    "        elif status == \"FINISHED\":\n",
    "            if status_description['ResultRows']>0:\n",
    "                results = client.get_statement_result(Id=query_id)\n",
    "                column_labels = []\n",
    "                for i in range(len(results[\"ColumnMetadata\"])): column_labels.append(results[\"ColumnMetadata\"][i]['label'])\n",
    "                records = []\n",
    "                for record in results.get('Records'):\n",
    "                    records.append([list(rec.values())[0] for rec in record])\n",
    "                df = pd.DataFrame(np.array(records), columns=column_labels)\n",
    "                return df\n",
    "            else:\n",
    "                return query_id\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data Preparation Script\n",
    "Data preparation script to be run on Redshift"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "setup_script = \"\"\"\n",
    "\n",
    "--train table \n",
    "CREATE TABLE banknoteauthentication_train(\n",
    "variance FLOAT,\n",
    "skewness FLOAT,\n",
    "curtosis FLOAT,\n",
    "entrophy FLOAT,\n",
    "class INT);\n",
    "\n",
    "--Load \n",
    "COPY banknoteauthentication_train FROM 's3://redshiftbucket-ml-sagemaker/banknote_authentication/train_data/' IAM_ROLE '<your-amazon-redshift-sagemaker-iam-role-arn>' REGION 'us-west-2' IGNOREHEADER 1 CSV;\n",
    "\n",
    "--test table \n",
    "CREATE TABLE banknoteauthentication_test(\n",
    "variance FLOAT,\n",
    "skewness FLOAT,\n",
    "curtosis FLOAT,\n",
    "entrophy FLOAT,\n",
    "class INT);\n",
    "\n",
    "--Load \n",
    "COPY banknoteauthentication_test FROM 's3://redshiftbucket-ml-sagemaker/banknote_authentication/test_data/' IAM_ROLE '<your-amazon-redshift-sagemaker-iam-role-arn>' REGION 'us-west-2' IGNOREHEADER 1 CSV;\n",
    " \n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Run data preparation script in Redshift"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "sql_stmt = setup_script.split(\";\")\n",
    "for sql_text in sql_stmt[:-1]:\n",
    "    run_sql(sql_text);"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Run Create Model statement to create a new ML model with Redshift ML"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "run_sql(\"\"\"\n",
    "\n",
    "--Create model \n",
    "CREATE MODEL model_banknoteauthentication_xgboost_binary FROM banknoteauthentication_train \n",
    "TARGET class \n",
    "FUNCTION func_model_banknoteauthentication_xgboost_binary \n",
    "IAM_ROLE '<<your-amazon-redshift-sagemaker-iam-role-arn>>' \n",
    "AUTO OFF \n",
    "MODEL_TYPE xgboost \n",
    "OBJECTIVE 'binary:logistic' \n",
    "PREPROCESSORS 'none' \n",
    "HYPERPARAMETERS DEFAULT EXCEPT(NUM_ROUND '100') \n",
    "SETTINGS(S3_BUCKET '<<your-amazon-s3-bucket-name>>');\n",
    "\"\"\");"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### View Model Progress "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = run_sql(\"SHOW MODEL model_banknoteauthentication_xgboost_binary\")\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Compare accuracy of the model \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "df = run_sql(\"\"\"\n",
    "-- check accuracy \n",
    "WITH infer_data AS (\n",
    "SELECT class AS label,\n",
    "func_model_banknoteauthentication_xgboost_binary (variance, skewness, curtosis, entrophy) AS predicted,\n",
    "CASE \n",
    "   WHEN label IS NULL\n",
    "       THEN 0\n",
    "   ELSE label\n",
    "   END AS actual,\n",
    "CASE \n",
    "   WHEN actual = predicted\n",
    "       THEN 1::INT\n",
    "   ELSE 0::INT\n",
    "   END AS correct\n",
    "FROM banknoteauthentication_test),\n",
    "aggr_data AS (\n",
    "SELECT SUM(correct) AS num_correct,\n",
    "COUNT(*) AS total\n",
    "FROM infer_data) \n",
    "SELECT (num_correct::FLOAT / total::FLOAT) AS accuracy FROM aggr_data;\n",
    "\"\"\");\n",
    "df\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Predict the count of original vs. counterfeit banknotes\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "df = run_sql(\"\"\"\n",
    "--check prediction \n",
    "WITH infer_data AS (\n",
    "    SELECT func_model_banknoteauthentication_xgboost_binary(variance, skewness, curtosis, entrophy) AS predicted\n",
    "    FROM banknoteauthentication_test\n",
    "    )SELECT CASE \n",
    "        WHEN predicted = '0'\n",
    "            THEN 'Original banknote'\n",
    "        WHEN predicted = '1'\n",
    "            THEN 'Counterfeit banknote'\n",
    "        ELSE 'NA'\n",
    "        END AS banknote_authentication\n",
    "    ,COUNT(1) AS count FROM infer_data GROUP BY 1;\n",
    "\"\"\");\n",
    "df\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}