{
 "cells": [
  {
   "cell_type": "markdown",
   "source": [
    "### Input Redshift Cluster Endpoint and User\n",
    "Please input your redshift cluster endpoint and existing user on that cluster."
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "source": [
    "REDSHIFT_ENDPOINT = 'redshift-cluster.xxxxxxxxxx.us-east-1.redshift.amazonaws.com:5439/dev'\n",
    "REDSHIFT_USER=\"awsuser\""
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "markdown",
   "source": [
    "### Setup Run SQL function using Redshift Data API to get SQL query output directly into pandas dataframe\n",
    "In this step, we are creating function run_sql, which we will use to get SQL query output directly into pandas dataframe. We will also use this function to run DDL statements"
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "source": [
    "\n",
    "import boto3\n",
    "import time\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "session = boto3.session.Session()\n",
    "region = session.region_name\n",
    "\n",
    "\n",
    "def run_sql(sql_text):\n",
    "    client = boto3.client(\"redshift-data\")\n",
    "    res = client.execute_statement(Database=REDSHIFT_ENDPOINT.split('/')[1], DbUser=REDSHIFT_USER, Sql=sql_text,\n",
    "                                   ClusterIdentifier=REDSHIFT_ENDPOINT.split('.')[0])\n",
    "    query_id = res[\"Id\"]\n",
    "    done = False\n",
    "    while not done:\n",
    "        time.sleep(1)\n",
    "        status_description = client.describe_statement(Id=query_id)\n",
    "        status = status_description[\"Status\"]\n",
    "        if status == \"FAILED\":\n",
    "            raise Exception('SQL query failed:' + query_id + \": \" + status_description[\"Error\"])\n",
    "        elif status == \"FINISHED\":\n",
    "            if status_description['ResultRows']>0:\n",
    "                results = client.get_statement_result(Id=query_id)\n",
    "                column_labels = []\n",
    "                for i in range(len(results[\"ColumnMetadata\"])): column_labels.append(results[\"ColumnMetadata\"][i]['label'])\n",
    "                records = []\n",
    "                for record in results.get('Records'):\n",
    "                    records.append([list(rec.values())[0] for rec in record])\n",
    "                df = pd.DataFrame(np.array(records), columns=column_labels)\n",
    "                return df\n",
    "            else:\n",
    "                return query_id\n"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Data Preparation Script\n",
    "Data preparation script to be run on Redshift"
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "source": [
    "setup_script = \"\"\"\n",
    "\n",
    "DROP TABLE IF EXISTS ecommerce_sales CASCADE;\n",
    "DROP TABLE IF EXISTS ecommerce_sales_data CASCADE;\n",
    "DROP TABLE IF EXISTS ecommerce_sales_training CASCADE;\n",
    "DROP TABLE IF EXISTS ecommerce_sales_validation CASCADE;\n",
    "DROP TABLE IF EXISTS ecommerce_sales_prediction CASCADE;\n",
    "\n",
    "\n",
    "CREATE TABLE IF NOT EXISTS ecommerce_sales\n",
    "(\n",
    "  invoiceno VARCHAR(30)    \n",
    "  ,stockcode VARCHAR(30)   \n",
    "  ,description VARCHAR(60)    \n",
    "  ,quantity DOUBLE PRECISION   \n",
    "  ,invoicedate VARCHAR(30)    \n",
    "  ,unitprice    DOUBLE PRECISION\n",
    "  ,customerid BIGINT    \n",
    "  ,country VARCHAR(25)    \n",
    ")\n",
    ";\n",
    " \n",
    "Copy ecommerce_sales\n",
    "From 's3://redshift-ml-multiclass/ecommerce_data.txt'\n",
    "iam_role 'arn:aws:iam::<accountid>:role/RedshiftML' delimiter '\\t' IGNOREHEADER 1 region 'us-east-1' maxerror 100;\n",
    "\n",
    "create table ecommerce_sales_data as (\n",
    "  select\n",
    "    t1.stockcode,\n",
    "    t1.description,\n",
    "    t1.invoicedate,\n",
    "    t1.customerid,\n",
    "    t1.country,\n",
    "    t1.sales_amt,\n",
    "    cast(random() * 100 as int) as data_group_id\n",
    "  from\n",
    "    (\n",
    "      select\n",
    "        stockcode,\n",
    "        description,\n",
    "        invoicedate,\n",
    "        customerid,\n",
    "        country,\n",
    "        sum(quantity * unitprice) as sales_amt\n",
    "      from\n",
    "        ecommerce_sales\n",
    "      group by\n",
    "        1,\n",
    "        2,\n",
    "        3,\n",
    "        4,\n",
    "        5\n",
    "    ) t1\n",
    ");\n",
    "\n",
    "\n",
    "create table ecommerce_sales_training as (\n",
    "  select\n",
    "    a.customerid,\n",
    "    a.country,\n",
    "    a.stockcode,\n",
    "    a.description,\n",
    "    a.invoicedate,\n",
    "    a.sales_amt,\n",
    "    (b.nbr_months_active) as nbr_months_active\n",
    "  from\n",
    "    ecommerce_sales_data a\n",
    "    inner join (\n",
    "      select\n",
    "        customerid,\n",
    "        count(\n",
    "          distinct(\n",
    "            DATE_PART(y, cast(invoicedate as date)) || '-' || LPAD(\n",
    "              DATE_PART(mon, cast(invoicedate as date)),\n",
    "              2,\n",
    "              '00'\n",
    "            )\n",
    "          )\n",
    "        ) as nbr_months_active\n",
    "      from\n",
    "        ecommerce_sales_data\n",
    "      group by\n",
    "        1\n",
    "    ) b on a.customerid = b.customerid\n",
    "  where\n",
    "    a.data_group_id < 80\n",
    ");\n",
    "\n",
    "create table ecommerce_sales_validation as (\n",
    "  select\n",
    "    a.customerid,\n",
    "    a.country,\n",
    "    a.stockcode,\n",
    "    a.description,\n",
    "    a.invoicedate,\n",
    "    a.sales_amt,\n",
    "    (b.nbr_months_active) as nbr_months_active\n",
    "  from\n",
    "    ecommerce_sales_data a\n",
    "    inner join (\n",
    "      select\n",
    "        customerid,\n",
    "        count(\n",
    "          distinct(\n",
    "            DATE_PART(y, cast(invoicedate as date)) || '-' || LPAD(\n",
    "              DATE_PART(mon, cast(invoicedate as date)),\n",
    "              2,\n",
    "              '00'\n",
    "            )\n",
    "          )\n",
    "        ) as nbr_months_active\n",
    "      from\n",
    "        ecommerce_sales_data\n",
    "      group by\n",
    "        1\n",
    "    ) b on a.customerid = b.customerid\n",
    "  where\n",
    "    a.data_group_id between 80\n",
    "    and 90\n",
    ");\n",
    "\n",
    "create table ecommerce_sales_prediction as (\n",
    "  select\n",
    "    customerid,\n",
    "    country,\n",
    "    stockcode,\n",
    "    description,\n",
    "    invoicedate,\n",
    "    sales_amt\n",
    "  from\n",
    "    ecommerce_sales_data\n",
    "  where\n",
    "    data_group_id > 90);  \n",
    "\"\"\""
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "markdown",
   "source": [
    "### Run data preparation script in Redshift"
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "source": [
    "sql_stmt = setup_script.split(\";\")\n",
    "for sql_text in sql_stmt[:-1]:\n",
    "    run_sql(sql_text);"
   ],
   "outputs": [],
   "metadata": {
    "scrolled": true
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "### Run Create Model statement to create a new ML model with Redshift ML"
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "source": [
    "run_sql(\"\"\"\n",
    "\n",
    "create model ecommerce_customer_activity\n",
    "from\n",
    "  (\n",
    "select   \n",
    "  customerid,\n",
    "  country,\n",
    "  stockcode,\n",
    "  description,\n",
    "  invoicedate,\n",
    "  sales_amt,\n",
    "  nbr_months_active  \n",
    " from ecommerce_sales_training)\n",
    " TARGET nbr_months_active FUNCTION predict_customer_activity\n",
    " IAM_ROLE 'arn:aws:iam::<accountid>:role/RedshiftML'\n",
    " problem_type MULTICLASS_CLASSIFICATION  \n",
    "  SETTINGS (\n",
    "    S3_BUCKET ''redshiftml-<your-account-id>'',\n",
    "    S3_GARBAGE_COLLECT OFF\n",
    "  );\n",
    "\"\"\");"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "markdown",
   "source": [
    "### View Model Progress "
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "source": [
    "df = run_sql(\"SHOW MODEL ecommerce_customer_activity\")\n",
    "df"
   ],
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "                         Key  \\\n",
       "0                 Model Name   \n",
       "1                Schema Name   \n",
       "2                      Owner   \n",
       "3              Creation Time   \n",
       "4                Model State   \n",
       "5        Training Job Status   \n",
       "6        validation:accuracy   \n",
       "7             Estimated Cost   \n",
       "8                              \n",
       "9             TRAINING DATA:   \n",
       "10                     Query   \n",
       "11                             \n",
       "12             Target Column   \n",
       "13                             \n",
       "14               PARAMETERS:   \n",
       "15                Model Type   \n",
       "16              Problem Type   \n",
       "17                 Objective   \n",
       "18           AutoML Job Name   \n",
       "19             Function Name   \n",
       "20       Function Parameters   \n",
       "21  Function Parameter Types   \n",
       "22                  IAM Role   \n",
       "23                 S3 Bucket   \n",
       "24               Max Runtime   \n",
       "\n",
       "                                                Value  \n",
       "0                         ecommerce_customer_activity  \n",
       "1                                              public  \n",
       "2                                                demo  \n",
       "3                            Mon, 09.08.2021 14:44:06  \n",
       "4                                               READY  \n",
       "5                          MaxAutoMLJobRuntimeReached  \n",
       "6                                            0.994640  \n",
       "7                                            4.523011  \n",
       "8                                                      \n",
       "9                                                      \n",
       "10  SELECT CUSTOMERID, COUNTRY, STOCKCODE, DESCRIP...  \n",
       "11                      FROM ECOMMERCE_SALES_TRAINING  \n",
       "12                                  NBR_MONTHS_ACTIVE  \n",
       "13                                                     \n",
       "14                                                     \n",
       "15                                               auto  \n",
       "16                           MulticlassClassification  \n",
       "17                                           Accuracy  \n",
       "18                    2021-08-09-14-44-06-713069-auto  \n",
       "19                          predict_customer_activity  \n",
       "20  customerid country stockcode description invoi...  \n",
       "21       int8 varchar varchar varchar varchar float8   \n",
       "22  arn:aws:iam::845897987212:role/RedshiftDemo-Re...  \n",
       "23                           redshift-ml-845897987212  \n",
       "24                                               5400  "
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Key</th>\n",
       "      <th>Value</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Model Name</td>\n",
       "      <td>ecommerce_customer_activity</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Schema Name</td>\n",
       "      <td>public</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Owner</td>\n",
       "      <td>demo</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Creation Time</td>\n",
       "      <td>Mon, 09.08.2021 14:44:06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Model State</td>\n",
       "      <td>READY</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Training Job Status</td>\n",
       "      <td>MaxAutoMLJobRuntimeReached</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>validation:accuracy</td>\n",
       "      <td>0.994640</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Estimated Cost</td>\n",
       "      <td>4.523011</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>TRAINING DATA:</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>Query</td>\n",
       "      <td>SELECT CUSTOMERID, COUNTRY, STOCKCODE, DESCRIP...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td></td>\n",
       "      <td>FROM ECOMMERCE_SALES_TRAINING</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>Target Column</td>\n",
       "      <td>NBR_MONTHS_ACTIVE</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>PARAMETERS:</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>Model Type</td>\n",
       "      <td>auto</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>Problem Type</td>\n",
       "      <td>MulticlassClassification</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>Objective</td>\n",
       "      <td>Accuracy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>AutoML Job Name</td>\n",
       "      <td>2021-08-09-14-44-06-713069-auto</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>Function Name</td>\n",
       "      <td>predict_customer_activity</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>Function Parameters</td>\n",
       "      <td>customerid country stockcode description invoi...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>Function Parameter Types</td>\n",
       "      <td>int8 varchar varchar varchar varchar float8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>IAM Role</td>\n",
       "      <td>arn:aws:iam::845897987212:role/RedshiftDemo-Re...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>S3 Bucket</td>\n",
       "      <td>redshift-ml-845897987212</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>Max Runtime</td>\n",
       "      <td>5400</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "metadata": {},
     "execution_count": 24
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Compare actual vs predicted\n"
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "source": [
    "df = run_sql(\"\"\"\n",
    "select \n",
    " cast(sum(t1.match)as decimal(7,2)) as predicted_matches\n",
    ",cast(sum(t1.nonmatch) as decimal(7,2)) as predicted_non_matches\n",
    ",cast(sum(t1.match + t1.nonmatch) as decimal(7,2))  as total_predictions\n",
    ",predicted_matches / total_predictions as pct_accuracy\n",
    "from \n",
    "(select   \n",
    "  customerid,\n",
    "  country,\n",
    "  stockcode,\n",
    "  description,\n",
    "  invoicedate,\n",
    "  sales_amt,\n",
    "  nbr_months_active,\n",
    "  predict_customer_activity(customerid, country, stockcode, description, invoicedate, sales_amt) as predicted_months_active,\n",
    "  case when nbr_months_active = predicted_months_active then 1\n",
    "      else 0 end as match,\n",
    "  case when nbr_months_active <> predicted_months_active then 1\n",
    "    else 0 end as nonmatch\n",
    "  from ecommerce_sales_validation\n",
    "  )t1;\n",
    "\"\"\");\n",
    "df\n"
   ],
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "  predicted_matches predicted_non_matches total_predictions pct_accuracy\n",
       "0          35121.00               8373.00          43494.00   0.80749068"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>predicted_matches</th>\n",
       "      <th>predicted_non_matches</th>\n",
       "      <th>total_predictions</th>\n",
       "      <th>pct_accuracy</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>35121.00</td>\n",
       "      <td>8373.00</td>\n",
       "      <td>43494.00</td>\n",
       "      <td>0.80749068</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "metadata": {},
     "execution_count": 7
    }
   ],
   "metadata": {
    "scrolled": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Predict how many customers will qualify for loyalty program\n"
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "source": [
    "df = run_sql(\"\"\"\n",
    " select \n",
    "  customerid,  \n",
    "  predict_customer_activity(customerid, country, stockcode, description, invoicedate, sales_amt) as predicted_months_active\n",
    "  from ecommerce_sales_prediction\n",
    " where predicted_months_active >=7\n",
    " group by 1,2\n",
    " limit 10;\n",
    "\"\"\");\n",
    "df\n"
   ],
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "   customerid  predicted_months_active\n",
       "0       14031                       12\n",
       "1       14849                       11\n",
       "2       15044                       11\n",
       "3       14156                       12\n",
       "4       18017                       10\n",
       "5       16145                       11\n",
       "6       16670                        9\n",
       "7       14944                        8\n",
       "8       17512                        7\n",
       "9       14051                       13"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>customerid</th>\n",
       "      <th>predicted_months_active</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>14031</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>14849</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>15044</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>14156</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>18017</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>16145</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>16670</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>14944</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>17512</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>14051</td>\n",
       "      <td>13</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "metadata": {},
     "execution_count": 8
    }
   ],
   "metadata": {
    "scrolled": false
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_python3",
   "language": "python",
   "name": "conda_python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}