{ "cells": [ { "cell_type": "markdown", "source": [ "### Input Redshift Cluster Endpoint and User\n", "Please input your redshift cluster endpoint and existing user on that cluster." ], "metadata": {} }, { "cell_type": "code", "execution_count": 1, "source": [ "REDSHIFT_ENDPOINT = 'redshift-cluster.xxxxxxxxxx.us-east-1.redshift.amazonaws.com:5439/dev'\n", "REDSHIFT_USER=\"awsuser\"" ], "outputs": [], "metadata": {} }, { "cell_type": "markdown", "source": [ "### Setup Run SQL function using Redshift Data API to get SQL query output directly into pandas dataframe\n", "In this step, we are creating function run_sql, which we will use to get SQL query output directly into pandas dataframe. We will also use this function to run DDL statements" ], "metadata": {} }, { "cell_type": "code", "execution_count": 2, "source": [ "\n", "import boto3\n", "import time\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", "session = boto3.session.Session()\n", "region = session.region_name\n", "\n", "\n", "def run_sql(sql_text):\n", " client = boto3.client(\"redshift-data\")\n", " res = client.execute_statement(Database=REDSHIFT_ENDPOINT.split('/')[1], DbUser=REDSHIFT_USER, Sql=sql_text,\n", " ClusterIdentifier=REDSHIFT_ENDPOINT.split('.')[0])\n", " query_id = res[\"Id\"]\n", " done = False\n", " while not done:\n", " time.sleep(1)\n", " status_description = client.describe_statement(Id=query_id)\n", " status = status_description[\"Status\"]\n", " if status == \"FAILED\":\n", " raise Exception('SQL query failed:' + query_id + \": \" + status_description[\"Error\"])\n", " elif status == \"FINISHED\":\n", " if status_description['ResultRows']>0:\n", " results = client.get_statement_result(Id=query_id)\n", " column_labels = []\n", " for i in range(len(results[\"ColumnMetadata\"])): column_labels.append(results[\"ColumnMetadata\"][i]['label'])\n", " records = []\n", " for record in results.get('Records'):\n", " records.append([list(rec.values())[0] for rec in record])\n", " df = pd.DataFrame(np.array(records), columns=column_labels)\n", " return df\n", " else:\n", " return query_id\n" ], "outputs": [], "metadata": {} }, { "cell_type": "markdown", "source": [ "## Data Preparation Script\n", "Data preparation script to be run on Redshift" ], "metadata": {} }, { "cell_type": "code", "execution_count": 32, "source": [ "setup_script = \"\"\"\n", "\n", "DROP TABLE IF EXISTS ecommerce_sales CASCADE;\n", "DROP TABLE IF EXISTS ecommerce_sales_data CASCADE;\n", "DROP TABLE IF EXISTS ecommerce_sales_training CASCADE;\n", "DROP TABLE IF EXISTS ecommerce_sales_validation CASCADE;\n", "DROP TABLE IF EXISTS ecommerce_sales_prediction CASCADE;\n", "\n", "\n", "CREATE TABLE IF NOT EXISTS ecommerce_sales\n", "(\n", " invoiceno VARCHAR(30) \n", " ,stockcode VARCHAR(30) \n", " ,description VARCHAR(60) \n", " ,quantity DOUBLE PRECISION \n", " ,invoicedate VARCHAR(30) \n", " ,unitprice DOUBLE PRECISION\n", " ,customerid BIGINT \n", " ,country VARCHAR(25) \n", ")\n", ";\n", " \n", "Copy ecommerce_sales\n", "From 's3://redshift-ml-multiclass/ecommerce_data.txt'\n", "iam_role 'arn:aws:iam:::role/RedshiftML' delimiter '\\t' IGNOREHEADER 1 region 'us-east-1' maxerror 100;\n", "\n", "create table ecommerce_sales_data as (\n", " select\n", " t1.stockcode,\n", " t1.description,\n", " t1.invoicedate,\n", " t1.customerid,\n", " t1.country,\n", " t1.sales_amt,\n", " cast(random() * 100 as int) as data_group_id\n", " from\n", " (\n", " select\n", " stockcode,\n", " description,\n", " invoicedate,\n", " customerid,\n", " country,\n", " sum(quantity * unitprice) as sales_amt\n", " from\n", " ecommerce_sales\n", " group by\n", " 1,\n", " 2,\n", " 3,\n", " 4,\n", " 5\n", " ) t1\n", ");\n", "\n", "\n", "create table ecommerce_sales_training as (\n", " select\n", " a.customerid,\n", " a.country,\n", " a.stockcode,\n", " a.description,\n", " a.invoicedate,\n", " a.sales_amt,\n", " (b.nbr_months_active) as nbr_months_active\n", " from\n", " ecommerce_sales_data a\n", " inner join (\n", " select\n", " customerid,\n", " count(\n", " distinct(\n", " DATE_PART(y, cast(invoicedate as date)) || '-' || LPAD(\n", " DATE_PART(mon, cast(invoicedate as date)),\n", " 2,\n", " '00'\n", " )\n", " )\n", " ) as nbr_months_active\n", " from\n", " ecommerce_sales_data\n", " group by\n", " 1\n", " ) b on a.customerid = b.customerid\n", " where\n", " a.data_group_id < 80\n", ");\n", "\n", "create table ecommerce_sales_validation as (\n", " select\n", " a.customerid,\n", " a.country,\n", " a.stockcode,\n", " a.description,\n", " a.invoicedate,\n", " a.sales_amt,\n", " (b.nbr_months_active) as nbr_months_active\n", " from\n", " ecommerce_sales_data a\n", " inner join (\n", " select\n", " customerid,\n", " count(\n", " distinct(\n", " DATE_PART(y, cast(invoicedate as date)) || '-' || LPAD(\n", " DATE_PART(mon, cast(invoicedate as date)),\n", " 2,\n", " '00'\n", " )\n", " )\n", " ) as nbr_months_active\n", " from\n", " ecommerce_sales_data\n", " group by\n", " 1\n", " ) b on a.customerid = b.customerid\n", " where\n", " a.data_group_id between 80\n", " and 90\n", ");\n", "\n", "create table ecommerce_sales_prediction as (\n", " select\n", " customerid,\n", " country,\n", " stockcode,\n", " description,\n", " invoicedate,\n", " sales_amt\n", " from\n", " ecommerce_sales_data\n", " where\n", " data_group_id > 90); \n", "\"\"\"" ], "outputs": [], "metadata": {} }, { "cell_type": "markdown", "source": [ "### Run data preparation script in Redshift" ], "metadata": {} }, { "cell_type": "code", "execution_count": 33, "source": [ "sql_stmt = setup_script.split(\";\")\n", "for sql_text in sql_stmt[:-1]:\n", " run_sql(sql_text);" ], "outputs": [], "metadata": { "scrolled": true } }, { "cell_type": "markdown", "source": [ "### Run Create Model statement to create a new ML model with Redshift ML" ], "metadata": {} }, { "cell_type": "code", "execution_count": 13, "source": [ "run_sql(\"\"\"\n", "\n", "create model ecommerce_customer_activity\n", "from\n", " (\n", "select \n", " customerid,\n", " country,\n", " stockcode,\n", " description,\n", " invoicedate,\n", " sales_amt,\n", " nbr_months_active \n", " from ecommerce_sales_training)\n", " TARGET nbr_months_active FUNCTION predict_customer_activity\n", " IAM_ROLE 'arn:aws:iam:::role/RedshiftML'\n", " problem_type MULTICLASS_CLASSIFICATION \n", " SETTINGS (\n", " S3_BUCKET ''redshiftml-'',\n", " S3_GARBAGE_COLLECT OFF\n", " );\n", "\"\"\");" ], "outputs": [], "metadata": {} }, { "cell_type": "markdown", "source": [ "### View Model Progress " ], "metadata": {} }, { "cell_type": "code", "execution_count": 24, "source": [ "df = run_sql(\"SHOW MODEL ecommerce_customer_activity\")\n", "df" ], "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Key \\\n", "0 Model Name \n", "1 Schema Name \n", "2 Owner \n", "3 Creation Time \n", "4 Model State \n", "5 Training Job Status \n", "6 validation:accuracy \n", "7 Estimated Cost \n", "8 \n", "9 TRAINING DATA: \n", "10 Query \n", "11 \n", "12 Target Column \n", "13 \n", "14 PARAMETERS: \n", "15 Model Type \n", "16 Problem Type \n", "17 Objective \n", "18 AutoML Job Name \n", "19 Function Name \n", "20 Function Parameters \n", "21 Function Parameter Types \n", "22 IAM Role \n", "23 S3 Bucket \n", "24 Max Runtime \n", "\n", " Value \n", "0 ecommerce_customer_activity \n", "1 public \n", "2 demo \n", "3 Mon, 09.08.2021 14:44:06 \n", "4 READY \n", "5 MaxAutoMLJobRuntimeReached \n", "6 0.994640 \n", "7 4.523011 \n", "8 \n", "9 \n", "10 SELECT CUSTOMERID, COUNTRY, STOCKCODE, DESCRIP... \n", "11 FROM ECOMMERCE_SALES_TRAINING \n", "12 NBR_MONTHS_ACTIVE \n", "13 \n", "14 \n", "15 auto \n", "16 MulticlassClassification \n", "17 Accuracy \n", "18 2021-08-09-14-44-06-713069-auto \n", "19 predict_customer_activity \n", "20 customerid country stockcode description invoi... \n", "21 int8 varchar varchar varchar varchar float8 \n", "22 arn:aws:iam::845897987212:role/RedshiftDemo-Re... \n", "23 redshift-ml-845897987212 \n", "24 5400 " ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
KeyValue
0Model Nameecommerce_customer_activity
1Schema Namepublic
2Ownerdemo
3Creation TimeMon, 09.08.2021 14:44:06
4Model StateREADY
5Training Job StatusMaxAutoMLJobRuntimeReached
6validation:accuracy0.994640
7Estimated Cost4.523011
8
9TRAINING DATA:
10QuerySELECT CUSTOMERID, COUNTRY, STOCKCODE, DESCRIP...
11FROM ECOMMERCE_SALES_TRAINING
12Target ColumnNBR_MONTHS_ACTIVE
13
14PARAMETERS:
15Model Typeauto
16Problem TypeMulticlassClassification
17ObjectiveAccuracy
18AutoML Job Name2021-08-09-14-44-06-713069-auto
19Function Namepredict_customer_activity
20Function Parameterscustomerid country stockcode description invoi...
21Function Parameter Typesint8 varchar varchar varchar varchar float8
22IAM Rolearn:aws:iam::845897987212:role/RedshiftDemo-Re...
23S3 Bucketredshift-ml-845897987212
24Max Runtime5400
\n", "
" ] }, "metadata": {}, "execution_count": 24 } ], "metadata": {} }, { "cell_type": "markdown", "source": [ "## Compare actual vs predicted\n" ], "metadata": {} }, { "cell_type": "code", "execution_count": 7, "source": [ "df = run_sql(\"\"\"\n", "select \n", " cast(sum(t1.match)as decimal(7,2)) as predicted_matches\n", ",cast(sum(t1.nonmatch) as decimal(7,2)) as predicted_non_matches\n", ",cast(sum(t1.match + t1.nonmatch) as decimal(7,2)) as total_predictions\n", ",predicted_matches / total_predictions as pct_accuracy\n", "from \n", "(select \n", " customerid,\n", " country,\n", " stockcode,\n", " description,\n", " invoicedate,\n", " sales_amt,\n", " nbr_months_active,\n", " predict_customer_activity(customerid, country, stockcode, description, invoicedate, sales_amt) as predicted_months_active,\n", " case when nbr_months_active = predicted_months_active then 1\n", " else 0 end as match,\n", " case when nbr_months_active <> predicted_months_active then 1\n", " else 0 end as nonmatch\n", " from ecommerce_sales_validation\n", " )t1;\n", "\"\"\");\n", "df\n" ], "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " predicted_matches predicted_non_matches total_predictions pct_accuracy\n", "0 35121.00 8373.00 43494.00 0.80749068" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
predicted_matchespredicted_non_matchestotal_predictionspct_accuracy
035121.008373.0043494.000.80749068
\n", "
" ] }, "metadata": {}, "execution_count": 7 } ], "metadata": { "scrolled": false } }, { "cell_type": "markdown", "source": [ "## Predict how many customers will qualify for loyalty program\n" ], "metadata": {} }, { "cell_type": "code", "execution_count": 8, "source": [ "df = run_sql(\"\"\"\n", " select \n", " customerid, \n", " predict_customer_activity(customerid, country, stockcode, description, invoicedate, sales_amt) as predicted_months_active\n", " from ecommerce_sales_prediction\n", " where predicted_months_active >=7\n", " group by 1,2\n", " limit 10;\n", "\"\"\");\n", "df\n" ], "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " customerid predicted_months_active\n", "0 14031 12\n", "1 14849 11\n", "2 15044 11\n", "3 14156 12\n", "4 18017 10\n", "5 16145 11\n", "6 16670 9\n", "7 14944 8\n", "8 17512 7\n", "9 14051 13" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
customeridpredicted_months_active
01403112
11484911
21504411
31415612
41801710
51614511
6166709
7149448
8175127
91405113
\n", "
" ] }, "metadata": {}, "execution_count": 8 } ], "metadata": { "scrolled": false } } ], "metadata": { "kernelspec": { "display_name": "conda_python3", "language": "python", "name": "conda_python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.13" } }, "nbformat": 4, "nbformat_minor": 4 }