{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Input Redshift Cluster Endpoint and User\n", "Please input your redshift cluster endpoint and existing user on that cluster." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "REDSHIFT_ENDPOINT = 'redshift-cluster.xxxxxxxxxx.us-east-1.redshift.amazonaws.com:5439/dev'\n", "REDSHIFT_USER=\"awsuser\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Setup Run SQL function using Redshift Data API to get SQL query output directly into pandas dataframe\n", "In this step, we are creating function run_sql, which we will use to get SQL query output directly into pandas dataframe. We will also use this function to run DDL statements" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "import boto3\n", "import time\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", "session = boto3.session.Session()\n", "region = session.region_name\n", "\n", "\n", "def run_sql(sql_text):\n", " client = boto3.client(\"redshift-data\")\n", " res = client.execute_statement(Database=REDSHIFT_ENDPOINT.split('/')[1], DbUser=REDSHIFT_USER, Sql=sql_text,\n", " ClusterIdentifier=REDSHIFT_ENDPOINT.split('.')[0])\n", " query_id = res[\"Id\"]\n", " done = False\n", " while not done:\n", " time.sleep(1)\n", " status_description = client.describe_statement(Id=query_id)\n", " status = status_description[\"Status\"]\n", " if status == \"FAILED\":\n", " raise Exception('SQL query failed:' + query_id + \": \" + status_description[\"Error\"])\n", " elif status == \"FINISHED\":\n", " if status_description['ResultRows']>0:\n", " results = client.get_statement_result(Id=query_id)\n", " column_labels = []\n", " for i in range(len(results[\"ColumnMetadata\"])): column_labels.append(results[\"ColumnMetadata\"][i]['label'])\n", " records = []\n", " for record in results.get('Records'):\n", " records.append([list(rec.values())[0] for rec in record])\n", " df = pd.DataFrame(np.array(records), columns=column_labels)\n", " return df\n", " else:\n", " return query_id\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data Preparation Script\n", "Data preparation script to be run on Redshift" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "setup_script = \"\"\"\n", "\n", "--train table \n", "CREATE TABLE banknoteauthentication_train(\n", "variance FLOAT,\n", "skewness FLOAT,\n", "curtosis FLOAT,\n", "entrophy FLOAT,\n", "class INT);\n", "\n", "--Load \n", "COPY banknoteauthentication_train FROM 's3://redshiftbucket-ml-sagemaker/banknote_authentication/train_data/' IAM_ROLE '' REGION 'us-west-2' IGNOREHEADER 1 CSV;\n", "\n", "--test table \n", "CREATE TABLE banknoteauthentication_test(\n", "variance FLOAT,\n", "skewness FLOAT,\n", "curtosis FLOAT,\n", "entrophy FLOAT,\n", "class INT);\n", "\n", "--Load \n", "COPY banknoteauthentication_test FROM 's3://redshiftbucket-ml-sagemaker/banknote_authentication/test_data/' IAM_ROLE '' REGION 'us-west-2' IGNOREHEADER 1 CSV;\n", " \n", "\"\"\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Run data preparation script in Redshift" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "sql_stmt = setup_script.split(\";\")\n", "for sql_text in sql_stmt[:-1]:\n", " run_sql(sql_text);" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Run Create Model statement to create a new ML model with Redshift ML" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "run_sql(\"\"\"\n", "\n", "--Create model \n", "CREATE MODEL model_banknoteauthentication_xgboost_binary FROM banknoteauthentication_train \n", "TARGET class \n", "FUNCTION func_model_banknoteauthentication_xgboost_binary \n", "IAM_ROLE '<>' \n", "AUTO OFF \n", "MODEL_TYPE xgboost \n", "OBJECTIVE 'binary:logistic' \n", "PREPROCESSORS 'none' \n", "HYPERPARAMETERS DEFAULT EXCEPT(NUM_ROUND '100') \n", "SETTINGS(S3_BUCKET '<>');\n", "\"\"\");" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### View Model Progress " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = run_sql(\"SHOW MODEL model_banknoteauthentication_xgboost_binary\")\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Compare accuracy of the model \n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": false }, "outputs": [], "source": [ "df = run_sql(\"\"\"\n", "-- check accuracy \n", "WITH infer_data AS (\n", "SELECT class AS label,\n", "func_model_banknoteauthentication_xgboost_binary (variance, skewness, curtosis, entrophy) AS predicted,\n", "CASE \n", " WHEN label IS NULL\n", " THEN 0\n", " ELSE label\n", " END AS actual,\n", "CASE \n", " WHEN actual = predicted\n", " THEN 1::INT\n", " ELSE 0::INT\n", " END AS correct\n", "FROM banknoteauthentication_test),\n", "aggr_data AS (\n", "SELECT SUM(correct) AS num_correct,\n", "COUNT(*) AS total\n", "FROM infer_data) \n", "SELECT (num_correct::FLOAT / total::FLOAT) AS accuracy FROM aggr_data;\n", "\"\"\");\n", "df\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Predict the count of original vs. counterfeit banknotes\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": false }, "outputs": [], "source": [ "df = run_sql(\"\"\"\n", "--check prediction \n", "WITH infer_data AS (\n", " SELECT func_model_banknoteauthentication_xgboost_binary(variance, skewness, curtosis, entrophy) AS predicted\n", " FROM banknoteauthentication_test\n", " )SELECT CASE \n", " WHEN predicted = '0'\n", " THEN 'Original banknote'\n", " WHEN predicted = '1'\n", " THEN 'Counterfeit banknote'\n", " ELSE 'NA'\n", " END AS banknote_authentication\n", " ,COUNT(1) AS count FROM infer_data GROUP BY 1;\n", "\"\"\");\n", "df\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.0" } }, "nbformat": 4, "nbformat_minor": 4 }