{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Fraud Detection\n",
    "\n",
    "This notebook shows how to use Amazon Sagemaker Processsing to prepare the data. \n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sagemaker Initial Setup\n",
    "The below code is used to get the S3 Bucket name configured for SageMaker"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! pip install -qU sagemaker\n",
    "import sagemaker\n",
    "from sagemaker import get_execution_role\n",
    "\n",
    "role = get_execution_role()\n",
    "sess = sagemaker.Session()\n",
    "bucket = sess.default_bucket() \n",
    "print(\"S3 bucket name: \", bucket)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "The below code is used to get the pre-built Docker Image URL for scikit-learn container stored in Amazon Elastic Container Registry (Amazon ECR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import boto3\n",
    "\n",
    "region = boto3.session.Session().region_name\n",
    "accountRegionMap =\t{\n",
    "  \"us-west-1\": \"746614075791\",\"us-west-2\": \"246618743249\",\"us-east-1\": \"683313688378\",\n",
    "  \"us-east-2\": \"257758044811\",\"ap-northeast-1\": \"354813040037\",\"ap-northeast-2\": \"354813040037\",\n",
    "  \"ap-southeast-1\": \"121021644041\",\"ap-southeast-2\": \"783357654285\",\"ap-south-1\": \"720646828776\",\n",
    "  \"eu-west-1\": \"141502667606\",\"eu-west-2\": \"764974769150\",\"eu-central-1\": \"492215442770\",\"ca-central-1\": \"341280168497\"\n",
    "}\n",
    "\n",
    "account_id = accountRegionMap[region]\n",
    "ecr_repository = \"sagemaker-scikit-learn\"\n",
    "tag = \":0.23-1-cpu-py3\"\n",
    "\n",
    "uri_suffix = \"amazonaws.com\"\n",
    "if region in [\"cn-north-1\", \"cn-northwest-1\"]:\n",
    "    uri_suffix = \"amazonaws.com.cn\"\n",
    "ecr_repository_uri = \"{}.dkr.ecr.{}.{}/{}\".format(\n",
    "    account_id, region, uri_suffix, ecr_repository + tag\n",
    ")\n",
    "\n",
    "print(\"SageMaker processing repository uri: \", ecr_repository_uri)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Amazon Sagemaker Data Preprocessing "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# The \"source\" field is used for the dataset and the \"destination\" is used to store the prepared data\n",
    "source = 's3://'+bucket+'/data-preparation-using-amazon-sagemaker-and-glue-databrew/DataSet/insurance_claims.csv'\n",
    "destination = 's3://'+bucket+'/data-preparation-using-amazon-sagemaker-and-glue-databrew/Results/DataProcessing'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%writefile AutoInsuranceFraudProcessing.py\n",
    "#This block of code generates a file \"AutoInsuranceFraudProcessing.py\" which has the code to process the data\n",
    "\n",
    "import argparse\n",
    "import os\n",
    "import warnings\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelBinarizer, KBinsDiscretizer\n",
    "from sklearn.preprocessing import PolynomialFeatures\n",
    "from sklearn.compose import make_column_transformer\n",
    "\n",
    "from sklearn.exceptions import DataConversionWarning\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "\n",
    "warnings.filterwarnings(action=\"ignore\", category=DataConversionWarning)\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    #get arguments\n",
    "    parser = argparse.ArgumentParser()\n",
    "    args, _ = parser.parse_known_args()\n",
    "    print(\"Received arguments {}\".format(args))\n",
    "    \n",
    "    #get the input data\n",
    "    input_data_path = os.path.join(\"/opt/ml/processing/input\", \"insurance_claims.csv\")\n",
    "    print(\"Reading input data from {}\".format(input_data_path))\n",
    "    df = pd.read_csv(input_data_path)\n",
    "    df = pd.DataFrame(data=df)\n",
    "    print(df.head())\n",
    "\n",
    "    #replacing ? with nan for the columns\n",
    "    df['police_report_available']=df['police_report_available'].replace('?',np.nan)\n",
    "    #dropping the unnecessary rows\n",
    "    df=df.dropna(subset=['police_report_available'])\n",
    "    \n",
    "    #drop unnecessary columns\n",
    "    df=df.drop(['months_as_customer'],axis=1)\n",
    "    \n",
    "    #now deal with the categorical features\n",
    "    #for the columns insured_sex and fraud_reported\n",
    "    le=LabelEncoder()\n",
    "    for i in list(df.columns):\n",
    "        if df[i].dtype=='object':\n",
    "            df[i]=le.fit_transform(df[i])\n",
    "    \n",
    "    #final preprocessed data\n",
    "    print(df.head())\n",
    "    train_features_output_path = os.path.join(\"/opt/ml/processing/output\", \"preprocessed_data.csv\")\n",
    "    df.to_csv(train_features_output_path, index=False)\n",
    "    print(\"done\")\n",
    "    \n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Run Preprocessing job with Amazon SageMaker Processing\n",
    "\n",
    "The script we have defined at `AutoInsuranceFraudProcessing.py` performs data preprocessing transformations on the raw data. The preproceesing involes replacing values, droping rows, dropping columns and categorical encoding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput\n",
    "\n",
    "script_processor = ScriptProcessor(command=['python3'],\n",
    "                                   image_uri=ecr_repository_uri,\n",
    "                                   role=role,\n",
    "                                   instance_count=1,\n",
    "                                   instance_type='ml.m4.2xlarge')\n",
    "\n",
    "script_processor.run(code='AutoInsuranceFraudProcessing.py',\n",
    "                     inputs=[ProcessingInput(source=source,\n",
    "                                             destination='/opt/ml/processing/input')],\n",
    "                     outputs=[ProcessingOutput(output_name=\"preprocessed_data.csv\", destination=destination,\n",
    "                                               source='/opt/ml/processing/output')])\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### View Results of Data Preprocessing\n",
    "\n",
    "Once the preprocessing job is complete, we can take a look at the contents of the S3 bucket to see the transformed data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "preprocessing_job_description = script_processor.jobs[-1].describe()\n",
    "\n",
    "output_config = preprocessing_job_description[\"ProcessingOutputConfig\"]\n",
    "for output in output_config[\"Outputs\"]:\n",
    "    print(output)\n",
    "    preprocessed_data = output[\"S3Output\"][\"S3Uri\"]\n",
    "    print(preprocessed_data)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Once the training is completed, the training instances are automatically saved and SageMaker stores the trained model and evaluation results to a location in S3."
   ]
  }
 ],
 "metadata": {
  "instance_type": "ml.t3.medium",
  "kernelspec": {
   "display_name": "Python 3 (Data Science)",
   "language": "python",
   "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:ap-south-1:394103062818:image/datascience-1.0"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}