{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Feature engineering & selection"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "**Jupyter Kernel**:\n",
    "\n",
    "\n",
    "* If you are in SageMaker Studio, make sure that you use the **PyTorch 1.10 Python 3.8 CPU Optimized** environment.\n",
    "* Make sure that you are using one of the following instance types: `ml.m5.large` or `ml.g4dn.xlarge`.\n",
    "\n",
    "**Run All**: \n",
    "\n",
    "* If you are in SageMaker Studio, you can choose the **Run All Cells** from the **Run** tab dropdown menu to run the entire notebook at once."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    },
    "vscode": {
     "languageId": "python"
    }
   },
   "outputs": [],
   "source": [
    "# Install dependencies that will be used in this notebook.\n",
    "!pip3 install -r ./utils/requirements.in -q"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "This solution relies on a config file to run the provisioned AWS resources. Run the cell below to generate that file."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import boto3\n",
    "import os\n",
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    },
    "pycharm": {
     "name": "#%%\n"
    },
    "vscode": {
     "languageId": "python"
    }
   },
   "outputs": [],
   "source": [
    "client = boto3.client('servicecatalog')\n",
    "cwd = os.getcwd().split('/')\n",
    "i= cwd.index('S3Downloads')\n",
    "pp_name = cwd[i + 1]\n",
    "pp = client.describe_provisioned_product(Name=pp_name)\n",
    "record_id = pp['ProvisionedProductDetail']['LastSuccessfulProvisioningRecordId']\n",
    "record = client.describe_record(Id=record_id)\n",
    "\n",
    "keys = [ x['OutputKey'] for x in record['RecordOutputs'] if 'OutputKey' and 'OutputValue' in x]\n",
    "values = [ x['OutputValue'] for x in record['RecordOutputs'] if 'OutputKey' and 'OutputValue' in x]\n",
    "stack_output = dict(zip(keys, values))\n",
    "\n",
    "with open(f'/root/S3Downloads/{pp_name}/stack_outputs.json', 'w') as f:\n",
    "    json.dump(stack_output, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sagemaker_config = json.load(open(\"stack_outputs.json\"))\n",
    "\n",
    "SOLUTION_BUCKET = sagemaker_config[\"SolutionS3Bucket\"]\n",
    "AWS_REGION = sagemaker_config[\"AWSRegion\"]\n",
    "SOLUTION_NAME = sagemaker_config[\"SolutionName\"]\n",
    "AWS_S3_BUCKET = sagemaker_config[\"S3Bucket\"]\n",
    "\n",
    "KEY_YIELD_CURVE = \"data/raw/yield_curve_field_dt.csv\"\n",
    "SPATIAL_FILES_KEY = \"data/spatial-files\"\n",
    "FIPS_STATS_KEY = \"data/fips-stats/fips_county_stats.csv\"\n",
    "FIPS_POLYGONS_KEY = \"data/fips-stats/geojson-counties-fips.json\"\n",
    "SENTINEL_2_SHAPEFILE_KEY = \"data/sentinel-2-shapefiles\"\n",
    "CROPS_MASK_KEY = \"data/crop_mask/raw\"\n",
    "REQUEST_MANIFESTS_KEY = \"request_manifests/\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Set up the environment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    },
    "pycharm": {
     "name": "#%%\n"
    },
    "vscode": {
     "languageId": "python"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import geopandas as gpd\n",
    "import numpy as np\n",
    "import json\n",
    "import datetime\n",
    "import seaborn as sns\n",
    "import boto3\n",
    "import io\n",
    "import os\n",
    "import s3fs\n",
    "import itertools as it\n",
    "import time\n",
    "import spyndex\n",
    "import dask.dataframe as dd\n",
    "from typing import List\n",
    "import warnings\n",
    "import cmapy\n",
    "import random\n",
    "\n",
    "from sklearn.preprocessing import RobustScaler\n",
    "from statsmodels.stats.outliers_influence import variance_inflation_factor\n",
    "\n",
    "from matplotlib.cm import get_cmap\n",
    "from matplotlib.colors import Colormap, CenteredNorm, rgb2hex\n",
    "import matplotlib.pyplot as plt\n",
    "import plotly.express as px\n",
    "\n",
    "import xgboost\n",
    "import shap\n",
    "\n",
    "import sagemaker\n",
    "\n",
    "from utils.helper_functions import (\n",
    "    download_s3_folder,\n",
    "    convert_dates,\n",
    "    heatmap_pandas,\n",
    "    centered_gradient,\n",
    "    global_shap_importance\n",
    "    \n",
    ")\n",
    "\n",
    "warnings.simplefilter('ignore')\n",
    "\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    },
    "pycharm": {
     "name": "#%%\n"
    },
    "vscode": {
     "languageId": "python"
    }
   },
   "outputs": [],
   "source": [
    "# Define a few variables to use throughout the notebook\n",
    "\n",
    "YEAR = 2018 # crop year\n",
    "CROP_TYPE = 'corn' # crop type\n",
    "CROP_REGION = '2-Central' # Illinois region\n",
    "FEATURES_SELECTION_QUANTILE = .75 # upper quantile used with the statistical-based feature selection"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Download spatial files locally."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    },
    "pycharm": {
     "name": "#%%\n"
    },
    "vscode": {
     "languageId": "python"
    }
   },
   "outputs": [],
   "source": [
    "download_s3_folder(AWS_S3_BUCKET,SPATIAL_FILES_KEY, \"tmp/spatial-files\")\n",
    "download_s3_folder(AWS_S3_BUCKET,SENTINEL_2_SHAPEFILE_KEY, \"tmp/Sentinel-2-Shapefile-Index\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Read the dataset that includes the output of the simulations at the field level (`id_10` and `id_field`)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    },
    "pycharm": {
     "name": "#%%\n"
    },
    "vscode": {
     "languageId": "python"
    }
   },
   "outputs": [],
   "source": [
    "data = pd.read_csv(\n",
    "    f\"s3://{AWS_S3_BUCKET}/{KEY_YIELD_CURVE}\",\n",
    "    index_col=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Read the `geojson` file to recover the cell IDs selected in `00 Geospatial Processing.ipynb`. This file includes FIPS, COUNTIES, CELLS, and the FIELDS mapping."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    },
    "pycharm": {
     "name": "#%%\n"
    },
    "vscode": {
     "languageId": "python"
    }
   },
   "outputs": [],
   "source": [
    "gp_cells_ids = gpd.read_file(\"tmp/zonal_fips_stats.geojson\")\n",
    "cells_ids = list(gp_cells_ids.id_10.unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    },
    "pycharm": {
     "name": "#%%\n"
    },
    "vscode": {
     "languageId": "python"
    }
   },
   "outputs": [],
   "source": [
    "# filter by YEAR and REGION\n",
    "df = data[(data.year == YEAR) & (data.region == CROP_REGION)]\n",
    "\n",
    "# filter by the selected cells IDs\n",
    "df = df[df.id_10.isin(cells_ids)]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "### Read zonal statistics"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "> **Note**: Files produced in the `00 Geospatial Processing.ipynb` notebook."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    },
    "vscode": {
     "languageId": "python"
    }
   },
   "outputs": [],
   "source": [
    "df_stats = dd.read_csv(\n",
    "    f\"s3://{AWS_S3_BUCKET}/data/zonal-statistics-allbands/\"\n",
    "    f\"{CROP_TYPE}/{YEAR}/*/*\",\n",
    "    dtype={\"id_10\": object},\n",
    "    include_path_column = True\n",
    ").compute()\n",
    "\n",
    "df_stats['crop_type'] = 'corn'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    },
    "vscode": {
     "languageId": "python"
    }
   },
   "outputs": [],
   "source": [
    "# Retrieve isoweek from path\n",
    "df_stats['isoweek'] = df_stats['path'].str.split('/').str[-2].str.split(\"-\").str[-1].astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    },
    "vscode": {
     "languageId": "python"
    }
   },
   "outputs": [],
   "source": [
    "cols = \"mean|id_10|FIPS|crop_type|isoweek\"\n",
    "\n",
    "# Select the mean statistics\n",
    "df_stats_mean = df_stats[df_stats.columns[df_stats.columns.str.contains(cols)]]\n",
    "\n",
    "# Drop rows with inf, -inf and nan values\n",
    "df_stats_mean = df_stats_mean.replace([np.inf, -np.inf], np.nan).dropna(axis=0)\n",
    "\n",
    "df_stats_mean['id_10'] = df_stats_mean['id_10'].astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    },
    "vscode": {
     "languageId": "python"
    }
   },
   "outputs": [],
   "source": [
    "numerical_cols = list(df_stats_mean.columns[df_stats_mean.columns.str.contains(\"mean\")])\n",
    "\n",
    "# Rescale the numerical features\n",
    "scaler = RobustScaler()\n",
    "df_stats_mean[numerical_cols] = scaler.fit_transform(df_stats_mean[numerical_cols])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "### SVIs signature variations (Grouped by isoweek)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for crop in df_stats_mean.crop_type.unique():\n",
    "    for week in df_stats_mean.isoweek.unique():\n",
    "        df_px = df_stats_mean[(df_stats_mean.isoweek == week) & (df_stats_mean.crop_type == crop)] \\\n",
    "        .groupby(by=[\"FIPS\"])[numerical_cols].mean().T\n",
    "        \n",
    "        columns = df_px.columns\n",
    "        \n",
    "        fig = px.line(df_px,\n",
    "                      x=df_px.index,\n",
    "                      y=columns,\n",
    "                      line_shape=\"spline\",\n",
    "                      render_mode=\"svg\",\n",
    "                      title=f\"week: {week}; crop type: {crop}\")\n",
    "        fig.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "### SVIs signature variations (Grouped by FIPS)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    },
    "vscode": {
     "languageId": "python"
    }
   },
   "outputs": [],
   "source": [
    "for crop in df_stats_mean.crop_type.unique():\n",
    "    for fips in df_stats_mean.FIPS.unique():\n",
    "        df_px = df_stats_mean[(df_stats_mean.FIPS == fips) & (df_stats_mean.crop_type == crop)] \\\n",
    "        .groupby(by=[\"isoweek\"])[numerical_cols].mean().T\n",
    "        \n",
    "        columns = df_px.columns\n",
    "        \n",
    "        fig = px.line(df_px,\n",
    "                      x=df_px.index,\n",
    "                      y=columns,\n",
    "                      line_shape=\"spline\",\n",
    "                      render_mode=\"svg\",\n",
    "                      title=f\" FIPS: {fips}; crop type: {crop}\")\n",
    "        fig.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "### Reshape data frame and create a mapping file for the corn phenology cycle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    },
    "vscode": {
     "languageId": "python"
    }
   },
   "outputs": [],
   "source": [
    "# Reshape the frame to convert [isoweek, crop_type] elements into individual columns\n",
    "df_stats_mean_groups = df_stats_mean.groupby(['isoweek','crop_type'])\n",
    "week_groups = []\n",
    "\n",
    "for name, group in df_stats_mean_groups:\n",
    "    group = group.rename(columns={c: f'{c}_{str(name[1])}_{str(name[0])}' for c in group.columns if c in numerical_cols})\n",
    "    week_groups.append(group)\n",
    "    \n",
    "df_stats_mean_pivoted = pd.concat(week_groups,axis=1)\n",
    "df_stats_mean_pivoted = df_stats_mean_pivoted.loc[:, ~df_stats_mean_pivoted.columns.duplicated()]\n",
    "df_stats_mean_pivoted = df_stats_mean_pivoted.drop(columns={'isoweek'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    },
    "vscode": {
     "languageId": "python"
    }
   },
   "outputs": [],
   "source": [
    "df_merged = pd.merge(df,df_stats_mean_pivoted, on='id_10')\n",
    "df_merged = df_merged.drop(columns = {'Y_corn_lt_avg','region','year','lat','long'})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Define the crop phenology stages for **CORN** according to [database characterization](https://www.sciencedirect.com/science/article/pii/S2352340921010283#tbl0001)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Within the following section, we learn the crop phenology graph (DAG), which is a collection of nodes and edges, where the `nodes` are various indicators of crop growth, soil characteristics, atmospheric conditions, and `edges` between them represent temporal-causal relationships. `Parent nodes` are the field-related parameters (including the day of sowing and area planted), whereas the `child nodes` are the yield, nitrogen uptake, and nitrogen leaching targets."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We now construct a mapping file that assigns a **level** index to each variable that corresponds to a crop phenology stage."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Corn phenology cycle\n",
    "\n",
    "* Vegetative stages\n",
    "    * VE – emergence, coleoptile breaks through the soil surface\n",
    "\n",
    "    * V1 - one leaf collar is visible\n",
    "\n",
    "    * V2 - second leaf collar is visible\n",
    "\n",
    "    * V3 - third leaf collar is visible, plant begins to photosynthesize and rely on nodal root system\n",
    "\n",
    "    * V4 - fourth leaf collar is visible\n",
    "\n",
    "    * V5-V6 - fifth to sixth leaf collars are visible, growing point is above the soil surface, critical period of nitrogen uptake begins, and kernel row numbers are determined\n",
    "    * V7-V(n) - seventh to nth leaf collars are visible, period of very rapid growth\n",
    "\n",
    "    * VT – Tasselling – tassel is emerged, transitioning to reproductive phase\n",
    "    \n",
    "* Reproductive stages\n",
    "    * R1 Silking – silks emerge from husks\n",
    "\n",
    "    * R2 Blister – kernels are white on outside and inner fluid is clear\n",
    "\n",
    "    * R3 Milk - kernels are yellow on the outside and inner fluid is milky-white\n",
    "\n",
    "    * R4 Dough - milky inner fluid thickens from starch accumulation\n",
    "\n",
    "    * R5 Dent – more than 50% of kernels are dented\n",
    "\n",
    "    * R6 Physiological maturity – black layer formed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    },
    "vscode": {
     "languageId": "python"
    }
   },
   "outputs": [],
   "source": [
    "CHILD_LIST = ['P', 'sand_40cm', 'om_40cm', 'clay_40cm', 'dul_dep',\n",
    "              'day_sow', 'll15_dep', 'restriction', 'whc', 'id_10', 'station', 'id_field', 'FIPS']\n",
    "PARENTS_LIST = ['Y_corn', 'n_uptake', 'LAI_max',\n",
    "                'rain_annual', 'Y_soy', 'L1', 'L2', 'L']\n",
    "\n",
    "\n",
    "def staging(row):\n",
    "\n",
    "    postfix = row.split('_')[-1]\n",
    "\n",
    "    # Available at planting\n",
    "    if row in CHILD_LIST:\n",
    "        return 0\n",
    "\n",
    "    # Available at harvest\n",
    "    elif row in PARENTS_LIST:\n",
    "        return 5\n",
    "    \n",
    "    # N fertiliser\n",
    "    elif row == 'N_fert':\n",
    "        return 1\n",
    "\n",
    "    # v5\n",
    "    elif postfix == 'v5':\n",
    "        return 2\n",
    "\n",
    "    # R6 - R8\n",
    "    elif postfix == 'fw':\n",
    "        return 3\n",
    "\n",
    "    elif postfix.isdigit():\n",
    "        # 1 Jan. to planting\n",
    "        if int(postfix) == 1:\n",
    "            return 0\n",
    "\n",
    "        # planting to v5\n",
    "        elif int(postfix) == 2:\n",
    "            return 1\n",
    "\n",
    "        # v5 - R1\n",
    "        elif int(postfix) == 3:\n",
    "            return 2\n",
    "\n",
    "        # R1 - R3\n",
    "        elif int(postfix) == 4:\n",
    "            return 3\n",
    "\n",
    "        # R3 - R6\n",
    "        elif int(postfix) == 5:\n",
    "            return 4\n",
    "\n",
    "        # harvest - Dec 31\n",
    "        elif int(postfix) == 6:\n",
    "            return -1\n",
    "\n",
    "        # v5 - R1\n",
    "        elif int(postfix) >= 20 and int(postfix) < 26:\n",
    "            return 2\n",
    "\n",
    "        # R1 - R3\n",
    "        elif int(postfix) >= 26 and int(postfix) < 33:\n",
    "            return 3\n",
    "\n",
    "        # R3 - R6\n",
    "        elif int(postfix) >= 33:\n",
    "            return 4\n",
    "\n",
    "        else:\n",
    "            return -1\n",
    "\n",
    "    else:\n",
    "        return -1\n",
    "\n",
    "\n",
    "df_mapping = pd.DataFrame(data=df_merged.columns, columns=['variable'])\n",
    "df_mapping['level'] = df_mapping.apply(\n",
    "    lambda row: staging(row['variable']), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    },
    "vscode": {
     "languageId": "python"
    }
   },
   "outputs": [],
   "source": [
    "REGION = CROP_REGION.replace(\"-\",\"_\")\n",
    "\n",
    "df_merged.to_csv(\n",
    "    f\"s3://{AWS_S3_BUCKET}/data/enhanced/\"\n",
    "    f\"enhanced_dataset_{YEAR}_{REGION}.csv\",\n",
    "    index=False,\n",
    "    )\n",
    "\n",
    "df_mapping.to_csv(\n",
    "    f\"s3://{AWS_S3_BUCKET}/data/enhanced/\"\n",
    "    f\"stage_mapping_{YEAR}_{REGION}.csv\",\n",
    "    index=False,\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "## Multicollinearity study\n",
    "\n",
    "Compute the correlation matrix and the [variance inflation factor](https://www.statsmodels.org/dev/generated/statsmodels.stats.outliers_influence.variance_inflation_factor.html), for each **corn** phenology stage."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    },
    "vscode": {
     "languageId": "python"
    }
   },
   "outputs": [],
   "source": [
    "df_eda = df_mapping[~df_mapping.variable.isin(\n",
    "    ['id_10', 'station', 'id_field', 'FIPS'])]\n",
    "\n",
    "for level in sorted(df_eda.level.unique()):\n",
    "\n",
    "    if level == -1:\n",
    "        continue\n",
    "\n",
    "    print(f\" Correlation Matrix for Stage: {level}\")\n",
    "\n",
    "    level_mapping = df_eda[df_eda.level == level]\n",
    "    cols = list(level_mapping.variable.unique())\n",
    "\n",
    "    corr = df_merged[cols].corr()\n",
    "\n",
    "    heatmap_pandas(corr)\n",
    "\n",
    "    print(\"\\n\")\n",
    "    print(f\"Variance Inflation Factor (VIF) for Stage: {level}\")\n",
    "\n",
    "    vif = pd.DataFrame()\n",
    "    vif[\"features\"] = cols\n",
    "    vif[\"vif\"] = [variance_inflation_factor(\n",
    "        df_merged[cols].values, i) for i in range(df_merged[cols].shape[1])]\n",
    "    vif = vif.set_index('features')\n",
    "    vif = vif.replace([np.inf, -np.inf], np.nan).dropna(axis=0)\n",
    "    display(vif.style.apply(centered_gradient, cmap=get_cmap('Reds')))\n",
    "\n",
    "    print(\"\\n\")\n",
    "    print(\"\\n\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "## Feature selection \n",
    "\n",
    "Compute global feature importance using [SHAP (SHapley Additive exPlanations)](https://github.com/slundberg/shap), for each **corn** phenology stage."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    },
    "vscode": {
     "languageId": "python"
    }
   },
   "outputs": [],
   "source": [
    "selected_features_corn = []\n",
    "\n",
    "for level in sorted(df_mapping.level.unique()):\n",
    "\n",
    "    if level == -1:\n",
    "        continue\n",
    "\n",
    "    print(f\" [CORN] Shap Analysis for Stage: {level}\")\n",
    "\n",
    "    level_mapping = df_mapping[df_mapping.level == level]\n",
    "    cols = list(level_mapping.variable.unique())\n",
    "\n",
    "    if 'Y_corn' in cols:\n",
    "        cols.remove('Y_corn')\n",
    "\n",
    "    # Select predictors\n",
    "    X = df_merged[cols]\n",
    "\n",
    "    # Select target\n",
    "    y = df_merged['Y_corn']\n",
    "\n",
    "    # Train XGBRegressor\n",
    "    model = xgboost.XGBRegressor().fit(X, y)\n",
    "\n",
    "    # Compute SHAP feature importance\n",
    "    feature_importance = global_shap_importance(model, X)\n",
    "\n",
    "    feature_importance['importance_scaled'] = (\n",
    "        feature_importance['importance'].values - feature_importance['importance'].values.min()) \\\n",
    "        / (feature_importance['importance'].values - feature_importance['importance'].values.min()\n",
    "           ).sum()\n",
    "\n",
    "    # Filter by the x quantile\n",
    "    quantile = feature_importance['importance_scaled'].quantile(\n",
    "        FEATURES_SELECTION_QUANTILE)\n",
    "    feature_importance = feature_importance.loc[(\n",
    "        feature_importance['importance_scaled'] > quantile)].sort_values(by=['importance_scaled'], ascending=False)\n",
    "\n",
    "    selected_features_corn.append(list(feature_importance.features.unique()))\n",
    "\n",
    "    feature_importance = feature_importance.set_index('features')\n",
    "\n",
    "    display(feature_importance.style.apply(\n",
    "        centered_gradient, cmap=get_cmap('Greens')))\n",
    "    \n",
    "    print(\"\\n\")\n",
    "    print(\"\\n\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "### Save the filtered dataset and the crop staging mapping files to S3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    },
    "vscode": {
     "languageId": "python"
    }
   },
   "outputs": [],
   "source": [
    "satellite_images = [\n",
    "    feat for feat in df_merged.columns if feat.startswith('mean_')]\n",
    "\n",
    "selected_features = sorted(list(set(sum(selected_features_corn, [\n",
    "]) + satellite_images + ['Y_corn', 'FIPS', 'id_10', 'id_field'])))\n",
    "\n",
    "# save selected features\n",
    "df_merged_filtered = df_merged[selected_features]\n",
    "df_mapping_filtered = df_mapping[df_mapping.variable.isin(selected_features)]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    },
    "vscode": {
     "languageId": "python"
    }
   },
   "outputs": [],
   "source": [
    "REGION = CROP_REGION.replace(\"-\",\"_\")\n",
    "\n",
    "df_merged_filtered.to_csv(\n",
    "    f\"s3://{AWS_S3_BUCKET}/data/enhanced/\"\n",
    "    f\"enhanced_dataset_filtered_{YEAR}_{REGION}.csv\",\n",
    "    index=False,\n",
    "    )\n",
    "\n",
    "df_mapping_filtered.to_csv(\n",
    "    f\"s3://{AWS_S3_BUCKET}/data/enhanced/\"\n",
    "    f\"stage_mapping_filtered_{YEAR}_{REGION}.csv\",\n",
    "    index=False,\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Next Steps"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You can now open [02 Causal Model.ipynb](02%20Causal%20Model.ipynb) and follow the steps inside the notebook."
   ]
  }
 ],
 "metadata": {
  "instance_type": "ml.m5.large",
  "kernelspec": {
   "display_name": "Python 3 (PyTorch 1.10 Python 3.8 CPU Optimized)",
   "language": "python",
   "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-west-2:236514542706:image/pytorch-1.10-cpu-py38"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}