{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "345566e7",
   "metadata": {},
   "outputs": [],
   "source": [
    "import ee\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "00006cb5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append(\"../src\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "f32d3cf1",
   "metadata": {},
   "outputs": [],
   "source": [
    "from data_utils import get_data_by_zone_year\n",
    "from data_utils import save_regional_data\n",
    "from data_utils import split_dataset"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1c1c107f",
   "metadata": {},
   "source": [
    "### initialize earth engine"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "3fb3aab2",
   "metadata": {},
   "outputs": [],
   "source": [
    "ee.Initialize()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "864de526",
   "metadata": {},
   "source": [
    "### select bucket to store dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "99dfadda",
   "metadata": {},
   "outputs": [],
   "source": [
    "s3_bucket = \"sagemaker-gis\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a0be2268",
   "metadata": {},
   "source": [
    "### select satellite data, year and bands"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "0e6b4e39",
   "metadata": {},
   "outputs": [],
   "source": [
    "base_sat_data = \"LANDSAT/LC08/C01/T1_SR\"\n",
    "year = 2014\n",
    "bands = \"B[1-7]\"\n",
    "\n",
    "meta_dict = {\"src_dataset\": base_sat_data.replace(\"/\", \"_\"), \"year\": year}\n",
    "date_range = [f\"{year}-01-01\", f\"{year}-12-31\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0eff0593",
   "metadata": {},
   "source": [
    "### read representative coordinates for each region"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "bdd0ea54",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>lon</th>\n",
       "      <th>lat</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>region</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Mozambique1</th>\n",
       "      <td>36.2093</td>\n",
       "      <td>-18.7423</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Mozambique2</th>\n",
       "      <td>34.7455</td>\n",
       "      <td>-20.6128</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Nigeria1</th>\n",
       "      <td>5.6116</td>\n",
       "      <td>5.3431</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Nigeria2</th>\n",
       "      <td>5.9983</td>\n",
       "      <td>4.5678</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Guinea-Bissau</th>\n",
       "      <td>-15.9903</td>\n",
       "      <td>12.1660</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   lon      lat\n",
       "region                         \n",
       "Mozambique1    36.2093 -18.7423\n",
       "Mozambique2    34.7455 -20.6128\n",
       "Nigeria1        5.6116   5.3431\n",
       "Nigeria2        5.9983   4.5678\n",
       "Guinea-Bissau -15.9903  12.1660"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_zones = pd.read_csv(\"../data/zones.csv\").set_index(\"region\")\n",
    "df_zones.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "1ae0ece1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Mozambique1', 'Mozambique2', 'Nigeria1', 'Nigeria2', 'Guinea-Bissau',\n",
       "       'Miami1', 'Miami2', 'Southern Mexico', 'West Mexico', 'El Salvador',\n",
       "       'Cuba1', 'Cuba2', 'Colombia', 'Venezuela', 'Amapa Brazil',\n",
       "       'Belem Brazil', 'Sao Luis Brazil', 'Sao Luis Brazil 2', 'Myanmar1',\n",
       "       'Madagascar', 'Myanmar2', 'Myanmar3', 'Vietnam1', 'Vietnam2', 'India'],\n",
       "      dtype='object', name='region')"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_zones.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "afa49427",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "lon    88.8\n",
       "lat    21.8\n",
       "Name: India, dtype: float64"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_zones.loc['India']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "4adb5e00",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "rows: 2086, rows_mangrove = 1196,  rows_other = 890\n"
     ]
    }
   ],
   "source": [
    "area = 'India'\n",
    "point_of_int = df_zones.loc[area, [\"lon\", \"lat\"]].tolist()\n",
    "data_dict = get_data_by_zone_year(\n",
    "    point_of_int, date_range, base_sat_data, bands\n",
    ")\n",
    "meta_dict[\"poi\"] = area.replace(\" \", \"_\")\n",
    "save_regional_data(data_dict, meta_dict, s3_bucket)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "93395d7e",
   "metadata": {},
   "source": [
    "### create dataset for each region"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "a9e452bc",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "processing data for Mozambique1...\n",
      "rows: 1227, rows_mangrove = 252,  rows_other = 975\n",
      "processing data for Mozambique2...\n",
      "rows: 1136, rows_mangrove = 157,  rows_other = 979\n",
      "processing data for Nigeria1...\n",
      "rows: 1375, rows_mangrove = 423,  rows_other = 952\n",
      "processing data for Nigeria2...\n",
      "rows: 1822, rows_mangrove = 930,  rows_other = 892\n",
      "processing data for Guinea-Bissau...\n",
      "rows: 1793, rows_mangrove = 896,  rows_other = 897\n",
      "processing data for Miami1...\n",
      "rows: 1191, rows_mangrove = 217,  rows_other = 974\n",
      "processing data for Miami2...\n",
      "rows: 1191, rows_mangrove = 217,  rows_other = 974\n",
      "processing data for Southern Mexico...\n",
      "rows: 1216, rows_mangrove = 238,  rows_other = 978\n",
      "processing data for West Mexico...\n",
      "rows: 1209, rows_mangrove = 227,  rows_other = 982\n",
      "processing data for El Salvador...\n",
      "rows: 1152, rows_mangrove = 169,  rows_other = 983\n",
      "processing data for Cuba1...\n",
      "rows: 1125, rows_mangrove = 144,  rows_other = 981\n",
      "processing data for Cuba2...\n",
      "rows: 1089, rows_mangrove = 102,  rows_other = 987\n",
      "processing data for Colombia...\n",
      "rows: 1118, rows_mangrove = 131,  rows_other = 987\n",
      "processing data for Venezuela...\n",
      "rows: 1587, rows_mangrove = 664,  rows_other = 923\n",
      "processing data for Amapa Brazil...\n",
      "rows: 1275, rows_mangrove = 316,  rows_other = 959\n",
      "processing data for Belem Brazil...\n",
      "rows: 1263, rows_mangrove = 299,  rows_other = 964\n",
      "processing data for Sao Luis Brazil...\n",
      "rows: 1245, rows_mangrove = 272,  rows_other = 973\n",
      "processing data for Sao Luis Brazil 2...\n",
      "rows: 1409, rows_mangrove = 469,  rows_other = 940\n",
      "processing data for Myanmar1...\n",
      "rows: 1114, rows_mangrove = 131,  rows_other = 983\n",
      "processing data for Madagascar...\n",
      "rows: 1193, rows_mangrove = 212,  rows_other = 981\n",
      "processing data for Myanmar2...\n",
      "rows: 1302, rows_mangrove = 328,  rows_other = 974\n",
      "processing data for Myanmar3...\n",
      "rows: 1201, rows_mangrove = 227,  rows_other = 974\n",
      "processing data for Vietnam1...\n",
      "rows: 1127, rows_mangrove = 142,  rows_other = 985\n",
      "processing data for Vietnam2...\n",
      "rows: 1085, rows_mangrove = 98,  rows_other = 987\n",
      "processing data for India...\n",
      "rows: 2086, rows_mangrove = 1196,  rows_other = 890\n"
     ]
    }
   ],
   "source": [
    "for area in df_zones.index:\n",
    "    print(f\"processing data for {area}...\")\n",
    "    point_of_int = df_zones.loc[area, [\"lon\", \"lat\"]].tolist()\n",
    "    data_dict = get_data_by_zone_year(\n",
    "        point_of_int, date_range, base_sat_data, bands\n",
    "    )\n",
    "    meta_dict[\"poi\"] = area.replace(\" \", \"_\")\n",
    "    save_regional_data(data_dict, meta_dict, s3_bucket)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "43b89560",
   "metadata": {},
   "source": [
    "### split the dataset between training and test sets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "3ea176aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "areas_for_test = [\"Vietnam2\", \"Myanmar3\", \"Cuba2\", \"India\"]\n",
    "folder = f\"{meta_dict['src_dataset']}/Year{meta_dict['year']}\"\n",
    "split_dataset(areas_for_test, s3_bucket, folder)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "56a035b7",
   "metadata": {},
   "source": [
    "### Check the training and test datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "6e8ff1a4",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_tr = pd.read_csv(f\"s3://{s3_bucket}/{meta_dict['src_dataset']}/Year{meta_dict['year']}/train.csv\")\n",
    "df_te = pd.read_csv(f\"s3://{s3_bucket}/{meta_dict['src_dataset']}/Year{meta_dict['year']}/test.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "f1ed3be2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>B1</th>\n",
       "      <th>B2</th>\n",
       "      <th>B3</th>\n",
       "      <th>B4</th>\n",
       "      <th>B5</th>\n",
       "      <th>B6</th>\n",
       "      <th>B7</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>74</td>\n",
       "      <td>101</td>\n",
       "      <td>354</td>\n",
       "      <td>191</td>\n",
       "      <td>2849</td>\n",
       "      <td>927</td>\n",
       "      <td>336</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>448</td>\n",
       "      <td>619</td>\n",
       "      <td>1205</td>\n",
       "      <td>1054</td>\n",
       "      <td>3828</td>\n",
       "      <td>2180</td>\n",
       "      <td>1447</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>61</td>\n",
       "      <td>114</td>\n",
       "      <td>363</td>\n",
       "      <td>170</td>\n",
       "      <td>3377</td>\n",
       "      <td>1639</td>\n",
       "      <td>644</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>168</td>\n",
       "      <td>229</td>\n",
       "      <td>549</td>\n",
       "      <td>401</td>\n",
       "      <td>2819</td>\n",
       "      <td>1379</td>\n",
       "      <td>625</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>191</td>\n",
       "      <td>288</td>\n",
       "      <td>667</td>\n",
       "      <td>562</td>\n",
       "      <td>3212</td>\n",
       "      <td>1878</td>\n",
       "      <td>920</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    B1   B2    B3    B4    B5    B6    B7  label\n",
       "0   74  101   354   191  2849   927   336      1\n",
       "1  448  619  1205  1054  3828  2180  1447      0\n",
       "2   61  114   363   170  3377  1639   644      0\n",
       "3  168  229   549   401  2819  1379   625      0\n",
       "4  191  288   667   562  3212  1878   920      0"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_tr.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "90561eab",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>B1</th>\n",
       "      <th>B2</th>\n",
       "      <th>B3</th>\n",
       "      <th>B4</th>\n",
       "      <th>B5</th>\n",
       "      <th>B6</th>\n",
       "      <th>B7</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>578</td>\n",
       "      <td>458</td>\n",
       "      <td>218</td>\n",
       "      <td>132</td>\n",
       "      <td>88</td>\n",
       "      <td>58</td>\n",
       "      <td>37</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>387</td>\n",
       "      <td>311</td>\n",
       "      <td>150</td>\n",
       "      <td>91</td>\n",
       "      <td>73</td>\n",
       "      <td>62</td>\n",
       "      <td>41</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>534</td>\n",
       "      <td>642</td>\n",
       "      <td>881</td>\n",
       "      <td>1206</td>\n",
       "      <td>2245</td>\n",
       "      <td>3552</td>\n",
       "      <td>2595</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>450</td>\n",
       "      <td>512</td>\n",
       "      <td>410</td>\n",
       "      <td>124</td>\n",
       "      <td>45</td>\n",
       "      <td>27</td>\n",
       "      <td>16</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>470</td>\n",
       "      <td>570</td>\n",
       "      <td>543</td>\n",
       "      <td>169</td>\n",
       "      <td>54</td>\n",
       "      <td>27</td>\n",
       "      <td>19</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    B1   B2   B3    B4    B5    B6    B7  label\n",
       "0  578  458  218   132    88    58    37      0\n",
       "1  387  311  150    91    73    62    41      0\n",
       "2  534  642  881  1206  2245  3552  2595      0\n",
       "3  450  512  410   124    45    27    16      0\n",
       "4  470  570  543   169    54    27    19      0"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_te.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "054a9f41",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((27070, 8), (5461, 8))"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_tr.shape, df_te.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ce5d7277",
   "metadata": {},
   "source": [
    "### Check the class composition"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "430dd21e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    0.747543\n",
       "1    0.252457\n",
       "Name: label, dtype: float64"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_tr.label.value_counts(normalize=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "9c924650",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    0.702802\n",
       "1    0.297198\n",
       "Name: label, dtype: float64"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_te.label.value_counts(normalize=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bf133e15",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "gis",
   "language": "python",
   "name": "gis"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}