{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "345566e7", "metadata": {}, "outputs": [], "source": [ "import ee\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 4, "id": "00006cb5", "metadata": {}, "outputs": [], "source": [ "import sys\n", "sys.path.append(\"../src\")" ] }, { "cell_type": "code", "execution_count": 5, "id": "f32d3cf1", "metadata": {}, "outputs": [], "source": [ "from data_utils import get_data_by_zone_year\n", "from data_utils import save_regional_data\n", "from data_utils import split_dataset" ] }, { "cell_type": "markdown", "id": "1c1c107f", "metadata": {}, "source": [ "### initialize earth engine" ] }, { "cell_type": "code", "execution_count": 6, "id": "3fb3aab2", "metadata": {}, "outputs": [], "source": [ "ee.Initialize()" ] }, { "cell_type": "markdown", "id": "864de526", "metadata": {}, "source": [ "### select bucket to store dataset" ] }, { "cell_type": "code", "execution_count": 7, "id": "99dfadda", "metadata": {}, "outputs": [], "source": [ "s3_bucket = \"sagemaker-gis\"" ] }, { "cell_type": "markdown", "id": "a0be2268", "metadata": {}, "source": [ "### select satellite data, year and bands" ] }, { "cell_type": "code", "execution_count": 8, "id": "0e6b4e39", "metadata": {}, "outputs": [], "source": [ "base_sat_data = \"LANDSAT/LC08/C01/T1_SR\"\n", "year = 2014\n", "bands = \"B[1-7]\"\n", "\n", "meta_dict = {\"src_dataset\": base_sat_data.replace(\"/\", \"_\"), \"year\": year}\n", "date_range = [f\"{year}-01-01\", f\"{year}-12-31\"]" ] }, { "cell_type": "markdown", "id": "0eff0593", "metadata": {}, "source": [ "### read representative coordinates for each region" ] }, { "cell_type": "code", "execution_count": 9, "id": "bdd0ea54", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
lonlat
region
Mozambique136.2093-18.7423
Mozambique234.7455-20.6128
Nigeria15.61165.3431
Nigeria25.99834.5678
Guinea-Bissau-15.990312.1660
\n", "
" ], "text/plain": [ " lon lat\n", "region \n", "Mozambique1 36.2093 -18.7423\n", "Mozambique2 34.7455 -20.6128\n", "Nigeria1 5.6116 5.3431\n", "Nigeria2 5.9983 4.5678\n", "Guinea-Bissau -15.9903 12.1660" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_zones = pd.read_csv(\"../data/zones.csv\").set_index(\"region\")\n", "df_zones.head()" ] }, { "cell_type": "code", "execution_count": 10, "id": "1ae0ece1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['Mozambique1', 'Mozambique2', 'Nigeria1', 'Nigeria2', 'Guinea-Bissau',\n", " 'Miami1', 'Miami2', 'Southern Mexico', 'West Mexico', 'El Salvador',\n", " 'Cuba1', 'Cuba2', 'Colombia', 'Venezuela', 'Amapa Brazil',\n", " 'Belem Brazil', 'Sao Luis Brazil', 'Sao Luis Brazil 2', 'Myanmar1',\n", " 'Madagascar', 'Myanmar2', 'Myanmar3', 'Vietnam1', 'Vietnam2', 'India'],\n", " dtype='object', name='region')" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_zones.index" ] }, { "cell_type": "code", "execution_count": 11, "id": "afa49427", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "lon 88.8\n", "lat 21.8\n", "Name: India, dtype: float64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_zones.loc['India']" ] }, { "cell_type": "code", "execution_count": 12, "id": "4adb5e00", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "rows: 2086, rows_mangrove = 1196, rows_other = 890\n" ] } ], "source": [ "area = 'India'\n", "point_of_int = df_zones.loc[area, [\"lon\", \"lat\"]].tolist()\n", "data_dict = get_data_by_zone_year(\n", " point_of_int, date_range, base_sat_data, bands\n", ")\n", "meta_dict[\"poi\"] = area.replace(\" \", \"_\")\n", "save_regional_data(data_dict, meta_dict, s3_bucket)" ] }, { "cell_type": "markdown", "id": "93395d7e", "metadata": {}, "source": [ "### create dataset for each region" ] }, { "cell_type": "code", "execution_count": 13, "id": "a9e452bc", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "processing data for Mozambique1...\n", "rows: 1227, rows_mangrove = 252, rows_other = 975\n", "processing data for Mozambique2...\n", "rows: 1136, rows_mangrove = 157, rows_other = 979\n", "processing data for Nigeria1...\n", "rows: 1375, rows_mangrove = 423, rows_other = 952\n", "processing data for Nigeria2...\n", "rows: 1822, rows_mangrove = 930, rows_other = 892\n", "processing data for Guinea-Bissau...\n", "rows: 1793, rows_mangrove = 896, rows_other = 897\n", "processing data for Miami1...\n", "rows: 1191, rows_mangrove = 217, rows_other = 974\n", "processing data for Miami2...\n", "rows: 1191, rows_mangrove = 217, rows_other = 974\n", "processing data for Southern Mexico...\n", "rows: 1216, rows_mangrove = 238, rows_other = 978\n", "processing data for West Mexico...\n", "rows: 1209, rows_mangrove = 227, rows_other = 982\n", "processing data for El Salvador...\n", "rows: 1152, rows_mangrove = 169, rows_other = 983\n", "processing data for Cuba1...\n", "rows: 1125, rows_mangrove = 144, rows_other = 981\n", "processing data for Cuba2...\n", "rows: 1089, rows_mangrove = 102, rows_other = 987\n", "processing data for Colombia...\n", "rows: 1118, rows_mangrove = 131, rows_other = 987\n", "processing data for Venezuela...\n", "rows: 1587, rows_mangrove = 664, rows_other = 923\n", "processing data for Amapa Brazil...\n", "rows: 1275, rows_mangrove = 316, rows_other = 959\n", "processing data for Belem Brazil...\n", "rows: 1263, rows_mangrove = 299, rows_other = 964\n", "processing data for Sao Luis Brazil...\n", "rows: 1245, rows_mangrove = 272, rows_other = 973\n", "processing data for Sao Luis Brazil 2...\n", "rows: 1409, rows_mangrove = 469, rows_other = 940\n", "processing data for Myanmar1...\n", "rows: 1114, rows_mangrove = 131, rows_other = 983\n", "processing data for Madagascar...\n", "rows: 1193, rows_mangrove = 212, rows_other = 981\n", "processing data for Myanmar2...\n", "rows: 1302, rows_mangrove = 328, rows_other = 974\n", "processing data for Myanmar3...\n", "rows: 1201, rows_mangrove = 227, rows_other = 974\n", "processing data for Vietnam1...\n", "rows: 1127, rows_mangrove = 142, rows_other = 985\n", "processing data for Vietnam2...\n", "rows: 1085, rows_mangrove = 98, rows_other = 987\n", "processing data for India...\n", "rows: 2086, rows_mangrove = 1196, rows_other = 890\n" ] } ], "source": [ "for area in df_zones.index:\n", " print(f\"processing data for {area}...\")\n", " point_of_int = df_zones.loc[area, [\"lon\", \"lat\"]].tolist()\n", " data_dict = get_data_by_zone_year(\n", " point_of_int, date_range, base_sat_data, bands\n", " )\n", " meta_dict[\"poi\"] = area.replace(\" \", \"_\")\n", " save_regional_data(data_dict, meta_dict, s3_bucket)" ] }, { "cell_type": "markdown", "id": "43b89560", "metadata": {}, "source": [ "### split the dataset between training and test sets" ] }, { "cell_type": "code", "execution_count": 14, "id": "3ea176aa", "metadata": {}, "outputs": [], "source": [ "areas_for_test = [\"Vietnam2\", \"Myanmar3\", \"Cuba2\", \"India\"]\n", "folder = f\"{meta_dict['src_dataset']}/Year{meta_dict['year']}\"\n", "split_dataset(areas_for_test, s3_bucket, folder)" ] }, { "cell_type": "markdown", "id": "56a035b7", "metadata": {}, "source": [ "### Check the training and test datasets" ] }, { "cell_type": "code", "execution_count": 15, "id": "6e8ff1a4", "metadata": {}, "outputs": [], "source": [ "df_tr = pd.read_csv(f\"s3://{s3_bucket}/{meta_dict['src_dataset']}/Year{meta_dict['year']}/train.csv\")\n", "df_te = pd.read_csv(f\"s3://{s3_bucket}/{meta_dict['src_dataset']}/Year{meta_dict['year']}/test.csv\")" ] }, { "cell_type": "code", "execution_count": 16, "id": "f1ed3be2", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
B1B2B3B4B5B6B7label
07410135419128499273361
1448619120510543828218014470
261114363170337716396440
3168229549401281913796250
4191288667562321218789200
\n", "
" ], "text/plain": [ " B1 B2 B3 B4 B5 B6 B7 label\n", "0 74 101 354 191 2849 927 336 1\n", "1 448 619 1205 1054 3828 2180 1447 0\n", "2 61 114 363 170 3377 1639 644 0\n", "3 168 229 549 401 2819 1379 625 0\n", "4 191 288 667 562 3212 1878 920 0" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_tr.head()" ] }, { "cell_type": "code", "execution_count": 17, "id": "90561eab", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
B1B2B3B4B5B6B7label
05784582181328858370
1387311150917362410
253464288112062245355225950
34505124101244527160
44705705431695427190
\n", "
" ], "text/plain": [ " B1 B2 B3 B4 B5 B6 B7 label\n", "0 578 458 218 132 88 58 37 0\n", "1 387 311 150 91 73 62 41 0\n", "2 534 642 881 1206 2245 3552 2595 0\n", "3 450 512 410 124 45 27 16 0\n", "4 470 570 543 169 54 27 19 0" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_te.head()" ] }, { "cell_type": "code", "execution_count": 18, "id": "054a9f41", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "((27070, 8), (5461, 8))" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_tr.shape, df_te.shape" ] }, { "cell_type": "markdown", "id": "ce5d7277", "metadata": {}, "source": [ "### Check the class composition" ] }, { "cell_type": "code", "execution_count": 19, "id": "430dd21e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 0.747543\n", "1 0.252457\n", "Name: label, dtype: float64" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_tr.label.value_counts(normalize=True)" ] }, { "cell_type": "code", "execution_count": 20, "id": "9c924650", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 0.702802\n", "1 0.297198\n", "Name: label, dtype: float64" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_te.label.value_counts(normalize=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "bf133e15", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "gis", "language": "python", "name": "gis" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 5 }