In [1]:
import ee
import pandas as pd

In [4]:
import sys
sys.path.append("../src")

In [5]:
from data_utils import get_data_by_zone_year
from data_utils import save_regional_data
from data_utils import split_dataset

### initialize earth engine

In [6]:
ee.Initialize()

### select bucket to store dataset

In [7]:
s3_bucket = "sagemaker-gis"

### select satellite data, year and bands

In [8]:
base_sat_data = "LANDSAT/LC08/C01/T1_SR"
year = 2014
bands = "B[1-7]"

meta_dict = {"src_dataset": base_sat_data.replace("/", "_"), "year": year}
date_range = [f"{year}-01-01", f"{year}-12-31"]

### read representative coordinates for each region

In [9]:
df_zones = pd.read_csv("../data/zones.csv").set_index("region")
df_zones.head()

Unnamed: 0_level_0,lon,lat
region,Unnamed: 1_level_1,Unnamed: 2_level_1
Mozambique1,36.2093,-18.7423
Mozambique2,34.7455,-20.6128
Nigeria1,5.6116,5.3431
Nigeria2,5.9983,4.5678
Guinea-Bissau,-15.9903,12.166


In [10]:
df_zones.index

Index(['Mozambique1', 'Mozambique2', 'Nigeria1', 'Nigeria2', 'Guinea-Bissau',
       'Miami1', 'Miami2', 'Southern Mexico', 'West Mexico', 'El Salvador',
       'Cuba1', 'Cuba2', 'Colombia', 'Venezuela', 'Amapa Brazil',
       'Belem Brazil', 'Sao Luis Brazil', 'Sao Luis Brazil 2', 'Myanmar1',
       'Madagascar', 'Myanmar2', 'Myanmar3', 'Vietnam1', 'Vietnam2', 'India'],
      dtype='object', name='region')

In [11]:
df_zones.loc['India']

lon    88.8
lat    21.8
Name: India, dtype: float64

In [12]:
area = 'India'
point_of_int = df_zones.loc[area, ["lon", "lat"]].tolist()
data_dict = get_data_by_zone_year(
    point_of_int, date_range, base_sat_data, bands
)
meta_dict["poi"] = area.replace(" ", "_")
save_regional_data(data_dict, meta_dict, s3_bucket)

rows: 2086, rows_mangrove = 1196,  rows_other = 890


### create dataset for each region

In [13]:
for area in df_zones.index:
    print(f"processing data for {area}...")
    point_of_int = df_zones.loc[area, ["lon", "lat"]].tolist()
    data_dict = get_data_by_zone_year(
        point_of_int, date_range, base_sat_data, bands
    )
    meta_dict["poi"] = area.replace(" ", "_")
    save_regional_data(data_dict, meta_dict, s3_bucket)

processing data for Mozambique1...
rows: 1227, rows_mangrove = 252,  rows_other = 975
processing data for Mozambique2...
rows: 1136, rows_mangrove = 157,  rows_other = 979
processing data for Nigeria1...
rows: 1375, rows_mangrove = 423,  rows_other = 952
processing data for Nigeria2...
rows: 1822, rows_mangrove = 930,  rows_other = 892
processing data for Guinea-Bissau...
rows: 1793, rows_mangrove = 896,  rows_other = 897
processing data for Miami1...
rows: 1191, rows_mangrove = 217,  rows_other = 974
processing data for Miami2...
rows: 1191, rows_mangrove = 217,  rows_other = 974
processing data for Southern Mexico...
rows: 1216, rows_mangrove = 238,  rows_other = 978
processing data for West Mexico...
rows: 1209, rows_mangrove = 227,  rows_other = 982
processing data for El Salvador...
rows: 1152, rows_mangrove = 169,  rows_other = 983
processing data for Cuba1...
rows: 1125, rows_mangrove = 144,  rows_other = 981
processing data for Cuba2...
rows: 1089, rows_mangrove = 102,  rows_ot

### split the dataset between training and test sets

In [14]:
areas_for_test = ["Vietnam2", "Myanmar3", "Cuba2", "India"]
folder = f"{meta_dict['src_dataset']}/Year{meta_dict['year']}"
split_dataset(areas_for_test, s3_bucket, folder)

### Check the training and test datasets

In [15]:
df_tr = pd.read_csv(f"s3://{s3_bucket}/{meta_dict['src_dataset']}/Year{meta_dict['year']}/train.csv")
df_te = pd.read_csv(f"s3://{s3_bucket}/{meta_dict['src_dataset']}/Year{meta_dict['year']}/test.csv")

In [16]:
df_tr.head()

Unnamed: 0,B1,B2,B3,B4,B5,B6,B7,label
0,74,101,354,191,2849,927,336,1
1,448,619,1205,1054,3828,2180,1447,0
2,61,114,363,170,3377,1639,644,0
3,168,229,549,401,2819,1379,625,0
4,191,288,667,562,3212,1878,920,0


In [17]:
df_te.head()

Unnamed: 0,B1,B2,B3,B4,B5,B6,B7,label
0,578,458,218,132,88,58,37,0
1,387,311,150,91,73,62,41,0
2,534,642,881,1206,2245,3552,2595,0
3,450,512,410,124,45,27,16,0
4,470,570,543,169,54,27,19,0


In [18]:
df_tr.shape, df_te.shape

((27070, 8), (5461, 8))

### Check the class composition

In [19]:
df_tr.label.value_counts(normalize=True)

0    0.747543
1    0.252457
Name: label, dtype: float64

In [20]:
df_te.label.value_counts(normalize=True)

0    0.702802
1    0.297198
Name: label, dtype: float64