# Prepare dataset 

This notebook is an interactive flow of taking a SageMaker GroundTruth Output manifest file and stratifying split the dataset into train/val/test splits for model training. 

### INPUTS 
- SageMaker GroundTruth Output manifest file(s) 
- Number of classes

### OUTPUTS 
- train/val/test split manifest files uploaded to S3 

In [None]:
import boto3
import re
import sagemaker
from sagemaker import get_execution_role
import time
from time import gmtime, strftime
import json
import random
!pip install jsonlines
import jsonlines
import os
from itertools import islice
import numpy as np
!pip install scikit-multilearn arff
from skmultilearn.model_selection import iterative_train_test_split

!pip install tqdm
from tqdm import tqdm

from matplotlib import pyplot as plt 
%matplotlib inline

from ground_truth_od import BoundingBox, WorkerBoundingBox, \
 GroundTruthBox, BoxedImage


role = get_execution_role()
sess = sagemaker.Session()
s3 = boto3.resource('s3')
s3_client = boto3.client('s3')

## Train/Validation Split 

60/20/20 stratified across number of instances for each class. 

We will be picking the subset detailed above for training.

In [None]:
# Get the SageMaker GroundTruth Labeling job outputs 

bucket_name = 'obj-nonprod-training'
gt_keys = ['folder/labeling_job1/manifests/output/output.manifest',
 'folder/labeling_job2/manifests/output/output.manifest']

for f in gt_keys: 
 s3_client.download_file(bucket_name, f, f"{f.split('/')[1]}-output.manifest") 

In [None]:
# UPDATE THESE VARIABLES! 
main_job_name = '' # The labeling job name you'd like all instances in this dataset to follow
n_class = 5 # Number of classes in your dataset

main_job_name_meta = main_job_name + '-metadata'

x_list = []
y_list = []
label_agg = [0] * n_class
# label_job_name = date 

for f in os.listdir('.'):
 """
 This for loop iterates through all files at the current directory structure for files that end with `output.manifest` because that's how SageMaker GroundTruth output manifests are called.
 """
 if f[-15:] != 'output.manifest': continue 
 label_job_name = f[:-16]
 
 print(f"Processing: {label_job_name}")
 with jsonlines.open(f, 'r') as reader:
 skip_count = 0
 for desc in tqdm(reader):
 y_vector = [0] * n_class
 # print(json.dumps(desc))
 if len(desc.get(label_job_name, {'annotations': [] })['annotations']) == 0: 
 skip_count +=1
 continue
 
 filtered_annot = []
 for label in desc[label_job_name]['annotations']: 
 filtered_annot.append(label)
 y_vector[label['class_id']] += 1
 label_agg[label['class_id']] += 1

 desc[main_job_name] = desc.pop(label_job_name)
 desc[main_job_name]['annotations'] = filtered_annot
 desc[main_job_name_meta] = desc.pop(label_job_name + '-metadata')
 
 x_list.append([desc])
 y_list.append(y_vector)
 
 print(f"{skip_count} images had no annotations and were dropped from the training set")

In [None]:
# Let's take a look at the distributions of our data 

print('label distribution: ', label_agg) # rek: 3574, 
print('num_samples: ', len(x_list))
print('train, val, test: ', int(len(x_list)*.6), int(len(x_list)*.2), int(len(x_list)*.2))

In [None]:
# Convert them into numpy arrays for stratification and explore their shapes 

x_arr = np.array(x_list)
y_arr = np.array(y_list)

print(x_arr.shape, y_arr.shape, '\n', x_arr[0], '\n', y_arr[0])

In [None]:
# 60,20,20 split train/val/test

x_train, y_train, x_hold, y_hold = iterative_train_test_split(x_arr, y_arr, test_size = 0.4)
x_val, y_val, x_test, y_test = iterative_train_test_split(x_hold, y_hold, test_size = 0.5)

In [None]:
# Explore the distributions

from skmultilearn.model_selection.measures import get_combination_wise_output_matrix, get_indicator_representation
from collections import Counter
import pandas as pd

pd.DataFrame({
 'train': Counter(str(combination) for row in get_combination_wise_output_matrix(y_train, order=2) for combination in row),
 'val' : Counter(str(combination) for row in get_combination_wise_output_matrix(y_val, order=2) for combination in row),
 'test' : Counter(str(combination) for row in get_combination_wise_output_matrix(y_test, order=2) for combination in row)
}).T.fillna(0.0)

In [None]:
# Manifest creation + upload to S3 

with open(f'{main_job_name}-train.manifest', 'w') as f_train: 
 for r_train in tqdm(x_train): 
 f_train.write(json.dumps(r_train[0]) + '\n')
 
with open(f'{main_job_name}-val.manifest', 'w') as f_val: 
 for r_val in tqdm(x_val): 
 f_val.write(json.dumps(r_val[0]) + '\n')
 
with open(f'{main_job_name}-test.manifest', 'w') as f_test: 
 for r_test in tqdm(x_test): 
 f_test.write(json.dumps(r_test[0]) + '\n')
 
with open(f'{main_job_name}-all.manifest', 'w') as f_all: 
 for r_train in tqdm(x_train): 
 f_all.write(json.dumps(r_train[0]) + '\n')
 for r_val in tqdm(x_val): 
 f_all.write(json.dumps(r_val[0]) + '\n')
 for r_test in tqdm(x_test): 
 f_all.write(json.dumps(r_test[0]) + '\n')

In [None]:
# Upload to S3 
s3_client.upload_file(f'{main_job_name}-train.manifest', bucket_name, f'training/{main_job_name}/{main_job_name}_train.manifest') 
s3_client.upload_file(f'{main_job_name}-val.manifest', bucket_name, f'training/{main_job_name}/{main_job_name}_val.manifest') 
s3_client.upload_file(f'{main_job_name}-test.manifest', bucket_name, f'training/{main_job_name}/{main_job_name}_test.manifest') 
s3_client.upload_file(f'{main_job_name}-all.manifest', bucket_name, f'training/{main_job_name}/{main_job_name}_all.manifest') 