{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Prepare dataset \n",
    "\n",
    "This notebook is an interactive flow of taking a SageMaker GroundTruth Output manifest file and stratifying split the dataset into train/val/test splits for model training. \n",
    "\n",
    "### INPUTS \n",
    "- SageMaker GroundTruth Output manifest file(s) \n",
    "- Number of classes\n",
    "\n",
    "### OUTPUTS \n",
    "- train/val/test split manifest files uploaded to S3 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import boto3\n",
    "import re\n",
    "import sagemaker\n",
    "from sagemaker import get_execution_role\n",
    "import time\n",
    "from time import gmtime, strftime\n",
    "import json\n",
    "import random\n",
    "!pip install jsonlines\n",
    "import jsonlines\n",
    "import os\n",
    "from itertools import islice\n",
    "import numpy as np\n",
    "!pip install scikit-multilearn arff\n",
    "from skmultilearn.model_selection import iterative_train_test_split\n",
    "\n",
    "!pip install tqdm\n",
    "from tqdm import tqdm\n",
    "\n",
    "from matplotlib import pyplot as plt \n",
    "%matplotlib inline\n",
    "\n",
    "from ground_truth_od import BoundingBox, WorkerBoundingBox, \\\n",
    "    GroundTruthBox, BoxedImage\n",
    "\n",
    "\n",
    "role = get_execution_role()\n",
    "sess = sagemaker.Session()\n",
    "s3 = boto3.resource('s3')\n",
    "s3_client = boto3.client('s3')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Train/Validation Split \n",
    "\n",
    "60/20/20 stratified across number of instances for each class. \n",
    "\n",
    "We will be picking the subset detailed above for training."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the SageMaker GroundTruth Labeling job outputs  \n",
    "\n",
    "bucket_name = 'obj-nonprod-training'\n",
    "gt_keys = ['folder/labeling_job1/manifests/output/output.manifest',\n",
    "           'folder/labeling_job2/manifests/output/output.manifest']\n",
    "\n",
    "for f in gt_keys: \n",
    "    s3_client.download_file(bucket_name, f, f\"{f.split('/')[1]}-output.manifest\")  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# UPDATE THESE VARIABLES! \n",
    "main_job_name = '<OUTPUT_DATASET_NAME>' # The labeling job name you'd like all instances in this dataset to follow\n",
    "n_class = 5 # Number of classes in your dataset\n",
    "\n",
    "main_job_name_meta = main_job_name + '-metadata'\n",
    "\n",
    "x_list = []\n",
    "y_list = []\n",
    "label_agg = [0] * n_class\n",
    "# label_job_name = date \n",
    "\n",
    "for f in os.listdir('.'):\n",
    "    \"\"\"\n",
    "    This for loop iterates through all files at the current directory structure for files that end with `output.manifest` because that's how SageMaker GroundTruth output manifests are called.\n",
    "    \"\"\"\n",
    "    if f[-15:] != 'output.manifest': continue \n",
    "    label_job_name = f[:-16]\n",
    "    \n",
    "    print(f\"Processing: {label_job_name}\")\n",
    "    with jsonlines.open(f, 'r') as reader:\n",
    "        skip_count = 0\n",
    "        for desc in tqdm(reader):\n",
    "            y_vector = [0] * n_class\n",
    "            # print(json.dumps(desc))\n",
    "            if len(desc.get(label_job_name, {'annotations': [] })['annotations']) == 0: \n",
    "                skip_count +=1\n",
    "                continue\n",
    "                \n",
    "            filtered_annot = []\n",
    "            for label in desc[label_job_name]['annotations']: \n",
    "                filtered_annot.append(label)\n",
    "                y_vector[label['class_id']] += 1\n",
    "                label_agg[label['class_id']] += 1\n",
    "\n",
    "            desc[main_job_name] = desc.pop(label_job_name)\n",
    "            desc[main_job_name]['annotations'] = filtered_annot\n",
    "            desc[main_job_name_meta] = desc.pop(label_job_name + '-metadata')\n",
    "            \n",
    "            x_list.append([desc])\n",
    "            y_list.append(y_vector)\n",
    "            \n",
    "        print(f\"{skip_count} images had no annotations and were dropped from the training set\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Let's take a look at the distributions of our data \n",
    "\n",
    "print('label distribution: ', label_agg) # rek: 3574, \n",
    "print('num_samples: ', len(x_list))\n",
    "print('train, val, test: ', int(len(x_list)*.6), int(len(x_list)*.2), int(len(x_list)*.2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert them into numpy arrays for stratification and explore their shapes \n",
    "\n",
    "x_arr = np.array(x_list)\n",
    "y_arr = np.array(y_list)\n",
    "\n",
    "print(x_arr.shape, y_arr.shape, '\\n', x_arr[0], '\\n', y_arr[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 60,20,20 split train/val/test\n",
    "\n",
    "x_train, y_train, x_hold, y_hold = iterative_train_test_split(x_arr, y_arr, test_size = 0.4)\n",
    "x_val, y_val, x_test, y_test = iterative_train_test_split(x_hold, y_hold, test_size = 0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Explore the distributions\n",
    "\n",
    "from skmultilearn.model_selection.measures import get_combination_wise_output_matrix, get_indicator_representation\n",
    "from collections import Counter\n",
    "import pandas as pd\n",
    "\n",
    "pd.DataFrame({\n",
    "    'train': Counter(str(combination) for row in get_combination_wise_output_matrix(y_train, order=2) for combination in row),\n",
    "    'val' : Counter(str(combination) for row in get_combination_wise_output_matrix(y_val, order=2) for combination in row),\n",
    "     'test' : Counter(str(combination) for row in get_combination_wise_output_matrix(y_test, order=2) for combination in row)\n",
    "}).T.fillna(0.0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Manifest creation + upload to S3 \n",
    "\n",
    "with open(f'{main_job_name}-train.manifest', 'w') as f_train: \n",
    "    for r_train in tqdm(x_train): \n",
    "        f_train.write(json.dumps(r_train[0]) + '\\n')\n",
    "        \n",
    "with open(f'{main_job_name}-val.manifest', 'w') as f_val: \n",
    "    for r_val in tqdm(x_val): \n",
    "        f_val.write(json.dumps(r_val[0]) + '\\n')\n",
    "        \n",
    "with open(f'{main_job_name}-test.manifest', 'w') as f_test: \n",
    "    for r_test in tqdm(x_test): \n",
    "        f_test.write(json.dumps(r_test[0]) + '\\n')\n",
    "        \n",
    "with open(f'{main_job_name}-all.manifest', 'w') as f_all: \n",
    "    for r_train in tqdm(x_train): \n",
    "        f_all.write(json.dumps(r_train[0]) + '\\n')\n",
    "    for r_val in tqdm(x_val): \n",
    "        f_all.write(json.dumps(r_val[0]) + '\\n')\n",
    "    for r_test in tqdm(x_test): \n",
    "        f_all.write(json.dumps(r_test[0]) + '\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Upload to S3 \n",
    "s3_client.upload_file(f'{main_job_name}-train.manifest', bucket_name, f'training/{main_job_name}/{main_job_name}_train.manifest') \n",
    "s3_client.upload_file(f'{main_job_name}-val.manifest', bucket_name, f'training/{main_job_name}/{main_job_name}_val.manifest') \n",
    "s3_client.upload_file(f'{main_job_name}-test.manifest', bucket_name, f'training/{main_job_name}/{main_job_name}_test.manifest') \n",
    "s3_client.upload_file(f'{main_job_name}-all.manifest', bucket_name, f'training/{main_job_name}/{main_job_name}_all.manifest') "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}