{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Prepare dataset \n", "\n", "This notebook is an interactive flow of taking a SageMaker GroundTruth Output manifest file and stratifying split the dataset into train/val/test splits for model training. \n", "\n", "### INPUTS \n", "- SageMaker GroundTruth Output manifest file(s) \n", "- Number of classes\n", "\n", "### OUTPUTS \n", "- train/val/test split manifest files uploaded to S3 " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import boto3\n", "import re\n", "import sagemaker\n", "from sagemaker import get_execution_role\n", "import time\n", "from time import gmtime, strftime\n", "import json\n", "import random\n", "!pip install jsonlines\n", "import jsonlines\n", "import os\n", "from itertools import islice\n", "import numpy as np\n", "!pip install scikit-multilearn arff\n", "from skmultilearn.model_selection import iterative_train_test_split\n", "\n", "!pip install tqdm\n", "from tqdm import tqdm\n", "\n", "from matplotlib import pyplot as plt \n", "%matplotlib inline\n", "\n", "from ground_truth_od import BoundingBox, WorkerBoundingBox, \\\n", " GroundTruthBox, BoxedImage\n", "\n", "\n", "role = get_execution_role()\n", "sess = sagemaker.Session()\n", "s3 = boto3.resource('s3')\n", "s3_client = boto3.client('s3')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Train/Validation Split \n", "\n", "60/20/20 stratified across number of instances for each class. \n", "\n", "We will be picking the subset detailed above for training." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Get the SageMaker GroundTruth Labeling job outputs \n", "\n", "bucket_name = 'obj-nonprod-training'\n", "gt_keys = ['folder/labeling_job1/manifests/output/output.manifest',\n", " 'folder/labeling_job2/manifests/output/output.manifest']\n", "\n", "for f in gt_keys: \n", " s3_client.download_file(bucket_name, f, f\"{f.split('/')[1]}-output.manifest\") " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# UPDATE THESE VARIABLES! \n", "main_job_name = '' # The labeling job name you'd like all instances in this dataset to follow\n", "n_class = 5 # Number of classes in your dataset\n", "\n", "main_job_name_meta = main_job_name + '-metadata'\n", "\n", "x_list = []\n", "y_list = []\n", "label_agg = [0] * n_class\n", "# label_job_name = date \n", "\n", "for f in os.listdir('.'):\n", " \"\"\"\n", " This for loop iterates through all files at the current directory structure for files that end with `output.manifest` because that's how SageMaker GroundTruth output manifests are called.\n", " \"\"\"\n", " if f[-15:] != 'output.manifest': continue \n", " label_job_name = f[:-16]\n", " \n", " print(f\"Processing: {label_job_name}\")\n", " with jsonlines.open(f, 'r') as reader:\n", " skip_count = 0\n", " for desc in tqdm(reader):\n", " y_vector = [0] * n_class\n", " # print(json.dumps(desc))\n", " if len(desc.get(label_job_name, {'annotations': [] })['annotations']) == 0: \n", " skip_count +=1\n", " continue\n", " \n", " filtered_annot = []\n", " for label in desc[label_job_name]['annotations']: \n", " filtered_annot.append(label)\n", " y_vector[label['class_id']] += 1\n", " label_agg[label['class_id']] += 1\n", "\n", " desc[main_job_name] = desc.pop(label_job_name)\n", " desc[main_job_name]['annotations'] = filtered_annot\n", " desc[main_job_name_meta] = desc.pop(label_job_name + '-metadata')\n", " \n", " x_list.append([desc])\n", " y_list.append(y_vector)\n", " \n", " print(f\"{skip_count} images had no annotations and were dropped from the training set\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Let's take a look at the distributions of our data \n", "\n", "print('label distribution: ', label_agg) # rek: 3574, \n", "print('num_samples: ', len(x_list))\n", "print('train, val, test: ', int(len(x_list)*.6), int(len(x_list)*.2), int(len(x_list)*.2))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Convert them into numpy arrays for stratification and explore their shapes \n", "\n", "x_arr = np.array(x_list)\n", "y_arr = np.array(y_list)\n", "\n", "print(x_arr.shape, y_arr.shape, '\\n', x_arr[0], '\\n', y_arr[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 60,20,20 split train/val/test\n", "\n", "x_train, y_train, x_hold, y_hold = iterative_train_test_split(x_arr, y_arr, test_size = 0.4)\n", "x_val, y_val, x_test, y_test = iterative_train_test_split(x_hold, y_hold, test_size = 0.5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Explore the distributions\n", "\n", "from skmultilearn.model_selection.measures import get_combination_wise_output_matrix, get_indicator_representation\n", "from collections import Counter\n", "import pandas as pd\n", "\n", "pd.DataFrame({\n", " 'train': Counter(str(combination) for row in get_combination_wise_output_matrix(y_train, order=2) for combination in row),\n", " 'val' : Counter(str(combination) for row in get_combination_wise_output_matrix(y_val, order=2) for combination in row),\n", " 'test' : Counter(str(combination) for row in get_combination_wise_output_matrix(y_test, order=2) for combination in row)\n", "}).T.fillna(0.0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Manifest creation + upload to S3 \n", "\n", "with open(f'{main_job_name}-train.manifest', 'w') as f_train: \n", " for r_train in tqdm(x_train): \n", " f_train.write(json.dumps(r_train[0]) + '\\n')\n", " \n", "with open(f'{main_job_name}-val.manifest', 'w') as f_val: \n", " for r_val in tqdm(x_val): \n", " f_val.write(json.dumps(r_val[0]) + '\\n')\n", " \n", "with open(f'{main_job_name}-test.manifest', 'w') as f_test: \n", " for r_test in tqdm(x_test): \n", " f_test.write(json.dumps(r_test[0]) + '\\n')\n", " \n", "with open(f'{main_job_name}-all.manifest', 'w') as f_all: \n", " for r_train in tqdm(x_train): \n", " f_all.write(json.dumps(r_train[0]) + '\\n')\n", " for r_val in tqdm(x_val): \n", " f_all.write(json.dumps(r_val[0]) + '\\n')\n", " for r_test in tqdm(x_test): \n", " f_all.write(json.dumps(r_test[0]) + '\\n')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Upload to S3 \n", "s3_client.upload_file(f'{main_job_name}-train.manifest', bucket_name, f'training/{main_job_name}/{main_job_name}_train.manifest') \n", "s3_client.upload_file(f'{main_job_name}-val.manifest', bucket_name, f'training/{main_job_name}/{main_job_name}_val.manifest') \n", "s3_client.upload_file(f'{main_job_name}-test.manifest', bucket_name, f'training/{main_job_name}/{main_job_name}_test.manifest') \n", "s3_client.upload_file(f'{main_job_name}-all.manifest', bucket_name, f'training/{main_job_name}/{main_job_name}_all.manifest') " ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.12" } }, "nbformat": 4, "nbformat_minor": 4 }