{
 "cells": [
  {
   "cell_type": "markdown",
   "source": [
    "# Defect Detection at the edge using Amazon SageMaker - Data preparation and preprocessing\n",
    "In this notebook, we will download the dataset and preprocess it accordingly to be used with the provided training pipelines."
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "import boto3\n",
    "import time\n",
    "import uuid\n",
    "import json\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from PIL import Image\n",
    "import glob, os\n",
    "from shutil import copyfile\n",
    "import sagemaker\n",
    "\n",
    "sts_client = boto3.client('sts')\n",
    "\n",
    "# Get the account id\n",
    "account_id = sts_client.get_caller_identity()[\"Account\"]\n",
    "\n",
    "# Project Name as defined in your CloudFormation template\n",
    "PROJECT_NAME = '<YOUR PROJECT NAME>'\n",
    "\n",
    "region = boto3.Session().region_name\n",
    "role = sagemaker.get_execution_role()\n",
    "bucket_name = 'sm-edge-workshop-%s-%s' % (PROJECT_NAME, account_id)"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "# Download the dataset\n",
    "!mkdir ./data\n",
    "!wget -P ./data http://go.vicos.si/kolektorsdd2"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "# Extract it\n",
    "!unzip ./data/kolektorsdd2 -d ./data/kolektor "
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "# Define some utilities\n",
    "\n",
    "def img_read(path):\n",
    "    \"\"\"Read image as numpy array\"\"\"\n",
    "    with Image.open(path) as i:\n",
    "        img = np.asarray(i)\n",
    "    return img\n",
    "\n",
    "def img_is_anomalous(img):\n",
    "    \"\"\"Assess whether an image is anomalous by assuming non-black masks are anomalous\"\"\"\n",
    "    if np.mean(img) > 0:\n",
    "        return True\n",
    "    else:\n",
    "        return False\n",
    "    \n",
    "def sort_img_by_mask(mask_file, dir_normal, dir_anomalous):\n",
    "    \"\"\"Copy file into specified directories based on mask\"\"\"\n",
    "    mask_img = img_read(mask_file)\n",
    "    data_img = mask_file.replace('_GT', '')\n",
    "    if img_is_anomalous(mask_img):\n",
    "        copyfile(data_img, os.path.join(dir_anomalous, os.path.basename(data_img)))\n",
    "    else:\n",
    "        copyfile(data_img, os.path.join(dir_normal, os.path.basename(data_img)))\n",
    "    return"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "# Define the base directory where the files are located and get a list of all the maks files\n",
    "directory = './data/kolektor/train/'\n",
    "mask_files = [f for f in glob.glob(os.path.join(directory, '*_GT.png'))]"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "# Create folders for the preprocessed images\n",
    "!mkdir ./data/kolektor-preprocessed\n",
    "!mkdir ./data/kolektor-preprocessed/img-classification\n",
    "!mkdir ./data/kolektor-preprocessed/img-classification/normal\n",
    "!mkdir ./data/kolektor-preprocessed/img-classification/anomalous\n",
    "\n",
    "!mkdir ./data/kolektor-preprocessed/semantic-segmentation\n",
    "!mkdir ./data/kolektor-preprocessed/semantic-segmentation/images\n",
    "!mkdir ./data/kolektor-preprocessed/semantic-segmentation/masks"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "# Read the files and sort them by mask file. If the mask file is just black, we assume that there is no anomaly and thus categorize it as \"normal\"\n",
    "\n",
    "dir_normal = './data/kolektor-preprocessed/img-classification/normal'\n",
    "dir_anomalous = './data/kolektor-preprocessed/img-classification/anomalous'\n",
    "\n",
    "for mask_file in mask_files:\n",
    "    sort_img_by_mask(mask_file, dir_normal, dir_anomalous)"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "# Sort the files into different folders for their masks and base images\n",
    "\n",
    "all_files = [f for f in glob.glob(os.path.join(directory, '*.png'))]\n",
    "dir_images = './data/kolektor-preprocessed/semantic-segmentation/images'\n",
    "dir_masks = './data/kolektor-preprocessed/semantic-segmentation/masks'\n",
    "\n",
    "for img_path in all_files:\n",
    "    if '_GT' in img_path:\n",
    "        # image is mask, sort into mask subdirectory\n",
    "        copyfile(img_path, os.path.join(dir_masks, os.path.basename(img_path).replace('_GT', '')))\n",
    "    else:\n",
    "        copyfile(img_path, os.path.join(dir_images, os.path.basename(img_path)))"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "# Copy to S3 bucket\n",
    "!aws s3 cp --recursive --quiet ./data/kolektor-preprocessed/ s3://$bucket_name/data/"
   ],
   "outputs": [],
   "metadata": {}
  }
 ],
 "metadata": {
  "instance_type": "ml.t3.medium",
  "kernelspec": {
   "display_name": "Python 3 (Data Science)",
   "language": "python",
   "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:eu-west-1:470317259841:image/datascience-1.0"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}