# Defect Detection at the edge using Amazon SageMaker - Data preparation and preprocessing
In this notebook, we will download the dataset and preprocess it accordingly to be used with the provided training pipelines.

In [None]:
import boto3
import time
import uuid
import json
import numpy as np
import pandas as pd
from PIL import Image
import glob, os
from shutil import copyfile
import sagemaker

sts_client = boto3.client('sts')

# Get the account id
account_id = sts_client.get_caller_identity()["Account"]

# Project Name as defined in your CloudFormation template
PROJECT_NAME = ''

region = boto3.Session().region_name
role = sagemaker.get_execution_role()
bucket_name = 'sm-edge-workshop-%s-%s' % (PROJECT_NAME, account_id)

In [None]:
# Download the dataset
!mkdir ./data
!wget -P ./data http://go.vicos.si/kolektorsdd2

In [None]:
# Extract it
!unzip ./data/kolektorsdd2 -d ./data/kolektor 

In [None]:
# Define some utilities

def img_read(path):
 """Read image as numpy array"""
 with Image.open(path) as i:
 img = np.asarray(i)
 return img

def img_is_anomalous(img):
 """Assess whether an image is anomalous by assuming non-black masks are anomalous"""
 if np.mean(img) > 0:
 return True
 else:
 return False
 
def sort_img_by_mask(mask_file, dir_normal, dir_anomalous):
 """Copy file into specified directories based on mask"""
 mask_img = img_read(mask_file)
 data_img = mask_file.replace('_GT', '')
 if img_is_anomalous(mask_img):
 copyfile(data_img, os.path.join(dir_anomalous, os.path.basename(data_img)))
 else:
 copyfile(data_img, os.path.join(dir_normal, os.path.basename(data_img)))
 return

In [None]:
# Define the base directory where the files are located and get a list of all the maks files
directory = './data/kolektor/train/'
mask_files = [f for f in glob.glob(os.path.join(directory, '*_GT.png'))]

In [None]:
# Create folders for the preprocessed images
!mkdir ./data/kolektor-preprocessed
!mkdir ./data/kolektor-preprocessed/img-classification
!mkdir ./data/kolektor-preprocessed/img-classification/normal
!mkdir ./data/kolektor-preprocessed/img-classification/anomalous

!mkdir ./data/kolektor-preprocessed/semantic-segmentation
!mkdir ./data/kolektor-preprocessed/semantic-segmentation/images
!mkdir ./data/kolektor-preprocessed/semantic-segmentation/masks

In [None]:
# Read the files and sort them by mask file. If the mask file is just black, we assume that there is no anomaly and thus categorize it as "normal"

dir_normal = './data/kolektor-preprocessed/img-classification/normal'
dir_anomalous = './data/kolektor-preprocessed/img-classification/anomalous'

for mask_file in mask_files:
 sort_img_by_mask(mask_file, dir_normal, dir_anomalous)

In [None]:
# Sort the files into different folders for their masks and base images

all_files = [f for f in glob.glob(os.path.join(directory, '*.png'))]
dir_images = './data/kolektor-preprocessed/semantic-segmentation/images'
dir_masks = './data/kolektor-preprocessed/semantic-segmentation/masks'

for img_path in all_files:
 if '_GT' in img_path:
 # image is mask, sort into mask subdirectory
 copyfile(img_path, os.path.join(dir_masks, os.path.basename(img_path).replace('_GT', '')))
 else:
 copyfile(img_path, os.path.join(dir_images, os.path.basename(img_path)))

In [None]:
# Copy to S3 bucket
!aws s3 cp --recursive --quiet ./data/kolektor-preprocessed/ s3://$bucket_name/data/