# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: MIT-0 import sys import os import subprocess # Install packages previous to executing the rest of the script. You can also build your own custom container # with your individal dependecies if needed subprocess.check_call([sys.executable, "-m", "pip", "install", "Augmentor", "wget", "mxnet", "opencv-python"]) os.system("apt-get update -y") os.system("apt-get install ffmpeg libsm6 libxext6 -y") import argparse import json import warnings import pandas as pd import numpy as np from glob import glob from datetime import datetime import shutil import wget from PIL import Image import Augmentor from sklearn.model_selection import train_test_split # Constants # the "folders" in the S3 bucket which define which images are good or bad PREFIX_NAME_NORMAL = 'normal' PREFIX_NAME_ANOMALOUS = 'anomalous' # Download im2rec.py tool for RecordIO conversion filename_im2rec_tool = wget.download("https://raw.githubusercontent.com/apache/incubator-mxnet/master/tools/im2rec.py") def augment_data(path, sample_count): """Augments the image dataset in the given path by adding rotation, zoom, brightness, contrast to the dataset""" p = Augmentor.Pipeline(path, output_directory=path) # Define augmentation operations #p.rotate(probability=0.4, max_left_rotation=8, max_right_rotation=8) #p.zoom(probability=0.3, min_factor=1.1, max_factor=1.3) p.random_brightness(probability=0.3, min_factor=0.4, max_factor=0.9) p.random_contrast(probability=0.2, min_factor=0.9, max_factor=1.1) p.sample(sample_count) def split_dataset(path, split=0.1): """Split the images into train-test-validation and move them into separate folder each (named train, test, val)""" label_map = { 'good': 0, 'bad': 1 } bad = sorted(glob(os.path.join(path, "%s/*" % PREFIX_NAME_ANOMALOUS))) good = sorted(glob(os.path.join(path, "%s/*" % PREFIX_NAME_NORMAL))) images = bad + good labels = ([label_map['bad']] * len(bad)) + ([label_map['good']] * len(good)) total_size = len(images) valid_size = int(split * total_size) test_size = int(split * total_size) print('Total number of samples (normal and anomalous):', total_size) train_x, valid_x = train_test_split(images, test_size=valid_size, random_state=42) train_y, valid_y = train_test_split(labels, test_size=valid_size, random_state=42) train_x, test_x = train_test_split(train_x, test_size=test_size, random_state=42) train_y, test_y = train_test_split(train_y, test_size=test_size, random_state=42) return (train_x, train_y), (valid_x, valid_y), (test_x, test_y) def resize_images(path, width, height): """Resize all images in a given path (in-place). Please note that this method overwrites existing images in the path""" files = glob(os.path.join(path, '*.png')) + glob(os.path.join(path, '*.jpg')) for file in files: im = Image.open(file) im_resized = im.resize((width, height), Image.ANTIALIAS) im_resized.save(file) def get_square_image(img): """Returns a squared image by adding black padding""" padding_color = (0, 0, 0) width, height = img.size if width == height: return img elif width > height: result = Image.new(img.mode, (width, width), padding_color) result.paste(img, (0, (width - height) // 2)) return result else: result = Image.new(img.mode, (height, height), padding_color) result.paste(img, ((height - width) // 2, 0)) return result def square_images(path): """Squares all images in a given path (in-place). Please note that this method overwrites existing images in the path.""" files = glob(os.path.join(path, '*.png')) + glob(os.path.join(path, '*.jpg')) for file in files: im = Image.open(file) im_squared = get_square_image(im) im_squared.save(file) if __name__=='__main__': parser = argparse.ArgumentParser() parser.add_argument('--augment-count-normal', type=int, default=0) parser.add_argument('--augment-count-anomalous', type=int, default=0) parser.add_argument('--image-width', type=int, default=224) parser.add_argument('--image-height', type=int, default=224) parser.add_argument('--split', type=float, default=0.1) args, _ = parser.parse_known_args() print('Received arguments {}'.format(args)) # Define the paths input_data_base_path = '/opt/ml/processing/input' train_output_base_path = '/opt/ml/processing/train' test_output_base_path = '/opt/ml/processing/test' val_output_base_path = '/opt/ml/processing/val' report_output_base_path = '/opt/ml/processing/report' temp_data_base_path = 'opt/ml/processing/tmp' input_path_normal = os.path.join(input_data_base_path, PREFIX_NAME_NORMAL) input_path_anomalous = os.path.join(input_data_base_path, PREFIX_NAME_ANOMALOUS) # The images size used IMAGE_WIDTH = int(args.image_width) IMAGE_HEIGHT = int(args.image_height) # Augment images if needed # TODO: Only augment training images, not entire dataset! print('Augmenting images...') augment_data(input_path_normal, int(args.augment_count_normal)) augment_data(input_path_anomalous, int(args.augment_count_anomalous)) # Square all the images to ensure that only squared images exist in the training datset by adding a black padding around the image # IMPORTANT: Make sure you do the same when running inference print('Squaring all images that are not squared already...') square_images(input_path_normal) square_images(input_path_anomalous) # Resize the images in-place in the container image print('Resizing images...') resize_images(input_path_normal, IMAGE_WIDTH, IMAGE_HEIGHT) resize_images(input_path_anomalous, IMAGE_WIDTH, IMAGE_HEIGHT) # Create train test validation split # FIXME: only augment train dataset, not the test dataset! (train_x, train_y), (valid_x, valid_y), (test_x, test_y) = split_dataset(input_data_base_path, split=float(args.split)) # Create list files for RecordIO transformation base_dir_recordio = './' with open(base_dir_recordio+'train.lst', 'w+') as f: for indx, s in enumerate(train_x): f.write(f'{indx}\t{train_y[indx]}\t{s}\n') with open(base_dir_recordio+'val.lst', 'w+') as f: for indx, s in enumerate(valid_x): f.write(f'{indx}\t{valid_y[indx]}\t{s}\n') with open(base_dir_recordio+'test.lst', 'w+') as f: for indx, s in enumerate(test_x): f.write(f'{indx}\t{test_y[indx]}\t{s}\n') # Run im2rec.py file to convert to RecordIO print('Running im2rec.py tool for recordio conversion') os.system('python3 ./im2rec.py train.lst ./') os.system('python3 ./im2rec.py val.lst ./') os.system('python3 ./im2rec.py test.lst ./') # Copy to the output paths shutil.copy('train.rec', os.path.join(train_output_base_path, 'train.rec')) shutil.copy('val.rec', os.path.join(val_output_base_path, 'val.rec')) shutil.copy('test.rec', os.path.join(test_output_base_path, 'test.rec')) # Save the preprocessing report to make information available to downstream steps preprocessing_report = { 'preprocessing': { 'dataset': { 'num_training_samples': len(train_x), 'num_test_samples': len(test_x), 'num_val_samples': len(valid_x) } } } print('Preprocessing report:', preprocessing_report) report_output_path = os.path.join(report_output_base_path, 'preprocessing_report.json') with open(report_output_path, "w") as f: f.write(json.dumps(preprocessing_report))