# This Notebook crops annotations center part, and puts everything in a pandas dataframe to be used as training data for XGBoost algorithm

## Section 1

In [2]:
import xml.etree.ElementTree as ET
import csv
import os
from PIL import Image
import numpy as np
import argparse
import random
import shutil
import pandas as pd
import glob
import boto3
import re
from sagemaker import get_execution_role
import PIL.Image
import matplotlib.pyplot as plt
from sagemaker.predictor import csv_serializer
import io

role = get_execution_role()
region = boto3.Session().region_name

In [3]:
##############################################################
## Get a folder with some files, create a list containing all 
#  file names
##############################################################
def get_filename_list(path):  
    name = []
    for path, subdirs, files in os.walk(path):
        for filename in files:
            if ('.JPG' in filename or '.png' in filename):
                filename = filename.split('.')[0]
                name.append(filename)
    return name
##############################################################
## Get a folder with some files, create a list containing all 
#  paths to all files
##############################################################
def get_file_path_list(path):  
    full_path_list = []
    for path, subdirs, files in os.walk(path):
        for filename in files:
            f = os.path.join(path, filename)
            full_path_list.append(f)
    return full_path_list

## Section 2- Cropping the annotions here. Cropped images will be saved in a folder named cropped_images_small

In [None]:
def crop_images(xml_path, images_path, classes_to_use):
# Crop objects of type given in "classes_to_use" from xml files with several 
# objects in each file and several classes in each file

    if os.path.isdir("cropped_images_small"):
        shutil.rmtree('cropped_images_small')
        os.mkdir('cropped_images_small')       
        print("Storing cropped images in cropped_images_small folder" )
    else:
        os.mkdir('cropped_images_small')       
        print("Storing cropped images in cropped_images_small folder" )

    xml_paths = get_file_path_list(xml_path)
    images_names = list(set(get_filename_list(images_path)))
    count = 0
    for idx, x in enumerate(xml_paths):
        if '.DS_Store' not in x:
            single_imgfile_path = images_path + '\\'+ x.split('\\')[-1].split('.')[0] +'.JPG'
            image = Image.open(single_imgfile_path)
            tree = ET.parse(x)
            root = tree.getroot()
            for idx2, rt in enumerate(root.findall('object')):
                name = rt.find('name').text
                if name in classes_to_use:
                    xmin = int(rt.find('bndbox').find('xmin').text)
                    ymin = int(rt.find('bndbox').find('ymin').text)
                    xmax = int(rt.find('bndbox').find('xmax').text)
                    ymax = int(rt.find('bndbox').find('ymax').text)
                    a = (xmax-xmin)/3.0
                    b = (ymax-ymin)/3.0
                    box = [int(xmin+a),int(ymin+b),int(xmax-a),int(ymax-b)]
                    image1 = image.crop(box)
                    image1.save('cropped_images_small/'+name+"-"+str(count)+".png", "PNG", quality=80, optimize=True, progressive=True)
                    count+=1

In [None]:
img_path = 'path to your images'
xml_path = 'path to your XML files'
classes = ['Corroded','Clean']
crop_images(xml_path, img_path,classes)

In [None]:
Corroded = [p for p in files if "Corroded" in p]
Clean = [p for p in files if "Clean" in p]

print("Numebr of Corroded data points: ",len(Corroded))
print("Numebr of Clean data points: ",len(Clean))

## Section 3- Put the cropped images in a single data frame

In [None]:
crop_path = 'Path to your cropped images'
files = get_file_path_list(crop_path)

cols = ['class','R','G','B']
df = pd.DataFrame()

classes_to_use = ['Corroded','Clean']
dict1 = {'Clean': 0, 'Corroded': 1}
for file in files:
    lbls = Image.open(file)
    imagenp = np.asarray(lbls)
    imagenp=imagenp.reshape(imagenp.shape[1]*imagenp.shape[0],3)
    name = file.split('\\')[-1].split('.')[0].split('-')[0]
    classname = dict1[name]
    dftemp = pd.DataFrame(imagenp)
    dftemp.columns =['R','G','B']
    dftemp['class'] = classname
    columnsTitles=['class','R','G','B']
    dftemp=dftemp.reindex(columns=columnsTitles)
    df = pd.concat([df,dftemp], axis=0)

df.columns = cols
df.to_csv('data.csv', index=False)

## Section 4- Divide data into train and validation and upload to S3

In [4]:
bucket = 'demo-corrosion' # custom bucket name.
prefix = 'csv'
bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region,bucket)

In [5]:
def data_split(FILE_DATA, FILE_TRAIN, FILE_VALIDATION, PERCENT_VALIDATION, TARGET_VAR):
    data = pd.read_csv(FILE_DATA)
    n = data.shape[0]
    
    # Make the first column the target feature    
    cols = data.columns.tolist()
    target_pos = data.columns.get_loc(TARGET_VAR)
    cols.pop(target_pos)
    cols = [TARGET_VAR] + cols
    data = data.loc[:,cols]
    
    num_of_data = len(data)
    num_train = int(((100-PERCENT_VALIDATION)/100.0)*n)
    num_valid = int((PERCENT_VALIDATION/100.0)*n)
        
    # Shuffle the data
    data = data.sample(frac=1, replace=False)
    
    # Split data
    train_data = data.iloc[:num_train,:]
    valid_data = data.iloc[(num_train+1):n,:]
    
    train_data.to_csv(FILE_TRAIN, index=False, header=False)
    valid_data.to_csv(FILE_VALIDATION, index=False, header=False)
    
def write_to_s3(fobj, bucket, key):
    return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(fobj)

def upload_to_s3(bucket, channel, filename):
    fobj=open(filename, 'rb')
    key = prefix+'/'+channel+'/'+filename
    url = 's3://{}/{}'.format(bucket, key)
    print('Writing to {}'.format(url))
    write_to_s3(fobj, bucket, key)     
    return(url)

In [6]:
FILE_DATA = 'blog.csv'
TARGET_VAR = 'class'
FILE_TRAIN = 'train.csv'
FILE_VALIDATION = 'validation.csv'
PERCENT_VALIDATION = 20

data_split(FILE_DATA, FILE_TRAIN, FILE_VALIDATION, PERCENT_VALIDATION, TARGET_VAR)

In [7]:
# upload the files to the S3 bucket
s3_train_loc = upload_to_s3(bucket = bucket, channel = 'train', filename = FILE_TRAIN)
s3_valid_loc = upload_to_s3(bucket = bucket, channel = 'validation', filename = FILE_VALIDATION)

Writing to s3://demo-corrosion/csv/train/train.csv
Writing to s3://demo-corrosion/csv/validation/validation.csv
