In [None]:
# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0

import os
import shutil
from pathlib import Path
from dotenv import load_dotenv
env_path = Path('..') / 'set.env'
load_dotenv(dotenv_path=env_path)

skin_cancer_bucket=os.environ.get('SKIN_CANCER_BUCKET')
skin_cancer_bucket_path=os.environ.get('SKIN_CANCER_BUCKET_PATH')
skin_cancer_files=os.environ.get('SKIN_CANCER_FILES')
skin_cancer_files_ext=os.environ.get('SKIN_CANCER_FILES_EXT')
base_dir = os.environ.get('BASE_DIR')

In [None]:
import boto3

#cleanup previous runs
if os.path.exists(os.path.join(base_dir,skin_cancer_files)):
 shutil.rmtree(base_dir+skin_cancer_files)
 
if os.path.exists(os.path.join(base_dir,skin_cancer_files_ext)):
 os.remove(os.path.join(base_dir,skin_cancer_files_ext)) 

data_dir = os.path.join(base_dir,'HAM10000')

if os.path.exists(os.path.join(base_dir,'HAM10000.tar.gz')):
 os.remove(os.path.join(base_dir,'HAM10000.tar.gz'))

if os.path.exists(data_dir):
 shutil.rmtree(data_dir)
 
s3 = boto3.client('s3')
s3.download_file(skin_cancer_bucket, skin_cancer_bucket_path+'/'+skin_cancer_files_ext,base_dir+skin_cancer_files_ext)

print('Download training data set from '+skin_cancer_bucket)

In [None]:
import torchtext
from numpy.random import seed
seed(101)
import pandas as pd
import numpy as np
import os

os.mkdir(base_dir+skin_cancer_files)
os.mkdir(base_dir+skin_cancer_files+'/HAM_images_part_1')
os.mkdir(base_dir+skin_cancer_files+'/HAM_images_part_2')

print('Uncompress data set for transformation')

torchtext.utils.extract_archive(base_dir+skin_cancer_files_ext, base_dir+skin_cancer_files)
torchtext.utils.extract_archive(base_dir+skin_cancer_files+'/HAM10000_images_part_1.zip', base_dir+skin_cancer_files+'/HAM_images_part_1')
torchtext.utils.extract_archive(base_dir+skin_cancer_files+'/HAM10000_images_part_2.zip', base_dir+skin_cancer_files+'/HAM_images_part_2')

In [None]:
# now we create 7 folders inside 'base_dir':
os.mkdir(data_dir)

# train_dir
 # nv
 # mel
 # bkl
 # bcc
 # akiec
 # vasc
 # df
 
# val_dir
 # nv
 # mel
 # bkl
 # bcc
 # akiec
 # vasc
 # df

# create a path to 'base_dir' to which we will join the names of the new folders
# train_dir
train_dir = os.path.join(data_dir, 'train_dir')
os.mkdir(train_dir)

# val_dir
val_dir = os.path.join(data_dir, 'val_dir')
os.mkdir(val_dir)

print('Create training and validation dir under HAM10000')

# Inside each folder we create seperate folders for each class

# create new folders inside train_dir
nv = os.path.join(train_dir, 'nv')
os.mkdir(nv)
mel = os.path.join(train_dir, 'mel')
os.mkdir(mel)
bkl = os.path.join(train_dir, 'bkl')
os.mkdir(bkl)
bcc = os.path.join(train_dir, 'bcc')
os.mkdir(bcc)
akiec = os.path.join(train_dir, 'akiec')
os.mkdir(akiec)
vasc = os.path.join(train_dir, 'vasc')
os.mkdir(vasc)
df = os.path.join(train_dir, 'df')
os.mkdir(df)

# create new folders inside val_dir
nv = os.path.join(val_dir, 'nv')
os.mkdir(nv)
mel = os.path.join(val_dir, 'mel')
os.mkdir(mel)
bkl = os.path.join(val_dir, 'bkl')
os.mkdir(bkl)
bcc = os.path.join(val_dir, 'bcc')
os.mkdir(bcc)
akiec = os.path.join(val_dir, 'akiec')
os.mkdir(akiec)
vasc = os.path.join(val_dir, 'vasc')
os.mkdir(vasc)
df = os.path.join(val_dir, 'df')
os.mkdir(df)

In [None]:
df_data = pd.read_csv(base_dir+skin_cancer_files+'/HAM10000_metadata')

df_data.head()

In [None]:
# this will tell us how many images are associated with each lesion_id
df = df_data.groupby('lesion_id').count()

# now we filter out lesion_id's that have only one image associated with it
df = df[df['image_id'] == 1]

df.reset_index(inplace=True)

df.head()

In [None]:
# here we identify lesion_id's that have duplicate images and those that have only
# one image.

def identify_duplicates(x):
 
 unique_list = list(df['lesion_id'])
 
 if x in unique_list:
 return 'no_duplicates'
 else:
 return 'has_duplicates'
 
# create a new colum that is a copy of the lesion_id column
df_data['duplicates'] = df_data['lesion_id']
# apply the function to this new column
df_data['duplicates'] = df_data['duplicates'].apply(identify_duplicates)

df_data.head()

print('Remove duplicates')


In [None]:
df_data['duplicates'].value_counts()

In [None]:
# now we filter out images that don't have duplicates
df = df_data[df_data['duplicates'] == 'no_duplicates']

df.shape

In [None]:
#now we create a val set using df because we are sure that none of these images
# have augmented duplicates in the train set
from sklearn.model_selection import train_test_split

y = df['dx']

_, df_val = train_test_split(df, test_size=0.17, random_state=101, stratify=y)

df_val.shape

In [None]:
df_val['dx'].value_counts()

In [None]:
# This function identifies if an image is part of the train
# or val set.
def identify_val_rows(x):
 # create a list of all the lesion_id's in the val set
 val_list = list(df_val['image_id'])
 
 if str(x) in val_list:
 return 'val'
 else:
 return 'train'

# identify train and val rows

# create a new colum that is a copy of the image_id column
df_data['train_or_val'] = df_data['image_id']
# apply the function to this new column
df_data['train_or_val'] = df_data['train_or_val'].apply(identify_val_rows)
 
# filter out train rows
df_train = df_data[df_data['train_or_val'] == 'train']

In [None]:
df_train['dx'].value_counts()

In [None]:
df_val['dx'].value_counts()

In [None]:
#Set the image_id as the index in df_data
df_data.set_index('image_id', inplace=True)

In [None]:
#Get a list of images in each of the two folders
import shutil

folder_1 = os.listdir(base_dir+skin_cancer_files+'/HAM_images_part_1')
folder_2 = os.listdir(base_dir+skin_cancer_files+'/HAM_images_part_2')

# Get a list of train and val images
train_list = list(df_train['image_id'])
val_list = list(df_val['image_id'])



# Transfer the train images
print('Organize Images by Skin Cancer Class')

for image in train_list:
 
 fname = image + '.jpg'
 label = df_data.loc[image,'dx']
 
 if fname in folder_1:
 # source path to image
 src = os.path.join(base_dir+skin_cancer_files+'/HAM_images_part_1', fname)
 # destination path to image
 dst = os.path.join(train_dir, label, fname)
 # copy the image from the source to the destination
 shutil.copyfile(src, dst)

 if fname in folder_2:
 # source path to image
 src = os.path.join(base_dir+skin_cancer_files+'/HAM_images_part_2', fname)
 # destination path to image
 dst = os.path.join(train_dir, label, fname)
 # copy the image from the source to the destination
 shutil.copyfile(src, dst)
 
# Transfer the val images

for image in val_list:
 
 fname = image + '.jpg'
 label = df_data.loc[image,'dx']
 
 if fname in folder_1:
 # source path to image
 src = os.path.join(base_dir+skin_cancer_files+'/HAM_images_part_1', fname)
 # destination path to image
 dst = os.path.join(val_dir, label, fname)
 # copy the image from the source to the destination
 shutil.copyfile(src, dst)

 if fname in folder_2:
 # source path to image
 src = os.path.join(base_dir+skin_cancer_files+'/HAM_images_part_2', fname)
 # destination path to image
 dst = os.path.join(val_dir, label, fname)
 # copy the image from the source to the destination
 shutil.copyfile(src, dst)
 
# Check how many train images we now have in each folder.
print('Images by Class')
print('nv: '+str(len(os.listdir(train_dir +'/nv'))))
print('mel: '+str(len(os.listdir(train_dir +'/mel'))))
print('bkl: '+str(len(os.listdir(train_dir +'/bkl'))))
print('bcc: '+str(len(os.listdir(train_dir +'/bcc'))))
print('akiec: '+str(len(os.listdir(train_dir +'/akiec'))))
print('vasc: '+str(len(os.listdir(train_dir +'/vasc'))))
print('df: '+str(len(os.listdir(train_dir +'/df'))))

In [None]:
# note that we are not augmenting class 'nv'
class_list = ['mel','bkl','bcc','akiec','vasc','df']

print('Augment Images By Class')

for item in class_list:
 
 # We are creating temporary directories here because we delete these directories later
 # create a base dir
 aug_dir = data_dir + '/aug_dir'
 os.mkdir(aug_dir)
 # create a dir within the base dir to store images of the same class
 img_dir = os.path.join(aug_dir, 'img_dir')
 os.mkdir(img_dir)

 # Choose a class
 img_class = item

 # list all images in that directory
 img_list = os.listdir(train_dir + '/'+ img_class)

 # Copy images from the class train dir to the img_dir e.g. class 'mel'
 for fname in img_list:
 # source path to image
 src = os.path.join(train_dir + '/' + img_class, fname)
 # destination path to image
 dst = os.path.join(img_dir,fname)
 # copy the image from the source to the destination
 shutil.copyfile(src, dst)
 
 # list all images in that directory
 aug_list = os.listdir(img_dir)
 
 num_aug_images_wanted = 5000 # total number of images we want to have in each class
 num_files = len(os.listdir(img_dir))
 num_batches = int(np.ceil((num_aug_images_wanted/num_files)))
 
 j = 0
 for i in range(1,num_batches):
 for fname in aug_list:
 # source path to image
 src = os.path.join(img_dir, fname)
 # destination path to image
 dst = os.path.join(train_dir + '/' + img_class, 'AUG_' + str(j) + '_'+ fname)
 # copy the image from the source to the destination
 shutil.copyfile(src, dst)
 j = j + 1
 
 shutil.rmtree(aug_dir)

In [None]:
# Check how many train images we now have in each folder.
# This is the original images plus the augmented images.
print('Images by Class After Augmentation')
print('nv: '+str(len(os.listdir(train_dir +'/nv'))))
print('mel: '+str(len(os.listdir(train_dir +'/mel'))))
print('bkl: '+str(len(os.listdir(train_dir +'/bkl'))))
print('bcc: '+str(len(os.listdir(train_dir +'/bcc'))))
print('akiec: '+str(len(os.listdir(train_dir +'/akiec'))))
print('vasc: '+str(len(os.listdir(train_dir +'/vasc'))))
print('df: '+str(len(os.listdir(train_dir +'/df'))))

In [None]:
from PIL import Image

class_names = sorted([x for x in os.listdir(train_dir) if os.path.isdir(os.path.join(train_dir, x))])
num_class = len(class_names)
image_files = [[os.path.join(train_dir, class_name, x) 
 for x in os.listdir(os.path.join(train_dir, class_name))] 
 for class_name in class_names]
image_file_list = []
image_label_list = []

for i, class_name in enumerate(class_names):
 image_file_list.extend(image_files[i])
 image_label_list.extend([i] * len(image_files[i]))
num_total = len(image_label_list)
image_width, image_height = Image.open(image_file_list[0]).size

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

print('')
print('Sample of Training images')
plt.subplots(3, 3, figsize=(8, 8))
for i,k in enumerate(np.random.randint(num_total, size=9)):
 im = Image.open(image_file_list[k])
 arr = np.array(im)
 #print(arr.shape)
 plt.subplot(3, 3, i + 1)
 plt.xlabel(class_names[image_label_list[k]])
 plt.imshow(arr, vmin=0, vmax=255)
plt.tight_layout()
plt.show()

print('')
print('Total image count:', num_total)
print('Image dimensions:', image_width, "x", image_height)
print('Label names:', class_names)
print('Label counts:', [len(image_files[i]) for i in range(num_class)])
print('')

In [None]:
print('Compressing transformed HAM10000 data set.')

!tar -czf ../HAM10000.tar.gz ../HAM10000

print('Training dataset transformation complete.')