{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.\n", "# SPDX-License-Identifier: MIT-0\n", "\n", "import os\n", "import shutil\n", "from pathlib import Path\n", "from dotenv import load_dotenv\n", "env_path = Path('..') / 'set.env'\n", "load_dotenv(dotenv_path=env_path)\n", "\n", "skin_cancer_bucket=os.environ.get('SKIN_CANCER_BUCKET')\n", "skin_cancer_bucket_path=os.environ.get('SKIN_CANCER_BUCKET_PATH')\n", "skin_cancer_files=os.environ.get('SKIN_CANCER_FILES')\n", "skin_cancer_files_ext=os.environ.get('SKIN_CANCER_FILES_EXT')\n", "base_dir = os.environ.get('BASE_DIR')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import boto3\n", "\n", "#cleanup previous runs\n", "if os.path.exists(os.path.join(base_dir,skin_cancer_files)):\n", " shutil.rmtree(base_dir+skin_cancer_files)\n", " \n", "if os.path.exists(os.path.join(base_dir,skin_cancer_files_ext)):\n", " os.remove(os.path.join(base_dir,skin_cancer_files_ext)) \n", "\n", "data_dir = os.path.join(base_dir,'HAM10000')\n", "\n", "if os.path.exists(os.path.join(base_dir,'HAM10000.tar.gz')):\n", " os.remove(os.path.join(base_dir,'HAM10000.tar.gz'))\n", "\n", "if os.path.exists(data_dir):\n", " shutil.rmtree(data_dir)\n", " \n", "s3 = boto3.client('s3')\n", "s3.download_file(skin_cancer_bucket, skin_cancer_bucket_path+'/'+skin_cancer_files_ext,base_dir+skin_cancer_files_ext)\n", "\n", "print('Download training data set from '+skin_cancer_bucket)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import torchtext\n", "from numpy.random import seed\n", "seed(101)\n", "import pandas as pd\n", "import numpy as np\n", "import os\n", "\n", "os.mkdir(base_dir+skin_cancer_files)\n", "os.mkdir(base_dir+skin_cancer_files+'/HAM_images_part_1')\n", "os.mkdir(base_dir+skin_cancer_files+'/HAM_images_part_2')\n", "\n", "print('Uncompress data set for transformation')\n", "\n", "torchtext.utils.extract_archive(base_dir+skin_cancer_files_ext, base_dir+skin_cancer_files)\n", "torchtext.utils.extract_archive(base_dir+skin_cancer_files+'/HAM10000_images_part_1.zip', base_dir+skin_cancer_files+'/HAM_images_part_1')\n", "torchtext.utils.extract_archive(base_dir+skin_cancer_files+'/HAM10000_images_part_2.zip', base_dir+skin_cancer_files+'/HAM_images_part_2')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# now we create 7 folders inside 'base_dir':\n", "os.mkdir(data_dir)\n", "\n", "# train_dir\n", " # nv\n", " # mel\n", " # bkl\n", " # bcc\n", " # akiec\n", " # vasc\n", " # df\n", " \n", "# val_dir\n", " # nv\n", " # mel\n", " # bkl\n", " # bcc\n", " # akiec\n", " # vasc\n", " # df\n", "\n", "# create a path to 'base_dir' to which we will join the names of the new folders\n", "# train_dir\n", "train_dir = os.path.join(data_dir, 'train_dir')\n", "os.mkdir(train_dir)\n", "\n", "# val_dir\n", "val_dir = os.path.join(data_dir, 'val_dir')\n", "os.mkdir(val_dir)\n", "\n", "print('Create training and validation dir under HAM10000')\n", "\n", "# Inside each folder we create seperate folders for each class\n", "\n", "# create new folders inside train_dir\n", "nv = os.path.join(train_dir, 'nv')\n", "os.mkdir(nv)\n", "mel = os.path.join(train_dir, 'mel')\n", "os.mkdir(mel)\n", "bkl = os.path.join(train_dir, 'bkl')\n", "os.mkdir(bkl)\n", "bcc = os.path.join(train_dir, 'bcc')\n", "os.mkdir(bcc)\n", "akiec = os.path.join(train_dir, 'akiec')\n", "os.mkdir(akiec)\n", "vasc = os.path.join(train_dir, 'vasc')\n", "os.mkdir(vasc)\n", "df = os.path.join(train_dir, 'df')\n", "os.mkdir(df)\n", "\n", "# create new folders inside val_dir\n", "nv = os.path.join(val_dir, 'nv')\n", "os.mkdir(nv)\n", "mel = os.path.join(val_dir, 'mel')\n", "os.mkdir(mel)\n", "bkl = os.path.join(val_dir, 'bkl')\n", "os.mkdir(bkl)\n", "bcc = os.path.join(val_dir, 'bcc')\n", "os.mkdir(bcc)\n", "akiec = os.path.join(val_dir, 'akiec')\n", "os.mkdir(akiec)\n", "vasc = os.path.join(val_dir, 'vasc')\n", "os.mkdir(vasc)\n", "df = os.path.join(val_dir, 'df')\n", "os.mkdir(df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_data = pd.read_csv(base_dir+skin_cancer_files+'/HAM10000_metadata')\n", "\n", "df_data.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# this will tell us how many images are associated with each lesion_id\n", "df = df_data.groupby('lesion_id').count()\n", "\n", "# now we filter out lesion_id's that have only one image associated with it\n", "df = df[df['image_id'] == 1]\n", "\n", "df.reset_index(inplace=True)\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# here we identify lesion_id's that have duplicate images and those that have only\n", "# one image.\n", "\n", "def identify_duplicates(x):\n", " \n", " unique_list = list(df['lesion_id'])\n", " \n", " if x in unique_list:\n", " return 'no_duplicates'\n", " else:\n", " return 'has_duplicates'\n", " \n", "# create a new colum that is a copy of the lesion_id column\n", "df_data['duplicates'] = df_data['lesion_id']\n", "# apply the function to this new column\n", "df_data['duplicates'] = df_data['duplicates'].apply(identify_duplicates)\n", "\n", "df_data.head()\n", "\n", "print('Remove duplicates')\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_data['duplicates'].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# now we filter out images that don't have duplicates\n", "df = df_data[df_data['duplicates'] == 'no_duplicates']\n", "\n", "df.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#now we create a val set using df because we are sure that none of these images\n", "# have augmented duplicates in the train set\n", "from sklearn.model_selection import train_test_split\n", "\n", "y = df['dx']\n", "\n", "_, df_val = train_test_split(df, test_size=0.17, random_state=101, stratify=y)\n", "\n", "df_val.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_val['dx'].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# This function identifies if an image is part of the train\n", "# or val set.\n", "def identify_val_rows(x):\n", " # create a list of all the lesion_id's in the val set\n", " val_list = list(df_val['image_id'])\n", " \n", " if str(x) in val_list:\n", " return 'val'\n", " else:\n", " return 'train'\n", "\n", "# identify train and val rows\n", "\n", "# create a new colum that is a copy of the image_id column\n", "df_data['train_or_val'] = df_data['image_id']\n", "# apply the function to this new column\n", "df_data['train_or_val'] = df_data['train_or_val'].apply(identify_val_rows)\n", " \n", "# filter out train rows\n", "df_train = df_data[df_data['train_or_val'] == 'train']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_train['dx'].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_val['dx'].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Set the image_id as the index in df_data\n", "df_data.set_index('image_id', inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Get a list of images in each of the two folders\n", "import shutil\n", "\n", "folder_1 = os.listdir(base_dir+skin_cancer_files+'/HAM_images_part_1')\n", "folder_2 = os.listdir(base_dir+skin_cancer_files+'/HAM_images_part_2')\n", "\n", "# Get a list of train and val images\n", "train_list = list(df_train['image_id'])\n", "val_list = list(df_val['image_id'])\n", "\n", "\n", "\n", "# Transfer the train images\n", "print('Organize Images by Skin Cancer Class')\n", "\n", "for image in train_list:\n", " \n", " fname = image + '.jpg'\n", " label = df_data.loc[image,'dx']\n", " \n", " if fname in folder_1:\n", " # source path to image\n", " src = os.path.join(base_dir+skin_cancer_files+'/HAM_images_part_1', fname)\n", " # destination path to image\n", " dst = os.path.join(train_dir, label, fname)\n", " # copy the image from the source to the destination\n", " shutil.copyfile(src, dst)\n", "\n", " if fname in folder_2:\n", " # source path to image\n", " src = os.path.join(base_dir+skin_cancer_files+'/HAM_images_part_2', fname)\n", " # destination path to image\n", " dst = os.path.join(train_dir, label, fname)\n", " # copy the image from the source to the destination\n", " shutil.copyfile(src, dst)\n", " \n", "# Transfer the val images\n", "\n", "for image in val_list:\n", " \n", " fname = image + '.jpg'\n", " label = df_data.loc[image,'dx']\n", " \n", " if fname in folder_1:\n", " # source path to image\n", " src = os.path.join(base_dir+skin_cancer_files+'/HAM_images_part_1', fname)\n", " # destination path to image\n", " dst = os.path.join(val_dir, label, fname)\n", " # copy the image from the source to the destination\n", " shutil.copyfile(src, dst)\n", "\n", " if fname in folder_2:\n", " # source path to image\n", " src = os.path.join(base_dir+skin_cancer_files+'/HAM_images_part_2', fname)\n", " # destination path to image\n", " dst = os.path.join(val_dir, label, fname)\n", " # copy the image from the source to the destination\n", " shutil.copyfile(src, dst)\n", " \n", "# Check how many train images we now have in each folder.\n", "print('Images by Class')\n", "print('nv: '+str(len(os.listdir(train_dir +'/nv'))))\n", "print('mel: '+str(len(os.listdir(train_dir +'/mel'))))\n", "print('bkl: '+str(len(os.listdir(train_dir +'/bkl'))))\n", "print('bcc: '+str(len(os.listdir(train_dir +'/bcc'))))\n", "print('akiec: '+str(len(os.listdir(train_dir +'/akiec'))))\n", "print('vasc: '+str(len(os.listdir(train_dir +'/vasc'))))\n", "print('df: '+str(len(os.listdir(train_dir +'/df'))))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# note that we are not augmenting class 'nv'\n", "class_list = ['mel','bkl','bcc','akiec','vasc','df']\n", "\n", "print('Augment Images By Class')\n", "\n", "for item in class_list:\n", " \n", " # We are creating temporary directories here because we delete these directories later\n", " # create a base dir\n", " aug_dir = data_dir + '/aug_dir'\n", " os.mkdir(aug_dir)\n", " # create a dir within the base dir to store images of the same class\n", " img_dir = os.path.join(aug_dir, 'img_dir')\n", " os.mkdir(img_dir)\n", "\n", " # Choose a class\n", " img_class = item\n", "\n", " # list all images in that directory\n", " img_list = os.listdir(train_dir + '/'+ img_class)\n", "\n", " # Copy images from the class train dir to the img_dir e.g. class 'mel'\n", " for fname in img_list:\n", " # source path to image\n", " src = os.path.join(train_dir + '/' + img_class, fname)\n", " # destination path to image\n", " dst = os.path.join(img_dir,fname)\n", " # copy the image from the source to the destination\n", " shutil.copyfile(src, dst)\n", " \n", " # list all images in that directory\n", " aug_list = os.listdir(img_dir)\n", " \n", " num_aug_images_wanted = 5000 # total number of images we want to have in each class\n", " num_files = len(os.listdir(img_dir))\n", " num_batches = int(np.ceil((num_aug_images_wanted/num_files)))\n", " \n", " j = 0\n", " for i in range(1,num_batches):\n", " for fname in aug_list:\n", " # source path to image\n", " src = os.path.join(img_dir, fname)\n", " # destination path to image\n", " dst = os.path.join(train_dir + '/' + img_class, 'AUG_' + str(j) + '_'+ fname)\n", " # copy the image from the source to the destination\n", " shutil.copyfile(src, dst)\n", " j = j + 1\n", " \n", " shutil.rmtree(aug_dir)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Check how many train images we now have in each folder.\n", "# This is the original images plus the augmented images.\n", "print('Images by Class After Augmentation')\n", "print('nv: '+str(len(os.listdir(train_dir +'/nv'))))\n", "print('mel: '+str(len(os.listdir(train_dir +'/mel'))))\n", "print('bkl: '+str(len(os.listdir(train_dir +'/bkl'))))\n", "print('bcc: '+str(len(os.listdir(train_dir +'/bcc'))))\n", "print('akiec: '+str(len(os.listdir(train_dir +'/akiec'))))\n", "print('vasc: '+str(len(os.listdir(train_dir +'/vasc'))))\n", "print('df: '+str(len(os.listdir(train_dir +'/df'))))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from PIL import Image\n", "\n", "class_names = sorted([x for x in os.listdir(train_dir) if os.path.isdir(os.path.join(train_dir, x))])\n", "num_class = len(class_names)\n", "image_files = [[os.path.join(train_dir, class_name, x) \n", " for x in os.listdir(os.path.join(train_dir, class_name))] \n", " for class_name in class_names]\n", "image_file_list = []\n", "image_label_list = []\n", "\n", "for i, class_name in enumerate(class_names):\n", " image_file_list.extend(image_files[i])\n", " image_label_list.extend([i] * len(image_files[i]))\n", "num_total = len(image_label_list)\n", "image_width, image_height = Image.open(image_file_list[0]).size" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "\n", "print('')\n", "print('Sample of Training images')\n", "plt.subplots(3, 3, figsize=(8, 8))\n", "for i,k in enumerate(np.random.randint(num_total, size=9)):\n", " im = Image.open(image_file_list[k])\n", " arr = np.array(im)\n", " #print(arr.shape)\n", " plt.subplot(3, 3, i + 1)\n", " plt.xlabel(class_names[image_label_list[k]])\n", " plt.imshow(arr, vmin=0, vmax=255)\n", "plt.tight_layout()\n", "plt.show()\n", "\n", "print('')\n", "print('Total image count:', num_total)\n", "print('Image dimensions:', image_width, \"x\", image_height)\n", "print('Label names:', class_names)\n", "print('Label counts:', [len(image_files[i]) for i in range(num_class)])\n", "print('')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print('Compressing transformed HAM10000 data set.')\n", "\n", "!tar -czf ../HAM10000.tar.gz ../HAM10000\n", "\n", "print('Training dataset transformation complete.')" ] } ], "metadata": { "kernelspec": { "display_name": "conda_pytorch_latest_p36", "language": "python", "name": "conda_pytorch_latest_p36" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.10" } }, "nbformat": 4, "nbformat_minor": 4 }