{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.\n",
    "# SPDX-License-Identifier: MIT-0\n",
    "\n",
    "import os\n",
    "import shutil\n",
    "from pathlib import Path\n",
    "from dotenv import load_dotenv\n",
    "env_path = Path('..') / 'set.env'\n",
    "load_dotenv(dotenv_path=env_path)\n",
    "\n",
    "skin_cancer_bucket=os.environ.get('SKIN_CANCER_BUCKET')\n",
    "skin_cancer_bucket_path=os.environ.get('SKIN_CANCER_BUCKET_PATH')\n",
    "skin_cancer_files=os.environ.get('SKIN_CANCER_FILES')\n",
    "skin_cancer_files_ext=os.environ.get('SKIN_CANCER_FILES_EXT')\n",
    "base_dir = os.environ.get('BASE_DIR')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import boto3\n",
    "\n",
    "#cleanup previous runs\n",
    "if os.path.exists(os.path.join(base_dir,skin_cancer_files)):\n",
    "    shutil.rmtree(base_dir+skin_cancer_files)\n",
    "    \n",
    "if os.path.exists(os.path.join(base_dir,skin_cancer_files_ext)):\n",
    "    os.remove(os.path.join(base_dir,skin_cancer_files_ext))    \n",
    "\n",
    "data_dir = os.path.join(base_dir,'HAM10000')\n",
    "\n",
    "if os.path.exists(os.path.join(base_dir,'HAM10000.tar.gz')):\n",
    "    os.remove(os.path.join(base_dir,'HAM10000.tar.gz'))\n",
    "\n",
    "if os.path.exists(data_dir):\n",
    "    shutil.rmtree(data_dir)\n",
    "    \n",
    "s3 = boto3.client('s3')\n",
    "s3.download_file(skin_cancer_bucket, skin_cancer_bucket_path+'/'+skin_cancer_files_ext,base_dir+skin_cancer_files_ext)\n",
    "\n",
    "print('Download training data set from '+skin_cancer_bucket)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torchtext\n",
    "from numpy.random import seed\n",
    "seed(101)\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "\n",
    "os.mkdir(base_dir+skin_cancer_files)\n",
    "os.mkdir(base_dir+skin_cancer_files+'/HAM_images_part_1')\n",
    "os.mkdir(base_dir+skin_cancer_files+'/HAM_images_part_2')\n",
    "\n",
    "print('Uncompress data set for transformation')\n",
    "\n",
    "torchtext.utils.extract_archive(base_dir+skin_cancer_files_ext, base_dir+skin_cancer_files)\n",
    "torchtext.utils.extract_archive(base_dir+skin_cancer_files+'/HAM10000_images_part_1.zip', base_dir+skin_cancer_files+'/HAM_images_part_1')\n",
    "torchtext.utils.extract_archive(base_dir+skin_cancer_files+'/HAM10000_images_part_2.zip', base_dir+skin_cancer_files+'/HAM_images_part_2')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# now we create 7 folders inside 'base_dir':\n",
    "os.mkdir(data_dir)\n",
    "\n",
    "# train_dir\n",
    "    # nv\n",
    "    # mel\n",
    "    # bkl\n",
    "    # bcc\n",
    "    # akiec\n",
    "    # vasc\n",
    "    # df\n",
    " \n",
    "# val_dir\n",
    "    # nv\n",
    "    # mel\n",
    "    # bkl\n",
    "    # bcc\n",
    "    # akiec\n",
    "    # vasc\n",
    "    # df\n",
    "\n",
    "# create a path to 'base_dir' to which we will join the names of the new folders\n",
    "# train_dir\n",
    "train_dir = os.path.join(data_dir, 'train_dir')\n",
    "os.mkdir(train_dir)\n",
    "\n",
    "# val_dir\n",
    "val_dir = os.path.join(data_dir, 'val_dir')\n",
    "os.mkdir(val_dir)\n",
    "\n",
    "print('Create training and validation dir under HAM10000')\n",
    "\n",
    "# Inside each folder we create seperate folders for each class\n",
    "\n",
    "# create new folders inside train_dir\n",
    "nv = os.path.join(train_dir, 'nv')\n",
    "os.mkdir(nv)\n",
    "mel = os.path.join(train_dir, 'mel')\n",
    "os.mkdir(mel)\n",
    "bkl = os.path.join(train_dir, 'bkl')\n",
    "os.mkdir(bkl)\n",
    "bcc = os.path.join(train_dir, 'bcc')\n",
    "os.mkdir(bcc)\n",
    "akiec = os.path.join(train_dir, 'akiec')\n",
    "os.mkdir(akiec)\n",
    "vasc = os.path.join(train_dir, 'vasc')\n",
    "os.mkdir(vasc)\n",
    "df = os.path.join(train_dir, 'df')\n",
    "os.mkdir(df)\n",
    "\n",
    "# create new folders inside val_dir\n",
    "nv = os.path.join(val_dir, 'nv')\n",
    "os.mkdir(nv)\n",
    "mel = os.path.join(val_dir, 'mel')\n",
    "os.mkdir(mel)\n",
    "bkl = os.path.join(val_dir, 'bkl')\n",
    "os.mkdir(bkl)\n",
    "bcc = os.path.join(val_dir, 'bcc')\n",
    "os.mkdir(bcc)\n",
    "akiec = os.path.join(val_dir, 'akiec')\n",
    "os.mkdir(akiec)\n",
    "vasc = os.path.join(val_dir, 'vasc')\n",
    "os.mkdir(vasc)\n",
    "df = os.path.join(val_dir, 'df')\n",
    "os.mkdir(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_data = pd.read_csv(base_dir+skin_cancer_files+'/HAM10000_metadata')\n",
    "\n",
    "df_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# this will tell us how many images are associated with each lesion_id\n",
    "df = df_data.groupby('lesion_id').count()\n",
    "\n",
    "# now we filter out lesion_id's that have only one image associated with it\n",
    "df = df[df['image_id'] == 1]\n",
    "\n",
    "df.reset_index(inplace=True)\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# here we identify lesion_id's that have duplicate images and those that have only\n",
    "# one image.\n",
    "\n",
    "def identify_duplicates(x):\n",
    "    \n",
    "    unique_list = list(df['lesion_id'])\n",
    "    \n",
    "    if x in unique_list:\n",
    "        return 'no_duplicates'\n",
    "    else:\n",
    "        return 'has_duplicates'\n",
    "    \n",
    "# create a new colum that is a copy of the lesion_id column\n",
    "df_data['duplicates'] = df_data['lesion_id']\n",
    "# apply the function to this new column\n",
    "df_data['duplicates'] = df_data['duplicates'].apply(identify_duplicates)\n",
    "\n",
    "df_data.head()\n",
    "\n",
    "print('Remove duplicates')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_data['duplicates'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# now we filter out images that don't have duplicates\n",
    "df = df_data[df_data['duplicates'] == 'no_duplicates']\n",
    "\n",
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#now we create a val set using df because we are sure that none of these images\n",
    "# have augmented duplicates in the train set\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "y = df['dx']\n",
    "\n",
    "_, df_val = train_test_split(df, test_size=0.17, random_state=101, stratify=y)\n",
    "\n",
    "df_val.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_val['dx'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# This function identifies if an image is part of the train\n",
    "# or val set.\n",
    "def identify_val_rows(x):\n",
    "    # create a list of all the lesion_id's in the val set\n",
    "    val_list = list(df_val['image_id'])\n",
    "    \n",
    "    if str(x) in val_list:\n",
    "        return 'val'\n",
    "    else:\n",
    "        return 'train'\n",
    "\n",
    "# identify train and val rows\n",
    "\n",
    "# create a new colum that is a copy of the image_id column\n",
    "df_data['train_or_val'] = df_data['image_id']\n",
    "# apply the function to this new column\n",
    "df_data['train_or_val'] = df_data['train_or_val'].apply(identify_val_rows)\n",
    "   \n",
    "# filter out train rows\n",
    "df_train = df_data[df_data['train_or_val'] == 'train']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train['dx'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_val['dx'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Set the image_id as the index in df_data\n",
    "df_data.set_index('image_id', inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Get a list of images in each of the two folders\n",
    "import shutil\n",
    "\n",
    "folder_1 = os.listdir(base_dir+skin_cancer_files+'/HAM_images_part_1')\n",
    "folder_2 = os.listdir(base_dir+skin_cancer_files+'/HAM_images_part_2')\n",
    "\n",
    "# Get a list of train and val images\n",
    "train_list = list(df_train['image_id'])\n",
    "val_list = list(df_val['image_id'])\n",
    "\n",
    "\n",
    "\n",
    "# Transfer the train images\n",
    "print('Organize Images by Skin Cancer Class')\n",
    "\n",
    "for image in train_list:\n",
    "    \n",
    "    fname = image + '.jpg'\n",
    "    label = df_data.loc[image,'dx']\n",
    "    \n",
    "    if fname in folder_1:\n",
    "        # source path to image\n",
    "        src = os.path.join(base_dir+skin_cancer_files+'/HAM_images_part_1', fname)\n",
    "        # destination path to image\n",
    "        dst = os.path.join(train_dir, label, fname)\n",
    "        # copy the image from the source to the destination\n",
    "        shutil.copyfile(src, dst)\n",
    "\n",
    "    if fname in folder_2:\n",
    "        # source path to image\n",
    "        src = os.path.join(base_dir+skin_cancer_files+'/HAM_images_part_2', fname)\n",
    "        # destination path to image\n",
    "        dst = os.path.join(train_dir, label, fname)\n",
    "        # copy the image from the source to the destination\n",
    "        shutil.copyfile(src, dst)\n",
    "        \n",
    "# Transfer the val images\n",
    "\n",
    "for image in val_list:\n",
    "    \n",
    "    fname = image + '.jpg'\n",
    "    label = df_data.loc[image,'dx']\n",
    "    \n",
    "    if fname in folder_1:\n",
    "        # source path to image\n",
    "        src = os.path.join(base_dir+skin_cancer_files+'/HAM_images_part_1', fname)\n",
    "        # destination path to image\n",
    "        dst = os.path.join(val_dir, label, fname)\n",
    "        # copy the image from the source to the destination\n",
    "        shutil.copyfile(src, dst)\n",
    "\n",
    "    if fname in folder_2:\n",
    "        # source path to image\n",
    "        src = os.path.join(base_dir+skin_cancer_files+'/HAM_images_part_2', fname)\n",
    "        # destination path to image\n",
    "        dst = os.path.join(val_dir, label, fname)\n",
    "        # copy the image from the source to the destination\n",
    "        shutil.copyfile(src, dst)\n",
    "        \n",
    "# Check how many train images we now have in each folder.\n",
    "print('Images by Class')\n",
    "print('nv: '+str(len(os.listdir(train_dir +'/nv'))))\n",
    "print('mel: '+str(len(os.listdir(train_dir +'/mel'))))\n",
    "print('bkl: '+str(len(os.listdir(train_dir +'/bkl'))))\n",
    "print('bcc: '+str(len(os.listdir(train_dir +'/bcc'))))\n",
    "print('akiec: '+str(len(os.listdir(train_dir +'/akiec'))))\n",
    "print('vasc: '+str(len(os.listdir(train_dir +'/vasc'))))\n",
    "print('df: '+str(len(os.listdir(train_dir +'/df'))))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# note that we are not augmenting class 'nv'\n",
    "class_list = ['mel','bkl','bcc','akiec','vasc','df']\n",
    "\n",
    "print('Augment Images By Class')\n",
    "\n",
    "for item in class_list:\n",
    "    \n",
    "    # We are creating temporary directories here because we delete these directories later\n",
    "    # create a base dir\n",
    "    aug_dir = data_dir + '/aug_dir'\n",
    "    os.mkdir(aug_dir)\n",
    "    # create a dir within the base dir to store images of the same class\n",
    "    img_dir = os.path.join(aug_dir, 'img_dir')\n",
    "    os.mkdir(img_dir)\n",
    "\n",
    "    # Choose a class\n",
    "    img_class = item\n",
    "\n",
    "    # list all images in that directory\n",
    "    img_list = os.listdir(train_dir + '/'+ img_class)\n",
    "\n",
    "    # Copy images from the class train dir to the img_dir e.g. class 'mel'\n",
    "    for fname in img_list:\n",
    "        # source path to image\n",
    "        src = os.path.join(train_dir + '/' + img_class, fname)\n",
    "        # destination path to image\n",
    "        dst = os.path.join(img_dir,fname)\n",
    "        # copy the image from the source to the destination\n",
    "        shutil.copyfile(src, dst)\n",
    "        \n",
    "    # list all images in that directory\n",
    "    aug_list = os.listdir(img_dir)\n",
    "    \n",
    "    num_aug_images_wanted = 5000 # total number of images we want to have in each class\n",
    "    num_files = len(os.listdir(img_dir))\n",
    "    num_batches = int(np.ceil((num_aug_images_wanted/num_files)))\n",
    "    \n",
    "    j = 0\n",
    "    for i in range(1,num_batches):\n",
    "        for fname in aug_list:\n",
    "            # source path to image\n",
    "            src = os.path.join(img_dir, fname)\n",
    "            # destination path to image\n",
    "            dst = os.path.join(train_dir + '/' + img_class, 'AUG_' + str(j) + '_'+ fname)\n",
    "            # copy the image from the source to the destination\n",
    "            shutil.copyfile(src, dst)\n",
    "        j = j + 1\n",
    "            \n",
    "    shutil.rmtree(aug_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check how many train images we now have in each folder.\n",
    "# This is the original images plus the augmented images.\n",
    "print('Images by Class After Augmentation')\n",
    "print('nv: '+str(len(os.listdir(train_dir +'/nv'))))\n",
    "print('mel: '+str(len(os.listdir(train_dir +'/mel'))))\n",
    "print('bkl: '+str(len(os.listdir(train_dir +'/bkl'))))\n",
    "print('bcc: '+str(len(os.listdir(train_dir +'/bcc'))))\n",
    "print('akiec: '+str(len(os.listdir(train_dir +'/akiec'))))\n",
    "print('vasc: '+str(len(os.listdir(train_dir +'/vasc'))))\n",
    "print('df: '+str(len(os.listdir(train_dir +'/df'))))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from PIL import Image\n",
    "\n",
    "class_names = sorted([x for x in os.listdir(train_dir) if os.path.isdir(os.path.join(train_dir, x))])\n",
    "num_class = len(class_names)\n",
    "image_files = [[os.path.join(train_dir, class_name, x) \n",
    "                for x in os.listdir(os.path.join(train_dir, class_name))] \n",
    "               for class_name in class_names]\n",
    "image_file_list = []\n",
    "image_label_list = []\n",
    "\n",
    "for i, class_name in enumerate(class_names):\n",
    "    image_file_list.extend(image_files[i])\n",
    "    image_label_list.extend([i] * len(image_files[i]))\n",
    "num_total = len(image_label_list)\n",
    "image_width, image_height = Image.open(image_file_list[0]).size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "print('')\n",
    "print('Sample of Training images')\n",
    "plt.subplots(3, 3, figsize=(8, 8))\n",
    "for i,k in enumerate(np.random.randint(num_total, size=9)):\n",
    "    im = Image.open(image_file_list[k])\n",
    "    arr = np.array(im)\n",
    "    #print(arr.shape)\n",
    "    plt.subplot(3, 3, i + 1)\n",
    "    plt.xlabel(class_names[image_label_list[k]])\n",
    "    plt.imshow(arr, vmin=0, vmax=255)\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "print('')\n",
    "print('Total image count:', num_total)\n",
    "print('Image dimensions:', image_width, \"x\", image_height)\n",
    "print('Label names:', class_names)\n",
    "print('Label counts:', [len(image_files[i]) for i in range(num_class)])\n",
    "print('')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('Compressing transformed HAM10000 data set.')\n",
    "\n",
    "!tar -czf ../HAM10000.tar.gz ../HAM10000\n",
    "\n",
    "print('Training dataset transformation complete.')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_pytorch_latest_p36",
   "language": "python",
   "name": "conda_pytorch_latest_p36"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}