{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Labeling and Modeling DICOM images using Amazon SageMaker\n",
    "\n",
    "This notebook walks through the output of from SageMaker GroundTruth job of labeling and annotating DICOM images. The output manifest file of DICOM labeling job contains reference to labels and annotation performed on DICOM images and saved in S3 bucket. This notebook demonstrates using output manifest file from SageMaker GroundTruth job and building a model. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Get the output manifest file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import boto3\n",
    "import os\n",
    "\n",
    "JOBNAME = '<LABELING JOB NAME>' #Replace it with the labeling job name\n",
    "REGION = '<REGION>' #Replace it with the job region\n",
    "client = boto3.client('sagemaker',region_name=REGION)\n",
    "response = client.describe_labeling_job(LabelingJobName= JOBNAME)\n",
    "file = response['LabelingJobOutput']['OutputDatasetS3Uri']\n",
    "output_manifest = os.path.basename(file)\n",
    "!aws s3 cp $file ./"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read manifest file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import json\n",
    "\n",
    "extracted_labels = []\n",
    "\n",
    "output_manifest = pd.read_json(\"output-manifest.manifest\", lines=True)\n",
    "def extract_label(x):\n",
    "    labels = x['dicom-label-job']['annotationsFromAllWorkers'][0]['annotationData']['content']['labels']['label']\n",
    "    extracted_labels.append(labels) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_manifest.apply(extract_label, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_manifest['label'] = extracted_labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = output_manifest.drop(['labels', 'dicom-label-job', 'dicom-label-job-metadata'], axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load DICOM images"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import boto3\n",
    "import pydicom\n",
    "from pydicom.filebase import DicomBytesIO\n",
    "from pydicom.filereader import dcmread\n",
    "from PIL import Image\n",
    "\n",
    "s3 = boto3.client('s3')\n",
    "images = []\n",
    "\n",
    "def get_dicom_image(x):\n",
    "    bucket = x['source-ref'].split(\"/\")[2]\n",
    "    prefixes = x['source-ref'].split(\"/\")\n",
    "    for i in range(3):\n",
    "        prefixes.pop(0)\n",
    "    key = \"/\".join(prefixes)\n",
    "    fileobj = s3.get_object(Bucket=bucket, Key=key)\n",
    "    dicom_data = fileobj['Body'].read()\n",
    "    dicom_bytes = DicomBytesIO(dicom_data)\n",
    "    ds = dcmread(dicom_bytes)\n",
    "    img = Image.fromarray(ds.pixel_array)\n",
    "    images.append(img)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.apply(get_dicom_image, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "resized_images = []\n",
    "def imresize(arr, size, resample):\n",
    "    resized_images.append(np.array(Image.fromarray(arr).resize(size, resample)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for img in images:\n",
    "    imresize(np.array(img), (224, 224), resample  = Image.NEAREST)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "resized_images = np.array(resized_images).reshape((-1, 224, 224, 1))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Train model using GroundTruth Labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = np.array(resized_images)\n",
    "y = df['label']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.utils import class_weight\n",
    "label_encoder = LabelEncoder()\n",
    "y = label_encoder.fit_transform(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
    "cls_weight = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from keras.preprocessing.image import ImageDataGenerator\n",
    "\n",
    "datagen = ImageDataGenerator(\n",
    "        featurewise_center=False, \n",
    "        samplewise_center=False,  \n",
    "        featurewise_std_normalization=False,\n",
    "        samplewise_std_normalization=False, \n",
    "        zca_whitening=False,  \n",
    "        rotation_range=10,  \n",
    "        zoom_range = 0.1, \n",
    "        width_shift_range=0.1, \n",
    "        height_shift_range=0.1,\n",
    "        horizontal_flip=False, \n",
    "        vertical_flip=False) \n",
    "\n",
    "datagen.fit(x_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from keras.utils.np_utils import to_categorical \n",
    "y_train = to_categorical(y_train, num_classes=14)\n",
    "y_test = to_categorical(y_test, num_classes=14)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "import keras\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D\n",
    "from keras import backend as K\n",
    "\n",
    "model = tf.keras.models.Sequential()\n",
    "model.add(tf.keras.layers.Conv2D(32, kernel_size=(5, 5), activation='relu', input_shape=(224,224,1)))\n",
    "model.add(tf.keras.layers.BatchNormalization())\n",
    "model.add(tf.keras.layers.Dropout(0.40))\n",
    "model.add(tf.keras.layers.Conv2D(64, (5, 5), activation='relu'))\n",
    "model.add(tf.keras.layers.BatchNormalization())\n",
    "model.add(tf.keras.layers.MaxPool2D(pool_size=(2, 2)))\n",
    "model.add(tf.keras.layers.Flatten())\n",
    "model.add(tf.keras.layers.Dense(128, activation='relu'))\n",
    "model.add(tf.keras.layers.Dropout(0.40))\n",
    "model.add(tf.keras.layers.Dense(14, activation='softmax'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "optimizer = tf.keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)\n",
    "model.compile(optimizer = optimizer , loss = \"categorical_crossentropy\", metrics=[\"accuracy\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from keras.callbacks import ReduceLROnPlateau\n",
    "\n",
    "learning_rate_reduction = ReduceLROnPlateau(monitor='val_accuracy', \n",
    "                                            patience=10, \n",
    "                                            verbose=1, \n",
    "                                            factor=0.5, \n",
    "                                            min_lr=0.001)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.compile(optimizer='adam',\n",
    "              loss='categorical_crossentropy',\n",
    "              metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "epochs = 50\n",
    "batch_size = 10\n",
    "history = model.fit_generator(datagen.flow(x_train,y_train, batch_size=10),\n",
    "                              epochs = epochs, validation_data = (x_test, y_test),\n",
    "                              verbose = 1, class_weight = cls_weight,steps_per_epoch=x_train.shape[0]/batch_size\n",
    "                              ,callbacks=[learning_rate_reduction])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_tensorflow2_p36",
   "language": "python",
   "name": "conda_tensorflow2_p36"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}