In [None]:
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0

# REQUIRED: Set pathname which contains the `BBBC005_v1` dataset used for training the model
IMG_PATH = "/root/BBBC005_v1_images/"
MASK_PATH = "/root/BBBC005_v1_ground_truth/"
MODEL_PATH = "/root/model/"

# Scale images to the specified height x width with a single channel by default.
IMG_HEIGHT = 512
IMG_WIDTH = 512
IMG_CHANNELS = 1

# Specify the number of images reserved for testing the model
NUM_TEST_IMAGES = 100

# Number of images to display for inference testing
DISPLAY_TEST_IMAGES = 4

In [None]:
import os

# Check whether the specified
# path exists or not
if os.path.exists(IMG_PATH) == False:
    print(IMG_PATH, "does not exist, please confirm datasets are correctly downloaded from the prerequisites step")

if os.path.exists(MASK_PATH) == False:
    print(MASK_PATH, "does not exist, please confirm datasets are correctly downloaded from the prerequisites step")

if os.path.exists(MODEL_PATH) == False:
    print(MODEL_PATH, "directory does not exist, creating directory ...")
    os.mkdir(MODEL_PATH)

In [None]:
# Install missing deps from the `TensorFlow 2.11.0 Python 3.9` instance
!pip install --upgrade pip
!pip install scikit-image

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from skimage.io import imread, imshow
from skimage.transform import resize

# Don't Show Warning Messages
import warnings
warnings.filterwarnings('ignore')

from keras.models import Model, load_model
from keras.layers import Input
from keras.layers.core import Dropout, Lambda
from keras.layers.convolutional import Conv2D, Conv2DTranspose
from keras.layers.pooling import MaxPooling2D
from tensorflow.keras.layers import concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K

import tensorflow as tf

In [None]:
# Confirm if a CPU, or GPU compatible device is available.
print(tf.config.list_physical_devices())
print(tf.__version__)  # This notebook runs on TensorFlow 2.12.0 as tested for the workshop

In [None]:
# Step 1 - Load the images and masks into a Pandas dataframe
imgs = os.listdir(IMG_PATH)
masks = os.listdir(MASK_PATH)
all = np.append(imgs, masks)

In [None]:
df_imgs = pd.DataFrame(columns=["filename", "type", "well", "cells", "blur", "sample", "stain"])

In [None]:
i = 0
for img in all:
    i += 1

# The image nomenclature was chosen to permit metadata to be organized in a plate layout. Each image follows the form
# SIMCEPImages_well_Ccells_Fblur_ssamples_wstain.TIF
# where the variables are as follows:

# well
# The standard 384-well plate format is used where the rows are named A-P and the columns 1- 24.
# cells
# The number of cells simulated in the image (1-100).
# blur
# The amount of focus blur applied (1-48). The focus blur was simulated by using MATLAB's imfilter function with a rotationally symmetric Gaussian lowpass filter of diameter and sigma of 0.25 Ã— 
# sample
# Number of samples (1-25) for a given combination. Can be used to mimic the "site" number for each well.
# stain
# 1 = cell body stain, 2 = nuclei stain.

    metadata = img.split("_")
    
    # Skip any filenames which do not match our nomenclature
    if(len(metadata) != 6):
        continue
    
    # Determine if an image or mask
    if i <= len(imgs):
        img_type = "image"
    else:
        img_type = "mask"
    
    # Create a new row, format fields from filename
    new_row = {
        'filename': img,
        'type': img_type,
        'well': metadata[1],
        'cells': int(metadata[2].replace("C", "")),
        'blur': int(metadata[3].replace("F", "")),
        'sample': int(metadata[4].replace("s", "")),
        'stain': int(metadata[5].replace("w", "").replace(".TIF", "")),
    }
    
    # Append the new row to our dataframe
    df_imgs.loc[len(df_imgs)] = new_row


In [None]:
df_imgs.head(20)

In [None]:
# Step 2 - Create a new dataframe for our masks and images
df_nuclei_masks = df_imgs.query("type == 'mask' & stain == 1")
df_nuclei_images = df_imgs.query("type == 'image' & stain == 1")

# Confirm the dataframe shape for the number images and masks.
print("Image mask shape (nuclei stain) =>\t",
        df_nuclei_masks.shape,
        "\nimage shape (nuclei stain) =>\t\t", 
        df_nuclei_images.shape)

In [None]:
# Step 3 - Split the data into training and validation
df_nuclei_masks_tests = df_nuclei_masks.sample(NUM_TEST_IMAGES)
df_nuclei_masks_tests.head()

In [None]:
# Remove our test images used for validation from the training mask dataframe
df_nuclei_masks = df_nuclei_masks.drop(df_nuclei_masks_tests.index)

In [None]:
# Confirm the shape for our test, and training dataset
print(df_nuclei_masks_tests.shape, df_nuclei_masks.shape)

In [None]:
# Display a random sample mask
row = df_nuclei_masks.sample(1)
row.head()

In [None]:
# Read the mask using skimage
mask = imread("{}/{}".format(MASK_PATH, row["filename"].to_string(index=False)))
plt.imshow(mask, cmap='gray')

In [None]:
# Load the original data to confirm our training mask matches
img = imread("{}/{}".format(IMG_PATH, row["filename"].to_string(index=False)))
plt.imshow(img)

In [None]:
# Step 4 - Resize and format images for model training
def format_img(dir, imgs, channels, dtype):
    train = np.zeros((len(imgs), IMG_HEIGHT, IMG_WIDTH, channels), dtype=dtype)
    for i, img in enumerate(imgs):
        file = "{}/{}".format(dir, img)
        if os.path.exists(file):
            raw_img = imread(file)
            train[i] = np.expand_dims(resize(raw_img, (IMG_HEIGHT, IMG_WIDTH), mode='constant', preserve_range=True), axis=-1)
        else:
            print(file, " does not exist")
        #print(img)
    return train

In [None]:
# Process dataset for training (x)
df_nuclei_imgs_x = format_img(IMG_PATH, df_nuclei_masks["filename"], IMG_CHANNELS, np.uint8)
df_nuclei_imgs_x.shape

In [None]:
# Process masks for training (y)
df_nuclei_masks_y = format_img(MASK_PATH, df_nuclei_masks["filename"], 1, np.bool)
df_nuclei_masks_y.shape

In [None]:
# Test images
df_nuclei_tests = format_img(MASK_PATH, df_nuclei_masks_tests["filename"], IMG_CHANNELS, np.uint8)
df_nuclei_tests.shape

In [None]:
# Step 5 - Configure Unet model
# Unet model source: https://www.kaggle.com/keegil/keras-u-net-starter-lb-0-277
inputs = Input((IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS))

s = Lambda(lambda x: x / 255) (inputs)

c1 = Conv2D(16, (3, 3), activation='elu', kernel_initializer='he_normal', padding='same') (s)
c1 = Dropout(0.1) (c1)
c1 = Conv2D(16, (3, 3), activation='elu', kernel_initializer='he_normal', padding='same') (c1)
p1 = MaxPooling2D((2, 2)) (c1)

c2 = Conv2D(32, (3, 3), activation='elu', kernel_initializer='he_normal', padding='same') (p1)
c2 = Dropout(0.1) (c2)
c2 = Conv2D(32, (3, 3), activation='elu', kernel_initializer='he_normal', padding='same') (c2)
p2 = MaxPooling2D((2, 2)) (c2)

c3 = Conv2D(64, (3, 3), activation='elu', kernel_initializer='he_normal', padding='same') (p2)
c3 = Dropout(0.2) (c3)
c3 = Conv2D(64, (3, 3), activation='elu', kernel_initializer='he_normal', padding='same') (c3)
p3 = MaxPooling2D((2, 2)) (c3)

c4 = Conv2D(128, (3, 3), activation='elu', kernel_initializer='he_normal', padding='same') (p3)
c4 = Dropout(0.2) (c4)
c4 = Conv2D(128, (3, 3), activation='elu', kernel_initializer='he_normal', padding='same') (c4)
p4 = MaxPooling2D(pool_size=(2, 2)) (c4)

c5 = Conv2D(256, (3, 3), activation='elu', kernel_initializer='he_normal', padding='same') (p4)
c5 = Dropout(0.3) (c5)
c5 = Conv2D(256, (3, 3), activation='elu', kernel_initializer='he_normal', padding='same') (c5)

u6 = Conv2DTranspose(128, (2, 2), strides=(2, 2), padding='same') (c5)
u6 = concatenate([u6, c4])
c6 = Conv2D(128, (3, 3), activation='elu', kernel_initializer='he_normal', padding='same') (u6)
c6 = Dropout(0.2) (c6)
c6 = Conv2D(128, (3, 3), activation='elu', kernel_initializer='he_normal', padding='same') (c6)

u7 = Conv2DTranspose(64, (2, 2), strides=(2, 2), padding='same') (c6)
u7 = concatenate([u7, c3])
c7 = Conv2D(64, (3, 3), activation='elu', kernel_initializer='he_normal', padding='same') (u7)
c7 = Dropout(0.2) (c7)
c7 = Conv2D(64, (3, 3), activation='elu', kernel_initializer='he_normal', padding='same') (c7)

u8 = Conv2DTranspose(32, (2, 2), strides=(2, 2), padding='same') (c7)
u8 = concatenate([u8, c2])
c8 = Conv2D(32, (3, 3), activation='elu', kernel_initializer='he_normal', padding='same') (u8)
c8 = Dropout(0.1) (c8)
c8 = Conv2D(32, (3, 3), activation='elu', kernel_initializer='he_normal', padding='same') (c8)

u9 = Conv2DTranspose(16, (2, 2), strides=(2, 2), padding='same') (c8)
u9 = concatenate([u9, c1], axis=3)
c9 = Conv2D(16, (3, 3), activation='elu', kernel_initializer='he_normal', padding='same') (u9)
c9 = Dropout(0.1) (c9)
c9 = Conv2D(16, (3, 3), activation='elu', kernel_initializer='he_normal', padding='same') (c9)

outputs = Conv2D(1, (1, 1), activation='sigmoid') (c9)

model = Model(inputs=[inputs], outputs=[outputs])

model.compile(optimizer='adam', loss='binary_crossentropy')

model.summary()

In [None]:
# Step 6 - Kickstart the training
# Train the model using the specified pathname
# Will take approx ~5min to complete 4 epochs (ml-g4dn-xlarge w/ NVIDIA T4 GPU)
filepath = MODEL_PATH + "unet-cell-segmentation-v2-512x512.model"

earlystopper = EarlyStopping(patience=2, verbose=1)

checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

callbacks_list = [earlystopper, checkpoint]

history = model.fit(df_nuclei_imgs_x, df_nuclei_masks_y, validation_split=0.20, batch_size=32, epochs=4, 
                    callbacks=callbacks_list)

In [None]:
print("Congratulations! Model saved to ", filepath)

In [None]:
# Step 7 - Test inference against our stored model
random_samples = np.random.permutation(NUM_TEST_IMAGES)[:DISPLAY_TEST_IMAGES]
random_samples

In [None]:
# Reset our dataframe index to match our random sample index
df_nuclei_masks_tests = df_nuclei_masks_tests.reset_index(drop=True)

In [None]:
inference = format_img(IMG_PATH, df_nuclei_masks_tests.loc[random_samples]["filename"], IMG_CHANNELS, np.uint8)

In [None]:
inference.shape

In [None]:
model = load_model(MODEL_PATH + 'unet-cell-segmentation-v2-512x512.model')

In [None]:
predictions = model.predict(inference)

In [None]:
# Threshold the predictions
predictions_threshold = (predictions >= 0.5).astype(np.uint8)

In [None]:
f, axarr = plt.subplots(DISPLAY_TEST_IMAGES,3, figsize=(32,32))
plt.tight_layout()

axarr[0,0].set_title('Cell Image', fontsize=24)
axarr[0,1].set_title('Original Mask', fontsize=24)
axarr[0,2].set_title('Model Prediction', fontsize=24)

for x in range(DISPLAY_TEST_IMAGES):
    axarr[x,0].set_aspect('equal')
    axarr[x,0].imshow(resize(inference[x, :, :, 0], (256, 256), mode='constant', preserve_range=True) )
    axarr[x,0].axis('on')

    original_mask_path = "{}/{}".format(MASK_PATH, df_nuclei_masks_tests.loc[random_samples[x]]["filename"])
    original_mask = imread(original_mask_path)
    axarr[x,1].set_aspect('equal')
    axarr[x,1].imshow(resize(original_mask, (256, 256), mode='constant', preserve_range=True) )
    axarr[x,1].axis('on')

    axarr[x,2].set_aspect('equal')
    axarr[x,2].imshow(resize(predictions_threshold[x, :, :, 0], (256, 256), mode='constant', preserve_range=True), cmap='gray')
    axarr[x,2].axis('on')

plt.subplots_adjust(wspace=0, hspace=0.2)
plt.show()