In [56]:
import os
import csv
import pandas as pd
import hashlib
from io import BytesIO
import pickle, gzip
import random as rand

from PIL import Image
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

import sagemaker

WORKING_DIR = os.getcwd()
DATA_DIR = WORKING_DIR+'/ut-zap50k-images-square'
 
ZAPPOS50K_INDEX = WORKING_DIR+'/zappos50k-index.csv'
DOWNLOAD_S3URI = "s3://reinvent2018-sagemaker-pytorch"

WEIGHT_SAME_IMG = 0.0
WEIGHT_DIFF_IMG = 1.0
PARAM_SAME_CATEGORY_WEIGHTING = 0.05
PARAM_SAME_SUBCATEGORY_WEIGHTING = 0.01 

ZAPPOS50K_INDEX = WORKING_DIR+'/zappos50k-index.csv'
ZAPPOS50K_INDEX_TRAIN = WORKING_DIR+'/zappos50k-index-train.csv'
ZAPPOS50K_INDEX_TEST = WORKING_DIR+'/zappos50k-index-test.csv'

ZAPPOS50K_TUPLES_INDEX_TRAIN = WORKING_DIR+'/zappos50k-tuples-index-train.csv'
ZAPPOS50K_TUPLES_INDEX_TEST = WORKING_DIR+'/zappos50k-tuples-index-test.csv'

## Download Data

In [None]:
%%bash -s "$DOWNLOAD_S3URI"
aws s3 cp $1/ut-zap50k-images-square.zip . --quiet
unzip -nq ut-zap50k-images-square.zip

## Generate Sample Indices

In [37]:
TRAIN_IMG_PATHS = ["Boots/Knee High/Anne Klein",
 "Boots/Knee High/Ariat",
 "Boots/Mid-Calf/UGG",
 "Sandals/Athletic/Keen Kids",
 "Sandals/Heel/Annie",
 "Sandals/Heel/Fly Flot",
 "Sandals/Heel/Onex",
 "Shoes/Oxfords/Calvin Klein",
 "Shoes/Oxfords/Rockport"]

TEST_IMG_PATHS = ['Boots/Knee High/Tommy Hilfiger Kids/',
 'Boots/Over the Knee/Calvin Klein Collection/',
 'Shoes/Oxfords/Bass']

def getImageTensor(img_path, transform):
 
 image = Image.open(img_path)
 image_tensor = transform(image)
 
 return image_tensor

def get_categories(img_loc) :

 path, file = os.path.split(img_loc)
 path_parts = path.split(os.sep)
 category = path_parts[0]
 subcategory = path_parts[1]

 return {'category': category, 'sub': subcategory}
 
def generate_sample_index(idxFile, img_paths) : 
 
 with open(idxFile, 'w') as csvfile:

 try:

 csvwriter = csv.writer(csvfile)
 for paths in img_paths:
 
 c = get_categories(paths)
 cid = int(hashlib.sha256(c['category'].encode('utf-8')).hexdigest(), 16) % 10**9
 scid = int(hashlib.sha256(c['sub'].encode('utf-8')).hexdigest(), 16) % 10**9
 
 files = os.listdir(os.path.join(DATA_DIR,paths))

 row = []
 for f in files:
 csvwriter.writerow([os.path.join(paths,f),cid,scid])

 except csv.Error as e:
 print(e)

 finally:
 csvfile.close()

In [52]:
ZAPPOS50K_PARTIAL_INDEX = WORKING_DIR+'/zappos50k-partial-index.csv'
ZAPPOS50K_PARTIAL_INDEX_TRAIN = WORKING_DIR+'/zappos50k-partial-index-train.csv'
ZAPPOS50K_PARTIAL_INDEX_TEST = WORKING_DIR+'/zappos50k-partial-index-test.csv'

generate_sample_index(ZAPPOS50K_PARTIAL_INDEX_TRAIN, TRAIN_IMG_PATHS)
generate_sample_index(ZAPPOS50K_PARTIAL_INDEX_TEST, TEST_IMG_PATHS)
generate_sample_index(ZAPPOS50K_PARTIAL_INDEX, TRAIN_IMG_PATHS+TEST_IMG_PATHS)

In [53]:
def generate_tuples_sample_index(idxFile, tuplesIdxFile) :
 
 indexDF = pd.read_csv(idxFile, header=None, names=['img1','cat','sub_cat'])
 tuplesDF = None

 for (idx, row) in indexDF.iterrows() :

 df = (indexDF[idx:]).copy().reset_index(drop=True)
 
 sim_cat_weight = WEIGHT_DIFF_IMG-(((row['cat'] == df['cat']) * PARAM_SAME_CATEGORY_WEIGHTING) + \
 ((row['sub_cat'] == df['sub_cat']) * PARAM_SAME_SUBCATEGORY_WEIGHTING))
 
 sim_cat_weight[0] = WEIGHT_SAME_IMG
 df['img2'] = pd.Series((row['img1'] for x in range(idx, indexDF.shape[0])))
 df['label'] = sim_cat_weight 
 
 df= df.drop(columns= ['cat','sub_cat']) 
 tuplesDF = df if (tuplesDF is None) else tuplesDF.append(df)
 
 tuplesDF.to_csv(tuplesIdxFile, sep=',', index=False, header=None)
 
 return tuplesDF.reset_index(drop=True)

In [54]:
ZAPPOS50K_TUPLES_PARTIAL_INDEX_TRAIN = WORKING_DIR+'/zappos50k-partial-tuples-index-train.csv'
ZAPPOS50K_TUPLES_PARTIAL_INDEX_TEST = WORKING_DIR+'/zappos50k-partial-tuples-index-test.csv'

generate_tuples_sample_index(ZAPPOS50K_PARTIAL_INDEX_TRAIN, ZAPPOS50K_TUPLES_PARTIAL_INDEX_TRAIN)
generate_tuples_sample_index(ZAPPOS50K_PARTIAL_INDEX_TEST, ZAPPOS50K_TUPLES_PARTIAL_INDEX_TEST)

Unnamed: 0,img1,img2,label
0,Boots/Knee High/Tommy Hilfiger Kids/8027756.40...,Boots/Knee High/Tommy Hilfiger Kids/8027756.40...,0.00
1,Boots/Knee High/Tommy Hilfiger Kids/8047638.3.jpg,Boots/Knee High/Tommy Hilfiger Kids/8027756.40...,0.94
2,Boots/Over the Knee/Calvin Klein Collection/80...,Boots/Knee High/Tommy Hilfiger Kids/8027756.40...,0.95
3,Shoes/Oxfords/Bass/7563706.226012.jpg,Boots/Knee High/Tommy Hilfiger Kids/8027756.40...,1.00
4,Shoes/Oxfords/Bass/7563706.371938.jpg,Boots/Knee High/Tommy Hilfiger Kids/8027756.40...,1.00
5,Shoes/Oxfords/Bass/7616146.278640.jpg,Boots/Knee High/Tommy Hilfiger Kids/8027756.40...,1.00
6,Shoes/Oxfords/Bass/7616146.372724.jpg,Boots/Knee High/Tommy Hilfiger Kids/8027756.40...,1.00
7,Shoes/Oxfords/Bass/7616146.372725.jpg,Boots/Knee High/Tommy Hilfiger Kids/8027756.40...,1.00
8,Shoes/Oxfords/Bass/8028830.372729.jpg,Boots/Knee High/Tommy Hilfiger Kids/8027756.40...,1.00
9,Shoes/Oxfords/Bass/7616146.244.jpg,Boots/Knee High/Tommy Hilfiger Kids/8027756.40...,1.00


## Generate Index for Full Zappos50k Data Set

In [55]:
CATEGORY_IDX = {
 "Shoes":{
 "i":-1,
 "r":[-1,-1],
 "Sneakers and Athletic Shoes":{
 "i":-1,
 "r":[-1,-1]
 },
 "Loafers":{
 "i":-1,
 "r":[-1,-1]
 },
 "Crib Shoes":{
 "i":-1,
 "r":[-1,-1] 
 },
 "Prewalker":{
 "i":-1,
 "r":[-1,-1]
 },
 "Flats":{
 "i":-1,
 "r":[-1,-1]
 },
 "Clogs and Mules":{
 "i":-1,
 "r":[-1,-1]
 },
 "Oxfords":{
 "i":-1,
 "r":[-1,-1]
 },
 "Firstwalker":{
 "i":-1,
 "r":[-1,-1]
 },
 "Heels":{
 "i":-1,
 "r":[-1,-1]
 },
 "Boat Shoes":{
 "i":-1,
 "r":[-1,-1]
 }
 },
 "Boots":{
 "i":-1,
 "r":[-1,-1],
 "Prewalker Boots":{
 "i":-1,
 "r":[-1,-1]
 },
 "Ankle":{
 "i":-1,
 "r":[-1,-1]
 },
 "Over the Knee":{
 "i":-1,
 "r":[-1,-1]
 },
 "Knee High":{
 "i":-1,
 "r":[-1,-1]
 },
 "Mid-Calf":{
 "i":-1,
 "r":[-1,-1]
 }
 },
 "Slippers":{
 "i":-1,
 "r":[-1,-1],
 "Boot":{
 "i":-1,
 "r":[-1,-1]
 },
 "Slipper Heels":{
 "i":-1,
 "r":[-1,-1]
 },
 "Slipper Flats":{
 "i":-1,
 "r":[-1,-1]
 }
 },
 "Sandals":{
 "i":-1,
 "r":[-1,-1],
 "Athletic":{
 "i":-1,
 "r":[-1,-1]
 },
 "Heel":{
 "i":-1,
 "r":[-1,-1]
 },
 "Flat":{
 "i":-1,
 "r":[-1,-1]
 }
 }
 }

IMG_BLACK_LIST = ['Boots/Mid-Calf/Primigi Kids/8022041.89.jpg',
 'Boots/Mid-Calf/Roper Kids/7675771.248592.jpg',
 'Shoes/Sneakers and Athletic Shoes/Puma Kids/7587775.215216.jpg',
 'Shoes/Sneakers and Athletic Shoes/Puma Kids/7649123.238814.jpg',
 'Shoes/Heels/Aravon/8003190.2783.jpg',
 'Shoes/Sneakers and Athletic Shoes/Puma Kids/7649125.238816.jpg']

def generate_zappos50k_index() : 
 
 with open(ZAPPOS50K_INDEX, 'w') as idxfile:
 
 try:
 
 csvwriter = csv.writer(idxfile)
 i= 0
 for category in os.listdir(ROOT_DATA_DIR):
 
 print(category+": "+str(i))
 cid = int(hashlib.sha256(category.encode('utf-8')).hexdigest(), 16) % 10**9
 
 CATEGORY_IDX[category]["i"] = cid
 CATEGORY_IDX[category]["r"][0] = i

 for subcat in os.listdir(ROOT_DATA_DIR+category):
 print(" "+subcat+": "+str(i))
 scid = int(hashlib.sha256(subcat.encode('utf-8')).hexdigest(), 16) % 10**9
 
 CATEGORY_IDX[category][subcat]["i"] = scid
 CATEGORY_IDX[category][subcat]["r"][0] = i
 
 for (root,dirs,files) in os.walk(ROOT_DATA_DIR+category+'/'+subcat): 
 for f in files:
 
 img_path = os.path.join(root.replace(ROOT_DATA_DIR,''),f)
 if img_path not in IMG_BLACK_LIST :
 csvwriter.writerow([img_path,cid,scid])
 i= i+1
 
 CATEGORY_IDX[category][subcat]["r"][1] = i-1 
 CATEGORY_IDX[category]["r"][1] = i-1
 
 except csv.Error as e:
 print(e)

 finally:
 idxfile.close() 

In [57]:
import json

generate_zappos50k_index()
print(json.dumps(CATEGORY_IDX, indent=4))

Boots: 0
 Over the Knee: 0
 Prewalker Boots: 49
 Mid-Calf: 51
 Ankle: 4775
 Knee High: 10630
Shoes: 12832
 Flats: 12832
 Clogs and Mules: 16826
 Sneakers and Athletic Shoes: 18253
 Crib Shoes: 31109
 Loafers: 31132
 Heels: 34007
 Oxfords: 39745
 Firstwalker: 41789
 Boat Shoes: 42158
 Prewalker: 42787
Slippers: 43036
 Slipper Flats: 43036
 Boot: 44295
 Slipper Heels: 44309
Sandals: 44319
 Flat: 44319
 Athletic: 49920
 Heel: 49934
{
 "Shoes": {
 "i": 746698023,
 "r": [
 12832,
 43035
 ],
 "Sneakers and Athletic Shoes": {
 "i": 844163951,
 "r": [
 18253,
 31108
 ]
 },
 "Loafers": {
 "i": 125610153,
 "r": [
 31132,
 34006
 ]
 },
 "Crib Shoes": {
 "i": 305734478,
 "r": [
 31109,
 31131
 ]
 },
 "Prewalker": {
 "i": 651163850,
 "r": [
 42787,
 43035
 ]
 },
 "Flats": {
 "i": 495965819,
 "r": [
 12832,
 16825
 ]
 },
 "Clogs and Mules": {
 "i": 55293501,
 "r": [
 16826,
 18252
 ]
 },
 "Oxfords": {
 "i": 904836252,
 "r": [
 39745,
 41788
 ]
 },
 "Firstwalker": {
 "i": 542200525,
 "r": [
 41789,
 

In [58]:
def get_random_img(img_idx, df_idx, df_row, cat, sub_cat):

 r = CATEGORY_IDX[cat][sub_cat]['r']
 
 rd = df_idx
 while (rd == df_idx):
 rd = rand.randint(r[0],r[1])
 
 return [df_row['img'],
 img_idx.iloc[rd,0],
 WEIGHT_DIFF_IMG
 -((df_row['cat'] == CATEGORY_IDX[cat]['i'])*PARAM_SAME_CATEGORY_WEIGHTING)
 -((df_row['sub_cat'] == CATEGORY_IDX[cat][sub_cat]['i'])*PARAM_SAME_SUBCATEGORY_WEIGHTING)]
 
def generate_zappos50k_tuples_index(train_test_split=80) : 
 
 with open(ZAPPOS50K_TUPLES_INDEX_TRAIN, 'w') as train_ds, open(ZAPPOS50K_TUPLES_INDEX_TEST, 'w') as test_ds:
 
 try: 
 train_writer = csv.writer(train_ds) 
 test_writer = csv.writer(test_ds) 
 
 img_idx = pd.read_csv(ZAPPOS50K_INDEX, header=None, names=['img','cat','sub_cat'])
 
 for (df_idx,df_row) in img_idx.iterrows():
 
 rows = [[df_row['img'],df_row['img'],WEIGHT_SAME_IMG]] 
 rows.append(get_random_img(img_idx, df_idx, df_row, 'Shoes', 'Sneakers and Athletic Shoes'))
 rows.append(get_random_img(img_idx, df_idx, df_row, 'Shoes', 'Loafers'))
 rows.append(get_random_img(img_idx, df_idx, df_row, 'Shoes', 'Crib Shoes'))
 rows.append(get_random_img(img_idx, df_idx, df_row, 'Shoes', 'Prewalker'))
 rows.append(get_random_img(img_idx, df_idx, df_row, 'Shoes', 'Flats'))
 rows.append(get_random_img(img_idx, df_idx, df_row, 'Shoes', 'Clogs and Mules'))
 rows.append(get_random_img(img_idx, df_idx, df_row, 'Shoes', 'Oxfords'))
 rows.append(get_random_img(img_idx, df_idx, df_row, 'Shoes', 'Firstwalker'))
 rows.append(get_random_img(img_idx, df_idx, df_row, 'Shoes', 'Heels'))
 rows.append(get_random_img(img_idx, df_idx, df_row, 'Shoes', 'Boat Shoes'))
 rows.append(get_random_img(img_idx, df_idx, df_row, 'Shoes', 'Oxfords'))
 rows.append(get_random_img(img_idx, df_idx, df_row, 'Boots', 'Prewalker Boots'))
 rows.append(get_random_img(img_idx, df_idx, df_row, 'Boots', 'Ankle'))
 rows.append(get_random_img(img_idx, df_idx, df_row, 'Boots', 'Over the Knee'))
 rows.append(get_random_img(img_idx, df_idx, df_row, 'Boots', 'Knee High'))
 rows.append(get_random_img(img_idx, df_idx, df_row, 'Boots', 'Mid-Calf'))
 rows.append(get_random_img(img_idx, df_idx, df_row, 'Slippers', 'Boot'))
 rows.append(get_random_img(img_idx, df_idx, df_row, 'Slippers', 'Slipper Heels'))
 rows.append(get_random_img(img_idx, df_idx, df_row, 'Slippers', 'Slipper Flats'))
 rows.append(get_random_img(img_idx, df_idx, df_row, 'Sandals', 'Athletic'))
 rows.append(get_random_img(img_idx, df_idx, df_row, 'Sandals', 'Heel'))
 rows.append(get_random_img(img_idx, df_idx, df_row, 'Sandals', 'Flat'))
 
 if (rand.randint(1,100) > train_test_split) :
 test_writer.writerows(rows)
 else:
 train_writer.writerows(rows)
 
 except csv.Error as e:
 print(e)

 finally:
 train_ds.close()
 test_ds.close() 

In [59]:
generate_zappos50k_tuples_index(90)

## Generate NPY Tensors for Batch Inference Input

The script below generate input files in NPY format that is required by the batch inference implementation. 

Each file contains images that have been converted to numpy arrays and serialized into gzip files (using Pickle).

Each file contains an array consisting of 4 dimensions: 
 1. Batch size
 2. Channels. The tensors have 3 representing RGB
 3. The last two dimensions are 224x224 representing the pixel values for each image and channel.
 
The first array represents the image that will be compared against other images. For instance, a file that contains
a tensor with the dimensions [53,3,224,224], represents 53 vecotrized images. The first index into the first dimension represents an image of the shape [1,3,224,224] that will be compared against the other slices that represent 52 images
of the same shape.

In [34]:
sagemaker_session = sagemaker.Session()
BUCKET = sagemaker_session.default_bucket()
PREFIX = 'sagemaker/DEMO-pytorch-siamese-network'
DATA_S3URI = "s3://"+BUCKET+'/'+PREFIX+'/data'

BATCH_INPUT_PREFIX = PREFIX+'/batch/in'
BATCH_OUTPUT_PREFIX = PREFIX+'/batch/out'
IMG_TENSOR_ROOT = WORKING_DIR+'/tensors'
BATCH_INPUT_FILENAME = '/tensors'

TRANSFORMATIONS = \
transforms.Compose([
 transforms.Resize(224), \
 transforms.ToTensor(), \
 transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225]) \
])

def getImageTensor(img_path, transform):
 
 image = Image.open(img_path)
 image_tensor = transform(image)
 
 return image_tensor

def batch_image_to_tensor_tuples(img_loc, dataloader, file_prefix, s3_prefix_out) :
 
 img1 = getImageTensor(img_loc, TRANSFORMATIONS)
 img1.unsqueeze_(0)
 img1 = img1.numpy()

 npy_prefix = IMG_TENSOR_ROOT+file_prefix+BATCH_INPUT_FILENAME
 
 if not os.path.exists(IMG_TENSOR_ROOT+file_prefix):
 os.makedirs(IMG_TENSOR_ROOT+file_prefix)
 
 nbatch = 1
 
 for data in dataloader:

 img_name = data.get('name')
 img2 = data.get('tensor').numpy() 
 batch = np.vstack((img1,img2))

 npy_f = npy_prefix+str(nbatch)+'.npy.gz'
 
 with gzip.open(npy_f, 'wb') as npy:

 try:
 pickle.dump((img_name,batch), npy, 2)
 finally:
 npy.close()
 sagemaker_session.upload_data(path=npy_f, bucket=BUCKET, key_prefix=s3_prefix_out)
 
 #clean up local files
 os.remove(npy_f)
 
 print("completed: "+npy_f)
 nbatch+=1
 
class Zappos50kDataset(Dataset):
 
 def __init__(self, csv_file, root_dir, transform=None):
 self.index = pd.read_csv(csv_file, header=None, usecols = [0,1])
 self.root_dir = root_dir
 self.transform = transform
 
 def __len__(self):
 return self.index.shape[0]

 def __getitem__(self, idx):
 img_name = os.path.join(self.root_dir, self.index.iloc[idx, 0])
 image = Image.open(img_name)
 image_tensor = self.transform(image)
 
 return {'name': self.index.iloc[idx, 0], 'tensor': image_tensor}

def convert_images_to_tensors(df) :

 DATA_DIR = WORKING_DIR+'/ut-zap50k-images-square'
 BATCH_INPUT_PREFIX = 'sagemaker/DEMO-pytorch-siamese-network/batch/in'
 PARAM_BATCH_SIZE = 32
 
 zapposDS = Zappos50kDataset(ZAPPOS50K_INDEX, DATA_DIR, TRANSFORMATIONS)
 zapposDL = torch.utils.data.DataLoader(dataset=zapposDS, batch_size= PARAM_BATCH_SIZE, shuffle=False)

 for (i,r) in df.iterrows() :
 
 img_loc = DATA_DIR + '/' + r['img']
 path, file = os.path.split(img_loc)
 file_prefix = path.replace(DATA_DIR, '')+'/'+os.path.splitext(file)[0]
 batch_image_to_tensor_tuples(img_loc, zapposDL, file_prefix, BATCH_INPUT_PREFIX+file_prefix)

In [None]:
#zapposDF = pd.read_csv(ZAPPOS50K_INDEX, header=None, usecols=[0],names=['img']) 
zapposDF = pd.read_csv(WORKING_DIR+'/zappos50k-index-cont1.csv', header=None, usecols=[0],names=['img']) 
convert_images_to_tensors(zapposDF)

completed: /home/ec2-user/SageMaker/tensors/Boots/Over the Knee/Stuart Weitzman/7793422.106/tensors1.npy.gz
completed: /home/ec2-user/SageMaker/tensors/Boots/Over the Knee/Stuart Weitzman/7793422.106/tensors2.npy.gz
completed: /home/ec2-user/SageMaker/tensors/Boots/Over the Knee/Stuart Weitzman/7793422.106/tensors3.npy.gz
completed: /home/ec2-user/SageMaker/tensors/Boots/Over the Knee/Stuart Weitzman/7793422.106/tensors4.npy.gz
completed: /home/ec2-user/SageMaker/tensors/Boots/Over the Knee/Stuart Weitzman/7793422.106/tensors5.npy.gz
completed: /home/ec2-user/SageMaker/tensors/Boots/Over the Knee/Stuart Weitzman/7793422.106/tensors6.npy.gz
completed: /home/ec2-user/SageMaker/tensors/Boots/Over the Knee/Stuart Weitzman/7793422.106/tensors7.npy.gz
completed: /home/ec2-user/SageMaker/tensors/Boots/Over the Knee/Stuart Weitzman/7793422.106/tensors8.npy.gz
completed: /home/ec2-user/SageMaker/tensors/Boots/Over the Knee/Stuart Weitzman/7793422.106/tensors9.npy.gz
completed: /home/ec2-user/Sa