# GPU performance IO Optimise code and data format


GPUs can significantly speed up deep learning training, and have the potential to reduce training time from weeks to just hours. However in order to fully benefit from the use of GPUS there are many aspects to consider such as a) code optimizations to ensure that underlying hardware is fully utilized b) using the latest high performant libraries and GPU drivers c) optimizing input/output and network operations to ensure that the data is fed to the GPU at the rate that matches its computations d) optimizing communication between GPUS during multi-GPU or distributed training.
 
Here, we will be specifically focusing on optimizations for improving I/O for GPU performance tuning, regardless of the underlying infrastructure or deep learning framework, as shown in Figure1. This is one area where customers stand to benefit the most from, obtaining typically 10X improvements in overall GPU training performance by just optimizing IO processing routines. 


We will be using the Caltech 256 dataset to demonstrate the results

In [None]:
import sagemaker
import boto3
import os
import sys
import pandas as pd
import tempfile
import shutil
import numpy as np
sys.path.append('./src')

sagemaker_session = sagemaker.Session()
account_id = boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name

role = sagemaker.get_execution_role()
# Optional you can use a role that you choose
#role="arn:aws:iam::{}:role/service-role/AmazonSageMaker-ExecutionRole-20190118T115449".format(account_id)


## Set up data locations in S3

In [None]:
bucket = sagemaker_session.default_bucket()

In [None]:

prefix_s3_train = "caltech_256/train/"
s3_train = "s3://{}/{}".format(bucket, prefix_s3_train)
tmp_data_dir = "temp/caltech"

prefix_s3_train_processed = "caltech_256/train_processed/"
s3_train_processed = "s3://{}/{}".format(bucket, prefix_s3_train_processed)
tmp_processed_data_dir = "temp/caltech_processed"

s3_model_path = "s3://{}/models".format(bucket)



## Download Caltech 256 dataset

Set prepare dataset to False , to avoid recreating data on repeated runs

In [None]:
prepare_dataset = True

#### Download dataset

In [None]:
%%time

%%bash -s "$prepare_dataset" "$s3_train" "$tmp_data_dir"
set -e
set -x

prepare_dataset=$1
s3_data_url=$2
local_data_dir=$3
if [ "$prepare_dataset" == "True" ]
then
 rm -rf $local_data_dir
 mkdir -p $local_data_dir
 
 # Download CALTECH 256 dataset
 wget http://www.vision.caltech.edu/Image_Datasets/Caltech256/256_ObjectCategories.tar -O $local_data_dir/data.tar
 
 # Extract file
 data_tmp_dir=$local_data_dir/downloaded_data 
 mkdir -p $data_tmp_dir
 tar -xf $local_data_dir/data.tar -C $data_tmp_dir 
 mv $data_tmp_dir/* $local_data_dir
 
 # delete gz & temp
 rm $local_data_dir/data.tar
 rm -d $data_tmp_dir

fi
echo "$prepare_dataset"

### Upload to raw files s3

In [None]:
%%time

from s3_util import S3Util

if prepare_dataset:
 S3Util().upload_files(tmp_data_dir,s3_train)
 

In [None]:
!aws s3 ls $s3_train


In [None]:
!aws s3 ls --recursive $s3_train | wc -l



## Create training job

#### Source directory

In [None]:
source_dir = './src'
entry_point_file = 'main.py'
dependencies = ['./src/datasets']

In [None]:
train_instance_type = "ml.p3.2xlarge" 

In [None]:
metric_def = [
 {"Name": "loss",
 "Regex": "## loss ##: (\d*[.]?\d*)"}
,{"Name": "secs_time_per_epoch",
 "Regex": "## secs_time_per_epoch ##: (\d*[.]?\d*)"}
]

In [None]:
inputs = {
 "train" : s3_train
}



In [None]:
epochs = 20 
batch_size=32
learning_rate=0.00001
log_level="INFO" #DEBUG

In [None]:
pytorch_version= "1.2.0"

### This is the most naive implementation 

This uses a single worker for the dataloader and an unoptimised dataset. This will have the slowest performance

Lets have a look at the custom dataset

In [None]:
!pygmentize ./src/datasets/custom_caltech_dataset.py

In [None]:
hp_naivedataset_single_worker = {'epochs': 1, 
 'batch-size': batch_size,
 'numworkers': 1,
 "dataset_type" : "CustomCaltechDataset",
 "epochs":epochs,
 "lr":learning_rate,
 "log-level" : log_level 
 
 }

In [None]:
from sagemaker.pytorch import PyTorch
from time import gmtime, strftime



estimator = PyTorch(entry_point=entry_point_file,
 source_dir=source_dir,
 dependencies = dependencies,
 role=role,
 py_version="py3",
 framework_version = pytorch_version,
 hyperparameters=hp_naivedataset_single_worker,
 output_path = s3_model_path,
 metric_definitions = metric_def,
 train_instance_count=1, 
 train_instance_type=train_instance_type)



In [None]:
job_name = "gpu-performance-naive-one-worker{}".format(strftime("%Y-%m-%d-%H-%M-%S", gmtime()))



estimator.fit( inputs, job_name=job_name, wait = False)

### Improve the performance by increasing the number of workers
This uses a multiple workers for the dataloader 

This example uses a p3 2x large and this has has 8 cpus [https://aws.amazon.com/ec2/instance-types/p3/]. So we will use 8 - 1, 7 workers

In [None]:

num_workers = 7

In [None]:
hp_naivedataset_multiple_worker = {'epochs': 1, 
 'batch-size': batch_size,
 'numworkers': num_workers,
 "dataset_type" : "CustomCaltechDataset",
 "epochs":epochs , 
 "lr":learning_rate,
 "log-level" : log_level 
 }

In [None]:
from sagemaker.pytorch import PyTorch
from time import gmtime, strftime



estimator = PyTorch(entry_point=entry_point_file,
 source_dir=source_dir,
 role=role,
 py_version="py3",
 framework_version = pytorch_version,
 hyperparameters=hp_naivedataset_multiple_worker,
 output_path = s3_model_path,
 metric_definitions = metric_def,
 train_instance_count=1, 
 train_instance_type=train_instance_type)



In [None]:
job_name = "gpu-performance-naive-multi-worker{}".format(strftime("%Y-%m-%d-%H-%M-%S", gmtime()))


estimator.fit( inputs, job_name=job_name, wait = False)

### Improve the performance by optimising the dataset code
This uses a single worker for the dataloader , but uses preprocessed data. So the __getitem__ function is quite lean. This allows you to obtain similar performance to having multiple workers.

Sometimes in order to obtain the best performance, you should use multiple workers and optimise the code 

#### Preprocessing logic

In [None]:
!pygmentize ./src/datasets/caltech_image_preprocessor.py

### Preprocess dataset

In [None]:
%%time

import os, shutil
sys.path.append('./src')
from datasets.caltech_image_preprocessor import CaltechImagePreprocessor

if prepare_dataset:
 if os.path.exists(tmp_processed_data_dir):
 shutil.rmtree( tmp_processed_data_dir ) 
 os.makedirs(tmp_processed_data_dir, exist_ok=False)
 CaltechImagePreprocessor().dump(os.path.join(os.path.dirname("."), tmp_data_dir ), tmp_processed_data_dir , parts=4)





### Upload to s3

In [None]:
%%time


from s3_util import S3Util

if prepare_dataset:
 S3Util().upload_files(tmp_processed_data_dir,s3_train_processed )
 

In [None]:
!aws s3 ls $s3_train_processed/

In [None]:
!pygmentize ./src/datasets/custom_caltech_optimised_dataset.py

#### Here we use a single worker to achieve the same performance as multiple workers.
In this example, if you increase the number of workers you would barely see any performance gain as the getitem operation is optmised as much as possible. In other cases tuning your dataset code and thhe increasing the number of workers will provide the optimal performance.

In [None]:

num_workers = 1

In [None]:

inputs_procesesd = {
 "train" : s3_train_processed
}



In [None]:
hp_optimiseddataset_multiple_worker = {'epochs': 1, 
 'batch-size': batch_size,
 'numworkers': num_workers,
 "dataset_type" : "CustomCaltechOptimisedDataset",
 "epochs":epochs 
 ,"lr":learning_rate
 }

In [None]:
from sagemaker.pytorch import PyTorch
from time import gmtime, strftime



estimator = PyTorch(entry_point=entry_point_file,
 source_dir=source_dir,
 role=role,
 py_version="py3",
 framework_version = pytorch_version,
 hyperparameters=hp_optimiseddataset_multiple_worker,
 output_path = s3_model_path,
 metric_definitions = metric_def,
 train_instance_count=1, 
 train_instance_type=train_instance_type)



In [None]:
job_name = "gpu-performance-tuned-single-worker{}".format(strftime("%Y-%m-%d-%H-%M-%S", gmtime()))


estimator.fit( inputs_procesesd, job_name=job_name, wait = False)