{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "fabulous-number", "metadata": {}, "outputs": [], "source": [ "from util_debugger import get_sys_metric\n", "from util_train import aug_exp_train\n", "import pprint" ] }, { "cell_type": "markdown", "id": "shared-wheat", "metadata": {}, "source": [ "### Setting up required parameters for the experiment to compare training time for augmentation on CPU and GPU" ] }, { "cell_type": "code", "execution_count": null, "id": "selective-count", "metadata": {}, "outputs": [], "source": [ "curr_sm_role = # Please update your SageMaker Execution Role\n", " \n", "# 'pytorch-cpu': JPEG decoding and augmentation on CPUs using PyTorch Dataloader\n", "# 'dali-cpu': JPEG decoding and augmentation on CPUs using NVIDIA DALI\n", "# 'dali-gpu': JPEG decoding and augmentation on GPUs using NVIDIA DALI\n", "AUGMENTATION_APPROACHES = ['pytorch-cpu', 'dali-gpu']\n", "\n", "instance_type = 'ml.p3.2xlarge'\n", "# Required for plotting system utilization\n", "num_cpu = 8\n", "num_gpu = 1\n", "\n", "# Training Script supports: 'RESNET50', 'RESNET18', and 'RESNET152'\n", "model_arch = 'RESNET18'\n", "\n", "batch_size = 32\n", "\n", "# Factor by which to repeat augmentation operations for increasing data pre-processing load \n", "aug_load_factor = 12\n", "\n", "# You can change other parameters such as training data, S3 bucket, Epoch and training hyperparameters at util_train.script" ] }, { "cell_type": "markdown", "id": "intensive-party", "metadata": {}, "source": [ "### Launching training jobs and fetching system utilization for data pre-processing on CPUs vs on GPUs" ] }, { "cell_type": "code", "execution_count": null, "id": "reserved-president", "metadata": {}, "outputs": [], "source": [ "exp_data = {}\n", "trial = 0\n", "pp = pprint.PrettyPrinter()\n", "\n", "for aug_operator in AUGMENTATION_APPROACHES:\n", " \n", " trial = trial + 1\n", " trial_data = dict.fromkeys(['train_job_id', 'model_arch', 'instance_type', 'batch_size', 'aug_load_factor', 'aug_operator', 'sys_util_df'])\n", " \n", " # Launch Amazon Sagemaker PyTorch traininng jobs with your custom training script.\n", " train_job_id, train_estimator = aug_exp_train(model_arch, \n", " batch_size, \n", " aug_operator, \n", " aug_load_factor, \n", " instance_type, \n", " curr_sm_role)\n", " \n", " # Extract system utilization metrics with SageMaker Debugger\n", " heatmap, metric_hist, sys_util_df = get_sys_metric(train_estimator, \n", " num_cpu,\n", " num_gpu)\n", " \n", " # Print parameter and result summary for the current training job run\n", " trial_data['train_job_id'] = train_job_id\n", " trial_data['model_arch'] = model_arch\n", " trial_data['instance_type'] = instance_type\n", " trial_data['batch_size'] = batch_size\n", " trial_data['aug_load_factor'] = aug_load_factor\n", " trial_data['aug_operator'] = aug_operator\n", " trial_data['sys_util_df'] = sys_util_df\n", " \n", " pp.pprint(trial_data) \n", " exp_data.update({'trial-'+str(trial): trial_data})\n", " \n", "pp.pprint(exp_data) " ] }, { "cell_type": "code", "execution_count": null, "id": "disabled-aaron", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": ".venv_dev", "language": "python", "name": ".venv_dev" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.2" } }, "nbformat": 4, "nbformat_minor": 5 }