{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Distributed PyTorch in SageMaker with Magic " ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\u001b[0;31mDocstring:\u001b[0m\n", "::\n", "\n", " %pytorch [--estimator_name ESTIMATOR_NAME] [--entry_point ENTRY_POINT]\n", " [--source_dir SOURCE_DIR] [--role ROLE]\n", " [--framework_version FRAMEWORK_VERSION]\n", " [--py_version PY_VERSION] [--instance_type INSTANCE_TYPE]\n", " [--instance_count INSTANCE_COUNT] [--output_path OUTPUT_PATH]\n", " [--hyperparameters FOO:1,BAR:0.555,BAZ:ABC | 'FOO : 1, BAR : 0.555, BAZ : ABC']\n", " [--channel_training CHANNEL_TRAINING]\n", " [--channel_testing CHANNEL_TESTING]\n", " [--use_spot_instances [USE_SPOT_INSTANCES]]\n", " [--max_wait MAX_WAIT]\n", " [--enable_sagemaker_metrics [ENABLE_SAGEMAKER_METRICS]]\n", " [--metric_definitions ['Name: loss, Regex: Loss = .*?);' ['Name: loss, Regex: Loss = (.*?;' ...]]]\n", " [--name_contains NAME_CONTAINS] [--max_result MAX_RESULT]\n", " {submit,list,status,logs,delete}\n", "\n", "Pytorch magic command.\n", "\n", "methods:\n", " {submit,list,status,logs,delete}\n", "\n", "submit:\n", " --estimator_name ESTIMATOR_NAME\n", " estimator shell variable name\n", " --entry_point ENTRY_POINT\n", " notebook local code file\n", " --source_dir SOURCE_DIR\n", " notebook local code src, may contain requirements.txt\n", " --role ROLE An AWS IAM role (either name or full ARN). The Amazon\n", " SageMaker training jobs and APIs that create Amazon\n", " SageMaker endpoints use this role to access training\n", " data and model artifacts. After the endpoint is\n", " created, the inference code might use the IAM role, if\n", " it needs to access an AWS resource.\n", " --framework_version FRAMEWORK_VERSION\n", " PyTorch version\n", " --py_version PY_VERSION\n", " Python version\n", " --instance_type INSTANCE_TYPE\n", " Type of EC2 instance to use for training, for example,\n", " ‘ml.c4.xlarge’.\n", " --instance_count INSTANCE_COUNT\n", " Number of Amazon EC2 instances to use for training.\n", " --output_path OUTPUT_PATH\n", " S3 location for saving the training result (model\n", " artifacts and output files). If not specified, results\n", " are stored to a default bucket. If the bucket with the\n", " specific name does not exist, the estimator creates\n", " the bucket during the fit() method execution.\n", " --hyperparameters \n", " Hyperparameters are passed to your script as arguments\n", " and can be retrieved with an argparse.\n", " --channel_training CHANNEL_TRAINING\n", " A string that represents the path to the directory\n", " that contains the input data for the training channel.\n", " --channel_testing CHANNEL_TESTING\n", " A string that represents the path to the directory\n", " that contains the input data for the testing channel.\n", "\n", "submit-spot:\n", " --use_spot_instances <[USE_SPOT_INSTANCES]>\n", " Specifies whether to use SageMaker Managed Spot\n", " instances for training. If enabled then the max_wait\n", " arg should also be set. More information:\n", " https://docs.aws.amazon.com/sagemaker/latest/dg/model-\n", " managed-spot-training.html\n", " --max_wait MAX_WAIT Timeout in seconds waiting for spot training instances\n", " (default: None). After this amount of time Amazon\n", " SageMaker will stop waiting for Spot instances to\n", " become available (default: None).\n", "\n", "submit-metrics:\n", " --enable_sagemaker_metrics <[ENABLE_SAGEMAKER_METRICS]>\n", " Enables SageMaker Metrics Time Series. For more\n", " information see: https://docs.aws.amazon.com/sagemaker\n", " /latest/dg/API_AlgorithmSpecification.html# SageMaker-\n", " Type-AlgorithmSpecification-\n", " EnableSageMakerMetricsTimeSeries\n", " --metric_definitions <['Name: loss, Regex: Loss = (.*?);' ['Name: loss, Regex: Loss = (.*?);' ...]]>\n", " A list of dictionaries that defines the metric(s) used\n", " to evaluate the training jobs. Each dictionary\n", " contains two keys: ‘Name’ for the name of the metric,\n", " and ‘Regex’ for the regular expression used to extract\n", " the metric from the logs. This should be defined only\n", " for jobs that don’t use an Amazon algorithm.\n", "\n", "list:\n", " --name_contains NAME_CONTAINS\n", " --max_result MAX_RESULT\n", "\u001b[0;31mFile:\u001b[0m /opt/conda/lib/python3.8/site-packages/sage_maker_kernel/kernelmagics.py\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%%pytorch?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Setup S3 bucket locations\n", "\n", "First, setup some locations in the default SageMaker bucket to store the raw input datasets and the Pytorch job output." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import sagemaker\n", "\n", "sess = sagemaker.Session()\n", "\n", "output_path='s3://' + sess.default_bucket() + '/pytorch/mnist'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "MNIST is a widely used dataset for handwritten digit classification. It consists of 70,000 labeled 28x28 pixel grayscale images of hand-written digits. The dataset is split into 60,000 training images and 10,000 test images." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import os\n", "import json\n", "import logging\n", "import boto3\n", "from botocore.exceptions import ClientError\n", "\n", "\n", "# Download training and testing data from a public S3 bucket\n", "\n", "\n", "def download_from_s3(data_dir='/tmp/data', train=True):\n", " \"\"\"Download MNIST dataset and convert it to numpy array\n", " \n", " Args:\n", " data_dir (str): directory to save the data\n", " train (bool): download training set\n", " \n", " Returns:\n", " None\n", " \"\"\"\n", " \n", " # Get global config\n", "# with open('code/config.json', 'r') as f:\n", "# CONFIG=json.load(f)\n", "\n", " CONFIG = {}\n", " CONFIG['public_bucket'] = \"sagemaker-sample-files\"\n", " \n", " if not os.path.exists(data_dir):\n", " os.makedirs(data_dir)\n", " \n", " if train:\n", " images_file = \"train-images-idx3-ubyte.gz\"\n", " labels_file = \"train-labels-idx1-ubyte.gz\"\n", " else:\n", " images_file = \"t10k-images-idx3-ubyte.gz\"\n", " labels_file = \"t10k-labels-idx1-ubyte.gz\"\n", "\n", " # download objects\n", " s3 = boto3.client('s3')\n", " bucket = CONFIG['public_bucket']\n", " for obj in [images_file, labels_file]:\n", " key = os.path.join(\"datasets/image/MNIST\", obj)\n", " dest = os.path.join(data_dir, obj)\n", " if not os.path.exists(dest):\n", " s3.download_file(bucket, key, dest)\n", " return\n", "\n", "\n", "download_from_s3('/tmp/data', True)\n", "download_from_s3('/tmp/data', False)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "prefix = 'mnist'\n", "bucket = sess.default_bucket()\n", "loc = sess.upload_data(path='/tmp/data', bucket=bucket, key_prefix=prefix)\n", "\n", "channels = {\n", " \"training\": loc,\n", " \"testing\": loc\n", "}\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "s3://sagemaker-eu-west-1-245582572290/pytorch/mnist\n", "s3://sagemaker-eu-west-1-245582572290/mnist\n", "s3://sagemaker-eu-west-1-245582572290/mnist\n" ] } ], "source": [ "print(output_path)\n", "print(channels.get('training'))\n", "print(channels.get('testing'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Write the PySTorch script\n", "\n", "The source for a traning script is in the cell below. The cell uses the `%%pytorch submit` directive to submit python application from cell to PyTorch Estimator. " ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Couldn't call 'get_role' to get Role ARN from role name workshop-sagemaker to get Role path.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "submit:\n", " {\n", " \"channel_testing\": \"s3://sagemaker-eu-west-1-245582572290/mnist\",\n", " \"channel_training\": \"s3://sagemaker-eu-west-1-245582572290/mnist\",\n", " \"enable_sagemaker_metrics\": true,\n", " \"entry_point\": \"/tmp/tmp-497a920d-e287-4f8e-ba53-251c4ffb80ad.py\",\n", " \"estimator_name\": \"___PyTorch_estimator\",\n", " \"framework_version\": \"1.5.0\",\n", " \"hyperparameters\": {\n", " \"backend\": \"gloo\",\n", " \"batch-size\": \"128\",\n", " \"epochs\": \"20\",\n", " \"learning-rate\": \"1e-3\",\n", " \"log-interval\": \"100\"\n", " },\n", " \"instance_count\": 1,\n", " \"instance_type\": \"ml.c4.xlarge\",\n", " \"max_result\": 10,\n", " \"max_wait\": 86400,\n", " \"metric_definitions\": [\n", " {\n", " \"Name\": \"loss\",\n", " \"Regex\": \"Loss: (.*)\"\n", " }\n", " ],\n", " \"name_contains\": \"pytorch\",\n", " \"output_path\": \"s3://sagemaker-eu-west-1-245582572290/pytorch/mnist\",\n", " \"py_version\": \"py3\",\n", " \"role\": \"arn:aws:iam::245582572290:role/workshop-sagemaker\",\n", " \"use_spot_instances\": true\n", "}\n", "{\n", " \"___PyTorch_latest_job_name\": \"pytorch-training-2020-12-17-17-26-21-133\",\n", " \"estimator_variable\": \"___PyTorch_estimator\"\n", "}\n" ] } ], "source": [ "%%pytorch submit --enable_sagemaker_metrics --metric_definitions 'Name: loss, Regex: Loss: (.*)' --use_spot_instances --max_wait 86400 --output_path s3://sagemaker-eu-west-1-245582572290/pytorch/mnist --channel_training s3://sagemaker-eu-west-1-245582572290/mnist --channel_testing s3://sagemaker-eu-west-1-245582572290/mnist --hyperparameters 'batch-size: 128 ,epochs: 20, learning-rate: 1e-3, log-interval: 100, backend: gloo' \n", "# --instance_type ml.g4dn.xlarge --instance_count 2 \n", "# --instance_type ml.c4.xlarge --instance_count 2 \n", "\n", "\n", "import argparse\n", "import gzip\n", "import json\n", "import logging\n", "import os\n", "import sys\n", "\n", "import numpy as np\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "import torch.optim as optim\n", "from torch.utils.data import Dataset, DataLoader\n", "import torch.distributed as dist\n", "\n", "\n", "logger = logging.getLogger(__name__)\n", "logger.setLevel(logging.DEBUG)\n", "logger.addHandler(logging.StreamHandler(sys.stdout))\n", "\n", "# Based on https://github.com/pytorch/examples/blob/master/mnist/main.py\n", "class Net(nn.Module):\n", " def __init__(self):\n", " super(Net, self).__init__()\n", " self.conv1 = nn.Conv2d(1, 10, kernel_size=5)\n", " self.conv2 = nn.Conv2d(10, 20, kernel_size=5)\n", " self.conv2_drop = nn.Dropout2d()\n", " self.fc1 = nn.Linear(320, 50)\n", " self.fc2 = nn.Linear(50, 10)\n", "\n", " def forward(self, x):\n", " x = F.relu(F.max_pool2d(self.conv1(x), 2))\n", " x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))\n", " x = x.view(-1, 320)\n", " x = F.relu(self.fc1(x))\n", " x = F.dropout(x, training=self.training)\n", " x = self.fc2(x)\n", " return F.log_softmax(x, dim=1)\n", "\n", "\n", "# Decode binary data from SM_CHANNEL_TRAINING\n", "# Decode and preprocess data\n", "# Create map dataset\n", "\n", "\n", "def normalize(x, axis):\n", " eps = np.finfo(float).eps\n", " mean = np.mean(x, axis=axis, keepdims=True)\n", " # avoid division by zero\n", " std = np.std(x, axis=axis, keepdims=True) + eps\n", " return (x - mean) / std\n", "\n", "def convert_to_tensor(data_dir, images_file, labels_file):\n", " \"\"\"Byte string to torch tensor \n", " \"\"\"\n", " with gzip.open(os.path.join(data_dir, images_file), 'rb') as f:\n", " images = np.frombuffer(f.read(), \n", " np.uint8, offset=16).reshape(-1, 28, 28).astype(np.float32)\n", "\n", " with gzip.open(os.path.join(data_dir, labels_file), 'rb') as f:\n", " labels = np.frombuffer(f.read(), np.uint8, offset=8).astype(\n", " np.int64)\n", " \n", " # normalize the images\n", " images = normalize(images, axis=(1,2))\n", "\n", " # add channel dimension (depth-major)\n", " images = np.expand_dims(images, axis=1)\n", "\n", " # to torch tensor\n", " images = torch.tensor(images, dtype=torch.float32)\n", " labels = torch.tensor(labels, dtype=torch.int64)\n", " return images, labels \n", "\n", " \n", "class MNIST(Dataset):\n", " def __init__(self, data_dir, train=True):\n", "\n", " if train:\n", " images_file=\"train-images-idx3-ubyte.gz\"\n", " labels_file=\"train-labels-idx1-ubyte.gz\"\n", " else:\n", " images_file=\"t10k-images-idx3-ubyte.gz\"\n", " labels_file=\"t10k-labels-idx1-ubyte.gz\"\n", " \n", " self.images, self.labels = convert_to_tensor(\n", " data_dir, images_file, labels_file)\n", " \n", "\n", " def __len__(self):\n", " return len(self.labels)\n", " \n", " def __getitem__(self, idx):\n", " return self.images[idx], self.labels[idx]\n", "\n", "\n", "def train(args):\n", " # Initialize the distributed environment.\n", " if(len(args.hosts)>1):\n", " world_size = len(args.hosts)\n", " os.environ['WORLD_SIZE'] = str(world_size)\n", " host_rank = args.hosts.index(args.current_host)\n", " dist.init_process_group(backend=args.backend, rank=host_rank)\n", " \n", " # GPU, CPU\n", " use_cuda = args.num_gpus > 0\n", " device = torch.device(\"cuda\" if use_cuda > 0 else \"cpu\")\n", "\n", " torch.manual_seed(args.seed)\n", " if use_cuda:\n", " torch.cuda.manual_seed(args.seed)\n", "\n", " train_loader = DataLoader(MNIST(args.train, train=True), \n", " batch_size=args.batch_size, shuffle=True)\n", " test_loader = DataLoader(MNIST(args.test, train=False),\n", " batch_size=args.test_batch_size, shuffle=False)\n", "\n", " net = Net().to(device)\n", " loss_fn = nn.CrossEntropyLoss()\n", " optimizer = optim.Adam(net.parameters(), \n", " betas=(args.beta_1, args.beta_2),\n", " weight_decay=args.weight_decay)\n", "\n", " logger.info(\"Start training ...\") \n", " for epoch in range(1, args.epochs+1):\n", " net.train()\n", " for batch_idx, (imgs, labels) in enumerate(train_loader, 1):\n", " imgs, labels = imgs.to(device), labels.to(device)\n", " output = net(imgs)\n", " loss = loss_fn(output, labels)\n", "\n", " optimizer.zero_grad()\n", " loss.backward()\n", " optimizer.step()\n", " \n", " if batch_idx % args.log_interval == 0:\n", " print('Train Epoch: {} [{}/{} ({:.0f}%)] Loss: {:.6f}'.format(\n", " epoch, batch_idx * len(imgs), len(train_loader.sampler),\n", " 100. * batch_idx / len(train_loader), loss.item()))\n", " \n", " # test the model\n", " test(net, test_loader, device)\n", "\n", " # save model checkpoint\n", " save_model(net, args.model_dir)\n", " return\n", "\n", "def test(model, test_loader, device):\n", " model.eval()\n", " test_loss = 0\n", " correct = 0\n", " with torch.no_grad():\n", " for imgs, labels in test_loader:\n", " imgs, labels = imgs.to(device), labels.to(device)\n", " output = model(imgs)\n", " test_loss+=F.cross_entropy(output, labels, reduction='sum').item()\n", " \n", " pred = output.max(1, keepdim=True)[1]\n", " correct+=pred.eq(labels.view_as(pred)).sum().item()\n", " \n", " test_loss /= len(test_loader.dataset)\n", " logger.info('Test set: Average loss: {:.4f}, Accuracy: {}/{}, {})\\n'.format(\n", " test_loss, correct, len(test_loader.dataset),\n", " 100.0 * correct / len(test_loader.dataset)\n", " ))\n", " return\n", "\n", "def save_model(model, model_dir):\n", " logger.info('Saving the model')\n", " path = os.path.join(model_dir, 'model.pth')\n", " torch.save(model.cpu().state_dict(), path)\n", " return\n", "\n", "def parse_args():\n", " parser = argparse.ArgumentParser()\n", "\n", " # Data and model checkpoints directories\n", " parser.add_argument('--batch-size', type=int, default=64, metavar='N',\n", " help='input batch size for training (default: 64)')\n", " parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',\n", " help='input batch size for testing (default: 1000)')\n", " parser.add_argument('--epochs', type=int, default=1, metavar='N',\n", " help='number of epochs to train (default: 1)')\n", " parser.add_argument('--learning-rate', type=float, default=0.001, metavar='LR',\n", " help='learning rate (default: 0.01)')\n", " parser.add_argument('--beta_1', type=float, default=0.9, metavar='BETA1',\n", " help='beta1 (default: 0.9)')\n", " parser.add_argument('--beta_2', type=float, default=0.999, metavar='BETA2',\n", " help='beta2 (default: 0.999)')\n", " parser.add_argument('--weight-decay', type=float, default=1e-4, metavar='WD',\n", " help='L2 weight decay (default: 1e-4)')\n", " parser.add_argument('--seed', type=int, default=1, metavar='S',\n", " help='random seed (default: 1)')\n", " parser.add_argument('--log-interval', type=int, default=100, metavar='N',\n", " help='how many batches to wait before logging training status')\n", " parser.add_argument('--backend', type=str, default=None,\n", " help='backend for distributed training (tcp, gloo on cpu and gloo, nccl on gpu)')\n", "\n", " # Container environment\n", " parser.add_argument('--hosts', type=list, default=json.loads(os.environ['SM_HOSTS']))\n", " parser.add_argument('--current-host', type=str, default=os.environ['SM_CURRENT_HOST'])\n", " parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])\n", " parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAINING'])\n", " parser.add_argument('--test', type=str, default=os.environ['SM_CHANNEL_TESTING'])\n", " parser.add_argument('--num-gpus', type=int, default=os.environ['SM_NUM_GPUS'])\n", " \n", " return parser.parse_args()\n", "\n", "\n", "if __name__ == \"__main__\":\n", " args = parse_args()\n", " \n", " train(args)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Stop latest traning Job" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"AlgorithmSpecification\": {\n", " \"EnableSageMakerMetricsTimeSeries\": true,\n", " \"MetricDefinitions\": [\n", " {\n", " \"Name\": \"loss\",\n", " \"Regex\": \"Loss: (.*)\"\n", " }\n", " ],\n", " \"TrainingImage\": \"763104351884.dkr.ecr.eu-west-1.amazonaws.com/pytorch-training:1.5.0-cpu-py3\",\n", " \"TrainingInputMode\": \"File\"\n", " },\n", " \"CreationTime\": \"2020-12-17 17:26:09.297000+00:00\",\n", " \"DebugHookConfig\": {\n", " \"CollectionConfigurations\": [],\n", " \"S3OutputPath\": \"s3://sagemaker-eu-west-1-245582572290/pytorch/mnist\"\n", " },\n", " \"EnableInterContainerTrafficEncryption\": false,\n", " \"EnableManagedSpotTraining\": true,\n", " \"EnableNetworkIsolation\": false,\n", " \"HyperParameters\": {\n", " \"backend\": \"\\\"gloo\\\"\",\n", " \"batch-size\": \"\\\"128\\\"\",\n", " \"epochs\": \"\\\"20\\\"\",\n", " \"learning-rate\": \"\\\"1e-3\\\"\",\n", " \"log-interval\": \"\\\"100\\\"\",\n", " \"sagemaker_container_log_level\": \"20\",\n", " \"sagemaker_job_name\": \"\\\"pytorch-training-2020-12-17-17-26-08-931\\\"\",\n", " \"sagemaker_program\": \"\\\"tmp-37d932cb-4395-4e37-880d-59eef8acf259.py\\\"\",\n", " \"sagemaker_region\": \"\\\"eu-west-1\\\"\",\n", " \"sagemaker_submit_directory\": \"\\\"s3://sagemaker-eu-west-1-245582572290/pytorch-training-2020-12-17-17-26-08-931/source/sourcedir.tar.gz\\\"\"\n", " },\n", " \"InputDataConfig\": [\n", " {\n", " \"ChannelName\": \"training\",\n", " \"CompressionType\": \"None\",\n", " \"DataSource\": {\n", " \"S3DataSource\": {\n", " \"S3DataDistributionType\": \"FullyReplicated\",\n", " \"S3DataType\": \"S3Prefix\",\n", " \"S3Uri\": \"s3://sagemaker-eu-west-1-245582572290/mnist\"\n", " }\n", " },\n", " \"RecordWrapperType\": \"None\"\n", " },\n", " {\n", " \"ChannelName\": \"testing\",\n", " \"CompressionType\": \"None\",\n", " \"DataSource\": {\n", " \"S3DataSource\": {\n", " \"S3DataDistributionType\": \"FullyReplicated\",\n", " \"S3DataType\": \"S3Prefix\",\n", " \"S3Uri\": \"s3://sagemaker-eu-west-1-245582572290/mnist\"\n", " }\n", " },\n", " \"RecordWrapperType\": \"None\"\n", " }\n", " ],\n", " \"LastModifiedTime\": \"2020-12-17 17:26:15.642000+00:00\",\n", " \"OutputDataConfig\": {\n", " \"KmsKeyId\": \"\",\n", " \"S3OutputPath\": \"s3://sagemaker-eu-west-1-245582572290/pytorch/mnist\"\n", " },\n", " \"ProfilerConfig\": {\n", " \"ProfilingIntervalInMilliseconds\": 500,\n", " \"S3OutputPath\": \"s3://sagemaker-eu-west-1-245582572290/pytorch/mnist\"\n", " },\n", " \"ProfilerRuleConfigurations\": [\n", " {\n", " \"RuleConfigurationName\": \"ProfilerReport-1608225968\",\n", " \"RuleEvaluatorImage\": \"929884845733.dkr.ecr.eu-west-1.amazonaws.com/sagemaker-debugger-rules:latest\",\n", " \"RuleParameters\": {\n", " \"rule_to_invoke\": \"ProfilerReport\"\n", " },\n", " \"VolumeSizeInGB\": 0\n", " }\n", " ],\n", " \"ProfilerRuleEvaluationStatuses\": [\n", " {\n", " \"LastModifiedTime\": \"2020-12-17 17:26:10.502000+00:00\",\n", " \"RuleConfigurationName\": \"ProfilerReport-1608225968\",\n", " \"RuleEvaluationStatus\": \"InProgress\"\n", " }\n", " ],\n", " \"ProfilingStatus\": \"Enabled\",\n", " \"ResourceConfig\": {\n", " \"InstanceCount\": 1,\n", " \"InstanceType\": \"ml.c4.xlarge\",\n", " \"VolumeSizeInGB\": 30\n", " },\n", " \"ResponseMetadata\": {\n", " \"HTTPHeaders\": {\n", " \"content-length\": \"3225\",\n", " \"content-type\": \"application/x-amz-json-1.1\",\n", " \"date\": \"Thu, 17 Dec 2020 17:26:14 GMT\",\n", " \"x-amzn-requestid\": \"d732e136-2eb3-4d01-8163-06316e305dd8\"\n", " },\n", " \"HTTPStatusCode\": 200,\n", " \"RequestId\": \"d732e136-2eb3-4d01-8163-06316e305dd8\",\n", " \"RetryAttempts\": 0\n", " },\n", " \"RoleArn\": \"arn:aws:iam::245582572290:role/workshop-sagemaker\",\n", " \"SecondaryStatus\": \"Starting\",\n", " \"SecondaryStatusTransitions\": [\n", " {\n", " \"StartTime\": \"2020-12-17 17:26:09.297000+00:00\",\n", " \"Status\": \"Starting\",\n", " \"StatusMessage\": \"Launching requested ML instances\"\n", " }\n", " ],\n", " \"StoppingCondition\": {\n", " \"MaxRuntimeInSeconds\": 86400,\n", " \"MaxWaitTimeInSeconds\": 86400\n", " },\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-12-17-17-26-08-931\",\n", " \"TrainingJobName\": \"pytorch-training-2020-12-17-17-26-08-931\",\n", " \"TrainingJobStatus\": \"Stopping\"\n", "}\n" ] } ], "source": [ "%pytorch delete" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Describe latest traning Job" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"AlgorithmSpecification\": {\n", " \"EnableSageMakerMetricsTimeSeries\": true,\n", " \"MetricDefinitions\": [\n", " {\n", " \"Name\": \"loss\",\n", " \"Regex\": \"Loss: (.*)\"\n", " }\n", " ],\n", " \"TrainingImage\": \"763104351884.dkr.ecr.eu-west-1.amazonaws.com/pytorch-training:1.5.0-cpu-py3\",\n", " \"TrainingInputMode\": \"File\"\n", " },\n", " \"BillableTimeInSeconds\": 107,\n", " \"CreationTime\": \"2020-12-17 17:26:21.532000+00:00\",\n", " \"DebugHookConfig\": {\n", " \"CollectionConfigurations\": [],\n", " \"S3OutputPath\": \"s3://sagemaker-eu-west-1-245582572290/pytorch/mnist\"\n", " },\n", " \"EnableInterContainerTrafficEncryption\": false,\n", " \"EnableManagedSpotTraining\": true,\n", " \"EnableNetworkIsolation\": false,\n", " \"FinalMetricDataList\": [\n", " {\n", " \"MetricName\": \"loss\",\n", " \"Timestamp\": \"1970-01-19 14:43:46.439000+00:00\",\n", " \"Value\": 0.10897199809551239\n", " }\n", " ],\n", " \"HyperParameters\": {\n", " \"backend\": \"\\\"gloo\\\"\",\n", " \"batch-size\": \"\\\"128\\\"\",\n", " \"epochs\": \"\\\"20\\\"\",\n", " \"learning-rate\": \"\\\"1e-3\\\"\",\n", " \"log-interval\": \"\\\"100\\\"\",\n", " \"sagemaker_container_log_level\": \"20\",\n", " \"sagemaker_job_name\": \"\\\"pytorch-training-2020-12-17-17-26-21-133\\\"\",\n", " \"sagemaker_program\": \"\\\"tmp-497a920d-e287-4f8e-ba53-251c4ffb80ad.py\\\"\",\n", " \"sagemaker_region\": \"\\\"eu-west-1\\\"\",\n", " \"sagemaker_submit_directory\": \"\\\"s3://sagemaker-eu-west-1-245582572290/pytorch-training-2020-12-17-17-26-21-133/source/sourcedir.tar.gz\\\"\"\n", " },\n", " \"InputDataConfig\": [\n", " {\n", " \"ChannelName\": \"training\",\n", " \"CompressionType\": \"None\",\n", " \"DataSource\": {\n", " \"S3DataSource\": {\n", " \"S3DataDistributionType\": \"FullyReplicated\",\n", " \"S3DataType\": \"S3Prefix\",\n", " \"S3Uri\": \"s3://sagemaker-eu-west-1-245582572290/mnist\"\n", " }\n", " },\n", " \"RecordWrapperType\": \"None\"\n", " },\n", " {\n", " \"ChannelName\": \"testing\",\n", " \"CompressionType\": \"None\",\n", " \"DataSource\": {\n", " \"S3DataSource\": {\n", " \"S3DataDistributionType\": \"FullyReplicated\",\n", " \"S3DataType\": \"S3Prefix\",\n", " \"S3Uri\": \"s3://sagemaker-eu-west-1-245582572290/mnist\"\n", " }\n", " },\n", " \"RecordWrapperType\": \"None\"\n", " }\n", " ],\n", " \"LastModifiedTime\": \"2020-12-17 17:35:10.892000+00:00\",\n", " \"ModelArtifacts\": {\n", " \"S3ModelArtifacts\": \"s3://sagemaker-eu-west-1-245582572290/pytorch/mnist/pytorch-training-2020-12-17-17-26-21-133/output/model.tar.gz\"\n", " },\n", " \"OutputDataConfig\": {\n", " \"KmsKeyId\": \"\",\n", " \"S3OutputPath\": \"s3://sagemaker-eu-west-1-245582572290/pytorch/mnist\"\n", " },\n", " \"ProfilerConfig\": {\n", " \"ProfilingIntervalInMilliseconds\": 500,\n", " \"S3OutputPath\": \"s3://sagemaker-eu-west-1-245582572290/pytorch/mnist\"\n", " },\n", " \"ProfilerRuleConfigurations\": [\n", " {\n", " \"RuleConfigurationName\": \"ProfilerReport-1608225981\",\n", " \"RuleEvaluatorImage\": \"929884845733.dkr.ecr.eu-west-1.amazonaws.com/sagemaker-debugger-rules:latest\",\n", " \"RuleParameters\": {\n", " \"rule_to_invoke\": \"ProfilerReport\"\n", " },\n", " \"VolumeSizeInGB\": 0\n", " }\n", " ],\n", " \"ProfilerRuleEvaluationStatuses\": [\n", " {\n", " \"LastModifiedTime\": \"2020-12-17 17:35:10.879000+00:00\",\n", " \"RuleConfigurationName\": \"ProfilerReport-1608225981\",\n", " \"RuleEvaluationJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:processing-job/pytorch-training-2020-12-1-profilerreport-1608225981-552d153e\",\n", " \"RuleEvaluationStatus\": \"NoIssuesFound\"\n", " }\n", " ],\n", " \"ProfilingStatus\": \"Enabled\",\n", " \"ResourceConfig\": {\n", " \"InstanceCount\": 1,\n", " \"InstanceType\": \"ml.c4.xlarge\",\n", " \"VolumeSizeInGB\": 30\n", " },\n", " \"ResponseMetadata\": {\n", " \"HTTPHeaders\": {\n", " \"content-length\": \"4283\",\n", " \"content-type\": \"application/x-amz-json-1.1\",\n", " \"date\": \"Thu, 17 Dec 2020 17:37:33 GMT\",\n", " \"x-amzn-requestid\": \"fd9372ee-dfc6-4d5e-bc62-b4ad50fe0f7d\"\n", " },\n", " \"HTTPStatusCode\": 200,\n", " \"RequestId\": \"fd9372ee-dfc6-4d5e-bc62-b4ad50fe0f7d\",\n", " \"RetryAttempts\": 0\n", " },\n", " \"RoleArn\": \"arn:aws:iam::245582572290:role/workshop-sagemaker\",\n", " \"SecondaryStatus\": \"Completed\",\n", " \"SecondaryStatusTransitions\": [\n", " {\n", " \"EndTime\": \"2020-12-17 17:28:48.623000+00:00\",\n", " \"StartTime\": \"2020-12-17 17:26:21.532000+00:00\",\n", " \"Status\": \"Starting\",\n", " \"StatusMessage\": \"Preparing the instances for training\"\n", " },\n", " {\n", " \"EndTime\": \"2020-12-17 17:29:25.182000+00:00\",\n", " \"StartTime\": \"2020-12-17 17:28:48.623000+00:00\",\n", " \"Status\": \"Downloading\",\n", " \"StatusMessage\": \"Downloading input data\"\n", " },\n", " {\n", " \"EndTime\": \"2020-12-17 17:34:50.431000+00:00\",\n", " \"StartTime\": \"2020-12-17 17:29:25.182000+00:00\",\n", " \"Status\": \"Training\",\n", " \"StatusMessage\": \"Training image download completed. Training in progress.\"\n", " },\n", " {\n", " \"EndTime\": \"2020-12-17 17:34:58.661000+00:00\",\n", " \"StartTime\": \"2020-12-17 17:34:50.431000+00:00\",\n", " \"Status\": \"Uploading\",\n", " \"StatusMessage\": \"Uploading generated training model\"\n", " },\n", " {\n", " \"EndTime\": \"2020-12-17 17:34:58.661000+00:00\",\n", " \"StartTime\": \"2020-12-17 17:34:58.661000+00:00\",\n", " \"Status\": \"Completed\",\n", " \"StatusMessage\": \"Training job completed\"\n", " }\n", " ],\n", " \"StoppingCondition\": {\n", " \"MaxRuntimeInSeconds\": 86400,\n", " \"MaxWaitTimeInSeconds\": 86400\n", " },\n", " \"TrainingEndTime\": \"2020-12-17 17:34:58.661000+00:00\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-12-17-17-26-21-133\",\n", " \"TrainingJobName\": \"pytorch-training-2020-12-17-17-26-21-133\",\n", " \"TrainingJobStatus\": \"Completed\",\n", " \"TrainingStartTime\": \"2020-12-17 17:28:48.623000+00:00\",\n", " \"TrainingTimeInSeconds\": 370\n", "}\n" ] } ], "source": [ "%pytorch status" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Show logs for latest traning Job" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2020-12-17 17:30:27 Starting - Preparing the instances for training\n", "2020-12-17 17:30:27 Downloading - Downloading input data\n", "2020-12-17 17:30:27 Training - Training image download completed. Training in progress.\u001b[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device\u001b[0m\n", "\u001b[34mbash: no job control in this shell\u001b[0m\n", "\u001b[34m2020-12-17 17:29:46,602 sagemaker-containers INFO Imported framework sagemaker_pytorch_container.training\u001b[0m\n", "\u001b[34m2020-12-17 17:29:46,605 sagemaker-containers INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", "\u001b[34m2020-12-17 17:29:46,618 sagemaker_pytorch_container.training INFO Block until all host DNS lookups succeed.\u001b[0m\n", "\u001b[34m2020-12-17 17:29:49,653 sagemaker_pytorch_container.training INFO Invoking user training script.\u001b[0m\n", "\u001b[34m2020-12-17 17:29:49,959 sagemaker-containers INFO Module default_user_module_name does not provide a setup.py. \u001b[0m\n", "\u001b[34mGenerating setup.py\u001b[0m\n", "\u001b[34m2020-12-17 17:29:49,959 sagemaker-containers INFO Generating setup.cfg\u001b[0m\n", "\u001b[34m2020-12-17 17:29:49,959 sagemaker-containers INFO Generating MANIFEST.in\u001b[0m\n", "\u001b[34m2020-12-17 17:29:49,959 sagemaker-containers INFO Installing module with the following command:\u001b[0m\n", "\u001b[34m/opt/conda/bin/python -m pip install . \u001b[0m\n", "\u001b[34mProcessing /tmp/tmpq4_ly84o/module_dir\u001b[0m\n", "\u001b[34mBuilding wheels for collected packages: default-user-module-name\n", " Building wheel for default-user-module-name (setup.py): started\n", " Building wheel for default-user-module-name (setup.py): finished with status 'done'\n", " Created wheel for default-user-module-name: filename=default_user_module_name-1.0.0-py2.py3-none-any.whl size=8553 sha256=c6f2d39864763d0b20856afd721f2c8aac558e337ac7457bc049438d27c7c477\n", " Stored in directory: /tmp/pip-ephem-wheel-cache-oibqo0zt/wheels/36/0a/f5/fb80b83ca40bce38e01cfea117b06f01a866ea9625eccf9f9c\u001b[0m\n", "\u001b[34mSuccessfully built default-user-module-name\u001b[0m\n", "\u001b[34mInstalling collected packages: default-user-module-name\u001b[0m\n", "\u001b[34mSuccessfully installed default-user-module-name-1.0.0\u001b[0m\n", "\u001b[34mWARNING: You are using pip version 20.1; however, version 20.3.3 is available.\u001b[0m\n", "\u001b[34mYou should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.\u001b[0m\n", "\u001b[34m2020-12-17 17:29:52,358 sagemaker-containers INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", "\u001b[34m2020-12-17 17:29:52,372 sagemaker-containers INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", "\u001b[34m2020-12-17 17:29:52,388 sagemaker-containers INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", "\u001b[34m2020-12-17 17:29:52,400 sagemaker-containers INFO Invoking user script\n", "\u001b[0m\n", "\u001b[34mTraining Env:\n", "\u001b[0m\n", "\u001b[34m{\n", " \"additional_framework_parameters\": {},\n", " \"channel_input_dirs\": {\n", " \"testing\": \"/opt/ml/input/data/testing\",\n", " \"training\": \"/opt/ml/input/data/training\"\n", " },\n", " \"current_host\": \"algo-1\",\n", " \"framework_module\": \"sagemaker_pytorch_container.training:main\",\n", " \"hosts\": [\n", " \"algo-1\"\n", " ],\n", " \"hyperparameters\": {\n", " \"batch-size\": \"128\",\n", " \"log-interval\": \"100\",\n", " \"learning-rate\": \"1e-3\",\n", " \"backend\": \"gloo\",\n", " \"epochs\": \"20\"\n", " },\n", " \"input_config_dir\": \"/opt/ml/input/config\",\n", " \"input_data_config\": {\n", " \"testing\": {\n", " \"TrainingInputMode\": \"File\",\n", " \"S3DistributionType\": \"FullyReplicated\",\n", " \"RecordWrapperType\": \"None\"\n", " },\n", " \"training\": {\n", " \"TrainingInputMode\": \"File\",\n", " \"S3DistributionType\": \"FullyReplicated\",\n", " \"RecordWrapperType\": \"None\"\n", " }\n", " },\n", " \"input_dir\": \"/opt/ml/input\",\n", " \"is_master\": true,\n", " \"job_name\": \"pytorch-training-2020-12-17-17-26-21-133\",\n", " \"log_level\": 20,\n", " \"master_hostname\": \"algo-1\",\n", " \"model_dir\": \"/opt/ml/model\",\n", " \"module_dir\": \"s3://sagemaker-eu-west-1-245582572290/pytorch-training-2020-12-17-17-26-21-133/source/sourcedir.tar.gz\",\n", " \"module_name\": \"tmp-497a920d-e287-4f8e-ba53-251c4ffb80ad\",\n", " \"network_interface_name\": \"eth0\",\n", " \"num_cpus\": 4,\n", " \"num_gpus\": 0,\n", " \"output_data_dir\": \"/opt/ml/output/data\",\n", " \"output_dir\": \"/opt/ml/output\",\n", " \"output_intermediate_dir\": \"/opt/ml/output/intermediate\",\n", " \"resource_config\": {\n", " \"current_host\": \"algo-1\",\n", " \"hosts\": [\n", " \"algo-1\"\n", " ],\n", " \"network_interface_name\": \"eth0\"\n", " },\n", " \"user_entry_point\": \"tmp-497a920d-e287-4f8e-ba53-251c4ffb80ad.py\"\u001b[0m\n", "\u001b[34m}\n", "\u001b[0m\n", "\u001b[34mEnvironment variables:\n", "\u001b[0m\n", "\u001b[34mSM_HOSTS=[\"algo-1\"]\u001b[0m\n", "\u001b[34mSM_NETWORK_INTERFACE_NAME=eth0\u001b[0m\n", "\u001b[34mSM_HPS={\"backend\":\"gloo\",\"batch-size\":\"128\",\"epochs\":\"20\",\"learning-rate\":\"1e-3\",\"log-interval\":\"100\"}\u001b[0m\n", "\u001b[34mSM_USER_ENTRY_POINT=tmp-497a920d-e287-4f8e-ba53-251c4ffb80ad.py\u001b[0m\n", "\u001b[34mSM_FRAMEWORK_PARAMS={}\u001b[0m\n", "\u001b[34mSM_RESOURCE_CONFIG={\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"}\u001b[0m\n", "\u001b[34mSM_INPUT_DATA_CONFIG={\"testing\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"},\"training\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}}\u001b[0m\n", "\u001b[34mSM_OUTPUT_DATA_DIR=/opt/ml/output/data\u001b[0m\n", "\u001b[34mSM_CHANNELS=[\"testing\",\"training\"]\u001b[0m\n", "\u001b[34mSM_CURRENT_HOST=algo-1\u001b[0m\n", "\u001b[34mSM_MODULE_NAME=tmp-497a920d-e287-4f8e-ba53-251c4ffb80ad\u001b[0m\n", "\u001b[34mSM_LOG_LEVEL=20\u001b[0m\n", "\u001b[34mSM_FRAMEWORK_MODULE=sagemaker_pytorch_container.training:main\u001b[0m\n", "\u001b[34mSM_INPUT_DIR=/opt/ml/input\u001b[0m\n", "\u001b[34mSM_INPUT_CONFIG_DIR=/opt/ml/input/config\u001b[0m\n", "\u001b[34mSM_OUTPUT_DIR=/opt/ml/output\u001b[0m\n", "\u001b[34mSM_NUM_CPUS=4\u001b[0m\n", "\u001b[34mSM_NUM_GPUS=0\u001b[0m\n", "\u001b[34mSM_MODEL_DIR=/opt/ml/model\u001b[0m\n", "\u001b[34mSM_MODULE_DIR=s3://sagemaker-eu-west-1-245582572290/pytorch-training-2020-12-17-17-26-21-133/source/sourcedir.tar.gz\u001b[0m\n", "\u001b[34mSM_TRAINING_ENV={\"additional_framework_parameters\":{},\"channel_input_dirs\":{\"testing\":\"/opt/ml/input/data/testing\",\"training\":\"/opt/ml/input/data/training\"},\"current_host\":\"algo-1\",\"framework_module\":\"sagemaker_pytorch_container.training:main\",\"hosts\":[\"algo-1\"],\"hyperparameters\":{\"backend\":\"gloo\",\"batch-size\":\"128\",\"epochs\":\"20\",\"learning-rate\":\"1e-3\",\"log-interval\":\"100\"},\"input_config_dir\":\"/opt/ml/input/config\",\"input_data_config\":{\"testing\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"},\"training\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}},\"input_dir\":\"/opt/ml/input\",\"is_master\":true,\"job_name\":\"pytorch-training-2020-12-17-17-26-21-133\",\"log_level\":20,\"master_hostname\":\"algo-1\",\"model_dir\":\"/opt/ml/model\",\"module_dir\":\"s3://sagemaker-eu-west-1-245582572290/pytorch-training-2020-12-17-17-26-21-133/source/sourcedir.tar.gz\",\"module_name\":\"tmp-497a920d-e287-4f8e-ba53-251c4ffb80ad\",\"network_interface_name\":\"eth0\",\"num_cpus\":4,\"num_gpus\":0,\"output_data_dir\":\"/opt/ml/output/data\",\"output_dir\":\"/opt/ml/output\",\"output_intermediate_dir\":\"/opt/ml/output/intermediate\",\"resource_config\":{\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"},\"user_entry_point\":\"tmp-497a920d-e287-4f8e-ba53-251c4ffb80ad.py\"}\u001b[0m\n", "\u001b[34mSM_USER_ARGS=[\"--backend\",\"gloo\",\"--batch-size\",\"128\",\"--epochs\",\"20\",\"--learning-rate\",\"1e-3\",\"--log-interval\",\"100\"]\u001b[0m\n", "\u001b[34mSM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate\u001b[0m\n", "\u001b[34mSM_CHANNEL_TESTING=/opt/ml/input/data/testing\u001b[0m\n", "\u001b[34mSM_CHANNEL_TRAINING=/opt/ml/input/data/training\u001b[0m\n", "\u001b[34mSM_HP_BATCH-SIZE=128\u001b[0m\n", "\u001b[34mSM_HP_LOG-INTERVAL=100\u001b[0m\n", "\u001b[34mSM_HP_LEARNING-RATE=1e-3\u001b[0m\n", "\u001b[34mSM_HP_BACKEND=gloo\u001b[0m\n", "\u001b[34mSM_HP_EPOCHS=20\u001b[0m\n", "\u001b[34mPYTHONPATH=/opt/ml/code:/opt/conda/bin:/opt/conda/lib/python36.zip:/opt/conda/lib/python3.6:/opt/conda/lib/python3.6/lib-dynload:/opt/conda/lib/python3.6/site-packages\n", "\u001b[0m\n", "\u001b[34mInvoking script with the following command:\n", "\u001b[0m\n", "\u001b[34m/opt/conda/bin/python tmp-497a920d-e287-4f8e-ba53-251c4ffb80ad.py --backend gloo --batch-size 128 --epochs 20 --learning-rate 1e-3 --log-interval 100\n", "\n", "\u001b[0m\n", "\u001b[34mStart training ...\u001b[0m\n", "\u001b[34m[2020-12-17 17:29:55.455 algo-1:44 INFO json_config.py:90] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", "\u001b[34m[2020-12-17 17:29:55.456 algo-1:44 INFO hook.py:183] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", "\u001b[34m[2020-12-17 17:29:55.456 algo-1:44 INFO hook.py:228] Saving to /opt/ml/output/tensors\u001b[0m\n", "\u001b[34m[2020-12-17 17:29:55.457 algo-1:44 INFO hook.py:364] Monitoring the collections: losses\u001b[0m\n", "\u001b[34m[2020-12-17 17:29:55.457 algo-1:44 INFO hook.py:422] Hook is writing from the hook with pid: 44\n", "\u001b[0m\n", "\u001b[34mTrain Epoch: 1 [12800/60000 (21%)] Loss: 0.571117\u001b[0m\n", "\u001b[34mTrain Epoch: 1 [25600/60000 (43%)] Loss: 0.435707\u001b[0m\n", "\u001b[34mTrain Epoch: 1 [38400/60000 (64%)] Loss: 0.278377\u001b[0m\n", "\u001b[34mTrain Epoch: 1 [51200/60000 (85%)] Loss: 0.247071\u001b[0m\n", "\u001b[34mTest set: Average loss: 0.1151, Accuracy: 9642/10000, 96.42)\n", "\u001b[0m\n", "\u001b[34mTrain Epoch: 2 [12800/60000 (21%)] Loss: 0.245921\u001b[0m\n", "\u001b[34mTrain Epoch: 2 [25600/60000 (43%)] Loss: 0.197355\u001b[0m\n", "\u001b[34mTrain Epoch: 2 [38400/60000 (64%)] Loss: 0.285320\u001b[0m\n", "\u001b[34mTrain Epoch: 2 [51200/60000 (85%)] Loss: 0.146226\u001b[0m\n", "\u001b[34mTest set: Average loss: 0.0750, Accuracy: 9772/10000, 97.72)\n", "\u001b[0m\n", "\u001b[34mTrain Epoch: 3 [12800/60000 (21%)] Loss: 0.221168\u001b[0m\n", "\u001b[34mTrain Epoch: 3 [25600/60000 (43%)] Loss: 0.365144\u001b[0m\n", "\u001b[34mTrain Epoch: 3 [38400/60000 (64%)] Loss: 0.299798\u001b[0m\n", "\u001b[34mTrain Epoch: 3 [51200/60000 (85%)] Loss: 0.162260\u001b[0m\n", "null\n" ] } ], "source": [ "%pytorch logs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## List traning jobs" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"NextToken\": \"cIws2QhTXUIa8bi8X9aU7gCAR0Xdc3x9L/Ofg4vsVMTtcNqRqLcpBqE42+cDc29TFQi5WMntyYF8Dtfi7hilXAF3S3jOJ0DmOuxvXC7MuU1Q6+20eQKMbbovB90pwL5DPnINepnlLEmFhvO87tIVNZR4vTy3ef5rgF6dqbA0VVq0m92q6y2SOofBaZP49sRdVnJtTcTQaS5EqeYVIuH5KxlQ5w0j5RQq6GZMUD+Yb2yDOCsvqfy1owdkaN5KZ4FGRNtmDYtMraMQRqc2qtXJqyeGVeTI2ay1LmtZUOEhRDz3vRe0Pt1v0C4FDO31aVce2uKbzUS+TI0u9eGnRVRpTlPz1+OKucA7cCkpc6121h+TvwCWzUFRylDK5bA9jdGxQ5WqvdHLF3P4RVNx8ltlG36ht76Wv1twVDcPQ1wZIMTfWInXZp1IY5V31hwN0wFMHIJj86VYFo1G/D59pwDhXnd9Iyj0APFLa9mhymUH3TrC9JytAPYKP8yt92QVOBs+CVElLY9l+EVZF26L+spMwJMg9NPWqPQ8T+uwHRkAsKxOLnlc6qKLB6EMy7Q8ZrrNUHxUJwbNLQ==\",\n", " \"ResponseMetadata\": {\n", " \"HTTPHeaders\": {\n", " \"content-length\": \"3674\",\n", " \"content-type\": \"application/x-amz-json-1.1\",\n", " \"date\": \"Thu, 17 Dec 2020 17:26:39 GMT\",\n", " \"x-amzn-requestid\": \"69a07fc8-5268-4436-a7ee-4ae1a16a3ba8\"\n", " },\n", " \"HTTPStatusCode\": 200,\n", " \"RequestId\": \"69a07fc8-5268-4436-a7ee-4ae1a16a3ba8\",\n", " \"RetryAttempts\": 0\n", " },\n", " \"TrainingJobSummaries\": [\n", " {\n", " \"CreationTime\": \"2020-12-17 17:26:21.532000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-17 17:26:23.319000+00:00\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-12-17-17-26-21-133\",\n", " \"TrainingJobName\": \"pytorch-training-2020-12-17-17-26-21-133\",\n", " \"TrainingJobStatus\": \"InProgress\"\n", " },\n", " {\n", " \"CreationTime\": \"2020-12-17 17:26:09.297000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-17 17:26:32.838000+00:00\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-12-17-17-26-08-931\",\n", " \"TrainingJobName\": \"pytorch-training-2020-12-17-17-26-08-931\",\n", " \"TrainingJobStatus\": \"Stopping\"\n", " },\n", " {\n", " \"CreationTime\": \"2020-12-10 22:19:13.571000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-10 22:26:42.835000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-10 22:26:22.416000+00:00\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-12-10-22-19-13-229\",\n", " \"TrainingJobName\": \"pytorch-training-2020-12-10-22-19-13-229\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"CreationTime\": \"2020-12-10 21:57:52.728000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-10 22:05:31.222000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-10 22:05:05.448000+00:00\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-12-10-21-57-52-405\",\n", " \"TrainingJobName\": \"pytorch-training-2020-12-10-21-57-52-405\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"CreationTime\": \"2020-12-10 21:27:46.539000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-10 21:32:33.233000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-10 21:32:15.664000+00:00\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-12-10-21-27-46-215\",\n", " \"TrainingJobName\": \"pytorch-training-2020-12-10-21-27-46-215\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"CreationTime\": \"2020-12-10 19:40:15.151000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-10 19:44:41.634000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-10 19:44:29.995000+00:00\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-12-10-19-40-14-827\",\n", " \"TrainingJobName\": \"pytorch-training-2020-12-10-19-40-14-827\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"CreationTime\": \"2020-12-10 19:29:48.055000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-10 19:34:34.433000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-10 19:33:57.428000+00:00\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-12-10-19-29-47-727\",\n", " \"TrainingJobName\": \"pytorch-training-2020-12-10-19-29-47-727\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"CreationTime\": \"2020-12-10 19:10:22.450000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-10 19:18:51.462000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-10 19:18:29.298000+00:00\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-12-10-19-10-22-080\",\n", " \"TrainingJobName\": \"pytorch-training-2020-12-10-19-10-22-080\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"CreationTime\": \"2020-12-10 16:31:31.233000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-10 16:40:00.265000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-10 16:39:31.170000+00:00\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-12-10-16-31-30-948\",\n", " \"TrainingJobName\": \"pytorch-training-2020-12-10-16-31-30-948\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"CreationTime\": \"2020-12-10 16:25:52.271000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-10 16:33:27.370000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-10 16:29:45.916000+00:00\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-12-10-16-25-51-909\",\n", " \"TrainingJobName\": \"pytorch-training-2020-12-10-16-25-51-909\",\n", " \"TrainingJobStatus\": \"Stopped\"\n", " }\n", " ]\n", "}\n" ] } ], "source": [ "%pytorch list " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Use estimator variable" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'pytorch-training-2020-12-17-17-26-21-133'" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "___PyTorch_estimator.training_job_analytics.name" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
timestampmetric_namevalue
00.0loss0.214329
160.0loss0.141576
2120.0loss0.123643
3180.0loss0.117714
4240.0loss0.108972
\n", "
" ], "text/plain": [ " timestamp metric_name value\n", "0 0.0 loss 0.214329\n", "1 60.0 loss 0.141576\n", "2 120.0 loss 0.123643\n", "3 180.0 loss 0.117714\n", "4 240.0 loss 0.108972" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "___PyTorch_estimator.training_job_analytics.dataframe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Notebook metainformation" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{\"AppType\":\"KernelGateway\",\"DomainId\":\"d-yu5msju0ejog\",\"UserProfileName\":\"lblokhin-custom\",\"ResourceArn\":\"arn:aws:sagemaker:eu-west-1:245582572290:app/d-yu5msju0ejog/lblokhin-custom/KernelGateway/lblokhin-ml-t3-medium-879b63d5fe00d0d11cf7ff2a5992\",\"ResourceName\":\"lblokhin-ml-t3-medium-879b63d5fe00d0d11cf7ff2a5992\",\"AppImageVersion\":\"\"}" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%less /opt/ml/metadata/resource-metadata.json" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'LANGUAGE': 'en_US.UTF-8',\n", " 'REGION_NAME': 'eu-west-1',\n", " 'HOSTNAME': 'lblokhin-ml-t3-medium-411fee9dd41a1713fdc0a5c3fa84',\n", " 'HOME': '/home/jovyan',\n", " 'CONDA_VERSION': '4.9.0',\n", " 'AWS_CONTAINER_CREDENTIALS_RELATIVE_URI': '/_sagemaker-instance-credentials/6ad65d45a08640ebd438a0158820c7dac41b815a19816ea80cc6f0756a6c2e09',\n", " 'NB_USER': 'jovyan',\n", " 'AWS_DEFAULT_REGION': 'eu-west-1',\n", " 'PATH': '/opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/tmp/miniconda3/condabin:/tmp/anaconda3/condabin:/tmp/miniconda2/condabin:/tmp/anaconda2/condabin',\n", " 'NB_GID': '100',\n", " 'LANG': 'en_US.UTF-8',\n", " 'AWS_ACCOUNT_ID': '245582572290',\n", " 'DEBIAN_FRONTEND': 'noninteractive',\n", " 'SHELL': '/bin/bash',\n", " 'AWS_REGION': 'eu-west-1',\n", " 'AWS_INTERNAL_IMAGE_OWNER': 'Custom',\n", " 'CONDA_DIR': '/opt/.sagemakerinternal/conda',\n", " 'LC_ALL': 'en_US.UTF-8',\n", " 'PWD': '/home/jovyan/work',\n", " 'SAGEMAKER_LOG_FILE': '/var/log/studio/kernel_gateway.log',\n", " 'NB_UID': '1000',\n", " 'JUPYTER_PATH': '/opt/conda/share/jupyter/',\n", " 'KERNEL_LAUNCH_TIMEOUT': '40',\n", " 'KERNEL_WORKING_PATH': 'Demo',\n", " 'KERNEL_GATEWAY': '1',\n", " 'JPY_PARENT_PID': '7',\n", " 'TERM': 'xterm-color',\n", " 'CLICOLOR': '1',\n", " 'PAGER': 'cat',\n", " 'GIT_PAGER': 'cat',\n", " 'MPLBACKEND': 'module://ipykernel.pylab.backend_inline'}" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%env" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "SageMakerMagic (lblokhin/26)", "language": "python", "name": "sm__SAGEMAKER_INTERNAL__arn:aws:sagemaker:eu-west-1:245582572290:image-version/lblokhin/26" }, "language_info": { "codemirror_mode": { "name": "python", "version": 3 }, "mimetype": "text/x-python", "name": "sm_kernel", "pygments_lexer": "python" } }, "nbformat": 4, "nbformat_minor": 4 }