{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Sage Maker session\n", "https://sagemaker.readthedocs.io/en/stable/api/utility/session.html" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'TrainingJobName': 'pytorch-training-2020-12-03-11-39-24-085',\n", " 'TrainingJobArn': 'arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-12-03-11-39-24-085',\n", " 'ModelArtifacts': {'S3ModelArtifacts': 's3://sagemaker-eu-west-1-245582572290/mnist/pytorch-training-2020-12-03-11-39-24-085/output/model.tar.gz'},\n", " 'TrainingJobStatus': 'Completed',\n", " 'SecondaryStatus': 'Completed',\n", " 'HyperParameters': {'sagemaker_container_log_level': '20',\n", " 'sagemaker_job_name': '\"pytorch-training-2020-12-03-11-39-24-085\"',\n", " 'sagemaker_program': '\"tmp-c267a390-383f-46fc-8427-37e62ab98d22.py\"',\n", " 'sagemaker_region': '\"eu-west-1\"',\n", " 'sagemaker_submit_directory': '\"s3://sagemaker-eu-west-1-245582572290/pytorch-training-2020-12-03-11-39-24-085/source/sourcedir.tar.gz\"'},\n", " 'AlgorithmSpecification': {'TrainingImage': '763104351884.dkr.ecr.eu-west-1.amazonaws.com/pytorch-training:1.5.0-cpu-py3',\n", " 'TrainingInputMode': 'File',\n", " 'EnableSageMakerMetricsTimeSeries': True},\n", " 'RoleArn': 'arn:aws:iam::245582572290:role/workshop-sagemaker',\n", " 'InputDataConfig': [{'ChannelName': 'training',\n", " 'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix',\n", " 'S3Uri': 's3://sagemaker-eu-west-1-245582572290/mnist',\n", " 'S3DataDistributionType': 'FullyReplicated'}},\n", " 'CompressionType': 'None',\n", " 'RecordWrapperType': 'None'},\n", " {'ChannelName': 'testing',\n", " 'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix',\n", " 'S3Uri': 's3://sagemaker-eu-west-1-245582572290/mnist',\n", " 'S3DataDistributionType': 'FullyReplicated'}},\n", " 'CompressionType': 'None',\n", " 'RecordWrapperType': 'None'}],\n", " 'OutputDataConfig': {'KmsKeyId': '',\n", " 'S3OutputPath': 's3://sagemaker-eu-west-1-245582572290/mnist'},\n", " 'ResourceConfig': {'InstanceType': 'ml.c4.xlarge',\n", " 'InstanceCount': 1,\n", " 'VolumeSizeInGB': 30},\n", " 'StoppingCondition': {'MaxRuntimeInSeconds': 86400},\n", " 'CreationTime': datetime.datetime(2020, 12, 3, 11, 39, 24, 348000, tzinfo=tzlocal()),\n", " 'TrainingStartTime': datetime.datetime(2020, 12, 3, 11, 41, 36, 107000, tzinfo=tzlocal()),\n", " 'TrainingEndTime': datetime.datetime(2020, 12, 3, 11, 43, 10, 797000, tzinfo=tzlocal()),\n", " 'LastModifiedTime': datetime.datetime(2020, 12, 3, 11, 43, 10, 797000, tzinfo=tzlocal()),\n", " 'SecondaryStatusTransitions': [{'Status': 'Starting',\n", " 'StartTime': datetime.datetime(2020, 12, 3, 11, 39, 24, 348000, tzinfo=tzlocal()),\n", " 'EndTime': datetime.datetime(2020, 12, 3, 11, 41, 36, 107000, tzinfo=tzlocal()),\n", " 'StatusMessage': 'Preparing the instances for training'},\n", " {'Status': 'Downloading',\n", " 'StartTime': datetime.datetime(2020, 12, 3, 11, 41, 36, 107000, tzinfo=tzlocal()),\n", " 'EndTime': datetime.datetime(2020, 12, 3, 11, 42, 23, 361000, tzinfo=tzlocal()),\n", " 'StatusMessage': 'Downloading input data'},\n", " {'Status': 'Training',\n", " 'StartTime': datetime.datetime(2020, 12, 3, 11, 42, 23, 361000, tzinfo=tzlocal()),\n", " 'EndTime': datetime.datetime(2020, 12, 3, 11, 43, 3, 226000, tzinfo=tzlocal()),\n", " 'StatusMessage': 'Training image download completed. Training in progress.'},\n", " {'Status': 'Uploading',\n", " 'StartTime': datetime.datetime(2020, 12, 3, 11, 43, 3, 226000, tzinfo=tzlocal()),\n", " 'EndTime': datetime.datetime(2020, 12, 3, 11, 43, 10, 797000, tzinfo=tzlocal()),\n", " 'StatusMessage': 'Uploading generated training model'},\n", " {'Status': 'Completed',\n", " 'StartTime': datetime.datetime(2020, 12, 3, 11, 43, 10, 797000, tzinfo=tzlocal()),\n", " 'EndTime': datetime.datetime(2020, 12, 3, 11, 43, 10, 797000, tzinfo=tzlocal()),\n", " 'StatusMessage': 'Training job completed'}],\n", " 'EnableNetworkIsolation': False,\n", " 'EnableInterContainerTrafficEncryption': False,\n", " 'EnableManagedSpotTraining': False,\n", " 'TrainingTimeInSeconds': 94,\n", " 'BillableTimeInSeconds': 94,\n", " 'DebugHookConfig': {'S3OutputPath': 's3://sagemaker-eu-west-1-245582572290/mnist',\n", " 'CollectionConfigurations': []},\n", " 'ResponseMetadata': {'RequestId': '6be845d4-540c-4a2f-a200-11534c43e069',\n", " 'HTTPStatusCode': 200,\n", " 'HTTPHeaders': {'x-amzn-requestid': '6be845d4-540c-4a2f-a200-11534c43e069',\n", " 'content-type': 'application/x-amz-json-1.1',\n", " 'content-length': '3261',\n", " 'date': 'Thu, 03 Dec 2020 11:55:53 GMT'},\n", " 'RetryAttempts': 0}}" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import sagemaker\n", "\n", "\n", "sm_sess = sagemaker.Session()\n", "sm_sess.describe_training_job(\"pytorch-training-2020-12-03-11-39-24-085\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2020-12-03 11:43:10 Starting - Preparing the instances for training\n", "2020-12-03 11:43:10 Downloading - Downloading input data\n", "2020-12-03 11:43:10 Training - Training image download completed. Training in progress.\n", "2020-12-03 11:43:10 Uploading - Uploading generated training model\n", "2020-12-03 11:43:10 Completed - Training job completed\u001b[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device\u001b[0m\n", "\u001b[34mbash: no job control in this shell\u001b[0m\n", "\u001b[34m2020-12-03 11:42:38,903 sagemaker-containers INFO Imported framework sagemaker_pytorch_container.training\u001b[0m\n", "\u001b[34m2020-12-03 11:42:38,907 sagemaker-containers INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", "\u001b[34m2020-12-03 11:42:38,919 sagemaker_pytorch_container.training INFO Block until all host DNS lookups succeed.\u001b[0m\n", "\u001b[34m2020-12-03 11:42:38,923 sagemaker_pytorch_container.training INFO Invoking user training script.\u001b[0m\n", "\u001b[34m2020-12-03 11:42:39,257 sagemaker-containers INFO Module default_user_module_name does not provide a setup.py. \u001b[0m\n", "\u001b[34mGenerating setup.py\u001b[0m\n", "\u001b[34m2020-12-03 11:42:39,257 sagemaker-containers INFO Generating setup.cfg\u001b[0m\n", "\u001b[34m2020-12-03 11:42:39,257 sagemaker-containers INFO Generating MANIFEST.in\u001b[0m\n", "\u001b[34m2020-12-03 11:42:39,257 sagemaker-containers INFO Installing module with the following command:\u001b[0m\n", "\u001b[34m/opt/conda/bin/python -m pip install . \u001b[0m\n", "\u001b[34mProcessing /tmp/tmpnq3xfdgz/module_dir\u001b[0m\n", "\u001b[34mBuilding wheels for collected packages: default-user-module-name\n", " Building wheel for default-user-module-name (setup.py): started\n", " Building wheel for default-user-module-name (setup.py): finished with status 'done'\n", " Created wheel for default-user-module-name: filename=default_user_module_name-1.0.0-py2.py3-none-any.whl size=8205 sha256=919bba1c0fd38ec7d467b2e65eaed8bfcf6198a990f95e8156e01a722e097ca0\n", " Stored in directory: /tmp/pip-ephem-wheel-cache-gd47ppjs/wheels/c6/ac/5e/e6295606dc2bbfd85a9da07a0cac492f2fd06b184ba4b38dc6\u001b[0m\n", "\u001b[34mSuccessfully built default-user-module-name\u001b[0m\n", "\u001b[34mInstalling collected packages: default-user-module-name\u001b[0m\n", "\u001b[34mSuccessfully installed default-user-module-name-1.0.0\u001b[0m\n", "\u001b[34mWARNING: You are using pip version 20.1; however, version 20.3.1 is available.\u001b[0m\n", "\u001b[34mYou should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.\u001b[0m\n", "\u001b[34m2020-12-03 11:42:41,748 sagemaker-containers INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", "\u001b[34m2020-12-03 11:42:41,766 sagemaker-containers INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", "\u001b[34m2020-12-03 11:42:41,783 sagemaker-containers INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", "\u001b[34m2020-12-03 11:42:41,795 sagemaker-containers INFO Invoking user script\n", "\u001b[0m\n", "\u001b[34mTraining Env:\n", "\u001b[0m\n", "\u001b[34m{\n", " \"additional_framework_parameters\": {},\n", " \"channel_input_dirs\": {\n", " \"testing\": \"/opt/ml/input/data/testing\",\n", " \"training\": \"/opt/ml/input/data/training\"\n", " },\n", " \"current_host\": \"algo-1\",\n", " \"framework_module\": \"sagemaker_pytorch_container.training:main\",\n", " \"hosts\": [\n", " \"algo-1\"\n", " ],\n", " \"hyperparameters\": {},\n", " \"input_config_dir\": \"/opt/ml/input/config\",\n", " \"input_data_config\": {\n", " \"testing\": {\n", " \"TrainingInputMode\": \"File\",\n", " \"S3DistributionType\": \"FullyReplicated\",\n", " \"RecordWrapperType\": \"None\"\n", " },\n", " \"training\": {\n", " \"TrainingInputMode\": \"File\",\n", " \"S3DistributionType\": \"FullyReplicated\",\n", " \"RecordWrapperType\": \"None\"\n", " }\n", " },\n", " \"input_dir\": \"/opt/ml/input\",\n", " \"is_master\": true,\n", " \"job_name\": \"pytorch-training-2020-12-03-11-39-24-085\",\n", " \"log_level\": 20,\n", " \"master_hostname\": \"algo-1\",\n", " \"model_dir\": \"/opt/ml/model\",\n", " \"module_dir\": \"s3://sagemaker-eu-west-1-245582572290/pytorch-training-2020-12-03-11-39-24-085/source/sourcedir.tar.gz\",\n", " \"module_name\": \"tmp-c267a390-383f-46fc-8427-37e62ab98d22\",\n", " \"network_interface_name\": \"eth0\",\n", " \"num_cpus\": 4,\n", " \"num_gpus\": 0,\n", " \"output_data_dir\": \"/opt/ml/output/data\",\n", " \"output_dir\": \"/opt/ml/output\",\n", " \"output_intermediate_dir\": \"/opt/ml/output/intermediate\",\n", " \"resource_config\": {\n", " \"current_host\": \"algo-1\",\n", " \"hosts\": [\n", " \"algo-1\"\n", " ],\n", " \"network_interface_name\": \"eth0\"\n", " },\n", " \"user_entry_point\": \"tmp-c267a390-383f-46fc-8427-37e62ab98d22.py\"\u001b[0m\n", "\u001b[34m}\n", "\u001b[0m\n", "\u001b[34mEnvironment variables:\n", "\u001b[0m\n", "\u001b[34mSM_HOSTS=[\"algo-1\"]\u001b[0m\n", "\u001b[34mSM_NETWORK_INTERFACE_NAME=eth0\u001b[0m\n", "\u001b[34mSM_HPS={}\u001b[0m\n", "\u001b[34mSM_USER_ENTRY_POINT=tmp-c267a390-383f-46fc-8427-37e62ab98d22.py\u001b[0m\n", "\u001b[34mSM_FRAMEWORK_PARAMS={}\u001b[0m\n", "\u001b[34mSM_RESOURCE_CONFIG={\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"}\u001b[0m\n", "\u001b[34mSM_INPUT_DATA_CONFIG={\"testing\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"},\"training\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}}\u001b[0m\n", "\u001b[34mSM_OUTPUT_DATA_DIR=/opt/ml/output/data\u001b[0m\n", "\u001b[34mSM_CHANNELS=[\"testing\",\"training\"]\u001b[0m\n", "\u001b[34mSM_CURRENT_HOST=algo-1\u001b[0m\n", "\u001b[34mSM_MODULE_NAME=tmp-c267a390-383f-46fc-8427-37e62ab98d22\u001b[0m\n", "\u001b[34mSM_LOG_LEVEL=20\u001b[0m\n", "\u001b[34mSM_FRAMEWORK_MODULE=sagemaker_pytorch_container.training:main\u001b[0m\n", "\u001b[34mSM_INPUT_DIR=/opt/ml/input\u001b[0m\n", "\u001b[34mSM_INPUT_CONFIG_DIR=/opt/ml/input/config\u001b[0m\n", "\u001b[34mSM_OUTPUT_DIR=/opt/ml/output\u001b[0m\n", "\u001b[34mSM_NUM_CPUS=4\u001b[0m\n", "\u001b[34mSM_NUM_GPUS=0\u001b[0m\n", "\u001b[34mSM_MODEL_DIR=/opt/ml/model\u001b[0m\n", "\u001b[34mSM_MODULE_DIR=s3://sagemaker-eu-west-1-245582572290/pytorch-training-2020-12-03-11-39-24-085/source/sourcedir.tar.gz\u001b[0m\n", "\u001b[34mSM_TRAINING_ENV={\"additional_framework_parameters\":{},\"channel_input_dirs\":{\"testing\":\"/opt/ml/input/data/testing\",\"training\":\"/opt/ml/input/data/training\"},\"current_host\":\"algo-1\",\"framework_module\":\"sagemaker_pytorch_container.training:main\",\"hosts\":[\"algo-1\"],\"hyperparameters\":{},\"input_config_dir\":\"/opt/ml/input/config\",\"input_data_config\":{\"testing\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"},\"training\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}},\"input_dir\":\"/opt/ml/input\",\"is_master\":true,\"job_name\":\"pytorch-training-2020-12-03-11-39-24-085\",\"log_level\":20,\"master_hostname\":\"algo-1\",\"model_dir\":\"/opt/ml/model\",\"module_dir\":\"s3://sagemaker-eu-west-1-245582572290/pytorch-training-2020-12-03-11-39-24-085/source/sourcedir.tar.gz\",\"module_name\":\"tmp-c267a390-383f-46fc-8427-37e62ab98d22\",\"network_interface_name\":\"eth0\",\"num_cpus\":4,\"num_gpus\":0,\"output_data_dir\":\"/opt/ml/output/data\",\"output_dir\":\"/opt/ml/output\",\"output_intermediate_dir\":\"/opt/ml/output/intermediate\",\"resource_config\":{\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"},\"user_entry_point\":\"tmp-c267a390-383f-46fc-8427-37e62ab98d22.py\"}\u001b[0m\n", "\u001b[34mSM_USER_ARGS=[]\u001b[0m\n", "\u001b[34mSM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate\u001b[0m\n", "\u001b[34mSM_CHANNEL_TESTING=/opt/ml/input/data/testing\u001b[0m\n", "\u001b[34mSM_CHANNEL_TRAINING=/opt/ml/input/data/training\u001b[0m\n", "\u001b[34mPYTHONPATH=/opt/ml/code:/opt/conda/bin:/opt/conda/lib/python36.zip:/opt/conda/lib/python3.6:/opt/conda/lib/python3.6/lib-dynload:/opt/conda/lib/python3.6/site-packages\n", "\u001b[0m\n", "\u001b[34mInvoking script with the following command:\n", "\u001b[0m\n", "\u001b[34m/opt/conda/bin/python tmp-c267a390-383f-46fc-8427-37e62ab98d22.py\n", "\n", "\u001b[0m\n", "\u001b[34mStart training ...\u001b[0m\n", "\u001b[34m[2020-12-03 11:42:45.045 algo-1:44 INFO json_config.py:90] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.\u001b[0m\n", "\u001b[34m[2020-12-03 11:42:45.046 algo-1:44 INFO hook.py:183] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.\u001b[0m\n", "\u001b[34m[2020-12-03 11:42:45.046 algo-1:44 INFO hook.py:228] Saving to /opt/ml/output/tensors\u001b[0m\n", "\u001b[34m[2020-12-03 11:42:45.046 algo-1:44 INFO hook.py:364] Monitoring the collections: losses\u001b[0m\n", "\u001b[34m[2020-12-03 11:42:45.047 algo-1:44 INFO hook.py:422] Hook is writing from the hook with pid: 44\n", "\u001b[0m\n", "\u001b[34mTrain Epoch: 1 [6400/60000 (11%)] Loss: 0.743953\u001b[0m\n", "\u001b[34mTrain Epoch: 1 [12800/60000 (21%)] Loss: 0.487840\u001b[0m\n", "\u001b[34mTrain Epoch: 1 [19200/60000 (32%)] Loss: 0.452505\u001b[0m\n", "\u001b[34mTrain Epoch: 1 [25600/60000 (43%)] Loss: 0.265998\u001b[0m\n", "\u001b[34mTrain Epoch: 1 [32000/60000 (53%)] Loss: 0.253175\u001b[0m\n", "\u001b[34mTrain Epoch: 1 [38400/60000 (64%)] Loss: 0.413751\u001b[0m\n", "\u001b[34mTrain Epoch: 1 [44800/60000 (75%)] Loss: 0.235252\u001b[0m\n", "\u001b[34mTrain Epoch: 1 [51200/60000 (85%)] Loss: 0.380865\u001b[0m\n", "\u001b[34mTrain Epoch: 1 [57600/60000 (96%)] Loss: 0.186060\u001b[0m\n", "\u001b[34mTest set: Average loss: 0.1058, Accuracy: 9678/10000, 96.78)\n", "\u001b[0m\n", "\u001b[34mSaving the model\u001b[0m\n", "\u001b[34m[2020-12-03 11:42:58.539 algo-1:44 INFO utils.py:25] The end of training job file will not be written for jobs running under SageMaker.\u001b[0m\n", "\u001b[34m2020-12-03 11:42:58,747 sagemaker-containers INFO Reporting training SUCCESS\u001b[0m\n" ] } ], "source": [ "sm_sess.logs_for_job(\"pytorch-training-2020-12-03-11-39-24-085\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Boto3\n", "https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.stop_training_job" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'TrainingJobSummaries': [{'TrainingJobName': 'pytorch-training-2020-12-03-11-39-24-085',\n", " 'TrainingJobArn': 'arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-12-03-11-39-24-085',\n", " 'CreationTime': datetime.datetime(2020, 12, 3, 11, 39, 24, 348000, tzinfo=tzlocal()),\n", " 'TrainingEndTime': datetime.datetime(2020, 12, 3, 11, 43, 10, 797000, tzinfo=tzlocal()),\n", " 'LastModifiedTime': datetime.datetime(2020, 12, 3, 11, 43, 10, 797000, tzinfo=tzlocal()),\n", " 'TrainingJobStatus': 'Completed'},\n", " {'TrainingJobName': 'pytorch-training-2020-12-03-09-26-00-359',\n", " 'TrainingJobArn': 'arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-12-03-09-26-00-359',\n", " 'CreationTime': datetime.datetime(2020, 12, 3, 9, 26, 0, 684000, tzinfo=tzlocal()),\n", " 'TrainingEndTime': datetime.datetime(2020, 12, 3, 9, 34, 39, 980000, tzinfo=tzlocal()),\n", " 'LastModifiedTime': datetime.datetime(2020, 12, 3, 9, 34, 39, 980000, tzinfo=tzlocal()),\n", " 'TrainingJobStatus': 'Completed'}],\n", " 'NextToken': 'cIws2QhTXUIa8bi8X9aU7gCAR0Xdc3x9L/Ofg4vsVMTtcNqRqLcpBqE42+cDc29TFQi5WMy1ST5uSEywRvC9pY03sEYOQa970wjR7qEA0D9ACIPEWDkHh7iBUzvRCcQcHtEvm20Wf3u59Z6PYko/eZciB8jI9LQYs8s5dBb74z1i9SIup31iJwBjqqhwj8WigdYN75WlklujoYSKPkM2mmgVwZ0HLBLOGhn9JfFhtYp6OPAw0QzPlarYFJOygVD7nZePRvsoXJYfhNzszqA13f7rVWqt3xVuC0693HA8bKCqnlkrNMhIBbczjJB5Hv+CIfncOE+WBXX71XAS2zIMytk8kMOKyvWUGG/WnuTFXwdZgiHS4fkZCyoJ9BQg6h/ex8VcnL3yzSShz1DAvs/xi+DLl72X8O7Par1PzjAi+HDNkdweeKAFCq9OGeKnCni9NBdNA/4CXayzLx6J9soWVeh4544hCiNV1Pch8cgdNReJMohkyQjdHGIQiF8wWWmpdbbuvaUAEU8qGXDxEre2RqyeEQspQSCKkUAOR18CmGDkQ24rKL40kdFLd4htdH9XwmY2T2A38UPgsDJ6iZUAyHXxV8Xs',\n", " 'ResponseMetadata': {'RequestId': '486ba43b-6d54-4ae2-91a2-f0a170a40636',\n", " 'HTTPStatusCode': 200,\n", " 'HTTPHeaders': {'x-amzn-requestid': '486ba43b-6d54-4ae2-91a2-f0a170a40636',\n", " 'content-type': 'application/x-amz-json-1.1',\n", " 'content-length': '1267',\n", " 'date': 'Thu, 03 Dec 2020 11:58:50 GMT'},\n", " 'RetryAttempts': 0}}" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import boto3\n", "\n", "\n", "client = boto3.client('sagemaker')\n", "client.list_training_jobs(NameContains='pytorch' ,MaxResults=3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### AWS CLI" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"TrainingJobSummaries\": [\n", " {\n", " \"TrainingJobName\": \"pytorch-training-2020-12-03-11-39-24-085\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-12-03-11-39-24-085\",\n", " \"CreationTime\": \"2020-12-03T11:39:24.348000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-03T11:43:10.797000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-03T11:43:10.797000+00:00\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"pytorch-training-2020-12-03-09-26-00-359\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-12-03-09-26-00-359\",\n", " \"CreationTime\": \"2020-12-03T09:26:00.684000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-03T09:34:39.980000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-03T09:34:39.980000+00:00\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"sagemaker-studio-d-yu5msju0ejog-2020-12-02-13-55-27-938\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/sagemaker-studio-d-yu5msju0ejog-2020-12-02-13-55-27-938\",\n", " \"CreationTime\": \"2020-12-02T13:55:28.332000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-02T13:59:47.652000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-02T13:59:47.652000+00:00\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"sagemaker-studio-d-yu5msju0ejog-2020-12-02-13-49-17-846\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/sagemaker-studio-d-yu5msju0ejog-2020-12-02-13-49-17-846\",\n", " \"CreationTime\": \"2020-12-02T13:49:18.243000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-02T13:53:37.409000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-02T13:53:37.409000+00:00\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"sagemaker-studio-d-yu5msju0ejog-2020-12-02-13-41-48-944\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/sagemaker-studio-d-yu5msju0ejog-2020-12-02-13-41-48-944\",\n", " \"CreationTime\": \"2020-12-02T13:41:49.317000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-02T13:46:00.279000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-02T13:46:00.279000+00:00\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"pytorch-training-2020-12-02-09-12-06-083\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-12-02-09-12-06-083\",\n", " \"CreationTime\": \"2020-12-02T09:12:06.405000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-02T09:20:25.365000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-02T09:20:25.365000+00:00\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"pytorch-training-2020-12-01-13-06-15-847\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-12-01-13-06-15-847\",\n", " \"CreationTime\": \"2020-12-01T13:06:16.257000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-01T13:14:15.478000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-01T13:14:15.478000+00:00\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"pytorch-training-2020-11-30-14-55-09-675\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-11-30-14-55-09-675\",\n", " \"CreationTime\": \"2020-11-30T14:55:10.003000+00:00\",\n", " \"TrainingEndTime\": \"2020-11-30T15:03:47.727000+00:00\",\n", " \"LastModifiedTime\": \"2020-11-30T15:03:47.727000+00:00\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"tensorflow-training-2020-11-24-19-41-46-057\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/tensorflow-training-2020-11-24-19-41-46-057\",\n", " \"CreationTime\": \"2020-11-24T19:41:46.363000+00:00\",\n", " \"TrainingEndTime\": \"2020-11-24T19:44:16.565000+00:00\",\n", " \"LastModifiedTime\": \"2020-11-24T19:44:16.565000+00:00\",\n", " \"TrainingJobStatus\": \"Stopped\"\n", " },\n", " {\n", " \"TrainingJobName\": \"tensorflow-training-2020-11-24-18-48-23-809\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/tensorflow-training-2020-11-24-18-48-23-809\",\n", " \"CreationTime\": \"2020-11-24T18:48:24.129000+00:00\",\n", " \"TrainingEndTime\": \"2020-11-24T18:53:53.391000+00:00\",\n", " \"LastModifiedTime\": \"2020-11-24T18:53:53.391000+00:00\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"tensorflow-training-2020-11-24-14-21-24-052\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/tensorflow-training-2020-11-24-14-21-24-052\",\n", " \"CreationTime\": \"2020-11-24T14:21:24.431000+00:00\",\n", " \"TrainingEndTime\": \"2020-11-24T14:27:30.103000+00:00\",\n", " \"LastModifiedTime\": \"2020-11-24T14:27:30.103000+00:00\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"tensorflow-training-2020-11-24-13-06-09-277\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/tensorflow-training-2020-11-24-13-06-09-277\",\n", " \"CreationTime\": \"2020-11-24T13:06:09.574000+00:00\",\n", " \"TrainingEndTime\": \"2020-11-24T13:09:20.272000+00:00\",\n", " \"LastModifiedTime\": \"2020-11-24T13:09:20.272000+00:00\",\n", " \"TrainingJobStatus\": \"Failed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"pytorch-training-2020-11-24-13-05-11-920\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-11-24-13-05-11-920\",\n", " \"CreationTime\": \"2020-11-24T13:05:12.280000+00:00\",\n", " \"TrainingEndTime\": \"2020-11-24T13:11:53.014000+00:00\",\n", " \"LastModifiedTime\": \"2020-11-24T13:11:53.014000+00:00\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"tensorflow-training-2020-11-24-12-53-23-755\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/tensorflow-training-2020-11-24-12-53-23-755\",\n", " \"CreationTime\": \"2020-11-24T12:53:24.083000+00:00\",\n", " \"TrainingEndTime\": \"2020-11-24T12:56:52.920000+00:00\",\n", " \"LastModifiedTime\": \"2020-11-24T12:56:52.920000+00:00\",\n", " \"TrainingJobStatus\": \"Failed\"\n", " }\n", " ]\n", "}\n" ] } ], "source": [ "!aws sagemaker list-training-jobs" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!aws sagemaker stop-training-job --training-job-name tensorflow-training-2020-11-24-19-41-46-057" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"TrainingJobName\": \"pytorch-training-2020-11-24-13-05-11-920\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-11-24-13-05-11-920\",\n", " \"ModelArtifacts\": {\n", " \"S3ModelArtifacts\": \"s3://sagemaker-eu-west-1-245582572290/mnist/pytorch-training-2020-11-24-13-05-11-920/output/model.tar.gz\"\n", " },\n", " \"TrainingJobStatus\": \"Completed\",\n", " \"SecondaryStatus\": \"Completed\",\n", " \"HyperParameters\": {\n", " \"batch-size\": \"128\",\n", " \"epochs\": \"20\",\n", " \"learning-rate\": \"0.001\",\n", " \"log-interval\": \"100\",\n", " \"sagemaker_container_log_level\": \"20\",\n", " \"sagemaker_job_name\": \"\\\"pytorch-training-2020-11-24-13-05-11-920\\\"\",\n", " \"sagemaker_program\": \"\\\"train.py\\\"\",\n", " \"sagemaker_region\": \"\\\"eu-west-1\\\"\",\n", " \"sagemaker_submit_directory\": \"\\\"s3://sagemaker-eu-west-1-245582572290/pytorch-training-2020-11-24-13-05-11-920/source/sourcedir.tar.gz\\\"\"\n", " },\n", " \"AlgorithmSpecification\": {\n", " \"TrainingImage\": \"763104351884.dkr.ecr.eu-west-1.amazonaws.com/pytorch-training:1.5.0-cpu-py3\",\n", " \"TrainingInputMode\": \"File\",\n", " \"EnableSageMakerMetricsTimeSeries\": true\n", " },\n", " \"RoleArn\": \"arn:aws:iam::245582572290:role/workshop-sagemaker\",\n", " \"InputDataConfig\": [\n", " {\n", " \"ChannelName\": \"training\",\n", " \"DataSource\": {\n", " \"S3DataSource\": {\n", " \"S3DataType\": \"S3Prefix\",\n", " \"S3Uri\": \"s3://sagemaker-eu-west-1-245582572290/mnist\",\n", " \"S3DataDistributionType\": \"FullyReplicated\"\n", " }\n", " },\n", " \"CompressionType\": \"None\",\n", " \"RecordWrapperType\": \"None\"\n", " },\n", " {\n", " \"ChannelName\": \"testing\",\n", " \"DataSource\": {\n", " \"S3DataSource\": {\n", " \"S3DataType\": \"S3Prefix\",\n", " \"S3Uri\": \"s3://sagemaker-eu-west-1-245582572290/mnist\",\n", " \"S3DataDistributionType\": \"FullyReplicated\"\n", " }\n", " },\n", " \"CompressionType\": \"None\",\n", " \"RecordWrapperType\": \"None\"\n", " }\n", " ],\n", " \"OutputDataConfig\": {\n", " \"KmsKeyId\": \"\",\n", " \"S3OutputPath\": \"s3://sagemaker-eu-west-1-245582572290/mnist\"\n", " },\n", " \"ResourceConfig\": {\n", " \"InstanceType\": \"ml.c4.xlarge\",\n", " \"InstanceCount\": 1,\n", " \"VolumeSizeInGB\": 30\n", " },\n", " \"StoppingCondition\": {\n", " \"MaxRuntimeInSeconds\": 86400\n", " },\n", " \"CreationTime\": \"2020-11-24T13:05:12.280000+00:00\",\n", " \"TrainingStartTime\": \"2020-11-24T13:07:01.241000+00:00\",\n", " \"TrainingEndTime\": \"2020-11-24T13:11:53.014000+00:00\",\n", " \"LastModifiedTime\": \"2020-11-24T13:11:53.014000+00:00\",\n", " \"SecondaryStatusTransitions\": [\n", " {\n", " \"Status\": \"Starting\",\n", " \"StartTime\": \"2020-11-24T13:05:12.280000+00:00\",\n", " \"EndTime\": \"2020-11-24T13:07:01.241000+00:00\",\n", " \"StatusMessage\": \"Preparing the instances for training\"\n", " },\n", " {\n", " \"Status\": \"Downloading\",\n", " \"StartTime\": \"2020-11-24T13:07:01.241000+00:00\",\n", " \"EndTime\": \"2020-11-24T13:07:41.870000+00:00\",\n", " \"StatusMessage\": \"Downloading input data\"\n", " },\n", " {\n", " \"Status\": \"Training\",\n", " \"StartTime\": \"2020-11-24T13:07:41.870000+00:00\",\n", " \"EndTime\": \"2020-11-24T13:11:45.630000+00:00\",\n", " \"StatusMessage\": \"Training image download completed. Training in progress.\"\n", " },\n", " {\n", " \"Status\": \"Uploading\",\n", " \"StartTime\": \"2020-11-24T13:11:45.630000+00:00\",\n", " \"EndTime\": \"2020-11-24T13:11:53.014000+00:00\",\n", " \"StatusMessage\": \"Uploading generated training model\"\n", " },\n", " {\n", " \"Status\": \"Completed\",\n", " \"StartTime\": \"2020-11-24T13:11:53.014000+00:00\",\n", " \"EndTime\": \"2020-11-24T13:11:53.014000+00:00\",\n", " \"StatusMessage\": \"Training job completed\"\n", " }\n", " ],\n", " \"EnableNetworkIsolation\": false,\n", " \"EnableInterContainerTrafficEncryption\": false,\n", " \"EnableManagedSpotTraining\": false,\n", " \"TrainingTimeInSeconds\": 292,\n", " \"BillableTimeInSeconds\": 292,\n", " \"DebugHookConfig\": {\n", " \"S3OutputPath\": \"s3://sagemaker-eu-west-1-245582572290/mnist\",\n", " \"CollectionConfigurations\": []\n", " }\n", "}\n" ] } ], "source": [ "!aws sagemaker describe-training-job --training-job-name pytorch-training-2020-11-24-13-05-11-920" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SAGEMAKER() SAGEMAKER()\n", "\n", "\n", "\n", "NAME\n", " sagemaker -\n", "\n", "DESCRIPTION\n", " Provides APIs for creating and managing Amazon SageMaker resources.\n", "\n", " Other Resources:\n", "\n", " o Amazon SageMaker Developer Guide\n", "\n", " o Amazon Augmented AI Runtime API Reference\n", "\n", "AVAILABLE COMMANDS\n", " o add-tags\n", "\n", " o associate-trial-component\n", "\n", " o create-algorithm\n", "\n", " o create-app\n", "\n", " o create-app-image-config\n", "\n", " o create-auto-ml-job\n", "\n", " o create-code-repository\n", "\n", " o create-compilation-job\n", "\n", " o create-domain\n", "\n", " o create-endpoint\n", "\n", " o create-endpoint-config\n", "\n", " o create-experiment\n", "\n", " o create-flow-definition\n", "\n", " o create-human-task-ui\n", "\n", " o create-hyper-parameter-tuning-job\n", "\n", " o create-image\n", "\n", " o create-image-version\n", "\n", " o create-labeling-job\n", "\n", " o create-model\n", "\n", " o create-model-package\n", "\n", " o create-monitoring-schedule\n", "\n", " o create-notebook-instance\n", "\n", " o create-notebook-instance-lifecycle-config\n", "\n", " o create-presigned-domain-url\n", "\n", " o create-presigned-notebook-instance-url\n", "\n", " o create-processing-job\n", "\n", " o create-training-job\n", "\n", " o create-transform-job\n", "\n", " o create-trial\n", "\n", " o create-trial-component\n", "\n", " o create-user-profile\n", "\n", " o create-workforce\n", "\n", " o create-workteam\n", "\n", " o delete-algorithm\n", "\n", " o delete-app\n", "\n", " o delete-app-image-config\n", "\n", " o delete-code-repository\n", "\n", " o delete-domain\n", "\n", " o delete-endpoint\n", "\n", " o delete-endpoint-config\n", "\n", " o delete-experiment\n", "\n", " o delete-flow-definition\n", "\n", " o delete-human-task-ui\n", "\n", " o delete-image\n", "\n", " o delete-image-version\n", "\n", " o delete-model\n", "\n", " o delete-model-package\n", "\n", " o delete-monitoring-schedule\n", "\n", " o delete-notebook-instance\n", "\n", " o delete-notebook-instance-lifecycle-config\n", "\n", " o delete-tags\n", "\n", " o delete-trial\n", "\n", " o delete-trial-component\n", "\n", " o delete-user-profile\n", "\n", " o delete-workforce\n", "\n", " o delete-workteam\n", "\n", " o describe-algorithm\n", "\n", " o describe-app\n", "\n", " o describe-app-image-config\n", "\n", " o describe-auto-ml-job\n", "\n", " o describe-code-repository\n", "\n", " o describe-compilation-job\n", "\n", " o describe-domain\n", "\n", " o describe-endpoint\n", "\n", " o describe-endpoint-config\n", "\n", " o describe-experiment\n", "\n", " o describe-flow-definition\n", "\n", " o describe-human-task-ui\n", "\n", " o describe-hyper-parameter-tuning-job\n", "\n", " o describe-image\n", "\n", " o describe-image-version\n", "\n", " o describe-labeling-job\n", "\n", " o describe-model\n", "\n", " o describe-model-package\n", "\n", " o describe-monitoring-schedule\n", "\n", " o describe-notebook-instance\n", "\n", " o describe-notebook-instance-lifecycle-config\n", "\n", " o describe-processing-job\n", "\n", " o describe-subscribed-workteam\n", "\n", " o describe-training-job\n", "\n", " o describe-transform-job\n", "\n", " o describe-trial\n", "\n", " o describe-trial-component\n", "\n", " o describe-user-profile\n", "\n", " o describe-workforce\n", "\n", " o describe-workteam\n", "\n", " o disassociate-trial-component\n", "\n", " o get-search-suggestions\n", "\n", " o help\n", "\n", " o list-algorithms\n", "\n", " o list-app-image-configs\n", "\n", " o list-apps\n", "\n", " o list-auto-ml-jobs\n", "\n", " o list-candidates-for-auto-ml-job\n", "\n", " o list-code-repositories\n", "\n", " o list-compilation-jobs\n", "\n", " o list-domains\n", "\n", " o list-endpoint-configs\n", "\n", " o list-endpoints\n", "\n", " o list-experiments\n", "\n", " o list-flow-definitions\n", "\n", " o list-human-task-uis\n", "\n", " o list-hyper-parameter-tuning-jobs\n", "\n", " o list-image-versions\n", "\n", " o list-images\n", "\n", " o list-labeling-jobs\n", "\n", " o list-labeling-jobs-for-workteam\n", "\n", " o list-model-packages\n", "\n", " o list-models\n", "\n", " o list-monitoring-executions\n", "\n", " o list-monitoring-schedules\n", "\n", " o list-notebook-instance-lifecycle-configs\n", "\n", " o list-notebook-instances\n", "\n", " o list-processing-jobs\n", "\n", " o list-subscribed-workteams\n", "\n", " o list-tags\n", "\n", " o list-training-jobs\n", "\n", " o list-training-jobs-for-hyper-parameter-tuning-job\n", "\n", " o list-transform-jobs\n", "\n", " o list-trial-components\n", "\n", " o list-trials\n", "\n", " o list-user-profiles\n", "\n", " o list-workforces\n", "\n", " o list-workteams\n", "\n", " o render-ui-template\n", "\n", " o search\n", "\n", " o start-monitoring-schedule\n", "\n", " o start-notebook-instance\n", "\n", " o stop-auto-ml-job\n", "\n", " o stop-compilation-job\n", "\n", " o stop-hyper-parameter-tuning-job\n", "\n", " o stop-labeling-job\n", "\n", " o stop-monitoring-schedule\n", "\n", " o stop-notebook-instance\n", "\n", " o stop-processing-job\n", "\n", " o stop-training-job\n", "\n", " o stop-transform-job\n", "\n", " o update-app-image-config\n", "\n", " o update-code-repository\n", "\n", " o update-domain\n", "\n", " o update-endpoint\n", "\n", " o update-endpoint-weights-and-capacities\n", "\n", " o update-experiment\n", "\n", " o update-image\n", "\n", " o update-monitoring-schedule\n", "\n", " o update-notebook-instance\n", "\n", " o update-notebook-instance-lifecycle-config\n", "\n", " o update-trial\n", "\n", " o update-trial-component\n", "\n", " o update-user-profile\n", "\n", " o update-workforce\n", "\n", " o update-workteam\n", "\n", " o wait\n", "\n", "\n", "\n", " SAGEMAKER()\n" ] } ], "source": [ "!aws sagemaker help" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"TrainingJobSummaries\": [\n", " {\n", " \"TrainingJobName\": \"pytorch-training-2020-12-10-02-52-52-166\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-12-10-02-52-52-166\",\n", " \"CreationTime\": \"2020-12-10T02:52:52.482000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-10T02:55:27.924000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-10T02:57:50.278000+00:00\",\n", " \"TrainingJobStatus\": \"Stopped\"\n", " },\n", " {\n", " \"TrainingJobName\": \"pytorch-training-2020-12-10-02-31-31-681\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-12-10-02-31-31-681\",\n", " \"CreationTime\": \"2020-12-10T02:31:32.012000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-10T02:34:25.333000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-10T02:39:11.093000+00:00\",\n", " \"TrainingJobStatus\": \"Stopped\"\n", " },\n", " {\n", " \"TrainingJobName\": \"tensorflow-training-2020-12-10-02-07-09-716\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/tensorflow-training-2020-12-10-02-07-09-716\",\n", " \"CreationTime\": \"2020-12-10T02:07:10.033000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-10T02:10:47.863000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-10T02:11:36.057000+00:00\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"pytorch-training-2020-12-10-02-04-24-474\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-12-10-02-04-24-474\",\n", " \"CreationTime\": \"2020-12-10T02:04:24.741000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-10T02:07:41.669000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-10T02:11:50.472000+00:00\",\n", " \"TrainingJobStatus\": \"Stopped\"\n", " },\n", " {\n", " \"TrainingJobName\": \"pytorch-training-2020-12-04-18-03-55-165\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-12-04-18-03-55-165\",\n", " \"CreationTime\": \"2020-12-04T18:03:55.444000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-04T18:08:44.723000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-04T18:08:44.723000+00:00\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"tensorflow-training-2020-12-04-16-28-58-014\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/tensorflow-training-2020-12-04-16-28-58-014\",\n", " \"CreationTime\": \"2020-12-04T16:28:58.274000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-04T16:33:19.884000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-04T16:33:19.884000+00:00\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"pytorch-training-2020-12-04-16-25-36-311\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-12-04-16-25-36-311\",\n", " \"CreationTime\": \"2020-12-04T16:25:36.550000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-04T16:33:06.188000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-04T16:33:06.188000+00:00\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"pytorch-training-2020-12-04-16-18-30-171\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-12-04-16-18-30-171\",\n", " \"CreationTime\": \"2020-12-04T16:18:30.428000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-04T16:22:06.122000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-04T16:22:06.122000+00:00\",\n", " \"TrainingJobStatus\": \"Failed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"pytorch-training-2020-12-04-16-09-05-905\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-12-04-16-09-05-905\",\n", " \"CreationTime\": \"2020-12-04T16:09:06.174000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-04T16:16:34.295000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-04T16:16:34.295000+00:00\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"tensorflow-training-2020-12-03-14-59-56-531\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/tensorflow-training-2020-12-03-14-59-56-531\",\n", " \"CreationTime\": \"2020-12-03T14:59:56.804000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-03T15:03:49.097000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-03T15:03:49.097000+00:00\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"tensorflow-training-2020-12-03-14-51-33-967\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/tensorflow-training-2020-12-03-14-51-33-967\",\n", " \"CreationTime\": \"2020-12-03T14:51:34.279000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-03T14:55:40.434000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-03T14:55:40.434000+00:00\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"tensorflow-training-2020-12-03-14-37-02-426\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/tensorflow-training-2020-12-03-14-37-02-426\",\n", " \"CreationTime\": \"2020-12-03T14:37:02.696000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-03T14:40:32.508000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-03T14:40:32.508000+00:00\",\n", " \"TrainingJobStatus\": \"Failed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"tensorflow-training-2020-12-03-14-36-39-556\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/tensorflow-training-2020-12-03-14-36-39-556\",\n", " \"CreationTime\": \"2020-12-03T14:36:39.993000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-03T14:38:47.074000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-03T14:38:47.074000+00:00\",\n", " \"TrainingJobStatus\": \"Stopped\"\n", " },\n", " {\n", " \"TrainingJobName\": \"pytorch-training-2020-12-03-14-21-57-898\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-12-03-14-21-57-898\",\n", " \"CreationTime\": \"2020-12-03T14:21:58.300000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-03T14:23:58.655000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-03T14:23:58.655000+00:00\",\n", " \"TrainingJobStatus\": \"Stopped\"\n", " },\n", " {\n", " \"TrainingJobName\": \"tensorflow-training-2020-12-03-13-31-27-927\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/tensorflow-training-2020-12-03-13-31-27-927\",\n", " \"CreationTime\": \"2020-12-03T13:31:28.199000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-03T13:34:41.352000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-03T13:34:41.352000+00:00\",\n", " \"TrainingJobStatus\": \"Failed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"pytorch-training-2020-12-03-13-18-12-817\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-12-03-13-18-12-817\",\n", " \"CreationTime\": \"2020-12-03T13:18:13.106000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-03T13:21:36.764000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-03T13:21:36.764000+00:00\",\n", " \"TrainingJobStatus\": \"Failed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"pytorch-training-2020-12-03-13-17-46-041\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-12-03-13-17-46-041\",\n", " \"CreationTime\": \"2020-12-03T13:17:46.323000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-03T13:21:30.375000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-03T13:21:30.375000+00:00\",\n", " \"TrainingJobStatus\": \"Failed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"tensorflow-training-2020-12-03-13-00-32-988\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/tensorflow-training-2020-12-03-13-00-32-988\",\n", " \"CreationTime\": \"2020-12-03T13:00:33.281000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-03T13:03:50.877000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-03T13:03:50.877000+00:00\",\n", " \"TrainingJobStatus\": \"Failed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"pytorch-training-2020-12-03-12-54-49-359\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-12-03-12-54-49-359\",\n", " \"CreationTime\": \"2020-12-03T12:54:49.665000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-03T12:58:25.443000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-03T12:58:25.443000+00:00\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"pytorch-training-2020-12-03-12-22-33-542\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-12-03-12-22-33-542\",\n", " \"CreationTime\": \"2020-12-03T12:22:33.919000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-03T12:26:17.928000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-03T12:26:17.928000+00:00\",\n", " \"TrainingJobStatus\": \"Failed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"pytorch-training-2020-12-03-11-39-24-085\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-12-03-11-39-24-085\",\n", " \"CreationTime\": \"2020-12-03T11:39:24.348000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-03T11:43:10.797000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-03T11:43:10.797000+00:00\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"pytorch-training-2020-12-03-09-26-00-359\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-12-03-09-26-00-359\",\n", " \"CreationTime\": \"2020-12-03T09:26:00.684000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-03T09:34:39.980000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-03T09:34:39.980000+00:00\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"sagemaker-studio-d-yu5msju0ejog-2020-12-02-13-55-27-938\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/sagemaker-studio-d-yu5msju0ejog-2020-12-02-13-55-27-938\",\n", " \"CreationTime\": \"2020-12-02T13:55:28.332000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-02T13:59:47.652000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-02T13:59:47.652000+00:00\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"sagemaker-studio-d-yu5msju0ejog-2020-12-02-13-49-17-846\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/sagemaker-studio-d-yu5msju0ejog-2020-12-02-13-49-17-846\",\n", " \"CreationTime\": \"2020-12-02T13:49:18.243000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-02T13:53:37.409000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-02T13:53:37.409000+00:00\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"sagemaker-studio-d-yu5msju0ejog-2020-12-02-13-41-48-944\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/sagemaker-studio-d-yu5msju0ejog-2020-12-02-13-41-48-944\",\n", " \"CreationTime\": \"2020-12-02T13:41:49.317000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-02T13:46:00.279000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-02T13:46:00.279000+00:00\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"pytorch-training-2020-12-02-09-12-06-083\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-12-02-09-12-06-083\",\n", " \"CreationTime\": \"2020-12-02T09:12:06.405000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-02T09:20:25.365000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-02T09:20:25.365000+00:00\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"pytorch-training-2020-12-01-13-06-15-847\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-12-01-13-06-15-847\",\n", " \"CreationTime\": \"2020-12-01T13:06:16.257000+00:00\",\n", " \"TrainingEndTime\": \"2020-12-01T13:14:15.478000+00:00\",\n", " \"LastModifiedTime\": \"2020-12-01T13:14:15.478000+00:00\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"pytorch-training-2020-11-30-14-55-09-675\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-11-30-14-55-09-675\",\n", " \"CreationTime\": \"2020-11-30T14:55:10.003000+00:00\",\n", " \"TrainingEndTime\": \"2020-11-30T15:03:47.727000+00:00\",\n", " \"LastModifiedTime\": \"2020-11-30T15:03:47.727000+00:00\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"tensorflow-training-2020-11-24-19-41-46-057\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/tensorflow-training-2020-11-24-19-41-46-057\",\n", " \"CreationTime\": \"2020-11-24T19:41:46.363000+00:00\",\n", " \"TrainingEndTime\": \"2020-11-24T19:44:16.565000+00:00\",\n", " \"LastModifiedTime\": \"2020-11-24T19:44:16.565000+00:00\",\n", " \"TrainingJobStatus\": \"Stopped\"\n", " },\n", " {\n", " \"TrainingJobName\": \"tensorflow-training-2020-11-24-18-48-23-809\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/tensorflow-training-2020-11-24-18-48-23-809\",\n", " \"CreationTime\": \"2020-11-24T18:48:24.129000+00:00\",\n", " \"TrainingEndTime\": \"2020-11-24T18:53:53.391000+00:00\",\n", " \"LastModifiedTime\": \"2020-11-24T18:53:53.391000+00:00\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"tensorflow-training-2020-11-24-14-21-24-052\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/tensorflow-training-2020-11-24-14-21-24-052\",\n", " \"CreationTime\": \"2020-11-24T14:21:24.431000+00:00\",\n", " \"TrainingEndTime\": \"2020-11-24T14:27:30.103000+00:00\",\n", " \"LastModifiedTime\": \"2020-11-24T14:27:30.103000+00:00\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"tensorflow-training-2020-11-24-13-06-09-277\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/tensorflow-training-2020-11-24-13-06-09-277\",\n", " \"CreationTime\": \"2020-11-24T13:06:09.574000+00:00\",\n", " \"TrainingEndTime\": \"2020-11-24T13:09:20.272000+00:00\",\n", " \"LastModifiedTime\": \"2020-11-24T13:09:20.272000+00:00\",\n", " \"TrainingJobStatus\": \"Failed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"pytorch-training-2020-11-24-13-05-11-920\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/pytorch-training-2020-11-24-13-05-11-920\",\n", " \"CreationTime\": \"2020-11-24T13:05:12.280000+00:00\",\n", " \"TrainingEndTime\": \"2020-11-24T13:11:53.014000+00:00\",\n", " \"LastModifiedTime\": \"2020-11-24T13:11:53.014000+00:00\",\n", " \"TrainingJobStatus\": \"Completed\"\n", " },\n", " {\n", " \"TrainingJobName\": \"tensorflow-training-2020-11-24-12-53-23-755\",\n", " \"TrainingJobArn\": \"arn:aws:sagemaker:eu-west-1:245582572290:training-job/tensorflow-training-2020-11-24-12-53-23-755\",\n", " \"CreationTime\": \"2020-11-24T12:53:24.083000+00:00\",\n", " \"TrainingEndTime\": \"2020-11-24T12:56:52.920000+00:00\",\n", " \"LastModifiedTime\": \"2020-11-24T12:56:52.920000+00:00\",\n", " \"TrainingJobStatus\": \"Failed\"\n", " }\n", " ]\n", "}\n" ] } ], "source": [ "list_training_jobs" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Imports\n", "from sagemaker import get_execution_role, Session\n", "from sagemaker.pytorch import PyTorch\n", "from sagemaker.tensorflow import TensorFlow\n", "from sagemaker import get_execution_role\n", "import boto3\n", "\n", "session = Session()\n", "\n", "client = boto3.client('sagemaker')\n", "\n", "## submit\n", "\n", "pytorch_estimator = PyTorch('pytorch-train.py',\n", " instance_type='ml.p3.2xlarge',\n", " instance_count=1,\n", " framework_version='1.5.0',\n", " py_version='py3',\n", " hyperparameters = {'epochs': 20, 'batch-size': 64, 'learning-rate': 0.1})\n", "\n", "pytorch_estimator.fit({'train': 's3://my-data-bucket/path/to/my/training/data',\n", " 'test': 's3://my-data-bucket/path/to/my/test/data'})\n", "\n", "### status\n", "session.describe_training_job(pytorch_estimator.latest_training_job.name)\n", "\n", "client.describe_training_job(TrainingJobName=pytorch_estimator.latest_training_job.name)\n", "\n", "!aws sagemaker describe-training-job --training-job-name pytorch-training-2020-11-24-13-05-11-920\n", "\n", "### logs\n", "session.logs_for_job(pytorch_estimator.latest_training_job.name)\n", "pytest.latest_training_job.sagemaker_session.logs_for_job(pytest.latest_training_job.name)\n", "\n", "### delete\n", "session.stop_training_job(pytest.latest_training_job.name)\n", "\n", "client.stop_training_job(TrainingJobName=pytest.latest_training_job.name)\n", "\n", "!aws sagemaker stop-training-job --training-job-name tensorflow-training-2020-11-24-19-41-46-057\n", "\n", "### list\n", "client.list_training_jobs(NameContains='tensorflow' ,MaxResults=15)\n", "\n", "!aws sagemaker list-training-jobs" ] } ], "metadata": { "kernelspec": { "display_name": "sm (lblokhin/17)", "language": "python", "name": "sm__SAGEMAKER_INTERNAL__arn:aws:sagemaker:eu-west-1:245582572290:image-version/lblokhin/17" }, "language_info": { "codemirror_mode": { "name": "python", "version": 3 }, "mimetype": "text/x-python", "name": "sm_kernel", "pygments_lexer": "python" } }, "nbformat": 4, "nbformat_minor": 4 }