{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "c4ab1da9", "metadata": {}, "outputs": [], "source": [ "import sagemaker\n", "from sagemaker.pytorch import PyTorch\n", "import boto3" ] }, { "cell_type": "code", "execution_count": 2, "id": "520c8810", "metadata": {}, "outputs": [], "source": [ "s3_client = boto3.client(\"s3\")\n", "sess = sagemaker.session.Session()\n", "role = sagemaker.get_execution_role()\n", "bucket = sess.default_bucket()\n", "key_prefix = \"ray_xgboost_distributed\"" ] }, { "cell_type": "code", "execution_count": 6, "id": "c838912f-acfd-4958-a4e3-55159d41ace9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2022-12-08 14:06:24-- http://ee-assets-prod-us-east-1.s3.amazonaws.com/modules/05fa7598d4d44836a42fde79b26568b2/v2/airline_14col.data.bz2\n", "Resolving ee-assets-prod-us-east-1.s3.amazonaws.com (ee-assets-prod-us-east-1.s3.amazonaws.com)... 52.217.170.89, 52.216.41.41, 52.216.209.89, ...\n", "Connecting to ee-assets-prod-us-east-1.s3.amazonaws.com (ee-assets-prod-us-east-1.s3.amazonaws.com)|52.217.170.89|:80... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 1089011924 (1.0G) [binary/octet-stream]\n", "Saving to: ‘airline_14col.data.bz2’\n", "\n", "100%[====================================>] 1,089,011,924 52.8MB/s in 19s \n", "\n", "2022-12-08 14:06:43 (55.7 MB/s) - ‘airline_14col.data.bz2’ saved [1089011924/1089011924]\n", "\n", "--2022-12-08 14:06:43-- http://airline_14col.data.bz2/\n", "Resolving airline_14col.data.bz2 (airline_14col.data.bz2)... failed: Name or service not known.\n", "wget: unable to resolve host address ‘airline_14col.data.bz2’\n", "FINISHED --2022-12-08 14:06:43--\n", "Total wall clock time: 19s\n", "Downloaded: 1 files, 1.0G in 19s (55.7 MB/s)\n" ] } ], "source": [ "!wget http://ee-assets-prod-us-east-1.s3.amazonaws.com/modules/05fa7598d4d44836a42fde79b26568b2/v2/airline_14col.data.bz2 airline_14col.data.bz2" ] }, { "cell_type": "code", "execution_count": 7, "id": "cad5369f", "metadata": {}, "outputs": [], "source": [ "input_data = sess.upload_data(\"airline_14col.data.bz2\", bucket, key_prefix=f\"{key_prefix}/input\")" ] }, { "cell_type": "code", "execution_count": 8, "id": "dc1bf8ea", "metadata": {}, "outputs": [], "source": [ "subnets=None\n", "security_group_ids=None" ] }, { "cell_type": "code", "execution_count": 9, "id": "9adec521", "metadata": {}, "outputs": [], "source": [ "# Pytorch Image is used to enable distributed GPU training\n", "estimator_gpu = PyTorch(\n", " source_dir=\"src\",\n", " entry_point=\"train_xgboost_airline.py\",\n", " subnets=subnets,\n", " security_group_ids=security_group_ids,\n", " role=role,\n", " instance_count=2, \n", " instance_type=\"ml.g5.2xlarge\",\n", " framework_version=\"1.8\",\n", " py_version=\"py36\"\n", ")" ] }, { "cell_type": "code", "execution_count": 10, "id": "36ce59d9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2022-12-08 14:07:40 Starting - Starting the training job...\n", "2022-12-08 14:07:46 Starting - Insufficient capacity error from EC2 while launching instances, retrying!ProfilerReport-1670508460: InProgress\n", "...\n", "2022-12-08 14:08:37 Failed - Training job failed\n", ".." ] }, { "ename": "CapacityError", "evalue": "Error for Training job pytorch-training-2022-12-08-14-07-40-126: Failed. Reason: CapacityError: Unable to provision requested ML compute capacity. Please retry using a different ML instance type.", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mCapacityError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m/tmp/ipykernel_23221/4245868564.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mestimator_gpu\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m\"train\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0minput_data\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.8/site-packages/sagemaker/workflow/pipeline_context.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 246\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself_instance\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msagemaker_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontext\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 247\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 248\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mrun_func\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 249\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 250\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.8/site-packages/sagemaker/estimator.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, inputs, wait, logs, job_name, experiment_config)\u001b[0m\n\u001b[1;32m 1097\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjobs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlatest_training_job\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1098\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mwait\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1099\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlatest_training_job\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlogs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlogs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1100\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1101\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_compilation_job_name\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.8/site-packages/sagemaker/estimator.py\u001b[0m in \u001b[0;36mwait\u001b[0;34m(self, logs)\u001b[0m\n\u001b[1;32m 2180\u001b[0m \u001b[0;31m# If logs are requested, call logs_for_jobs.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2181\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlogs\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;34m\"None\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2182\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msagemaker_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlogs_for_job\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjob_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwait\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlog_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlogs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2183\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2184\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msagemaker_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait_for_job\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjob_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.8/site-packages/sagemaker/session.py\u001b[0m in \u001b[0;36mlogs_for_job\u001b[0;34m(self, job_name, wait, poll, log_type)\u001b[0m\n\u001b[1;32m 3852\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3853\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mwait\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3854\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_job_status\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjob_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdescription\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"TrainingJobStatus\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3855\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdot\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3856\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/anaconda3/envs/python3/lib/python3.8/site-packages/sagemaker/session.py\u001b[0m in \u001b[0;36m_check_job_status\u001b[0;34m(self, job, desc, status_key_name)\u001b[0m\n\u001b[1;32m 3385\u001b[0m )\n\u001b[1;32m 3386\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m\"CapacityError\"\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreason\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3387\u001b[0;31m raise exceptions.CapacityError(\n\u001b[0m\u001b[1;32m 3388\u001b[0m \u001b[0mmessage\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmessage\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3389\u001b[0m \u001b[0mallowed_statuses\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Completed\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Stopped\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mCapacityError\u001b[0m: Error for Training job pytorch-training-2022-12-08-14-07-40-126: Failed. Reason: CapacityError: Unable to provision requested ML compute capacity. Please retry using a different ML instance type." ] } ], "source": [ "estimator_gpu.fit({\"train\": input_data})" ] }, { "cell_type": "code", "execution_count": null, "id": "8acb2c4d", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "conda_python3", "language": "python", "name": "conda_python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 5 }