{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Train an object detection model using Tensorflow on SageMaker" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Setup environment" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "import sagemaker\n", "from sagemaker.estimator import Framework, Estimator\n", "\n", "role = sagemaker.get_execution_role()\n", "\n", "inputs = {'train': ''} # define s3 training data inputs, this is the output of the processing job\n", "tensorboard_s3_prefix = '' # s3 path for tensorboard events, up to you where to save events " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Build and push container" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%bash\n", "git clone https://github.com/tensorflow/models.git docker/models\n", "# get model_main and exporter_main files from TF2 Object Detection GitHub repository\n", "cp docker/models/research/object_detection/exporter_main_v2.py source_dir \n", "cp docker/models/research/object_detection/model_main_tf2.py source_dir" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "image_name = 'tf2-object-detection'" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "!sh ./docker/build_and_push.sh $image_name" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with open (os.path.join('docker', 'ecr_image_fullname.txt'), 'r') as f:\n", " container = f.readlines()[0][:-1]\n", "\n", "print(container)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Get pre-trained model from model zoo" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Download the base model and extract locally" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "%%bash\n", "mkdir /tmp/checkpoint\n", "mkdir source_dir/checkpoint\n", "wget -O /tmp/efficientdet.tar.gz http://download.tensorflow.org/models/object_detection/tf2/20200711/efficientdet_d1_coco17_tpu-32.tar.gz\n", "tar -zxvf /tmp/efficientdet.tar.gz --strip-components 2 --directory source_dir/checkpoint efficientdet_d1_coco17_tpu-32/checkpoint" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create SageMaker Custom Framework and Launch Training job" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here we define a custom framework estimator using the Amazon SageMaker Python SDK and run training with that class, which will take care of managing these tasks." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "class CustomFramework(Framework):\n", " def __init__(\n", " self,\n", " entry_point,\n", " framework_version=None,\n", " py_version=None,\n", " source_dir=None,\n", " hyperparameters=None,\n", " image_uri=None,\n", " distribution=None,\n", " **kwargs\n", " ):\n", " super(CustomFramework, self).__init__(\n", " entry_point, source_dir, hyperparameters, image_uri=image_uri, **kwargs\n", " )\n", " self.framework_version = framework_version\n", " self.py_version = None\n", " \n", " def _configure_distribution(self, distributions):\n", " return None\n", "\n", " def create_model(\n", " self,\n", " model_server_workers=None,\n", " role=None,\n", " vpc_config_override=None,\n", " entry_point=None,\n", " source_dir=None,\n", " dependencies=None,\n", " image_uri=None,\n", " **kwargs\n", " ):\n", " return None" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "tensorboard_output_config = sagemaker.debugger.TensorBoardOutputConfig(\n", " s3_output_path=tensorboard_s3_prefix,\n", " container_local_output_path='/opt/training/'\n", ")\n", "\n", "estimator = CustomFramework(\n", " role=role,\n", " image_uri=container,\n", " entry_point='run_training.sh',\n", " source_dir='source_dir/',\n", " hyperparameters={\n", " \"model_dir\":\"/opt/training\", \n", " \"pipeline_config_path\": \"pipeline.config\",\n", " \"num_train_steps\": \"1000\", \n", " \"sample_1_of_n_eval_examples\": \"1\"\n", " },\n", " instance_count=1,\n", " instance_type='ml.g5.2xlarge',\n", " tensorboard_output_config=tensorboard_output_config,\n", " disable_profiler=True,\n", " base_job_name='tf2-object-detection'\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "estimator.fit(inputs)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Visualize training metrics with Tensorboard" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Due to this issue: https://github.com/ipython/ipykernel/issues/395#issuecomment-479787997\n", "#If you're using a custom conda env, there is a change that the tensorboard executable isn't in the Python path.\n", "#uncomment the following lines\n", "\n", "#bin_env_path = \"/home/ec2-user/anaconda3/envs/myenv/bin/\"\n", "#os.environ[\"PATH\"] += os.pathsep + bin_env_path" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "job_artifacts_path = estimator.latest_job_tensorboard_artifacts_path()\n", "job_artifacts_path" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Visualize training outputs" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Careful notebook would be stuck until you stop tensorboard, you can also launch this from a terminal\n", "tensorboard_s3_output_path = f'{job_artifacts_path}/train'\n", "!F_CPP_MIN_LOG_LEVEL=3 AWS_REGION=eu-west-1 tensorboard --logdir=$tensorboard_s3_output_path" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Visualize evaluation outputs\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Careful notebook would be stuck until you stop tensorboard, you can also launch this from a terminal\n", "tensorboard_s3_output_path = f'{job_artifacts_path}/eval'\n", "!F_CPP_MIN_LOG_LEVEL=3 AWS_REGION=eu-west-1 tensorboard --logdir=$tensorboard_s3_output_path" ] } ], "metadata": { "kernelspec": { "display_name": "conda_tensorflow2_p36", "language": "python", "name": "conda_tensorflow2_p36" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.13" } }, "nbformat": 4, "nbformat_minor": 4 }