{ "cells": [ { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "#!pip install -e ." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "import subprocess\n", "from datetime import datetime\n", "\n", "from sagemaker import get_execution_role\n", "from sagemaker.tensorflow import TensorFlow\n", "\n", "import yaml\n", "from contextlib import redirect_stdout\n", "import boto3\n", "from configs import cfg" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Set S3 Locations and Job Name" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "S3_BUCKET = 'sagemaker-smcv-tutorial' # Don't include s3:// in your bucket name\n", "S3_DIR = 'smcv-tensorflow-tutorial'\n", "LOCAL_DATA_DIR = '/root/smcv-tensorflow-tutorial' # For reasons detailed in Distributed Training, do not put this dir in the SageMakerCV dir\n", "S3_SRC=os.path.join(\"s3://\", S3_BUCKET, S3_DIR)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "boto_session = boto3.session.Session()\n", "region = boto_session.region_name\n", "os.environ['AWS_DEFAULT_REGION'] = region # This is the region we set at the beginning, when creating the S3 bucket for our data\n", "\n", "# this is all for naming\n", "user_id=\"jbsnyder-smcv-tutorial\" # This is used for naming your training job, and organizing your results on S3. It can be anything you like.\n", "date_str=datetime.now().strftime(\"%d-%m-%Y\")\n", "time_str=datetime.now().strftime(\"%d-%m-%Y-%H-%M-%S\")" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "# specify training type, s3 src and nodes\n", "instance_type=\"ml.p4d.24xlarge\" # This can be any of 'ml.p3dn.24xlarge', 'ml.p4d.24xlarge', 'ml.p3.16xlarge', 'ml.p3.8xlarge', 'ml.p3.2xlarge', 'ml.g4dn.12xlarge'\n", "nodes=1\n", "role=get_execution_role() #give Sagemaker permission to launch nodes on our behalf\n", "source_dir='.'\n", "entry_point='train.py'" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "dist_config_file = f\"configs/1_node.yaml\" # f\"configs/dist-training-config.yaml\"" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "cfg.LOG_INTERVAL = 50 # Number of training steps between logging interval\n", "cfg.MODEL.DENSE.PRE_NMS_TOP_N_TRAIN = 2000 # Top regions of interest to select before NMS\n", "cfg.MODEL.DENSE.POST_NMS_TOP_N_TRAIN = 1000 # Top regions of interest to select after NMS\n", "cfg.MODEL.RCNN.ROI_HEAD = \"StandardRoIHead\"\n", "cfg.MODEL.FRCNN.LOSS_TYPE = \"giou\"\n", "cfg.MODEL.FRCNN.LABEL_SMOOTHING = 0.1 # label smoothing for box head\n", "cfg.MODEL.FRCNN.CARL = True" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "cfg.INPUT.TRAIN_BATCH_SIZE = 48 # Training batch size\n", "cfg.INPUT.EVAL_BATCH_SIZE = 32 # Training batch size\n", "cfg.SOLVER.SCHEDULE = \"CosineDecay\" # Learning rate schedule, either CosineDecay or PiecewiseConstantDecay\n", "cfg.SOLVER.OPTIMIZER = \"NovoGrad\" # Optimizer type NovoGrad or Momentum\n", "cfg.SOLVER.MOMENTUM = 0.9\n", "cfg.SOLVER.WARM_UP_RATIO = 0.01\n", "cfg.SOLVER.LR = .008 # Base learning rate after warmup\n", "cfg.SOLVER.BETA_1 = 0.9 # NovoGrad beta 1 value\n", "cfg.SOLVER.BETA_2 = 0.6 # NovoGRad beta 2 value\n", "cfg.SOLVER.MAX_ITERS = 22000 # Total training steps\n", "cfg.SOLVER.WARMUP_STEPS = 750 # warmup steps\n", "cfg.SOLVER.XLA = True # Train with XLA\n", "cfg.SOLVER.FP16 = True # Train with mixed precision enables\n", "cfg.SOLVER.TF32 = True # Train with TF32 data type enabled, only available on Ampere GPUs and TF 2.4 and up\n", "cfg.SOLVER.EVAL_EPOCH_EVAL = False # Only run eval at end\n", "cfg.SOLVER.ALPHA = 0.025 # final learning rate as multiplier of initial learning rate\n", "cfg.SOLVER.WEIGHT_DECAY = 0.0001 # Optimizer weight decay" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "cfg.HOOKS=[\"CheckpointHook\",\n", " \"IterTimerHook\",\n", " \"TextLoggerHook\",\n", " \"CocoEvaluator\"]" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "if nodes>0 and instance_type in ['ml.p3dn.24xlarge', 'ml.p4d.24xlarge', 'ml.p3.16xlarge']:\n", " distribution = { \"smdistributed\": { \"dataparallel\": { \"enabled\": True } } } \n", "else:\n", " custom_mpi_options = ['-x FI_EFA_USE_DEVICE_RDMA=1',\n", " '-x OMPI_MCA_btl_vader_single_copy_mechanism=none',\n", " '-x TF_CUDNN_USE_AUTOTUNE=0',\n", " '-x NCCL_MIN_NRINGS=0']\n", " distribution = { \"mpi\": { \"enabled\": True, \"custom_mpi_options\": \" \".join(custom_mpi_options)}}" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "job_name = f'{user_id}-{time_str}'\n", "output_path = os.path.join(S3_SRC, \"sagemaker-output\", date_str, job_name)\n", "code_location = os.path.join(S3_SRC, \"sagemaker-code\", date_str, job_name)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "channels = {'val2017': os.path.join(S3_SRC, 'data', 'coco', 'tfrecord', 'val2017'),\n", " 'annotations': os.path.join(S3_SRC, 'data', 'coco', 'annotations'),\n", " 'weights': os.path.join(S3_SRC, 'data', 'weights', 'resnet')}" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "CHANNELS_DIR='/opt/ml/input/data/' # on node\n", "cfg.PATHS.TRAIN_FILE_PATTERN = os.path.join(S3_SRC, 'data', 'coco', 'tfrecord', 'train2017', 'train*')\n", "cfg.PATHS.VAL_FILE_PATTERN = os.path.join(CHANNELS_DIR, \"val2017\", \"val*\")\n", "cfg.PATHS.WEIGHTS = os.path.join(CHANNELS_DIR, \"weights\", \"resnet50.ckpt\")\n", "cfg.PATHS.VAL_ANNOTATIONS = os.path.join(CHANNELS_DIR, \"annotations\", \"instances_val2017.json\")\n", "cfg.PATHS.OUT_DIR = '/opt/ml/checkpoints'" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "with open(dist_config_file, 'w') as outfile:\n", " with redirect_stdout(outfile): print(cfg.dump())" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "hyperparameters = {\"config\": dist_config_file}" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "estimator = TensorFlow(\n", " entry_point=entry_point, \n", " source_dir=source_dir, \n", " py_version='py37',\n", " framework_version='2.4.1', #2.3-2.5 supported\n", " role=role,\n", " instance_count=nodes,\n", " instance_type=instance_type,\n", " distribution=distribution,\n", " output_path=output_path,\n", " checkpoint_s3_uri=output_path,\n", " model_dir=output_path,\n", " hyperparameters=hyperparameters,\n", " volume_size=500,\n", " disable_profiler=True,\n", " debugger_hook_config=False,\n", " code_location=code_location,\n", ")" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "estimator.fit(channels, wait=False, job_name=job_name)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "instance_type": "ml.g4dn.xlarge", "kernelspec": { "display_name": "Python 3 (TensorFlow 2.3 Python 3.7 GPU Optimized)", "language": "python", "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-west-2:236514542706:image/tensorflow-2.3-gpu-py37-cu110-ubuntu18.04-v3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.10" } }, "nbformat": 4, "nbformat_minor": 4 }