{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!pip install -e ."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import subprocess\n",
    "from datetime import datetime\n",
    "\n",
    "from sagemaker import get_execution_role\n",
    "from sagemaker.tensorflow import TensorFlow\n",
    "\n",
    "import yaml\n",
    "from contextlib import redirect_stdout\n",
    "import boto3\n",
    "from configs import cfg"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Set S3 Locations and Job Name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "S3_BUCKET = 'sagemaker-smcv-tutorial' # Don't include s3:// in your bucket name\n",
    "S3_DIR = 'smcv-tensorflow-tutorial'\n",
    "LOCAL_DATA_DIR = '/root/smcv-tensorflow-tutorial' # For reasons detailed in Distributed Training, do not put this dir in the SageMakerCV dir\n",
    "S3_SRC=os.path.join(\"s3://\", S3_BUCKET, S3_DIR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "boto_session = boto3.session.Session()\n",
    "region = boto_session.region_name\n",
    "os.environ['AWS_DEFAULT_REGION'] = region # This is the region we set at the beginning, when creating the S3 bucket for our data\n",
    "\n",
    "# this is all for naming\n",
    "user_id=\"jbsnyder-smcv-tutorial\" # This is used for naming your training job, and organizing your results on S3. It can be anything you like.\n",
    "date_str=datetime.now().strftime(\"%d-%m-%Y\")\n",
    "time_str=datetime.now().strftime(\"%d-%m-%Y-%H-%M-%S\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "# specify training type, s3 src and nodes\n",
    "instance_type=\"ml.p4d.24xlarge\" # This can be any of 'ml.p3dn.24xlarge', 'ml.p4d.24xlarge', 'ml.p3.16xlarge', 'ml.p3.8xlarge', 'ml.p3.2xlarge', 'ml.g4dn.12xlarge'\n",
    "nodes=1\n",
    "role=get_execution_role() #give Sagemaker permission to launch nodes on our behalf\n",
    "source_dir='.'\n",
    "entry_point='train.py'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "dist_config_file = f\"configs/1_node.yaml\" # f\"configs/dist-training-config.yaml\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "cfg.LOG_INTERVAL = 50 # Number of training steps between logging interval\n",
    "cfg.MODEL.DENSE.PRE_NMS_TOP_N_TRAIN = 2000 # Top regions of interest to select before NMS\n",
    "cfg.MODEL.DENSE.POST_NMS_TOP_N_TRAIN = 1000 # Top regions of interest to select after NMS\n",
    "cfg.MODEL.RCNN.ROI_HEAD = \"StandardRoIHead\"\n",
    "cfg.MODEL.FRCNN.LOSS_TYPE = \"giou\"\n",
    "cfg.MODEL.FRCNN.LABEL_SMOOTHING = 0.1 # label smoothing for box head\n",
    "cfg.MODEL.FRCNN.CARL = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "cfg.INPUT.TRAIN_BATCH_SIZE = 48 # Training batch size\n",
    "cfg.INPUT.EVAL_BATCH_SIZE = 32 # Training batch size\n",
    "cfg.SOLVER.SCHEDULE = \"CosineDecay\" # Learning rate schedule, either CosineDecay or PiecewiseConstantDecay\n",
    "cfg.SOLVER.OPTIMIZER = \"NovoGrad\" # Optimizer type NovoGrad or Momentum\n",
    "cfg.SOLVER.MOMENTUM = 0.9\n",
    "cfg.SOLVER.WARM_UP_RATIO = 0.01\n",
    "cfg.SOLVER.LR = .008 # Base learning rate after warmup\n",
    "cfg.SOLVER.BETA_1 = 0.9 # NovoGrad beta 1 value\n",
    "cfg.SOLVER.BETA_2 = 0.6 # NovoGRad beta 2 value\n",
    "cfg.SOLVER.MAX_ITERS = 22000 # Total training steps\n",
    "cfg.SOLVER.WARMUP_STEPS = 750 # warmup steps\n",
    "cfg.SOLVER.XLA = True # Train with XLA\n",
    "cfg.SOLVER.FP16 = True # Train with mixed precision enables\n",
    "cfg.SOLVER.TF32 = True # Train with TF32 data type enabled, only available on Ampere GPUs and TF 2.4 and up\n",
    "cfg.SOLVER.EVAL_EPOCH_EVAL = False # Only run eval at end\n",
    "cfg.SOLVER.ALPHA = 0.025 # final learning rate as multiplier of initial learning rate\n",
    "cfg.SOLVER.WEIGHT_DECAY = 0.0001 # Optimizer weight decay"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "cfg.HOOKS=[\"CheckpointHook\",\n",
    "           \"IterTimerHook\",\n",
    "           \"TextLoggerHook\",\n",
    "           \"CocoEvaluator\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "if nodes>0 and instance_type in ['ml.p3dn.24xlarge', 'ml.p4d.24xlarge', 'ml.p3.16xlarge']:\n",
    "    distribution = { \"smdistributed\": { \"dataparallel\": { \"enabled\": True } } } \n",
    "else:\n",
    "    custom_mpi_options = ['-x FI_EFA_USE_DEVICE_RDMA=1',\n",
    "                          '-x OMPI_MCA_btl_vader_single_copy_mechanism=none',\n",
    "                          '-x TF_CUDNN_USE_AUTOTUNE=0',\n",
    "                          '-x NCCL_MIN_NRINGS=0']\n",
    "    distribution = { \"mpi\": { \"enabled\": True, \"custom_mpi_options\": \" \".join(custom_mpi_options)}}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "job_name = f'{user_id}-{time_str}'\n",
    "output_path = os.path.join(S3_SRC, \"sagemaker-output\", date_str, job_name)\n",
    "code_location = os.path.join(S3_SRC, \"sagemaker-code\", date_str, job_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "channels = {'val2017': os.path.join(S3_SRC, 'data', 'coco', 'tfrecord', 'val2017'),\n",
    "            'annotations': os.path.join(S3_SRC, 'data', 'coco', 'annotations'),\n",
    "            'weights': os.path.join(S3_SRC, 'data', 'weights', 'resnet')}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "CHANNELS_DIR='/opt/ml/input/data/' # on node\n",
    "cfg.PATHS.TRAIN_FILE_PATTERN = os.path.join(S3_SRC, 'data', 'coco', 'tfrecord', 'train2017', 'train*')\n",
    "cfg.PATHS.VAL_FILE_PATTERN = os.path.join(CHANNELS_DIR, \"val2017\", \"val*\")\n",
    "cfg.PATHS.WEIGHTS = os.path.join(CHANNELS_DIR, \"weights\", \"resnet50.ckpt\")\n",
    "cfg.PATHS.VAL_ANNOTATIONS = os.path.join(CHANNELS_DIR, \"annotations\", \"instances_val2017.json\")\n",
    "cfg.PATHS.OUT_DIR = '/opt/ml/checkpoints'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(dist_config_file, 'w') as outfile:\n",
    "    with redirect_stdout(outfile): print(cfg.dump())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "hyperparameters = {\"config\": dist_config_file}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "estimator = TensorFlow(\n",
    "                entry_point=entry_point, \n",
    "                source_dir=source_dir, \n",
    "                py_version='py37',\n",
    "                framework_version='2.4.1', #2.3-2.5 supported\n",
    "                role=role,\n",
    "                instance_count=nodes,\n",
    "                instance_type=instance_type,\n",
    "                distribution=distribution,\n",
    "                output_path=output_path,\n",
    "                checkpoint_s3_uri=output_path,\n",
    "                model_dir=output_path,\n",
    "                hyperparameters=hyperparameters,\n",
    "                volume_size=500,\n",
    "                disable_profiler=True,\n",
    "                debugger_hook_config=False,\n",
    "                code_location=code_location,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "estimator.fit(channels, wait=False, job_name=job_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "instance_type": "ml.g4dn.xlarge",
  "kernelspec": {
   "display_name": "Python 3 (TensorFlow 2.3 Python 3.7 GPU Optimized)",
   "language": "python",
   "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-west-2:236514542706:image/tensorflow-2.3-gpu-py37-cu110-ubuntu18.04-v3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}