{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## SageMakerCV\n",
    "\n",
    "This notebook launches a SageMakerCV training job for PyTorch."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!pip install -e ."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from datetime import datetime\n",
    "import yaml\n",
    "from contextlib import redirect_stdout\n",
    "\n",
    "from sagemaker import get_execution_role\n",
    "from sagemaker.pytorch import PyTorch\n",
    "import boto3\n",
    "from configs import cfg"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "S3_BUCKET = 'sagemaker-smcv-tutorial' # Don't include s3:// in your bucket name\n",
    "S3_DIR = 'smcv-pytorch-tutorial'\n",
    "S3_DATA_LOCATION=os.path.join(\"s3://\", S3_BUCKET, S3_DIR, \"data\", \"coco\")\n",
    "S3_WEIGHTS_LOCATION=os.path.join(\"s3://\", S3_BUCKET, S3_DIR, \"data\", \"weights\")\n",
    "R50_WEIGHTS=\"resnet50.pkl\"\n",
    "\n",
    "user_id=\"username-smcv-tutorial\" # This is used for naming your training job, and organizing your results on S3. It can be anything you like."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "boto_session = boto3.session.Session()\n",
    "region = boto_session.region_name\n",
    "# Make sure that your running training in the same region as your S3 bucket\n",
    "# Generally bad to be running training in Oregon but reading data in Virginia\n",
    "os.environ['AWS_DEFAULT_REGION'] = region # This is the region we set at the beginning, when creating the S3 bucket for our data\n",
    "\n",
    "# this is all for naming\n",
    "date_str=datetime.now().strftime(\"%d-%m-%Y\")\n",
    "time_str=datetime.now().strftime(\"%d-%m-%Y-%H-%M-%S\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "# specify training type, s3 src and nodes\n",
    "instance_type=\"ml.p4d.24xlarge\" # This can be any of 'ml.p3dn.24xlarge', 'ml.p4d.24xlarge', 'ml.p3.16xlarge', 'ml.p3.8xlarge', 'ml.p3.2xlarge', 'ml.g4dn.12xlarge'\n",
    "nodes=2\n",
    "s3_location=os.path.join(\"s3://\", S3_BUCKET, S3_DIR)\n",
    "role=get_execution_role() #give Sagemaker permission to launch nodes on our behalf\n",
    "source_dir='.'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "dist_config_file = f\"configs/distributed_config.yaml\" #f\"configs/dist-training-config.yaml\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "# dataloader settings\n",
    "cfg.DATALOADER.SIZE_DIVISIBILITY=32\n",
    "cfg.DATALOADER.NUM_WORKERS=0\n",
    "\n",
    "cfg.MODEL.META_ARCHITECTURE=\"GeneralizedRCNN\" # The type of model we're training. found in amazon-sagemaker-cv/pytorch/sagemakercv/detection/detector/generalized_rcnn.py\n",
    "cfg.MODEL.RESNETS.TRANS_FUNC=\"BottleneckWithFixedBatchNorm\" # Type of bottleneck function in the Resnet50 backbone. see https://arxiv.org/abs/1512.03385\n",
    "cfg.MODEL.BACKBONE.CONV_BODY=\"R-50-FPN\" # Type of backbone, Resnet50 with feature pyramid network\n",
    "cfg.MODEL.BACKBONE.OUT_CHANNELS=256 # number of channels on the output feature maps from the backbone\n",
    "cfg.MODEL.RPN.USE_FPN=True # Use Feature Pyramid. RPN needs to know this since FPN adds an extra feature map\n",
    "cfg.MODEL.RPN.ANCHOR_STRIDE=(4, 8, 16, 32, 64) # positions of anchors, see blog posts for details\n",
    "cfg.MODEL.RPN.PRE_NMS_TOP_N_TRAIN=2000 # top N anchors to keep before non-max suppression during training\n",
    "cfg.MODEL.RPN.PRE_NMS_TOP_N_TEST=1000 # top N anchors to keep before non-max suppression during testing\n",
    "cfg.MODEL.RPN.POST_NMS_TOP_N_TEST=1000 # top N anchors to keep after non-max suppression during testing\n",
    "cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN=1000 # top N anchors to keep after non-max suppression during training\n",
    "cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST=1000 # top N anchors to keep before non-max suppression during training\n",
    "cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_PER_IMAGE=True # Run NMS per FPN level\n",
    "cfg.MODEL.RPN.LS=0.1 # label smoothing improves performance on less common categories\n",
    "\n",
    "# ROI Heads\n",
    "cfg.MODEL.ROI_HEADS.USE_FPN=True # Use Feature Pyramid. ROI needs to know this since FPN adds an extra feature map\n",
    "cfg.MODEL.ROI_HEADS.BBOX_REG_WEIGHTS=(10., 10., 5., 5.) # Regression wieghts for bounding boxes, see blog posts\n",
    "cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION=7 # Pixel size of region cropped from feature map\n",
    "cfg.MODEL.ROI_BOX_HEAD.POOLER_SCALES=(0.25, 0.125, 0.0625, 0.03125) # Pooling for ROI align\n",
    "cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO=2 # Sampling for ROI Align\n",
    "cfg.MODEL.ROI_BOX_HEAD.FEATURE_EXTRACTOR=\"FPN2MLPFeatureExtractor\" # Type of ROI feature extractor found in SageMakerCV core utils\n",
    "cfg.MODEL.ROI_BOX_HEAD.PREDICTOR=\"FPNPredictor\" # Predictor type used for inference found in SageMakerCV core utils\n",
    "cfg.MODEL.ROI_BOX_HEAD.LOSS=\"GIoULoss\" # Use GIoU loss, improves box performance https://giou.stanford.edu/GIoU.pdf\n",
    "cfg.MODEL.ROI_BOX_HEAD.DECODE=True # Convert boxes to pixel positions\n",
    "cfg.MODEL.ROI_BOX_HEAD.CARL=True # Use carl loss https://arxiv.org/pdf/1904.04821.pdf\n",
    "cfg.MODEL.ROI_MASK_HEAD.POOLER_SCALES=(0.25, 0.125, 0.0625, 0.03125) # Mask head ROI align\n",
    "cfg.MODEL.ROI_MASK_HEAD.FEATURE_EXTRACTOR=\"MaskRCNNFPNFeatureExtractor\" # Mask feature extractor type in SageMakerCV core utils\n",
    "cfg.MODEL.ROI_MASK_HEAD.PREDICTOR=\"MaskRCNNC4Predictor\" # Predictor used for inference\n",
    "cfg.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION=14 # Pixel size of region cropped from feature map\n",
    "cfg.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO=2 # ROI align sampling ratio\n",
    "cfg.MODEL.ROI_MASK_HEAD.RESOLUTION=28 # output resolution of mask\n",
    "cfg.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR=False # share feature extractor between box and mask heads\n",
    "cfg.MODEL.MASK_ON=True # use mask head\n",
    "\n",
    "cfg.SOLVER.OPTIMIZER=\"NovoGrad\" # Type of optimizer, NovoGrad, Adam, SGD, Lamb\n",
    "cfg.SOLVER.BASE_LR=0.028 # Learning rate after warmup\n",
    "cfg.SOLVER.BETA1=0.9 # Beta value for Novograd, Adam, and Lamb\n",
    "cfg.SOLVER.BETA2=0.3 # Beta value for Novograd, Adam, and Lamb\n",
    "cfg.SOLVER.ALPHA=.001 # Alpha for final value of cosine decay\n",
    "cfg.SOLVER.LR_SCHEDULE=\"COSINE\" # Decay type, COSINE or MULTISTEP\n",
    "cfg.SOLVER.IMS_PER_BATCH=192 # Global training batch size, must be a multiple of the number of GPUs\n",
    "cfg.SOLVER.WEIGHT_DECAY=0.001 # Training weight decay applied as decoupled weight decay on optimizer\n",
    "cfg.SOLVER.MAX_ITER=9000 # Total number of training steps\n",
    "cfg.SOLVER.WARMUP_FACTOR=.01 # Starting learning rate as a multiple of the BASE_LR\n",
    "cfg.SOLVER.WARMUP_ITERS=625 # Number of warmup steps to reach BASE_LR\n",
    "cfg.SOLVER.GRADIENT_CLIPPING=0.0 # Gradient clipping norm, leave as 0.0 to disable gradient clipping\n",
    "cfg.OPT_LEVEL=\"O4\" # Mixed precision optimization level\n",
    "cfg.TEST.IMS_PER_BATCH=64 # Evaluation batch size, must be a multiple of the number of GPUs\n",
    "cfg.TEST.PER_EPOCH_EVAL=False # Eval after every epoch or only at the end of training\n",
    "\n",
    "job_name = f'{user_id}-{time_str}'\n",
    "output_path = os.path.join(s3_location, \"sagemaker-output\", date_str, job_name)\n",
    "code_location = os.path.join(s3_location, \"sagemaker-code\", date_str, job_name)\n",
    "\n",
    "channels = {'validation': os.path.join(S3_DATA_LOCATION, 'val2017'),\n",
    "            'weights': S3_WEIGHTS_LOCATION,\n",
    "            'annotations': os.path.join(S3_DATA_LOCATION, 'annotations')}\n",
    "\n",
    "CHANNELS_DIR='/opt/ml/input/data/' # on node\n",
    "cfg.INPUT.VAL_INPUT_DIR = os.path.join(CHANNELS_DIR, 'validation') # Corresponds to the vdalidation key in the channels\n",
    "cfg.INPUT.TRAIN_ANNO_DIR = os.path.join(CHANNELS_DIR, 'annotations', 'instances_train2017.json')\n",
    "cfg.INPUT.VAL_ANNO_DIR = os.path.join(CHANNELS_DIR, 'annotations', 'instances_val2017.json')\n",
    "cfg.MODEL.WEIGHT=os.path.join(CHANNELS_DIR, 'weights', R50_WEIGHTS) # backbone weights file\n",
    "cfg.INPUT.TRAIN_INPUT_DIR = os.path.join(S3_DATA_LOCATION, \"train2017\") # Set to S3 location so we use the S3 plugin\n",
    "cfg.OUTPUT_DIR = '/opt/ml/checkpoints'\n",
    "cfg.DATALOADER.NUM_WORKERS=12 \n",
    "\n",
    "cfg.HOOKS=[\"DetectronCheckpointHook\",\n",
    "           \"AMP_Hook\",\n",
    "           \"IterTimerHook\",\n",
    "           \"TextLoggerHook\",\n",
    "           \"COCOEvaluation\"]\n",
    "\n",
    "with open(dist_config_file, 'w') as outfile:\n",
    "    with redirect_stdout(outfile): print(cfg.dump())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "if nodes>1 and instance_type in ['ml.p3dn.24xlarge', 'ml.p4d.24xlarge', 'ml.p3.16xlarge']:\n",
    "    distribution = { \"smdistributed\": { \"dataparallel\": { \"enabled\": True } } } \n",
    "    entry_point = \"train.py\"\n",
    "else:\n",
    "    distribution = None\n",
    "    entry_point = \"launch_ddp.py\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "hyperparameters = {\"config\": dist_config_file}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "estimator = PyTorch(\n",
    "                entry_point=entry_point, \n",
    "                source_dir=source_dir, \n",
    "                py_version='py3',\n",
    "                framework_version='1.8.1', # 1.6 - 1.8 supported\n",
    "                role=role,\n",
    "                instance_count=nodes,\n",
    "                instance_type=instance_type,\n",
    "                distribution=distribution,\n",
    "                output_path=output_path,\n",
    "                checkpoint_s3_uri=output_path,\n",
    "                model_dir=output_path,\n",
    "                hyperparameters=hyperparameters,\n",
    "                volume_size=500,\n",
    "                code_location=code_location,\n",
    "                disable_profiler=True, # Reduce number of logs since we don't need profiler or debugger for this training\n",
    "                debugger_hook_config=False,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "estimator.fit(channels, wait=False, job_name=job_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "instance_type": "ml.g4dn.xlarge",
  "kernelspec": {
   "display_name": "Python 3 (PyTorch 1.6 Python 3.6 GPU Optimized)",
   "language": "python",
   "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-west-2:236514542706:image/pytorch-1.6-gpu-py36-cu110-ubuntu18.04-v3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}