# Train Detectron2 with SageMaker Training Jobs

In [None]:
# Define IAM role
import boto3
import re

import os
import numpy as np
import pandas as pd
! pip install sagemaker==2.15.0
from sagemaker import get_execution_role

role = get_execution_role()

In [None]:
import sagemaker
from time import gmtime, strftime

sess = sagemaker.Session() # can use LocalSession() to run container locally

bucket = '<MY_BUCKET>'
region = "<MY_REGION>"
account = sess.boto_session.client('sts').get_caller_identity()['Account']

# Note: Upload your COCO data from the previous step into S3 at the `prefix_input` location below. We recommend using `aws s3 sync `
# Where your COCO data resides in S3 
prefix_input = 'training/data'

# Where you'd like your training output to be stored 
prefix_output = 'training/d2-output'

# Configure Training Job

Define algorithm metrics which Sagemaker will scrap, persist, and render in training job console

In [None]:
# 

metric_definitions=[
    {
        "Name": "total_loss",
        "Regex": ".*total_loss:\s([0-9\\.]+)\s*"
    },
    {
        "Name": "loss_cls",
        "Regex": ".*loss_cls:\s([0-9\\.]+)\s*"
    },
    {
        "Name": "loss_box_reg",
        "Regex": ".*loss_box_reg:\s([0-9\\.]+)\s*"
    },
    {
        "Name": "loss_mask",
        "Regex": ".*loss_mask:\s([0-9\\.]+)\s*"
    },
    {
        "Name": "loss_rpn_cls",
        "Regex": ".*loss_rpn_cls:\s([0-9\\.]+)\s*"
    },
    {
        "Name": "loss_rpn_loc",
        "Regex": ".*loss_rpn_loc:\s([0-9\\.]+)\s*"
    }, 
    {
        "Name": "overall_training_speed",
        "Regex": ".*Overall training speed:\s([0-9\\.]+)\s*"
    },
    {
        "Name": "lr",  
        "Regex": ".*lr:\s([0-9\\.]+)\s*"
    },
    {
        "Name": "iter",  
        "Regex": ".*iter:\s([0-9\\.]+)\s*"
    },
    {
        "Name": "AP <OBJECT1>", # Duplicate this dictionary object as much as you want! This will plot Average Precision for <OBJECT1> on your training job
        "Regex": ".*<OBJECT1>\s*\W\S\s([0-9\.]+)\s*"
    }, 
    {
        "Name": "AP <OBJECT2>", # Duplicate this dictionary object as much as you want! This will plot Average Precision for <OBJECT2> on your training job
        "Regex": ".*<OBJECT2>\s*\W\S\s([0-9\.]+)\s*"
    }, 
    {
        "Name": "Estimated Training Time Left",
        "Regex": ".*eta:\s([0-9\\.]+)\s*"
    }
]

print(f"s3://{bucket}/{prefix_input}/")

In [None]:
d2_configs = [
    'MODEL.ROI_HEADS.NUM_CLASSES', '2',
    'SOLVER.REFERENCE_WORLD_SIZE', '8',
    'SOLVER.MAX_ITER', '100', # uncomment if want to do small experiment
    'MODEL.WEIGHTS', 'https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_101_FPN_3x/137851257/model_final_f6e8b1.pkl',
#     'INPUT.MIN_SIZE_TRAIN'
#     'INPUT.MAX_SIZE_TRAIN'
#     'INPUT.MIN_SIZE_TEST'
#     'INPUT.MAX_SIZE_TEST'    
#     'INPUT.CROP.TYPE', 'relative_range',
#     'INPUT.CROP.SIZE', '(0.9, 0.9)',
#     INPUT.FORMAT -- VERIFY THIS! Needed? Maybe only for seg 
    'MODEL.BACKBONE.FREEZE_AT', '2', # There are 5 stages in ResNet. The first is a convolution, and the followingstages are each group of residual blocks.
#     MODEL.ANCHOR_GENERATOR.NAME = "DefaultAnchorGenerator"
#     MODEL.ANCHOR_GENERATOR.SIZES = [[32, 64, 128, 256, 512]]
#     SOLVER.LR_SCHEDULER_NAME = "WarmupMultiStepLR"
#     SOLVER.BASE_LR = 0.001
#     SOLVER.MOMENTUM = 0.9
#     SOLVER.WEIGHT_DECAY = 0.0001
#     SOLVER.WEIGHT_DECAY_NORM = 0.0
#     SOLVER.GAMMA = 0.1
#     SOLVER.STEPS = (30000,)
]

' '.join(d2_configs)

In [None]:
from sagemaker.pytorch import PyTorch

hyperparameters = { "local-config-file":"faster_rcnn_R_101_FPN_3x.yaml", 
                   "resume":"True", 
                   "opts": ' '.join(d2_configs) # https://detectron2.readthedocs.io/modules/config.html#config-references
                   }

d2 = PyTorch('train.py',
             role=role,
             max_run=3*24*60*60, # 3 days in seconds
             source_dir='source',
             framework_version='1.6.0',
             py_version='py3',
             instance_count=1,
             instance_type='ml.p3.16xlarge',
             volume_size=100,
             output_path="s3://{}/{}".format(bucket, prefix_output),
             metric_definitions = metric_definitions,
             hyperparameters = hyperparameters,
             sagemaker_session=sess)

d2.fit(f"s3://{bucket}/{prefix_input}",
       job_name = "d2-model",
       wait=True) 

In [None]:
d2.latest_training_job.describe()