## SageMakerCV

This notebook launches a SageMakerCV training job for PyTorch.

In [18]:
#!pip install -e .

In [1]:
import os
from datetime import datetime
import yaml
from contextlib import redirect_stdout

from sagemaker import get_execution_role
from sagemaker.pytorch import PyTorch
import boto3
from configs import cfg

In [39]:
S3_BUCKET = 'sagemaker-smcv-tutorial' # Don't include s3:// in your bucket name
S3_DIR = 'smcv-pytorch-tutorial'
S3_DATA_LOCATION=os.path.join("s3://", S3_BUCKET, S3_DIR, "data", "coco")
S3_WEIGHTS_LOCATION=os.path.join("s3://", S3_BUCKET, S3_DIR, "data", "weights")
R50_WEIGHTS="resnet50.pkl"

user_id="username-smcv-tutorial" # This is used for naming your training job, and organizing your results on S3. It can be anything you like.

In [40]:
boto_session = boto3.session.Session()
region = boto_session.region_name
# Make sure that your running training in the same region as your S3 bucket
# Generally bad to be running training in Oregon but reading data in Virginia
os.environ['AWS_DEFAULT_REGION'] = region # This is the region we set at the beginning, when creating the S3 bucket for our data

# this is all for naming
date_str=datetime.now().strftime("%d-%m-%Y")
time_str=datetime.now().strftime("%d-%m-%Y-%H-%M-%S")

In [41]:
# specify training type, s3 src and nodes
instance_type="ml.p4d.24xlarge" # This can be any of 'ml.p3dn.24xlarge', 'ml.p4d.24xlarge', 'ml.p3.16xlarge', 'ml.p3.8xlarge', 'ml.p3.2xlarge', 'ml.g4dn.12xlarge'
nodes=2
s3_location=os.path.join("s3://", S3_BUCKET, S3_DIR)
role=get_execution_role() #give Sagemaker permission to launch nodes on our behalf
source_dir='.'

In [42]:
dist_config_file = f"configs/distributed_config.yaml" #f"configs/dist-training-config.yaml"

In [43]:
# dataloader settings
cfg.DATALOADER.SIZE_DIVISIBILITY=32
cfg.DATALOADER.NUM_WORKERS=0

cfg.MODEL.META_ARCHITECTURE="GeneralizedRCNN" # The type of model we're training. found in amazon-sagemaker-cv/pytorch/sagemakercv/detection/detector/generalized_rcnn.py
cfg.MODEL.RESNETS.TRANS_FUNC="BottleneckWithFixedBatchNorm" # Type of bottleneck function in the Resnet50 backbone. see https://arxiv.org/abs/1512.03385
cfg.MODEL.BACKBONE.CONV_BODY="R-50-FPN" # Type of backbone, Resnet50 with feature pyramid network
cfg.MODEL.BACKBONE.OUT_CHANNELS=256 # number of channels on the output feature maps from the backbone
cfg.MODEL.RPN.USE_FPN=True # Use Feature Pyramid. RPN needs to know this since FPN adds an extra feature map
cfg.MODEL.RPN.ANCHOR_STRIDE=(4, 8, 16, 32, 64) # positions of anchors, see blog posts for details
cfg.MODEL.RPN.PRE_NMS_TOP_N_TRAIN=2000 # top N anchors to keep before non-max suppression during training
cfg.MODEL.RPN.PRE_NMS_TOP_N_TEST=1000 # top N anchors to keep before non-max suppression during testing
cfg.MODEL.RPN.POST_NMS_TOP_N_TEST=1000 # top N anchors to keep after non-max suppression during testing
cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN=1000 # top N anchors to keep after non-max suppression during training
cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST=1000 # top N anchors to keep before non-max suppression during training
cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_PER_IMAGE=True # Run NMS per FPN level
cfg.MODEL.RPN.LS=0.1 # label smoothing improves performance on less common categories

# ROI Heads
cfg.MODEL.ROI_HEADS.USE_FPN=True # Use Feature Pyramid. ROI needs to know this since FPN adds an extra feature map
cfg.MODEL.ROI_HEADS.BBOX_REG_WEIGHTS=(10., 10., 5., 5.) # Regression wieghts for bounding boxes, see blog posts
cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION=7 # Pixel size of region cropped from feature map
cfg.MODEL.ROI_BOX_HEAD.POOLER_SCALES=(0.25, 0.125, 0.0625, 0.03125) # Pooling for ROI align
cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO=2 # Sampling for ROI Align
cfg.MODEL.ROI_BOX_HEAD.FEATURE_EXTRACTOR="FPN2MLPFeatureExtractor" # Type of ROI feature extractor found in SageMakerCV core utils
cfg.MODEL.ROI_BOX_HEAD.PREDICTOR="FPNPredictor" # Predictor type used for inference found in SageMakerCV core utils
cfg.MODEL.ROI_BOX_HEAD.LOSS="GIoULoss" # Use GIoU loss, improves box performance https://giou.stanford.edu/GIoU.pdf
cfg.MODEL.ROI_BOX_HEAD.DECODE=True # Convert boxes to pixel positions
cfg.MODEL.ROI_BOX_HEAD.CARL=True # Use carl loss https://arxiv.org/pdf/1904.04821.pdf
cfg.MODEL.ROI_MASK_HEAD.POOLER_SCALES=(0.25, 0.125, 0.0625, 0.03125) # Mask head ROI align
cfg.MODEL.ROI_MASK_HEAD.FEATURE_EXTRACTOR="MaskRCNNFPNFeatureExtractor" # Mask feature extractor type in SageMakerCV core utils
cfg.MODEL.ROI_MASK_HEAD.PREDICTOR="MaskRCNNC4Predictor" # Predictor used for inference
cfg.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION=14 # Pixel size of region cropped from feature map
cfg.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO=2 # ROI align sampling ratio
cfg.MODEL.ROI_MASK_HEAD.RESOLUTION=28 # output resolution of mask
cfg.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR=False # share feature extractor between box and mask heads
cfg.MODEL.MASK_ON=True # use mask head

cfg.SOLVER.OPTIMIZER="NovoGrad" # Type of optimizer, NovoGrad, Adam, SGD, Lamb
cfg.SOLVER.BASE_LR=0.028 # Learning rate after warmup
cfg.SOLVER.BETA1=0.9 # Beta value for Novograd, Adam, and Lamb
cfg.SOLVER.BETA2=0.3 # Beta value for Novograd, Adam, and Lamb
cfg.SOLVER.ALPHA=.001 # Alpha for final value of cosine decay
cfg.SOLVER.LR_SCHEDULE="COSINE" # Decay type, COSINE or MULTISTEP
cfg.SOLVER.IMS_PER_BATCH=192 # Global training batch size, must be a multiple of the number of GPUs
cfg.SOLVER.WEIGHT_DECAY=0.001 # Training weight decay applied as decoupled weight decay on optimizer
cfg.SOLVER.MAX_ITER=9000 # Total number of training steps
cfg.SOLVER.WARMUP_FACTOR=.01 # Starting learning rate as a multiple of the BASE_LR
cfg.SOLVER.WARMUP_ITERS=625 # Number of warmup steps to reach BASE_LR
cfg.SOLVER.GRADIENT_CLIPPING=0.0 # Gradient clipping norm, leave as 0.0 to disable gradient clipping
cfg.OPT_LEVEL="O4" # Mixed precision optimization level
cfg.TEST.IMS_PER_BATCH=64 # Evaluation batch size, must be a multiple of the number of GPUs
cfg.TEST.PER_EPOCH_EVAL=False # Eval after every epoch or only at the end of training

job_name = f'{user_id}-{time_str}'
output_path = os.path.join(s3_location, "sagemaker-output", date_str, job_name)
code_location = os.path.join(s3_location, "sagemaker-code", date_str, job_name)

channels = {'validation': os.path.join(S3_DATA_LOCATION, 'val2017'),
 'weights': S3_WEIGHTS_LOCATION,
 'annotations': os.path.join(S3_DATA_LOCATION, 'annotations')}

CHANNELS_DIR='/opt/ml/input/data/' # on node
cfg.INPUT.VAL_INPUT_DIR = os.path.join(CHANNELS_DIR, 'validation') # Corresponds to the vdalidation key in the channels
cfg.INPUT.TRAIN_ANNO_DIR = os.path.join(CHANNELS_DIR, 'annotations', 'instances_train2017.json')
cfg.INPUT.VAL_ANNO_DIR = os.path.join(CHANNELS_DIR, 'annotations', 'instances_val2017.json')
cfg.MODEL.WEIGHT=os.path.join(CHANNELS_DIR, 'weights', R50_WEIGHTS) # backbone weights file
cfg.INPUT.TRAIN_INPUT_DIR = os.path.join(S3_DATA_LOCATION, "train2017") # Set to S3 location so we use the S3 plugin
cfg.OUTPUT_DIR = '/opt/ml/checkpoints'
cfg.DATALOADER.NUM_WORKERS=12 

cfg.HOOKS=["DetectronCheckpointHook",
 "AMP_Hook",
 "IterTimerHook",
 "TextLoggerHook",
 "COCOEvaluation"]

with open(dist_config_file, 'w') as outfile:
 with redirect_stdout(outfile): print(cfg.dump())

In [44]:
if nodes>1 and instance_type in ['ml.p3dn.24xlarge', 'ml.p4d.24xlarge', 'ml.p3.16xlarge']:
 distribution = { "smdistributed": { "dataparallel": { "enabled": True } } } 
 entry_point = "train.py"
else:
 distribution = None
 entry_point = "launch_ddp.py"

In [45]:
hyperparameters = {"config": dist_config_file}

In [46]:
estimator = PyTorch(
 entry_point=entry_point, 
 source_dir=source_dir, 
 py_version='py3',
 framework_version='1.8.1', # 1.6 - 1.8 supported
 role=role,
 instance_count=nodes,
 instance_type=instance_type,
 distribution=distribution,
 output_path=output_path,
 checkpoint_s3_uri=output_path,
 model_dir=output_path,
 hyperparameters=hyperparameters,
 volume_size=500,
 code_location=code_location,
 disable_profiler=True, # Reduce number of logs since we don't need profiler or debugger for this training
 debugger_hook_config=False,
)

In [47]:
estimator.fit(channels, wait=False, job_name=job_name)