In [20]:
#!pip install -e .

In [1]:
import os
import subprocess
from datetime import datetime

from sagemaker import get_execution_role
from sagemaker.tensorflow import TensorFlow

import yaml
from contextlib import redirect_stdout
import boto3
from configs import cfg

#### Set S3 Locations and Job Name

In [17]:
S3_BUCKET = 'sagemaker-smcv-tutorial' # Don't include s3:// in your bucket name
S3_DIR = 'smcv-tensorflow-tutorial'
LOCAL_DATA_DIR = '/root/smcv-tensorflow-tutorial' # For reasons detailed in Distributed Training, do not put this dir in the SageMakerCV dir
S3_SRC=os.path.join("s3://", S3_BUCKET, S3_DIR)

In [18]:
boto_session = boto3.session.Session()
region = boto_session.region_name
os.environ['AWS_DEFAULT_REGION'] = region # This is the region we set at the beginning, when creating the S3 bucket for our data

# this is all for naming
user_id="jbsnyder-smcv-tutorial" # This is used for naming your training job, and organizing your results on S3. It can be anything you like.
date_str=datetime.now().strftime("%d-%m-%Y")
time_str=datetime.now().strftime("%d-%m-%Y-%H-%M-%S")

In [19]:
# specify training type, s3 src and nodes
instance_type="ml.p4d.24xlarge" # This can be any of 'ml.p3dn.24xlarge', 'ml.p4d.24xlarge', 'ml.p3.16xlarge', 'ml.p3.8xlarge', 'ml.p3.2xlarge', 'ml.g4dn.12xlarge'
nodes=1
role=get_execution_role() #give Sagemaker permission to launch nodes on our behalf
source_dir='.'
entry_point='train.py'

In [20]:
dist_config_file = f"configs/1_node.yaml" # f"configs/dist-training-config.yaml"

In [21]:
cfg.LOG_INTERVAL = 50 # Number of training steps between logging interval
cfg.MODEL.DENSE.PRE_NMS_TOP_N_TRAIN = 2000 # Top regions of interest to select before NMS
cfg.MODEL.DENSE.POST_NMS_TOP_N_TRAIN = 1000 # Top regions of interest to select after NMS
cfg.MODEL.RCNN.ROI_HEAD = "StandardRoIHead"
cfg.MODEL.FRCNN.LOSS_TYPE = "giou"
cfg.MODEL.FRCNN.LABEL_SMOOTHING = 0.1 # label smoothing for box head
cfg.MODEL.FRCNN.CARL = True

In [22]:
cfg.INPUT.TRAIN_BATCH_SIZE = 48 # Training batch size
cfg.INPUT.EVAL_BATCH_SIZE = 32 # Training batch size
cfg.SOLVER.SCHEDULE = "CosineDecay" # Learning rate schedule, either CosineDecay or PiecewiseConstantDecay
cfg.SOLVER.OPTIMIZER = "NovoGrad" # Optimizer type NovoGrad or Momentum
cfg.SOLVER.MOMENTUM = 0.9
cfg.SOLVER.WARM_UP_RATIO = 0.01
cfg.SOLVER.LR = .008 # Base learning rate after warmup
cfg.SOLVER.BETA_1 = 0.9 # NovoGrad beta 1 value
cfg.SOLVER.BETA_2 = 0.6 # NovoGRad beta 2 value
cfg.SOLVER.MAX_ITERS = 22000 # Total training steps
cfg.SOLVER.WARMUP_STEPS = 750 # warmup steps
cfg.SOLVER.XLA = True # Train with XLA
cfg.SOLVER.FP16 = True # Train with mixed precision enables
cfg.SOLVER.TF32 = True # Train with TF32 data type enabled, only available on Ampere GPUs and TF 2.4 and up
cfg.SOLVER.EVAL_EPOCH_EVAL = False # Only run eval at end
cfg.SOLVER.ALPHA = 0.025 # final learning rate as multiplier of initial learning rate
cfg.SOLVER.WEIGHT_DECAY = 0.0001 # Optimizer weight decay

In [23]:
cfg.HOOKS=["CheckpointHook",
 "IterTimerHook",
 "TextLoggerHook",
 "CocoEvaluator"]

In [24]:
if nodes>0 and instance_type in ['ml.p3dn.24xlarge', 'ml.p4d.24xlarge', 'ml.p3.16xlarge']:
 distribution = { "smdistributed": { "dataparallel": { "enabled": True } } } 
else:
 custom_mpi_options = ['-x FI_EFA_USE_DEVICE_RDMA=1',
 '-x OMPI_MCA_btl_vader_single_copy_mechanism=none',
 '-x TF_CUDNN_USE_AUTOTUNE=0',
 '-x NCCL_MIN_NRINGS=0']
 distribution = { "mpi": { "enabled": True, "custom_mpi_options": " ".join(custom_mpi_options)}}

In [25]:
job_name = f'{user_id}-{time_str}'
output_path = os.path.join(S3_SRC, "sagemaker-output", date_str, job_name)
code_location = os.path.join(S3_SRC, "sagemaker-code", date_str, job_name)

In [26]:
channels = {'val2017': os.path.join(S3_SRC, 'data', 'coco', 'tfrecord', 'val2017'),
 'annotations': os.path.join(S3_SRC, 'data', 'coco', 'annotations'),
 'weights': os.path.join(S3_SRC, 'data', 'weights', 'resnet')}

In [27]:
CHANNELS_DIR='/opt/ml/input/data/' # on node
cfg.PATHS.TRAIN_FILE_PATTERN = os.path.join(S3_SRC, 'data', 'coco', 'tfrecord', 'train2017', 'train*')
cfg.PATHS.VAL_FILE_PATTERN = os.path.join(CHANNELS_DIR, "val2017", "val*")
cfg.PATHS.WEIGHTS = os.path.join(CHANNELS_DIR, "weights", "resnet50.ckpt")
cfg.PATHS.VAL_ANNOTATIONS = os.path.join(CHANNELS_DIR, "annotations", "instances_val2017.json")
cfg.PATHS.OUT_DIR = '/opt/ml/checkpoints'

In [28]:
with open(dist_config_file, 'w') as outfile:
 with redirect_stdout(outfile): print(cfg.dump())

In [29]:
hyperparameters = {"config": dist_config_file}

In [30]:
estimator = TensorFlow(
 entry_point=entry_point, 
 source_dir=source_dir, 
 py_version='py37',
 framework_version='2.4.1', #2.3-2.5 supported
 role=role,
 instance_count=nodes,
 instance_type=instance_type,
 distribution=distribution,
 output_path=output_path,
 checkpoint_s3_uri=output_path,
 model_dir=output_path,
 hyperparameters=hyperparameters,
 volume_size=500,
 disable_profiler=True,
 debugger_hook_config=False,
 code_location=code_location,
)

In [31]:
estimator.fit(channels, wait=False, job_name=job_name)