{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## SageMakerCV\n", "\n", "This notebook launches a SageMakerCV training job for PyTorch." ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "#!pip install -e ." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "from datetime import datetime\n", "import yaml\n", "from contextlib import redirect_stdout\n", "\n", "from sagemaker import get_execution_role\n", "from sagemaker.pytorch import PyTorch\n", "import boto3\n", "from configs import cfg" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "S3_BUCKET = 'sagemaker-smcv-tutorial' # Don't include s3:// in your bucket name\n", "S3_DIR = 'smcv-pytorch-tutorial'\n", "S3_DATA_LOCATION=os.path.join(\"s3://\", S3_BUCKET, S3_DIR, \"data\", \"coco\")\n", "S3_WEIGHTS_LOCATION=os.path.join(\"s3://\", S3_BUCKET, S3_DIR, \"data\", \"weights\")\n", "R50_WEIGHTS=\"resnet50.pkl\"\n", "\n", "user_id=\"username-smcv-tutorial\" # This is used for naming your training job, and organizing your results on S3. It can be anything you like." ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "boto_session = boto3.session.Session()\n", "region = boto_session.region_name\n", "# Make sure that your running training in the same region as your S3 bucket\n", "# Generally bad to be running training in Oregon but reading data in Virginia\n", "os.environ['AWS_DEFAULT_REGION'] = region # This is the region we set at the beginning, when creating the S3 bucket for our data\n", "\n", "# this is all for naming\n", "date_str=datetime.now().strftime(\"%d-%m-%Y\")\n", "time_str=datetime.now().strftime(\"%d-%m-%Y-%H-%M-%S\")" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "# specify training type, s3 src and nodes\n", "instance_type=\"ml.p4d.24xlarge\" # This can be any of 'ml.p3dn.24xlarge', 'ml.p4d.24xlarge', 'ml.p3.16xlarge', 'ml.p3.8xlarge', 'ml.p3.2xlarge', 'ml.g4dn.12xlarge'\n", "nodes=2\n", "s3_location=os.path.join(\"s3://\", S3_BUCKET, S3_DIR)\n", "role=get_execution_role() #give Sagemaker permission to launch nodes on our behalf\n", "source_dir='.'" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "dist_config_file = f\"configs/distributed_config.yaml\" #f\"configs/dist-training-config.yaml\"" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "# dataloader settings\n", "cfg.DATALOADER.SIZE_DIVISIBILITY=32\n", "cfg.DATALOADER.NUM_WORKERS=0\n", "\n", "cfg.MODEL.META_ARCHITECTURE=\"GeneralizedRCNN\" # The type of model we're training. found in amazon-sagemaker-cv/pytorch/sagemakercv/detection/detector/generalized_rcnn.py\n", "cfg.MODEL.RESNETS.TRANS_FUNC=\"BottleneckWithFixedBatchNorm\" # Type of bottleneck function in the Resnet50 backbone. see https://arxiv.org/abs/1512.03385\n", "cfg.MODEL.BACKBONE.CONV_BODY=\"R-50-FPN\" # Type of backbone, Resnet50 with feature pyramid network\n", "cfg.MODEL.BACKBONE.OUT_CHANNELS=256 # number of channels on the output feature maps from the backbone\n", "cfg.MODEL.RPN.USE_FPN=True # Use Feature Pyramid. RPN needs to know this since FPN adds an extra feature map\n", "cfg.MODEL.RPN.ANCHOR_STRIDE=(4, 8, 16, 32, 64) # positions of anchors, see blog posts for details\n", "cfg.MODEL.RPN.PRE_NMS_TOP_N_TRAIN=2000 # top N anchors to keep before non-max suppression during training\n", "cfg.MODEL.RPN.PRE_NMS_TOP_N_TEST=1000 # top N anchors to keep before non-max suppression during testing\n", "cfg.MODEL.RPN.POST_NMS_TOP_N_TEST=1000 # top N anchors to keep after non-max suppression during testing\n", "cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN=1000 # top N anchors to keep after non-max suppression during training\n", "cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST=1000 # top N anchors to keep before non-max suppression during training\n", "cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_PER_IMAGE=True # Run NMS per FPN level\n", "cfg.MODEL.RPN.LS=0.1 # label smoothing improves performance on less common categories\n", "\n", "# ROI Heads\n", "cfg.MODEL.ROI_HEADS.USE_FPN=True # Use Feature Pyramid. ROI needs to know this since FPN adds an extra feature map\n", "cfg.MODEL.ROI_HEADS.BBOX_REG_WEIGHTS=(10., 10., 5., 5.) # Regression wieghts for bounding boxes, see blog posts\n", "cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION=7 # Pixel size of region cropped from feature map\n", "cfg.MODEL.ROI_BOX_HEAD.POOLER_SCALES=(0.25, 0.125, 0.0625, 0.03125) # Pooling for ROI align\n", "cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO=2 # Sampling for ROI Align\n", "cfg.MODEL.ROI_BOX_HEAD.FEATURE_EXTRACTOR=\"FPN2MLPFeatureExtractor\" # Type of ROI feature extractor found in SageMakerCV core utils\n", "cfg.MODEL.ROI_BOX_HEAD.PREDICTOR=\"FPNPredictor\" # Predictor type used for inference found in SageMakerCV core utils\n", "cfg.MODEL.ROI_BOX_HEAD.LOSS=\"GIoULoss\" # Use GIoU loss, improves box performance https://giou.stanford.edu/GIoU.pdf\n", "cfg.MODEL.ROI_BOX_HEAD.DECODE=True # Convert boxes to pixel positions\n", "cfg.MODEL.ROI_BOX_HEAD.CARL=True # Use carl loss https://arxiv.org/pdf/1904.04821.pdf\n", "cfg.MODEL.ROI_MASK_HEAD.POOLER_SCALES=(0.25, 0.125, 0.0625, 0.03125) # Mask head ROI align\n", "cfg.MODEL.ROI_MASK_HEAD.FEATURE_EXTRACTOR=\"MaskRCNNFPNFeatureExtractor\" # Mask feature extractor type in SageMakerCV core utils\n", "cfg.MODEL.ROI_MASK_HEAD.PREDICTOR=\"MaskRCNNC4Predictor\" # Predictor used for inference\n", "cfg.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION=14 # Pixel size of region cropped from feature map\n", "cfg.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO=2 # ROI align sampling ratio\n", "cfg.MODEL.ROI_MASK_HEAD.RESOLUTION=28 # output resolution of mask\n", "cfg.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR=False # share feature extractor between box and mask heads\n", "cfg.MODEL.MASK_ON=True # use mask head\n", "\n", "cfg.SOLVER.OPTIMIZER=\"NovoGrad\" # Type of optimizer, NovoGrad, Adam, SGD, Lamb\n", "cfg.SOLVER.BASE_LR=0.028 # Learning rate after warmup\n", "cfg.SOLVER.BETA1=0.9 # Beta value for Novograd, Adam, and Lamb\n", "cfg.SOLVER.BETA2=0.3 # Beta value for Novograd, Adam, and Lamb\n", "cfg.SOLVER.ALPHA=.001 # Alpha for final value of cosine decay\n", "cfg.SOLVER.LR_SCHEDULE=\"COSINE\" # Decay type, COSINE or MULTISTEP\n", "cfg.SOLVER.IMS_PER_BATCH=192 # Global training batch size, must be a multiple of the number of GPUs\n", "cfg.SOLVER.WEIGHT_DECAY=0.001 # Training weight decay applied as decoupled weight decay on optimizer\n", "cfg.SOLVER.MAX_ITER=9000 # Total number of training steps\n", "cfg.SOLVER.WARMUP_FACTOR=.01 # Starting learning rate as a multiple of the BASE_LR\n", "cfg.SOLVER.WARMUP_ITERS=625 # Number of warmup steps to reach BASE_LR\n", "cfg.SOLVER.GRADIENT_CLIPPING=0.0 # Gradient clipping norm, leave as 0.0 to disable gradient clipping\n", "cfg.OPT_LEVEL=\"O4\" # Mixed precision optimization level\n", "cfg.TEST.IMS_PER_BATCH=64 # Evaluation batch size, must be a multiple of the number of GPUs\n", "cfg.TEST.PER_EPOCH_EVAL=False # Eval after every epoch or only at the end of training\n", "\n", "job_name = f'{user_id}-{time_str}'\n", "output_path = os.path.join(s3_location, \"sagemaker-output\", date_str, job_name)\n", "code_location = os.path.join(s3_location, \"sagemaker-code\", date_str, job_name)\n", "\n", "channels = {'validation': os.path.join(S3_DATA_LOCATION, 'val2017'),\n", " 'weights': S3_WEIGHTS_LOCATION,\n", " 'annotations': os.path.join(S3_DATA_LOCATION, 'annotations')}\n", "\n", "CHANNELS_DIR='/opt/ml/input/data/' # on node\n", "cfg.INPUT.VAL_INPUT_DIR = os.path.join(CHANNELS_DIR, 'validation') # Corresponds to the vdalidation key in the channels\n", "cfg.INPUT.TRAIN_ANNO_DIR = os.path.join(CHANNELS_DIR, 'annotations', 'instances_train2017.json')\n", "cfg.INPUT.VAL_ANNO_DIR = os.path.join(CHANNELS_DIR, 'annotations', 'instances_val2017.json')\n", "cfg.MODEL.WEIGHT=os.path.join(CHANNELS_DIR, 'weights', R50_WEIGHTS) # backbone weights file\n", "cfg.INPUT.TRAIN_INPUT_DIR = os.path.join(S3_DATA_LOCATION, \"train2017\") # Set to S3 location so we use the S3 plugin\n", "cfg.OUTPUT_DIR = '/opt/ml/checkpoints'\n", "cfg.DATALOADER.NUM_WORKERS=12 \n", "\n", "cfg.HOOKS=[\"DetectronCheckpointHook\",\n", " \"AMP_Hook\",\n", " \"IterTimerHook\",\n", " \"TextLoggerHook\",\n", " \"COCOEvaluation\"]\n", "\n", "with open(dist_config_file, 'w') as outfile:\n", " with redirect_stdout(outfile): print(cfg.dump())" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "if nodes>1 and instance_type in ['ml.p3dn.24xlarge', 'ml.p4d.24xlarge', 'ml.p3.16xlarge']:\n", " distribution = { \"smdistributed\": { \"dataparallel\": { \"enabled\": True } } } \n", " entry_point = \"train.py\"\n", "else:\n", " distribution = None\n", " entry_point = \"launch_ddp.py\"" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "hyperparameters = {\"config\": dist_config_file}" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "estimator = PyTorch(\n", " entry_point=entry_point, \n", " source_dir=source_dir, \n", " py_version='py3',\n", " framework_version='1.8.1', # 1.6 - 1.8 supported\n", " role=role,\n", " instance_count=nodes,\n", " instance_type=instance_type,\n", " distribution=distribution,\n", " output_path=output_path,\n", " checkpoint_s3_uri=output_path,\n", " model_dir=output_path,\n", " hyperparameters=hyperparameters,\n", " volume_size=500,\n", " code_location=code_location,\n", " disable_profiler=True, # Reduce number of logs since we don't need profiler or debugger for this training\n", " debugger_hook_config=False,\n", ")" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "estimator.fit(channels, wait=False, job_name=job_name)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "instance_type": "ml.g4dn.xlarge", "kernelspec": { "display_name": "Python 3 (PyTorch 1.6 Python 3.6 GPU Optimized)", "language": "python", "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-west-2:236514542706:image/pytorch-1.6-gpu-py36-cu110-ubuntu18.04-v3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.13" } }, "nbformat": 4, "nbformat_minor": 4 }