global: namespace: kubeflow shared_fs: efs shared_pvc: tensorpack-efs-gp-bursting source_cidr: "0.0.0.0/0" # Public IP source CIDR maskrcnn: name: maskrcnn gpus: 16 gpu_nodes: 2 gpus_per_node: 8 image: train_script: /tensorpack/examples/FasterRCNN/train.py data_fs: efs data_dir: data steps_per_epoch: 7500 # Must be equal to 120000/gpus lr_schedule: "[240000,320000,360000]" eval_period_in_epochs: 1 data_train: "[\"coco_train2017\"]" data_val: "(\"coco_val2017\")" mode_fpn: 'True' mode_mask: 'True' backbone_norm: FreezeBN backbone_weights: data/pretrained-models/ImageNet-R50-AlignPadding.npz image_pull_policy: Always horovod_cycle_time: "0.5" horovod_fusion_threshold: "67108864" nccl_socket_ifname: ^lo,docker0 if_exclude: lo,docker0 nccl_min_rings: 8 nccl_debug: INFO extra_config: 'TRAIN.CHECKPOINT_PERIOD=2' backoff_limit: 20