global: namespace: kubeflow shared_fs: efs shared_pvc: tensorpack-efs-gp-bursting source_cidr: "0.0.0.0/0" # Public IP source CIDR maskrcnn: name: mask-rcnn-tensorflow gpus: 16 gpu_nodes: 2 gpus_per_node: 8 image: train_script: /mask-rcnn-tensorflow/MaskRCNN/train.py tp_16: 1 batch_size_per_gpu: 4 data_fs: efs data_dir: data working_dir: /mask-rcnn-tensorflow images_per_epoch: 120000 lr_epoch_schedule: "[(16, 0.1), (20, 0.01), (24, None)]" eval_period_in_epochs: 1 data_train: "[\"train2017\"]" data_val: "(\"val2017\")" mode_fpn: 'True' mode_mask: 'True' backbone_norm: FreezeBN backbone_weights: data/pretrained-models/ImageNet-R50-AlignPadding.npz image_pull_policy: Always horovod_cycle_time: "0.5" horovod_fusion_threshold: "67108864" nccl_socket_ifname: ^lo,docker0 if_exclude: lo,docker0 nccl_min_rings: 8 nccl_debug: INFO extra_config: 'TRAIN.GRADIENT_CLIP=0.36' backoff_limit: 20