global: namespace: kubeflow shared_fs: fsx shared_pvc: tensorpack-fsx # tensorpack-efs-gp-bursting source_cidr: "0.0.0.0/0" # Public IP source CIDR maskrcnn: name: maskrcnn gpus: 16 gpu_nodes: 2 gpus_per_node: 8 image: train_script: /tensorpack/examples/FasterRCNN/train.py data_fs: fsx # efs data_dir: mask-rcnn/eks/input/data steps_per_epoch: 7500 # Must be equal to 120000/gpus lr_schedule: "[240000,320000,360000]" eval_period_in_epochs: 1 data_train: "[\"coco_train2017\"]" data_val: "(\"coco_val2017\")" mode_fpn: 'True' mode_mask: 'True' backbone_norm: FreezeBN backbone_weights: mask-rcnn/eks/input/data/pretrained-models/ImageNet-R50-AlignPadding.npz image_pull_policy: Always horovod_cycle_time: "0.5" horovod_fusion_threshold: "67108864" nccl_socket_ifname: ^lo,docker0 if_exclude: lo,docker0 tf_device_min_sys_mem_mb: 4096 nccl_debug: INFO nccl_buffsize: "4194304" extra_config: 'TRAIN.CHECKPOINT_PERIOD=2' backoff_limit: 2000