global: namespace: kubeflow shared_fs: fsx #efs shared_pvc: tensorpack-fsx # tensorpack-efs-gp-bursting source_cidr: "0.0.0.0/0" # Public IP source CIDR maskrcnn: name: mask-rcnn-tensorflow gpus: 16 gpu_nodes: 2 gpus_per_node: 8 image: train_script: /mask-rcnn-tensorflow/MaskRCNN/train.py batch_size_per_gpu: 4 data_fs: fsx #ef6 data_dir: mask-rcnn/eks/input/data #data working_dir: /mask-rcnn-tensorflow images_per_epoch: 120000 lr_epoch_schedule: "[(16, 0.1), (20, 0.01), (24, None)]" eval_period_in_epochs: 1 data_train: "[\"train2017\"]" data_val: "(\"val2017\")" mode_fpn: 'True' mode_mask: 'True' backbone_norm: FreezeBN backbone_weights: mask-rcnn/eks/input/data/pretrained-models/ImageNet-R50-AlignPadding.npz image_pull_policy: Always extra_config: 'TRAIN.GRADIENT_CLIP=0.36' nccl_socket_ifname: ^lo,docker0 if_exclude: lo,docker0 tf_device_min_sys_mem_mb: 4096 nccl_debug: INFO nccl_buffsize: "4194304" backoff_limit: 2000