#!/bin/bash source .env #torchx run -s kubernetes -c queue=default,image_repo=${REGISTRY}${IMAGE} dist.ddp --gpu 0 --memMB 4096 --script idle.py data # etcd is required when using dist_ddp.py:ddp # etcd is not required when using dist.ddp #torchx run -s kubernetes -c queue=default,image_repo=${REGISTRY}${IMAGE} dist.ddp --debug False -j 1:2x1 --gpu 1 --memMB 40960 --script /workspace/imagenet-elastic.py -- --arch resnet18 --epochs 10 --batch-size 32 --print-freq 10 --workers 0 /workspace/data/tiny-imagenet-200 torchx run -s kubernetes -c queue=default,image_repo=${REGISTRY}${IMAGE} ./dist_ddp.py:ddp --debug False -j 1:2x1 --gpu 1 --memMB 40960 --script /workspace/imagenet-elastic.py -- --arch resnet18 --epochs 10 --batch-size 32 --print-freq 10 --workers 0 /workspace/data/tiny-imagenet-200