In [28]:
!pip uninstall kubeflow-pytorchjob --y

[0m

In [29]:
!pip install kubeflow-training

Defaulting to user installation because normal site-packages is not writeable


In [30]:
!pip list -o | grep kubeflow-training

In [48]:
from kubernetes.client import V1PodTemplateSpec
from kubernetes.client import V1ObjectMeta
from kubernetes.client import V1PodSpec
from kubernetes.client import V1Container
from kubernetes.client import V1ResourceRequirements

from kubernetes.client import V1VolumeMount
from kubernetes.client import V1Volume
from kubernetes.client import V1PersistentVolumeClaimVolumeSource

from kubeflow.training import constants
from kubeflow.training.utils import utils
from kubeflow.training import V1ReplicaSpec
from kubeflow.training import V1PyTorchJob
from kubeflow.training import V1PyTorchJobSpec
from kubeflow.training import PyTorchJobClient
from kubeflow.training import V1RunPolicy


In [49]:
namespace = utils.get_default_target_namespace()
namespace

'kubeflow-user-example-com'

In [50]:
!kubectl version

Client Version: version.Info{Major:"1", Minor:"24", GitVersion:"v1.24.2", GitCommit:"f66044f4361b9f1f96f0053dd46cb7dce5e990a8", GitTreeState:"clean", BuildDate:"2022-06-15T14:22:29Z", GoVersion:"go1.18.3", Compiler:"gc", Platform:"linux/amd64"}
Kustomize Version: v4.5.4
Server Version: version.Info{Major:"1", Minor:"21+", GitVersion:"v1.21.13-eks-84b4fe6", GitCommit:"e1318dce57b3e319a2e3fecf343677d1c4d4aa75", GitTreeState:"clean", BuildDate:"2022-06-09T18:22:07Z", GoVersion:"go1.16.15", Compiler:"gc", Platform:"linux/amd64"}


In [51]:
efsvolumemount = V1VolumeMount(
 mount_path="/efs-shared",
 name="efs-pv"
)

persistent_vol_claim = V1PersistentVolumeClaimVolumeSource(
 claim_name="efs-pvc"
)

efs_volume = V1Volume(
 name="efs-pv",
 persistent_volume_claim=persistent_vol_claim
)



In [52]:
container = V1Container(
 name="pytorch",
 image="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.12.0-gpu-py38-cu116-ubuntu20.04-e3",
 args=["python","./efs-shared/pipeline/mnist.py","--epochs","3","--seed","7","--log-interval","60"], 
 volume_mounts=[efsvolumemount],
 resources=V1ResourceRequirements(
 limits={'nvidia.com/gpu': '1'}
 )
)

metadata = V1ObjectMeta(
 annotations={'sidecar.istio.io/inject': 'false'}
)

master = V1ReplicaSpec(
 replicas=1,
 restart_policy="OnFailure",
 template=V1PodTemplateSpec(
 metadata=V1ObjectMeta(
 annotations={'sidecar.istio.io/inject': 'false'}
 ),
 spec=V1PodSpec(
 containers=[container],
 volumes=[efs_volume]
 )
 )
)

worker = V1ReplicaSpec(
 replicas=1,
 restart_policy="OnFailure",
 template=V1PodTemplateSpec(
 metadata=V1ObjectMeta(
 annotations={'sidecar.istio.io/inject': 'false'}
 ),
 spec=V1PodSpec(
 containers=[container],
 volumes=[efs_volume]
 )
 )
)


In [53]:
worker

{'replicas': 1,
 'restart_policy': 'OnFailure',
 'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'},
 'cluster_name': None,
 'creation_timestamp': None,
 'deletion_grace_period_seconds': None,
 'deletion_timestamp': None,
 'finalizers': None,
 'generate_name': None,
 'generation': None,
 'labels': None,
 'managed_fields': None,
 'name': None,
 'namespace': None,
 'owner_references': None,
 'resource_version': None,
 'self_link': None,
 'uid': None},
 'spec': {'active_deadline_seconds': None,
 'affinity': None,
 'automount_service_account_token': None,
 'containers': [{'args': ['python',
 './efs-shared/pipeline/mnist.py',
 '--epochs',
 '3',
 '--seed',
 '7',
 '--log-interval',
 '60'],
 'command': None,
 'env': None,
 'env_from': None,
 'image': '763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.12.0-gpu-py38-cu116-ubuntu20.04-e3',
 'image_pull_policy': None,
 'lifecycle': None,
 'liveness_probe': None,
 'name': 'pytorch',
 'ports': None,
 'readine

In [54]:
pytorchjob = V1PyTorchJob(
 api_version="kubeflow.org/v1",
 kind="PyTorchJob",
 metadata=V1ObjectMeta(name="pytorch-dist-mnist-gloo3",namespace=namespace),
 spec=V1PyTorchJobSpec(
 run_policy=V1RunPolicy(clean_pod_policy="None"),
 pytorch_replica_specs={"Master": master,
 "Worker": worker}
 )
)

In [55]:
pytorchjob_client = PyTorchJobClient()
pytorchjob_client.create(pytorchjob)

{'apiVersion': 'kubeflow.org/v1',
 'kind': 'PyTorchJob',
 'metadata': {'creationTimestamp': '2022-07-19T22:45:10Z',
 'generation': 1,
 'managedFields': [{'apiVersion': 'kubeflow.org/v1',
 'fieldsType': 'FieldsV1',
 'fieldsV1': {'f:spec': {'.': {},
 'f:pytorchReplicaSpecs': {'.': {},
 'f:Master': {'.': {},
 'f:replicas': {},
 'f:restartPolicy': {},
 'f:template': {'.': {},
 'f:metadata': {'.': {},
 'f:annotations': {'.': {}, 'f:sidecar.istio.io/inject': {}}},
 'f:spec': {'.': {}, 'f:containers': {}, 'f:volumes': {}}}},
 'f:Worker': {'.': {},
 'f:replicas': {},
 'f:restartPolicy': {},
 'f:template': {'.': {},
 'f:metadata': {'.': {},
 'f:annotations': {'.': {}, 'f:sidecar.istio.io/inject': {}}},
 'f:spec': {'.': {}, 'f:containers': {}, 'f:volumes': {}}}}},
 'f:runPolicy': {'.': {}, 'f:cleanPodPolicy': {}}}},
 'manager': 'OpenAPI-Generator',
 'operation': 'Update',
 'time': '2022-07-19T22:45:10Z'}],
 'name': 'pytorch-dist-mnist-gloo3',
 'namespace': 'kubeflow-user-example-com',
 'resource

In [56]:
pytorchjob_client.get('pytorch-dist-mnist-gloo3')

{'apiVersion': 'kubeflow.org/v1',
 'kind': 'PyTorchJob',
 'metadata': {'creationTimestamp': '2022-07-19T22:45:10Z',
 'generation': 1,
 'managedFields': [{'apiVersion': 'kubeflow.org/v1',
 'fieldsType': 'FieldsV1',
 'fieldsV1': {'f:spec': {'.': {},
 'f:pytorchReplicaSpecs': {'.': {},
 'f:Master': {'.': {},
 'f:replicas': {},
 'f:restartPolicy': {},
 'f:template': {'.': {},
 'f:metadata': {'.': {},
 'f:annotations': {'.': {}, 'f:sidecar.istio.io/inject': {}}},
 'f:spec': {'.': {}, 'f:containers': {}, 'f:volumes': {}}}},
 'f:Worker': {'.': {},
 'f:replicas': {},
 'f:restartPolicy': {},
 'f:template': {'.': {},
 'f:metadata': {'.': {},
 'f:annotations': {'.': {}, 'f:sidecar.istio.io/inject': {}}},
 'f:spec': {'.': {}, 'f:containers': {}, 'f:volumes': {}}}}},
 'f:runPolicy': {'.': {}, 'f:cleanPodPolicy': {}}}},
 'manager': 'OpenAPI-Generator',
 'operation': 'Update',
 'time': '2022-07-19T22:45:10Z'},
 {'apiVersion': 'kubeflow.org/v1',
 'fieldsType': 'FieldsV1',
 'fieldsV1': {'f:status': {'.

In [59]:
pytorchjob_client.get_job_status('pytorch-dist-mnist-gloo3', namespace=namespace)

'Succeeded'

In [61]:
pytorchjob_client.wait_for_job('pytorch-dist-mnist-gloo3', namespace=namespace, watch=False)

{'apiVersion': 'kubeflow.org/v1',
 'kind': 'PyTorchJob',
 'metadata': {'creationTimestamp': '2022-07-19T22:45:10Z',
 'generation': 1,
 'managedFields': [{'apiVersion': 'kubeflow.org/v1',
 'fieldsType': 'FieldsV1',
 'fieldsV1': {'f:spec': {'.': {},
 'f:pytorchReplicaSpecs': {'.': {},
 'f:Master': {'.': {},
 'f:replicas': {},
 'f:restartPolicy': {},
 'f:template': {'.': {},
 'f:metadata': {'.': {},
 'f:annotations': {'.': {}, 'f:sidecar.istio.io/inject': {}}},
 'f:spec': {'.': {}, 'f:containers': {}, 'f:volumes': {}}}},
 'f:Worker': {'.': {},
 'f:replicas': {},
 'f:restartPolicy': {},
 'f:template': {'.': {},
 'f:metadata': {'.': {},
 'f:annotations': {'.': {}, 'f:sidecar.istio.io/inject': {}}},
 'f:spec': {'.': {}, 'f:containers': {}, 'f:volumes': {}}}}},
 'f:runPolicy': {'.': {}, 'f:cleanPodPolicy': {}}}},
 'manager': 'OpenAPI-Generator',
 'operation': 'Update',
 'time': '2022-07-19T22:45:10Z'},
 {'apiVersion': 'kubeflow.org/v1',
 'fieldsType': 'FieldsV1',
 'fieldsV1': {'f:status': {'.

In [62]:
pytorchjob_client.get_logs('pytorch-dist-mnist-gloo3', namespace=namespace, master=False)

The logs of Pod pytorch-dist-mnist-gloo3-worker-0:
 INFO:root:WORLD_SIZE- 2
INFO:root:distribution availible: True
INFO:root:args.no_cuda: False
INFO:root:torch.cuda.is_available: True
INFO:root:Use Cudo: True
INFO:root:Using distributed PyTorch with gloo backend
INFO:torch.distributed.distributed_c10d:Added key: store_based_barrier_key:1 to store for rank: 1
INFO:torch.distributed.distributed_c10d:Rank 1: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes.
INFO:root:here11
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to sc-claim-dlc/MNIST/raw/train-images-idx3-ubyte.gz
Extracting sc-claim-dlc/MNIST/raw/train-images-idx3-ubyte.gz to sc-claim-dlc/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to sc-claim-dlc/MNIST/raw/train-labels-idx1-ubyte.gz
100%|████████

In [63]:
pytorchjob_client.delete('pytorch-dist-mnist-gloo3')

{'kind': 'Status',
 'apiVersion': 'v1',
 'metadata': {},
 'status': 'Success',
 'details': {'name': 'pytorch-dist-mnist-gloo3',
 'group': 'kubeflow.org',
 'kind': 'pytorchjobs',
 'uid': 'b618f675-b089-4ccf-8d92-675330266b27'}}