{ "cells": [ { "cell_type": "code", "execution_count": 28, "id": "4c03f330-3c58-4da4-bc56-8ce312fa4dcd", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[33mWARNING: Skipping kubeflow-pytorchjob as it is not installed.\u001b[0m\u001b[33m\n", "\u001b[0m" ] } ], "source": [ "!pip uninstall kubeflow-pytorchjob --y" ] }, { "cell_type": "code", "execution_count": 29, "id": "3c768c75-4a3d-47c3-8096-6e1b0c22d698", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Defaulting to user installation because normal site-packages is not writeable\n", "Requirement already satisfied: kubeflow-training in /opt/conda/lib/python3.8/site-packages (1.4.0)\n", "Requirement already satisfied: urllib3>=1.15.1 in /opt/conda/lib/python3.8/site-packages (from kubeflow-training) (1.26.9)\n", "Requirement already satisfied: setuptools>=21.0.0 in /opt/conda/lib/python3.8/site-packages (from kubeflow-training) (62.3.4)\n", "Requirement already satisfied: retrying>=1.3.3 in /opt/conda/lib/python3.8/site-packages (from kubeflow-training) (1.3.3)\n", "Requirement already satisfied: table-logger>=0.3.5 in /opt/conda/lib/python3.8/site-packages (from kubeflow-training) (0.3.6)\n", "Requirement already satisfied: certifi>=14.05.14 in /opt/conda/lib/python3.8/site-packages (from kubeflow-training) (2022.6.15)\n", "Requirement already satisfied: python-dateutil>=2.5.3 in /opt/conda/lib/python3.8/site-packages (from kubeflow-training) (2.8.2)\n", "Requirement already satisfied: kubernetes>=12.0.0 in /opt/conda/lib/python3.8/site-packages (from kubeflow-training) (18.20.0)\n", "Requirement already satisfied: six>=1.10 in /opt/conda/lib/python3.8/site-packages (from kubeflow-training) (1.16.0)\n", "Requirement already satisfied: pyyaml>=5.4.1 in /opt/conda/lib/python3.8/site-packages (from kubernetes>=12.0.0->kubeflow-training) (5.4.1)\n", "Requirement already satisfied: requests in /opt/conda/lib/python3.8/site-packages (from kubernetes>=12.0.0->kubeflow-training) (2.28.0)\n", "Requirement already satisfied: requests-oauthlib in /opt/conda/lib/python3.8/site-packages (from kubernetes>=12.0.0->kubeflow-training) (1.3.1)\n", "Requirement already satisfied: google-auth>=1.0.1 in /opt/conda/lib/python3.8/site-packages (from kubernetes>=12.0.0->kubeflow-training) (1.35.0)\n", "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /opt/conda/lib/python3.8/site-packages (from kubernetes>=12.0.0->kubeflow-training) (1.3.2)\n", "Requirement already satisfied: numpy in /opt/conda/lib/python3.8/site-packages (from table-logger>=0.3.5->kubeflow-training) (1.22.2)\n", "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /opt/conda/lib/python3.8/site-packages (from google-auth>=1.0.1->kubernetes>=12.0.0->kubeflow-training) (4.2.4)\n", "Requirement already satisfied: pyasn1-modules>=0.2.1 in /opt/conda/lib/python3.8/site-packages (from google-auth>=1.0.1->kubernetes>=12.0.0->kubeflow-training) (0.2.8)\n", "Requirement already satisfied: rsa<5,>=3.1.4 in /opt/conda/lib/python3.8/site-packages (from google-auth>=1.0.1->kubernetes>=12.0.0->kubeflow-training) (4.7.2)\n", "Requirement already satisfied: charset-normalizer~=2.0.0 in /opt/conda/lib/python3.8/site-packages (from requests->kubernetes>=12.0.0->kubeflow-training) (2.0.12)\n", "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.8/site-packages (from requests->kubernetes>=12.0.0->kubeflow-training) (3.3)\n", "Requirement already satisfied: oauthlib>=3.0.0 in /opt/conda/lib/python3.8/site-packages (from requests-oauthlib->kubernetes>=12.0.0->kubeflow-training) (3.2.0)\n", "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /opt/conda/lib/python3.8/site-packages (from pyasn1-modules>=0.2.1->google-auth>=1.0.1->kubernetes>=12.0.0->kubeflow-training) (0.4.8)\n" ] } ], "source": [ "!pip install kubeflow-training" ] }, { "cell_type": "code", "execution_count": 30, "id": "90707fe6-a6e9-42a8-9acc-b363803e5763", "metadata": {}, "outputs": [], "source": [ "!pip list -o | grep kubeflow-training" ] }, { "cell_type": "code", "execution_count": 48, "id": "b1f168f6-0c51-4681-b41b-9f6c7788b452", "metadata": {}, "outputs": [], "source": [ "from kubernetes.client import V1PodTemplateSpec\n", "from kubernetes.client import V1ObjectMeta\n", "from kubernetes.client import V1PodSpec\n", "from kubernetes.client import V1Container\n", "from kubernetes.client import V1ResourceRequirements\n", "\n", "from kubernetes.client import V1VolumeMount\n", "from kubernetes.client import V1Volume\n", "from kubernetes.client import V1PersistentVolumeClaimVolumeSource\n", "\n", "from kubeflow.training import constants\n", "from kubeflow.training.utils import utils\n", "from kubeflow.training import V1ReplicaSpec\n", "from kubeflow.training import V1PyTorchJob\n", "from kubeflow.training import V1PyTorchJobSpec\n", "from kubeflow.training import PyTorchJobClient\n", "from kubeflow.training import V1RunPolicy\n" ] }, { "cell_type": "code", "execution_count": 49, "id": "518aef80-7ac9-4304-9d81-363562e2c736", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'kubeflow-user-example-com'" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "namespace = utils.get_default_target_namespace()\n", "namespace" ] }, { "cell_type": "code", "execution_count": 50, "id": "b2436e13-c163-4442-aa65-65ce8e6ceeab", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "WARNING: This version information is deprecated and will be replaced with the output from kubectl version --short. Use --output=yaml|json to get the full version.\n", "Client Version: version.Info{Major:\"1\", Minor:\"24\", GitVersion:\"v1.24.2\", GitCommit:\"f66044f4361b9f1f96f0053dd46cb7dce5e990a8\", GitTreeState:\"clean\", BuildDate:\"2022-06-15T14:22:29Z\", GoVersion:\"go1.18.3\", Compiler:\"gc\", Platform:\"linux/amd64\"}\n", "Kustomize Version: v4.5.4\n", "Server Version: version.Info{Major:\"1\", Minor:\"21+\", GitVersion:\"v1.21.13-eks-84b4fe6\", GitCommit:\"e1318dce57b3e319a2e3fecf343677d1c4d4aa75\", GitTreeState:\"clean\", BuildDate:\"2022-06-09T18:22:07Z\", GoVersion:\"go1.16.15\", Compiler:\"gc\", Platform:\"linux/amd64\"}\n", "WARNING: version difference between client (1.24) and server (1.21) exceeds the supported minor version skew of +/-1\n" ] } ], "source": [ "!kubectl version" ] }, { "cell_type": "code", "execution_count": 51, "id": "f4225ab9-3700-4a21-a993-9c73b6d2a463", "metadata": { "tags": [] }, "outputs": [], "source": [ "efsvolumemount = V1VolumeMount(\n", " mount_path=\"/efs-shared\",\n", " name=\"efs-pv\"\n", ")\n", "\n", "persistent_vol_claim = V1PersistentVolumeClaimVolumeSource(\n", " claim_name=\"efs-pvc\"\n", ")\n", "\n", "efs_volume = V1Volume(\n", " name=\"efs-pv\",\n", " persistent_volume_claim=persistent_vol_claim\n", ")\n", "\n" ] }, { "cell_type": "code", "execution_count": 52, "id": "542515e9-d85e-4c0f-ba75-9bc2a171e553", "metadata": {}, "outputs": [], "source": [ "container = V1Container(\n", " name=\"pytorch\",\n", " image=\"763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.12.0-gpu-py38-cu116-ubuntu20.04-e3\",\n", " args=[\"python\",\"./efs-shared/pipeline/mnist.py\",\"--epochs\",\"3\",\"--seed\",\"7\",\"--log-interval\",\"60\"], \n", " volume_mounts=[efsvolumemount],\n", " resources=V1ResourceRequirements(\n", " limits={'nvidia.com/gpu': '1'}\n", " )\n", ")\n", "\n", "metadata = V1ObjectMeta(\n", " annotations={'sidecar.istio.io/inject': 'false'}\n", ")\n", "\n", "master = V1ReplicaSpec(\n", " replicas=1,\n", " restart_policy=\"OnFailure\",\n", " template=V1PodTemplateSpec(\n", " metadata=V1ObjectMeta(\n", " annotations={'sidecar.istio.io/inject': 'false'}\n", " ),\n", " spec=V1PodSpec(\n", " containers=[container],\n", " volumes=[efs_volume]\n", " )\n", " )\n", ")\n", "\n", "worker = V1ReplicaSpec(\n", " replicas=1,\n", " restart_policy=\"OnFailure\",\n", " template=V1PodTemplateSpec(\n", " metadata=V1ObjectMeta(\n", " annotations={'sidecar.istio.io/inject': 'false'}\n", " ),\n", " spec=V1PodSpec(\n", " containers=[container],\n", " volumes=[efs_volume]\n", " )\n", " )\n", ")\n" ] }, { "cell_type": "code", "execution_count": 53, "id": "aa05964a-3a1f-410f-afe5-bbf47f16a7bf", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "{'replicas': 1,\n", " 'restart_policy': 'OnFailure',\n", " 'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'},\n", " 'cluster_name': None,\n", " 'creation_timestamp': None,\n", " 'deletion_grace_period_seconds': None,\n", " 'deletion_timestamp': None,\n", " 'finalizers': None,\n", " 'generate_name': None,\n", " 'generation': None,\n", " 'labels': None,\n", " 'managed_fields': None,\n", " 'name': None,\n", " 'namespace': None,\n", " 'owner_references': None,\n", " 'resource_version': None,\n", " 'self_link': None,\n", " 'uid': None},\n", " 'spec': {'active_deadline_seconds': None,\n", " 'affinity': None,\n", " 'automount_service_account_token': None,\n", " 'containers': [{'args': ['python',\n", " './efs-shared/pipeline/mnist.py',\n", " '--epochs',\n", " '3',\n", " '--seed',\n", " '7',\n", " '--log-interval',\n", " '60'],\n", " 'command': None,\n", " 'env': None,\n", " 'env_from': None,\n", " 'image': '763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.12.0-gpu-py38-cu116-ubuntu20.04-e3',\n", " 'image_pull_policy': None,\n", " 'lifecycle': None,\n", " 'liveness_probe': None,\n", " 'name': 'pytorch',\n", " 'ports': None,\n", " 'readiness_probe': None,\n", " 'resources': {'limits': {'nvidia.com/gpu': '1'},\n", " 'requests': None},\n", " 'security_context': None,\n", " 'startup_probe': None,\n", " 'stdin': None,\n", " 'stdin_once': None,\n", " 'termination_message_path': None,\n", " 'termination_message_policy': None,\n", " 'tty': None,\n", " 'volume_devices': None,\n", " 'volume_mounts': [{'mount_path': '/efs-shared',\n", " 'mount_propagation': None,\n", " 'name': 'efs-pv',\n", " 'read_only': None,\n", " 'sub_path': None,\n", " 'sub_path_expr': None}],\n", " 'working_dir': None}],\n", " 'dns_config': None,\n", " 'dns_policy': None,\n", " 'enable_service_links': None,\n", " 'ephemeral_containers': None,\n", " 'host_aliases': None,\n", " 'host_ipc': None,\n", " 'host_network': None,\n", " 'host_pid': None,\n", " 'hostname': None,\n", " 'image_pull_secrets': None,\n", " 'init_containers': None,\n", " 'node_name': None,\n", " 'node_selector': None,\n", " 'overhead': None,\n", " 'preemption_policy': None,\n", " 'priority': None,\n", " 'priority_class_name': None,\n", " 'readiness_gates': None,\n", " 'restart_policy': None,\n", " 'runtime_class_name': None,\n", " 'scheduler_name': None,\n", " 'security_context': None,\n", " 'service_account': None,\n", " 'service_account_name': None,\n", " 'share_process_namespace': None,\n", " 'subdomain': None,\n", " 'termination_grace_period_seconds': None,\n", " 'tolerations': None,\n", " 'topology_spread_constraints': None,\n", " 'volumes': [{'aws_elastic_block_store': None,\n", " 'azure_disk': None,\n", " 'azure_file': None,\n", " 'cephfs': None,\n", " 'cinder': None,\n", " 'config_map': None,\n", " 'csi': None,\n", " 'downward_api': None,\n", " 'empty_dir': None,\n", " 'fc': None,\n", " 'flex_volume': None,\n", " 'flocker': None,\n", " 'gce_persistent_disk': None,\n", " 'git_repo': None,\n", " 'glusterfs': None,\n", " 'host_path': None,\n", " 'iscsi': None,\n", " 'name': 'efs-pv',\n", " 'nfs': None,\n", " 'persistent_volume_claim': {'claim_name': 'efs-pvc',\n", " 'read_only': None},\n", " 'photon_persistent_disk': None,\n", " 'portworx_volume': None,\n", " 'projected': None,\n", " 'quobyte': None,\n", " 'rbd': None,\n", " 'scale_io': None,\n", " 'secret': None,\n", " 'storageos': None,\n", " 'vsphere_volume': None}]}}}" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "worker" ] }, { "cell_type": "code", "execution_count": 54, "id": "98336984-fb80-4561-b59d-c048f76399ec", "metadata": {}, "outputs": [], "source": [ "pytorchjob = V1PyTorchJob(\n", " api_version=\"kubeflow.org/v1\",\n", " kind=\"PyTorchJob\",\n", " metadata=V1ObjectMeta(name=\"pytorch-dist-mnist-gloo3\",namespace=namespace),\n", " spec=V1PyTorchJobSpec(\n", " run_policy=V1RunPolicy(clean_pod_policy=\"None\"),\n", " pytorch_replica_specs={\"Master\": master,\n", " \"Worker\": worker}\n", " )\n", ")" ] }, { "cell_type": "code", "execution_count": 55, "id": "aaf9e511-a850-4f86-8e17-b804f1426ab2", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "{'apiVersion': 'kubeflow.org/v1',\n", " 'kind': 'PyTorchJob',\n", " 'metadata': {'creationTimestamp': '2022-07-19T22:45:10Z',\n", " 'generation': 1,\n", " 'managedFields': [{'apiVersion': 'kubeflow.org/v1',\n", " 'fieldsType': 'FieldsV1',\n", " 'fieldsV1': {'f:spec': {'.': {},\n", " 'f:pytorchReplicaSpecs': {'.': {},\n", " 'f:Master': {'.': {},\n", " 'f:replicas': {},\n", " 'f:restartPolicy': {},\n", " 'f:template': {'.': {},\n", " 'f:metadata': {'.': {},\n", " 'f:annotations': {'.': {}, 'f:sidecar.istio.io/inject': {}}},\n", " 'f:spec': {'.': {}, 'f:containers': {}, 'f:volumes': {}}}},\n", " 'f:Worker': {'.': {},\n", " 'f:replicas': {},\n", " 'f:restartPolicy': {},\n", " 'f:template': {'.': {},\n", " 'f:metadata': {'.': {},\n", " 'f:annotations': {'.': {}, 'f:sidecar.istio.io/inject': {}}},\n", " 'f:spec': {'.': {}, 'f:containers': {}, 'f:volumes': {}}}}},\n", " 'f:runPolicy': {'.': {}, 'f:cleanPodPolicy': {}}}},\n", " 'manager': 'OpenAPI-Generator',\n", " 'operation': 'Update',\n", " 'time': '2022-07-19T22:45:10Z'}],\n", " 'name': 'pytorch-dist-mnist-gloo3',\n", " 'namespace': 'kubeflow-user-example-com',\n", " 'resourceVersion': '16621748',\n", " 'uid': 'b618f675-b089-4ccf-8d92-675330266b27'},\n", " 'spec': {'pytorchReplicaSpecs': {'Master': {'replicas': 1,\n", " 'restartPolicy': 'OnFailure',\n", " 'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},\n", " 'spec': {'containers': [{'args': ['python',\n", " './efs-shared/pipeline/mnist.py',\n", " '--epochs',\n", " '3',\n", " '--seed',\n", " '7',\n", " '--log-interval',\n", " '60'],\n", " 'image': '763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.12.0-gpu-py38-cu116-ubuntu20.04-e3',\n", " 'name': 'pytorch',\n", " 'resources': {'limits': {'nvidia.com/gpu': '1'}},\n", " 'volumeMounts': [{'mountPath': '/efs-shared', 'name': 'efs-pv'}]}],\n", " 'volumes': [{'name': 'efs-pv',\n", " 'persistentVolumeClaim': {'claimName': 'efs-pvc'}}]}}},\n", " 'Worker': {'replicas': 1,\n", " 'restartPolicy': 'OnFailure',\n", " 'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},\n", " 'spec': {'containers': [{'args': ['python',\n", " './efs-shared/pipeline/mnist.py',\n", " '--epochs',\n", " '3',\n", " '--seed',\n", " '7',\n", " '--log-interval',\n", " '60'],\n", " 'image': '763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.12.0-gpu-py38-cu116-ubuntu20.04-e3',\n", " 'name': 'pytorch',\n", " 'resources': {'limits': {'nvidia.com/gpu': '1'}},\n", " 'volumeMounts': [{'mountPath': '/efs-shared', 'name': 'efs-pv'}]}],\n", " 'volumes': [{'name': 'efs-pv',\n", " 'persistentVolumeClaim': {'claimName': 'efs-pvc'}}]}}}},\n", " 'runPolicy': {'cleanPodPolicy': 'None'}}}" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pytorchjob_client = PyTorchJobClient()\n", "pytorchjob_client.create(pytorchjob)" ] }, { "cell_type": "code", "execution_count": 56, "id": "79a1ba45-a145-4076-85a4-a3b2a06658ae", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "{'apiVersion': 'kubeflow.org/v1',\n", " 'kind': 'PyTorchJob',\n", " 'metadata': {'creationTimestamp': '2022-07-19T22:45:10Z',\n", " 'generation': 1,\n", " 'managedFields': [{'apiVersion': 'kubeflow.org/v1',\n", " 'fieldsType': 'FieldsV1',\n", " 'fieldsV1': {'f:spec': {'.': {},\n", " 'f:pytorchReplicaSpecs': {'.': {},\n", " 'f:Master': {'.': {},\n", " 'f:replicas': {},\n", " 'f:restartPolicy': {},\n", " 'f:template': {'.': {},\n", " 'f:metadata': {'.': {},\n", " 'f:annotations': {'.': {}, 'f:sidecar.istio.io/inject': {}}},\n", " 'f:spec': {'.': {}, 'f:containers': {}, 'f:volumes': {}}}},\n", " 'f:Worker': {'.': {},\n", " 'f:replicas': {},\n", " 'f:restartPolicy': {},\n", " 'f:template': {'.': {},\n", " 'f:metadata': {'.': {},\n", " 'f:annotations': {'.': {}, 'f:sidecar.istio.io/inject': {}}},\n", " 'f:spec': {'.': {}, 'f:containers': {}, 'f:volumes': {}}}}},\n", " 'f:runPolicy': {'.': {}, 'f:cleanPodPolicy': {}}}},\n", " 'manager': 'OpenAPI-Generator',\n", " 'operation': 'Update',\n", " 'time': '2022-07-19T22:45:10Z'},\n", " {'apiVersion': 'kubeflow.org/v1',\n", " 'fieldsType': 'FieldsV1',\n", " 'fieldsV1': {'f:status': {'.': {},\n", " 'f:conditions': {},\n", " 'f:replicaStatuses': {'.': {}, 'f:Master': {}, 'f:Worker': {}}}},\n", " 'manager': 'manager',\n", " 'operation': 'Update',\n", " 'time': '2022-07-19T22:45:11Z'}],\n", " 'name': 'pytorch-dist-mnist-gloo3',\n", " 'namespace': 'kubeflow-user-example-com',\n", " 'resourceVersion': '16621790',\n", " 'uid': 'b618f675-b089-4ccf-8d92-675330266b27'},\n", " 'spec': {'pytorchReplicaSpecs': {'Master': {'replicas': 1,\n", " 'restartPolicy': 'OnFailure',\n", " 'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},\n", " 'spec': {'containers': [{'args': ['python',\n", " './efs-shared/pipeline/mnist.py',\n", " '--epochs',\n", " '3',\n", " '--seed',\n", " '7',\n", " '--log-interval',\n", " '60'],\n", " 'image': '763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.12.0-gpu-py38-cu116-ubuntu20.04-e3',\n", " 'name': 'pytorch',\n", " 'resources': {'limits': {'nvidia.com/gpu': '1'}},\n", " 'volumeMounts': [{'mountPath': '/efs-shared', 'name': 'efs-pv'}]}],\n", " 'volumes': [{'name': 'efs-pv',\n", " 'persistentVolumeClaim': {'claimName': 'efs-pvc'}}]}}},\n", " 'Worker': {'replicas': 1,\n", " 'restartPolicy': 'OnFailure',\n", " 'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},\n", " 'spec': {'containers': [{'args': ['python',\n", " './efs-shared/pipeline/mnist.py',\n", " '--epochs',\n", " '3',\n", " '--seed',\n", " '7',\n", " '--log-interval',\n", " '60'],\n", " 'image': '763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.12.0-gpu-py38-cu116-ubuntu20.04-e3',\n", " 'name': 'pytorch',\n", " 'resources': {'limits': {'nvidia.com/gpu': '1'}},\n", " 'volumeMounts': [{'mountPath': '/efs-shared', 'name': 'efs-pv'}]}],\n", " 'volumes': [{'name': 'efs-pv',\n", " 'persistentVolumeClaim': {'claimName': 'efs-pvc'}}]}}}},\n", " 'runPolicy': {'cleanPodPolicy': 'None'}},\n", " 'status': {'conditions': [{'lastTransitionTime': '2022-07-19T22:45:10Z',\n", " 'lastUpdateTime': '2022-07-19T22:45:10Z',\n", " 'message': 'PyTorchJob pytorch-dist-mnist-gloo3 is created.',\n", " 'reason': 'PyTorchJobCreated',\n", " 'status': 'True',\n", " 'type': 'Created'},\n", " {'lastTransitionTime': '2022-07-19T22:45:11Z',\n", " 'lastUpdateTime': '2022-07-19T22:45:11Z',\n", " 'message': 'PyTorchJob pytorch-dist-mnist-gloo3 is running.',\n", " 'reason': 'JobRunning',\n", " 'status': 'True',\n", " 'type': 'Running'}],\n", " 'replicaStatuses': {'Master': {}, 'Worker': {}}}}" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pytorchjob_client.get('pytorch-dist-mnist-gloo3')" ] }, { "cell_type": "code", "execution_count": 59, "id": "d6a151c7-0108-477e-ac3f-1027179a1116", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Succeeded'" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pytorchjob_client.get_job_status('pytorch-dist-mnist-gloo3', namespace=namespace)" ] }, { "cell_type": "code", "execution_count": 61, "id": "7dc32f64-9797-404f-a75e-c5d54671a56d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'apiVersion': 'kubeflow.org/v1',\n", " 'kind': 'PyTorchJob',\n", " 'metadata': {'creationTimestamp': '2022-07-19T22:45:10Z',\n", " 'generation': 1,\n", " 'managedFields': [{'apiVersion': 'kubeflow.org/v1',\n", " 'fieldsType': 'FieldsV1',\n", " 'fieldsV1': {'f:spec': {'.': {},\n", " 'f:pytorchReplicaSpecs': {'.': {},\n", " 'f:Master': {'.': {},\n", " 'f:replicas': {},\n", " 'f:restartPolicy': {},\n", " 'f:template': {'.': {},\n", " 'f:metadata': {'.': {},\n", " 'f:annotations': {'.': {}, 'f:sidecar.istio.io/inject': {}}},\n", " 'f:spec': {'.': {}, 'f:containers': {}, 'f:volumes': {}}}},\n", " 'f:Worker': {'.': {},\n", " 'f:replicas': {},\n", " 'f:restartPolicy': {},\n", " 'f:template': {'.': {},\n", " 'f:metadata': {'.': {},\n", " 'f:annotations': {'.': {}, 'f:sidecar.istio.io/inject': {}}},\n", " 'f:spec': {'.': {}, 'f:containers': {}, 'f:volumes': {}}}}},\n", " 'f:runPolicy': {'.': {}, 'f:cleanPodPolicy': {}}}},\n", " 'manager': 'OpenAPI-Generator',\n", " 'operation': 'Update',\n", " 'time': '2022-07-19T22:45:10Z'},\n", " {'apiVersion': 'kubeflow.org/v1',\n", " 'fieldsType': 'FieldsV1',\n", " 'fieldsV1': {'f:status': {'.': {},\n", " 'f:completionTime': {},\n", " 'f:conditions': {},\n", " 'f:replicaStatuses': {'.': {},\n", " 'f:Master': {'.': {}, 'f:succeeded': {}},\n", " 'f:Worker': {'.': {}, 'f:succeeded': {}}}}},\n", " 'manager': 'manager',\n", " 'operation': 'Update',\n", " 'time': '2022-07-19T22:50:02Z'}],\n", " 'name': 'pytorch-dist-mnist-gloo3',\n", " 'namespace': 'kubeflow-user-example-com',\n", " 'resourceVersion': '16628940',\n", " 'uid': 'b618f675-b089-4ccf-8d92-675330266b27'},\n", " 'spec': {'pytorchReplicaSpecs': {'Master': {'replicas': 1,\n", " 'restartPolicy': 'OnFailure',\n", " 'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},\n", " 'spec': {'containers': [{'args': ['python',\n", " './efs-shared/pipeline/mnist.py',\n", " '--epochs',\n", " '3',\n", " '--seed',\n", " '7',\n", " '--log-interval',\n", " '60'],\n", " 'image': '763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.12.0-gpu-py38-cu116-ubuntu20.04-e3',\n", " 'name': 'pytorch',\n", " 'resources': {'limits': {'nvidia.com/gpu': '1'}},\n", " 'volumeMounts': [{'mountPath': '/efs-shared', 'name': 'efs-pv'}]}],\n", " 'volumes': [{'name': 'efs-pv',\n", " 'persistentVolumeClaim': {'claimName': 'efs-pvc'}}]}}},\n", " 'Worker': {'replicas': 1,\n", " 'restartPolicy': 'OnFailure',\n", " 'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},\n", " 'spec': {'containers': [{'args': ['python',\n", " './efs-shared/pipeline/mnist.py',\n", " '--epochs',\n", " '3',\n", " '--seed',\n", " '7',\n", " '--log-interval',\n", " '60'],\n", " 'image': '763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.12.0-gpu-py38-cu116-ubuntu20.04-e3',\n", " 'name': 'pytorch',\n", " 'resources': {'limits': {'nvidia.com/gpu': '1'}},\n", " 'volumeMounts': [{'mountPath': '/efs-shared', 'name': 'efs-pv'}]}],\n", " 'volumes': [{'name': 'efs-pv',\n", " 'persistentVolumeClaim': {'claimName': 'efs-pvc'}}]}}}},\n", " 'runPolicy': {'cleanPodPolicy': 'None'}},\n", " 'status': {'completionTime': '2022-07-19T22:50:02Z',\n", " 'conditions': [{'lastTransitionTime': '2022-07-19T22:45:10Z',\n", " 'lastUpdateTime': '2022-07-19T22:45:10Z',\n", " 'message': 'PyTorchJob pytorch-dist-mnist-gloo3 is created.',\n", " 'reason': 'PyTorchJobCreated',\n", " 'status': 'True',\n", " 'type': 'Created'},\n", " {'lastTransitionTime': '2022-07-19T22:45:11Z',\n", " 'lastUpdateTime': '2022-07-19T22:45:11Z',\n", " 'message': 'PyTorchJob pytorch-dist-mnist-gloo3 is running.',\n", " 'reason': 'JobRunning',\n", " 'status': 'False',\n", " 'type': 'Running'},\n", " {'lastTransitionTime': '2022-07-19T22:50:02Z',\n", " 'lastUpdateTime': '2022-07-19T22:50:02Z',\n", " 'message': 'PyTorchJob pytorch-dist-mnist-gloo3 is successfully completed.',\n", " 'reason': 'JobSucceeded',\n", " 'status': 'True',\n", " 'type': 'Succeeded'}],\n", " 'replicaStatuses': {'Master': {'succeeded': 1}, 'Worker': {'succeeded': 1}}}}" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pytorchjob_client.wait_for_job('pytorch-dist-mnist-gloo3', namespace=namespace, watch=False)" ] }, { "cell_type": "code", "execution_count": 62, "id": "276b6348-974c-4af8-96db-4b5a6a16a064", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "The logs of Pod pytorch-dist-mnist-gloo3-worker-0:\n", " INFO:root:WORLD_SIZE- 2\n", "INFO:root:distribution availible: True\n", "INFO:root:args.no_cuda: False\n", "INFO:root:torch.cuda.is_available: True\n", "INFO:root:Use Cudo: True\n", "INFO:root:Using distributed PyTorch with gloo backend\n", "INFO:torch.distributed.distributed_c10d:Added key: store_based_barrier_key:1 to store for rank: 1\n", "INFO:torch.distributed.distributed_c10d:Rank 1: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes.\n", "INFO:root:here11\n", "Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz\n", "Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to sc-claim-dlc/MNIST/raw/train-images-idx3-ubyte.gz\n", "Extracting sc-claim-dlc/MNIST/raw/train-images-idx3-ubyte.gz to sc-claim-dlc/MNIST/raw\n", "\n", "Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz\n", "Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to sc-claim-dlc/MNIST/raw/train-labels-idx1-ubyte.gz\n", "100%|██████████| 9912422/9912422 [00:00<00:00, 86222495.33it/s]\n", "Extracting sc-claim-dlc/MNIST/raw/train-labels-idx1-ubyte.gz to sc-claim-dlc/MNIST/raw\n", "\n", "Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz\n", "Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to sc-claim-dlc/MNIST/raw/t10k-images-idx3-ubyte.gz\n", "100%|██████████| 28881/28881 [00:00<00:00, 138125078.48it/s]\n", "100%|██████████| 1648877/1648877 [00:00<00:00, 180110719.22it/s]\n", "Extracting sc-claim-dlc/MNIST/raw/t10k-images-idx3-ubyte.gz to sc-claim-dlc/MNIST/raw\n", "\n", "Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz\n", "Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to sc-claim-dlc/MNIST/raw/t10k-labels-idx1-ubyte.gz\n", "100%|██████████| 4542/4542 [00:00<00:00, 49481892.90it/s]\n", "INFO:root:here1\n", "INFO:root:here2\n", "INFO:root:here3\n", "INFO:root:here4\n", "INFO:root:dist.is_available(): True \n", "INFO:root:dist.is_initialized(): True backend\n", "INFO:root:is_distributed(): True\n", "INFO:root:use_cuda:True\n", "INFO:root:dist.is_available(): True \n", "INFO:root:dist.is_initialized(): True backend\n", "INFO:root:train_loader - 469\n", "Extracting sc-claim-dlc/MNIST/raw/t10k-labels-idx1-ubyte.gz to sc-claim-dlc/MNIST/raw\n", "\n", "/opt/conda/lib/python3.8/site-packages/torch/nn/functional.py:1331: UserWarning: dropout2d: Received a 2-D input to dropout2d, which is deprecated and will result in an error in a future release. To retain the behavior and silence this warning, please use dropout instead. Note that dropout2d exists to provide channel-wise dropout on inputs with 2 spatial dimensions, a channel dimension, and an optional batch dimension (i.e. 3D or 4D inputs).\n", " warnings.warn(warn_msg)\n", "INFO:root:Epoch: 1 ( 0.0%) - Loss: 2.2982213497161865\n", "INFO:torch.nn.parallel.distributed:Reducer buckets have been rebuilt in this iteration.\n", "INFO:root:Epoch: 1 ( 12.8%) - Loss: 0.2094886749982834\n", "INFO:root:Epoch: 1 ( 25.6%) - Loss: 0.16655616462230682\n", "INFO:root:Epoch: 1 ( 38.4%) - Loss: 0.11170735955238342\n", "INFO:root:Epoch: 1 ( 51.2%) - Loss: 0.0438513457775116\n", "INFO:root:Epoch: 1 ( 64.0%) - Loss: 0.207422137260437\n", "INFO:root:Epoch: 1 ( 76.8%) - Loss: 0.1903132200241089\n", "INFO:root:Epoch: 1 ( 89.6%) - Loss: 0.10344178974628448\n", "INFO:root:data_counter - 30000\n", "INFO:root:Test accuracy: 9832/10000 ( 98.3%)\n", "INFO:root:loss=0.0481\n", "INFO:root:accuracy=0.9832\n", "INFO:root:train_loader - 469\n", "INFO:root:Epoch: 2 ( 0.0%) - Loss: 0.1635887771844864\n", "INFO:root:Epoch: 2 ( 12.8%) - Loss: 0.054971203207969666\n", "INFO:root:Epoch: 2 ( 25.6%) - Loss: 0.05949727073311806\n", "INFO:root:Epoch: 2 ( 38.4%) - Loss: 0.10091497749090195\n", "INFO:root:Epoch: 2 ( 51.2%) - Loss: 0.03360436484217644\n", "INFO:root:Epoch: 2 ( 64.0%) - Loss: 0.11781998723745346\n", "INFO:root:Epoch: 2 ( 76.8%) - Loss: 0.04402787610888481\n", "INFO:root:Epoch: 2 ( 89.6%) - Loss: 0.04582521691918373\n", "INFO:root:data_counter - 30000\n", "INFO:root:Test accuracy: 9851/10000 ( 98.5%)\n", "INFO:root:loss=0.0425\n", "INFO:root:accuracy=0.9851\n", "INFO:root:train_loader - 469\n", "INFO:root:Epoch: 3 ( 0.0%) - Loss: 0.0673985555768013\n", "INFO:root:Epoch: 3 ( 12.8%) - Loss: 0.041666429489851\n", "INFO:root:Epoch: 3 ( 25.6%) - Loss: 0.014612806029617786\n", "INFO:root:Epoch: 3 ( 38.4%) - Loss: 0.10124501585960388\n", "INFO:root:Epoch: 3 ( 51.2%) - Loss: 0.009830977767705917\n", "INFO:root:Epoch: 3 ( 64.0%) - Loss: 0.07772887498140335\n", "INFO:root:Epoch: 3 ( 76.8%) - Loss: 0.07046407461166382\n", "INFO:root:Epoch: 3 ( 89.6%) - Loss: 0.016354750841856003\n", "INFO:root:data_counter - 30000\n", "INFO:root:Test accuracy: 9896/10000 ( 99.0%)\n", "INFO:root:loss=0.0312\n", "INFO:root:accuracy=0.9896\n", "\n", "The logs of Pod pytorch-dist-mnist-gloo3-master-0:\n", " INFO:root:WORLD_SIZE- 2\n", "INFO:root:distribution availible: True\n", "INFO:root:args.no_cuda: False\n", "INFO:root:torch.cuda.is_available: True\n", "INFO:root:Use Cudo: True\n", "INFO:root:Using distributed PyTorch with gloo backend\n", "INFO:torch.distributed.distributed_c10d:Added key: store_based_barrier_key:1 to store for rank: 0\n", "INFO:torch.distributed.distributed_c10d:Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes.\n", "INFO:root:here11\n", "Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz\n", "Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to sc-claim-dlc/MNIST/raw/train-images-idx3-ubyte.gz\n", "100%|██████████| 9912422/9912422 [00:00<00:00, 95481305.38it/s]\n", "Extracting sc-claim-dlc/MNIST/raw/train-images-idx3-ubyte.gz to sc-claim-dlc/MNIST/raw\n", "\n", "Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz\n", "Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to sc-claim-dlc/MNIST/raw/train-labels-idx1-ubyte.gz\n", "100%|██████████| 28881/28881 [00:00<00:00, 147546521.10it/s]\n", "Extracting sc-claim-dlc/MNIST/raw/train-labels-idx1-ubyte.gz to sc-claim-dlc/MNIST/raw\n", "\n", "Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz\n", "Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to sc-claim-dlc/MNIST/raw/t10k-images-idx3-ubyte.gz\n", "100%|██████████| 1648877/1648877 [00:00<00:00, 111266674.12it/s]\n", "Extracting sc-claim-dlc/MNIST/raw/t10k-images-idx3-ubyte.gz to sc-claim-dlc/MNIST/raw\n", "\n", "Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz\n", "Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to sc-claim-dlc/MNIST/raw/t10k-labels-idx1-ubyte.gz\n", "100%|██████████| 4542/4542 [00:00<00:00, 51487915.59it/s]\n", "INFO:root:here1\n", "INFO:root:here2\n", "INFO:root:here3\n", "INFO:root:here4\n", "INFO:root:dist.is_available(): True \n", "INFO:root:dist.is_initialized(): True backend\n", "INFO:root:is_distributed(): True\n", "INFO:root:use_cuda:True\n", "INFO:root:dist.is_available(): True \n", "INFO:root:dist.is_initialized(): True backend\n", "INFO:root:train_loader - 469\n", "Extracting sc-claim-dlc/MNIST/raw/t10k-labels-idx1-ubyte.gz to sc-claim-dlc/MNIST/raw\n", "\n", "/opt/conda/lib/python3.8/site-packages/torch/nn/functional.py:1331: UserWarning: dropout2d: Received a 2-D input to dropout2d, which is deprecated and will result in an error in a future release. To retain the behavior and silence this warning, please use dropout instead. Note that dropout2d exists to provide channel-wise dropout on inputs with 2 spatial dimensions, a channel dimension, and an optional batch dimension (i.e. 3D or 4D inputs).\n", " warnings.warn(warn_msg)\n", "INFO:root:Epoch: 1 ( 0.0%) - Loss: 2.3083343505859375\n", "INFO:torch.nn.parallel.distributed:Reducer buckets have been rebuilt in this iteration.\n", "INFO:root:Epoch: 1 ( 12.8%) - Loss: 0.26069486141204834\n", "INFO:root:Epoch: 1 ( 25.6%) - Loss: 0.13111376762390137\n", "INFO:root:Epoch: 1 ( 38.4%) - Loss: 0.2366747409105301\n", "INFO:root:Epoch: 1 ( 51.2%) - Loss: 0.03386534005403519\n", "INFO:root:Epoch: 1 ( 64.0%) - Loss: 0.08485321700572968\n", "INFO:root:Epoch: 1 ( 76.8%) - Loss: 0.11484973132610321\n", "INFO:root:Epoch: 1 ( 89.6%) - Loss: 0.1472587138414383\n", "INFO:root:data_counter - 30000\n", "INFO:root:Test accuracy: 9832/10000 ( 98.3%)\n", "INFO:root:loss=0.0481\n", "INFO:root:accuracy=0.9832\n", "INFO:root:train_loader - 469\n", "INFO:root:Epoch: 2 ( 0.0%) - Loss: 0.0748339518904686\n", "INFO:root:Epoch: 2 ( 12.8%) - Loss: 0.1949663758277893\n", "INFO:root:Epoch: 2 ( 25.6%) - Loss: 0.055954597890377045\n", "INFO:root:Epoch: 2 ( 38.4%) - Loss: 0.08764931559562683\n", "INFO:root:Epoch: 2 ( 51.2%) - Loss: 0.01219240203499794\n", "INFO:root:Epoch: 2 ( 64.0%) - Loss: 0.05545560643076897\n", "INFO:root:Epoch: 2 ( 76.8%) - Loss: 0.08434510231018066\n", "INFO:root:Epoch: 2 ( 89.6%) - Loss: 0.15671400725841522\n", "INFO:root:data_counter - 30000\n", "INFO:root:Test accuracy: 9851/10000 ( 98.5%)\n", "INFO:root:loss=0.0425\n", "INFO:root:accuracy=0.9851\n", "INFO:root:train_loader - 469\n", "INFO:root:Epoch: 3 ( 0.0%) - Loss: 0.05795310437679291\n", "INFO:root:Epoch: 3 ( 12.8%) - Loss: 0.09435995668172836\n", "INFO:root:Epoch: 3 ( 25.6%) - Loss: 0.06416410952806473\n", "INFO:root:Epoch: 3 ( 38.4%) - Loss: 0.041010383516550064\n", "INFO:root:Epoch: 3 ( 51.2%) - Loss: 0.01622803322970867\n", "INFO:root:Epoch: 3 ( 64.0%) - Loss: 0.10800038278102875\n", "INFO:root:Epoch: 3 ( 76.8%) - Loss: 0.018832189962267876\n", "INFO:root:Epoch: 3 ( 89.6%) - Loss: 0.060872942209243774\n", "INFO:root:data_counter - 30000\n", "INFO:root:Test accuracy: 9896/10000 ( 99.0%)\n", "INFO:root:loss=0.0312\n", "INFO:root:accuracy=0.9896\n", "\n" ] } ], "source": [ "pytorchjob_client.get_logs('pytorch-dist-mnist-gloo3', namespace=namespace, master=False)" ] }, { "cell_type": "code", "execution_count": 63, "id": "624bee1e-d983-4669-ab3f-a95b4124fb11", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'kind': 'Status',\n", " 'apiVersion': 'v1',\n", " 'metadata': {},\n", " 'status': 'Success',\n", " 'details': {'name': 'pytorch-dist-mnist-gloo3',\n", " 'group': 'kubeflow.org',\n", " 'kind': 'pytorchjobs',\n", " 'uid': 'b618f675-b089-4ccf-8d92-675330266b27'}}" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pytorchjob_client.delete('pytorch-dist-mnist-gloo3')" ] }, { "cell_type": "code", "execution_count": null, "id": "160f35da-2530-4a90-bbde-00c317baa40c", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.13" } }, "nbformat": 4, "nbformat_minor": 5 }