name: 'PyTorch Operator - Training Job' description: Kubeflow PyTorchJob launcher inputs: - {name: name, type: String, description: 'PyTorchJob name.'} - {name: namespace, type: String, default: kubeflow, description: 'PyTorchJob namespace (likely your current namespace).'} - {name: version, type: String, default: v1, description: 'PyTorchJob version.'} - {name: master_spec, type: JsonObject, default: '{}', description: 'PyTorchJob Master replicaSpecs.'} - {name: worker_spec, type: JsonObject, default: '{}', description: 'PyTorchJob Worker replicaSpecs.'} - {name: job_timeout_minutes, type: Integer, default: 1440, description: 'Time in minutes to wait for the job to complete.'} - {name: delete_after_done, type: Boolean, default: 'True' , description: 'Whether to delete the job after it is finished.'} - {name: clean_pod_policy, type: String, default: Running, description: 'Defines the policy for cleaning up pods after the PyTorchJob completes.'} - {name: active_deadline_seconds, type: Integer, optional: true, description: 'Specifies the duration (in seconds) since startTime during which the job can remain active before it is terminated. Must be a positive integer. This setting applies only to pods where restartPolicy is OnFailure or Always.'} - {name: backoff_limit, type: Integer, optional: true, description: 'Number of retries before marking this job as failed.'} - {name: ttl_seconds_after_finished, type: Integer, optional: true, description: 'Defines the TTL for cleaning up finished PyTorchJobs.'} implementation: container: image: cascribner/kubeflow-pytorchjob-launcher:v1 command: [python, /ml/launch_pytorchjob.py] args: - --name - {inputValue: name} - --namespace - {inputValue: namespace} - --version - {inputValue: version} - --masterSpec - {inputValue: master_spec} - --workerSpec - {inputValue: worker_spec} - --jobTimeoutMinutes - {inputValue: job_timeout_minutes} - --deleteAfterDone - {inputValue: delete_after_done} - --cleanPodPolicy - {inputValue: clean_pod_policy} - if: cond: {isPresent: active_deadline_seconds} then: - --activeDeadlineSeconds - {inputValue: active_deadline_seconds} - if: cond: {isPresent: backoff_limit} then: - --backoffLimit - {inputValue: backoff_limit} - if: cond: {isPresent: ttl_seconds_after_finished} then: - --ttlSecondsAfterFinished - {inputValue: ttl_seconds_after_finished}