apiVersion: apps/v1 kind: DaemonSet metadata: name: aws-virtual-gpu-device-plugin-daemonset namespace: kube-system spec: selector: matchLabels: name: aws-virtual-gpu-device-plugin updateStrategy: type: RollingUpdate template: metadata: # This annotation is deprecated. Kept here for backward compatibility # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ annotations: scheduler.alpha.kubernetes.io/critical-pod: "" labels: name: aws-virtual-gpu-device-plugin spec: hostIPC: true nodeSelector: k8s.amazonaws.com/accelerator: vgpu tolerations: # This toleration is deprecated. Kept here for backward compatibility # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ - key: CriticalAddonsOnly operator: Exists - key: k8s.amazonaws.com/vgpu operator: Exists effect: NoSchedule # Mark this pod as a critical add-on; when enabled, the critical add-on # scheduler reserves resources for critical add-on pods so that they can # be rescheduled after a failure. # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ priorityClassName: "system-node-critical" # In case machine deploy MPS device plugin and change compute mode to initContainers: - name: set-compute-mode image: nvidia/cuda:11.2.1-runtime command: ['nvidia-smi', '-c', 'EXCLUSIVE_PROCESS'] securityContext: capabilities: add: ["SYS_ADMIN"] containers: - image: amazon/aws-virtual-gpu-device-plugin:v0.1.0 name: aws-virtual-gpu-device-plugin-ctr securityContext: allowPrivilegeEscalation: false capabilities: drop: ["ALL"] volumeMounts: - name: device-plugin mountPath: /var/lib/kubelet/device-plugins - image: nvidia/mps name: mps volumeMounts: - name: nvidia-mps mountPath: /tmp/nvidia-mps env: - name: CUDA_MPS_ACTIVE_THREAD_PERCENTAGE value: "10" volumes: - name: device-plugin hostPath: path: /var/lib/kubelet/device-plugins - name: nvidia-mps hostPath: path: /tmp/nvidia-mps