// Package gpu implements GPU plugin. package gpu import ( "context" "errors" "fmt" "io" "reflect" "strings" "time" "github.com/aws/aws-k8s-tester/ec2config" "github.com/aws/aws-k8s-tester/eksconfig" "github.com/aws/aws-k8s-tester/pkg/fileutil" k8s_client "github.com/aws/aws-k8s-tester/pkg/k8s-client" "github.com/aws/aws-sdk-go/service/eks" "go.uber.org/zap" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/exec" ) // Config defines GPU configuration. type Config struct { Logger *zap.Logger LogWriter io.Writer Stopc chan struct{} EKSConfig *eksconfig.Config K8SClient k8s_client.EKS } // Tester defines GPU tester. type Tester interface { // Name returns the name of the tester. Name() string // InstallNvidiaDriver installs the Nvidia device plugin for Kubernetes. // After GPU worker nodes join the cluster, one must apply the Nvidia // device plugin for Kubernetes as a DaemonSet. // ref. https://docs.aws.amazon.com/eks/latest/userguide/create-managed-node-group.html // ref. https://docs.aws.amazon.com/eks/latest/userguide/gpu-ami.html // ref. https://github.com/NVIDIA/k8s-device-plugin InstallNvidiaDriver() error // CreateNvidiaSMI launches a pod manifest that launches a Cuda container that // runs "nvidia-smi" on a GPU worker node. // ref. https://docs.aws.amazon.com/eks/latest/userguide/gpu-ami.html CreateNvidiaSMI() error } var pkgName = reflect.TypeOf(tester{}).PkgPath() func (ts *tester) Name() string { return pkgName } // New creates a new Job tester. func New(cfg Config) Tester { cfg.Logger.Info("creating tester", zap.String("tester", pkgName)) return &tester{cfg: cfg} } type tester struct { cfg Config } // https://github.com/NVIDIA/k8s-device-plugin/blob/master/nvidia-device-plugin.yml // https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/1.0.0-beta5/nvidia-device-plugin.yml // kubectl apply -f apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/1.0.0-beta5/nvidia-device-plugin.yml const nvidiaDriverTemplate = ` # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. apiVersion: apps/v1 kind: DaemonSet metadata: name: nvidia-device-plugin-daemonset namespace: kube-system spec: selector: matchLabels: name: nvidia-device-plugin-ds updateStrategy: type: RollingUpdate template: metadata: # This annotation is deprecated. Kept here for backward compatibility # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ annotations: scheduler.alpha.kubernetes.io/critical-pod: "" labels: name: nvidia-device-plugin-ds spec: tolerations: # This toleration is deprecated. Kept here for backward compatibility # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ - key: CriticalAddonsOnly operator: Exists - key: nvidia.com/gpu operator: Exists effect: NoSchedule # Mark this pod as a critical add-on; when enabled, the critical add-on # scheduler reserves resources for critical add-on pods so that they can # be rescheduled after a failure. # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ priorityClassName: "system-node-critical" containers: - image: nvidia/k8s-device-plugin:1.0.0-beta5 name: nvidia-device-plugin-ctr securityContext: allowPrivilegeEscalation: false capabilities: drop: ["ALL"] volumeMounts: - name: device-plugin mountPath: /var/lib/kubelet/device-plugins volumes: - name: device-plugin hostPath: path: /var/lib/kubelet/device-plugins ` // https://github.com/NVIDIA/k8s-device-plugin/releases // https://docs.aws.amazon.com/eks/latest/userguide/create-managed-node-group.html // https://docs.aws.amazon.com/eks/latest/userguide/gpu-ami.html // https://github.com/NVIDIA/k8s-device-plugin // https://github.com/NVIDIA/k8s-device-plugin/blob/master/nvidia-device-plugin.yml // https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/1.0.0-beta5/nvidia-device-plugin.yml func (ts *tester) InstallNvidiaDriver() (err error) { if !ts.cfg.EKSConfig.IsEnabledAddOnNodeGroups() && !ts.cfg.EKSConfig.IsEnabledAddOnManagedNodeGroups() { ts.cfg.Logger.Info("skipping nvidia driver install") return nil } ts.cfg.Logger.Info("starting tester.InstallNvidiaDriver", zap.String("tester", pkgName)) fpath, err := fileutil.WriteTempFile([]byte(nvidiaDriverTemplate)) if err != nil { return err } applyArgs := []string{ ts.cfg.EKSConfig.KubectlPath, "--kubeconfig=" + ts.cfg.EKSConfig.KubeConfigPath, "apply", "-f", fpath, } applyCmd := strings.Join(applyArgs, " ") applied := false retryStart, waitDur := time.Now(), 3*time.Minute for time.Since(retryStart) < waitDur { select { case <-ts.cfg.Stopc: ts.cfg.Logger.Warn("install nvidia GPU driver stopped") return nil case <-time.After(5 * time.Second): } ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second) output, err := exec.New().CommandContext(ctx, applyArgs[0], applyArgs[1:]...).CombinedOutput() cancel() out := strings.TrimSpace(string(output)) fmt.Fprintf(ts.cfg.LogWriter, "\n\n'%s' output:\n\n%s\n\n", applyCmd, out) if err != nil { ts.cfg.Logger.Warn("failed to create nvidia GPU driver", zap.Error(err)) time.Sleep(5 * time.Second) continue } applied = true ts.cfg.Logger.Info("created nvidia GPU driver") break } if !applied { return errors.New("failed to install nvidia GPU driver") } if ts.cfg.EKSConfig.IsEnabledAddOnNodeGroups() { cnt := 0 for _, cur := range ts.cfg.EKSConfig.AddOnNodeGroups.ASGs { if cur.AMIType == ec2config.AMITypeAL2X8664GPU { cnt++ } } waitDur := 5 * time.Minute var items []v1.Node retryStart := time.Now() readyNGs := make(map[string]struct{}) for time.Since(retryStart) < waitDur { if len(readyNGs) == cnt { break } for ngName, cur := range ts.cfg.EKSConfig.AddOnNodeGroups.ASGs { if cur.AMIType != ec2config.AMITypeAL2X8664GPU { ts.cfg.Logger.Warn("skipping non-GPU AMI", zap.String("ng-name", ngName)) continue } if _, ok := readyNGs[ngName]; ok { ts.cfg.Logger.Info("skipping already ready mng", zap.String("ng-name", ngName)) continue } ts.cfg.Logger.Info("listing GPU nodes via client-go", zap.String("ng-name", ngName)) ctx, cancel := context.WithTimeout(context.Background(), time.Minute) nodes, err := ts.cfg.K8SClient.KubernetesClientSet().CoreV1().Nodes().List( ctx, metav1.ListOptions{ // TODO: filter by GPU? // FieldSelector: fields.OneTermEqualSelector("metadata.name", "GPU").String(), }, ) cancel() if err != nil { ts.cfg.Logger.Warn("get nodes failed", zap.Error(err)) time.Sleep(5 * time.Second) continue } items = nodes.Items ts.cfg.Logger.Info("listed GPU nodes via client-go", zap.String("ng-name", ngName), zap.Int("nodes", len(items))) foundReady := int32(0) for _, node := range items { labels := node.GetLabels() if labels["NGName"] != ngName { continue } nodeName := node.GetName() ts.cfg.Logger.Info("checking node-info conditions", zap.String("node-name", nodeName), zap.String("labels", fmt.Sprintf("%+v", labels)), zap.String("allocatable", fmt.Sprintf("%+v", node.Status.Allocatable)), ) for _, cond := range node.Status.Conditions { if cond.Type != v1.NodeReady { continue } ts.cfg.Logger.Info("node info", zap.String("node-name", nodeName), zap.String("type", fmt.Sprintf("%s", cond.Type)), zap.String("status", fmt.Sprintf("%s", cond.Status)), ) if cond.Status == v1.ConditionTrue { foundReady++ } } } ts.cfg.Logger.Info("nodes", zap.Int32("current-ready-nodes", foundReady), zap.Int32("min-ready-nodes", cur.ASGMinSize), zap.Int32("desired-ready-nodes", cur.ASGDesiredCapacity), ) time.Sleep(5 * time.Second) if foundReady >= cur.ASGMinSize { readyNGs[ngName] = struct{}{} break } } } } if ts.cfg.EKSConfig.IsEnabledAddOnManagedNodeGroups() { cnt := 0 for _, cur := range ts.cfg.EKSConfig.AddOnManagedNodeGroups.MNGs { if cur.AMIType == eks.AMITypesAl2X8664Gpu { cnt++ } } waitDur := 5 * time.Minute var items []v1.Node retryStart := time.Now() readyMNGs := make(map[string]struct{}) for time.Since(retryStart) < waitDur { if len(readyMNGs) == cnt { break } for mngName, cur := range ts.cfg.EKSConfig.AddOnManagedNodeGroups.MNGs { if cur.AMIType != eks.AMITypesAl2X8664Gpu { ts.cfg.Logger.Warn("skipping non-GPU AMI", zap.String("mng-name", mngName)) continue } if _, ok := readyMNGs[mngName]; ok { ts.cfg.Logger.Info("skipping already ready mng", zap.String("mng-name", mngName)) continue } ts.cfg.Logger.Info("listing GPU nodes via client-go", zap.String("mng-name", mngName)) ctx, cancel := context.WithTimeout(context.Background(), time.Minute) nodes, err := ts.cfg.K8SClient.KubernetesClientSet().CoreV1().Nodes().List( ctx, metav1.ListOptions{ // TODO: filter by GPU? // FieldSelector: fields.OneTermEqualSelector("metadata.name", "GPU").String(), }, ) cancel() if err != nil { ts.cfg.Logger.Warn("get nodes failed", zap.Error(err)) time.Sleep(5 * time.Second) continue } items = nodes.Items ts.cfg.Logger.Info("listed GPU nodes via client-go", zap.String("mng-name", mngName), zap.Int("nodes", len(items))) foundReady := 0 for _, node := range items { labels := node.GetLabels() if labels["NGName"] != mngName { continue } nodeName := node.GetName() ts.cfg.Logger.Info("checking node-info conditions", zap.String("node-name", nodeName), zap.String("labels", fmt.Sprintf("%+v", labels))) for _, cond := range node.Status.Conditions { if cond.Type != v1.NodeReady { continue } ts.cfg.Logger.Info("node info", zap.String("node-name", nodeName), zap.String("type", fmt.Sprintf("%s", cond.Type)), zap.String("status", fmt.Sprintf("%s", cond.Status)), ) if cond.Status == v1.ConditionTrue { foundReady++ } } } ts.cfg.Logger.Info("nodes", zap.Int("current-ready-nodes", foundReady), zap.Int("min-ready-nodes", cur.ASGMinSize), zap.Int("desired-ready-nodes", cur.ASGDesiredCapacity), ) time.Sleep(5 * time.Second) if foundReady >= cur.ASGMinSize { readyMNGs[mngName] = struct{}{} break } } } } ts.cfg.EKSConfig.Sync() return nil } /* https://docs.aws.amazon.com/eks/latest/userguide/gpu-ami.html takes about 1-min to finish kubectl apply -f nvidia-smi.yaml kubectl logs nvidia-smi +-----------------------------------------------------------------------------+ | NVIDIA-SMI 418.87.00 Driver Version: 418.87.00 CUDA Version: 10.1 | |-------------------------------+----------------------+----------------------+ | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | |===============================+======================+======================| | 0 Tesla V100-SXM2... On | 00000000:00:1D.0 Off | 0 | | N/A 43C P0 41W / 300W | 0MiB / 16130MiB | 1% Default | +-------------------------------+----------------------+----------------------+ +-----------------------------------------------------------------------------+ | Processes: GPU Memory | | GPU PID Type Process name Usage | |=============================================================================| | No running processes found | +-----------------------------------------------------------------------------+ */ func (ts *tester) CreateNvidiaSMI() error { ts.cfg.Logger.Info("starting tester.CreateNvidiaSMI", zap.String("tester", pkgName)) ctx, cancel := context.WithTimeout(context.Background(), time.Minute) _, err := ts.cfg.K8SClient. KubernetesClientSet(). CoreV1(). Pods("default"). Create( ctx, &v1.Pod{ TypeMeta: metav1.TypeMeta{ APIVersion: "v1", Kind: "Pod", }, ObjectMeta: metav1.ObjectMeta{ Name: "nvidia-smi", }, Spec: v1.PodSpec{ RestartPolicy: v1.RestartPolicyOnFailure, Containers: []v1.Container{ { Name: "nvidia-smi", Image: "nvidia/cuda:9.2-devel", Args: []string{"nvidia-smi"}, Resources: v1.ResourceRequirements{ Limits: map[v1.ResourceName]resource.Quantity{ v1.ResourceName("nvidia.com/gpu"): resource.MustParse("1"), }, }, }, }, // DO NOT SET node selector, it fails with // "Warning FailedScheduling 20s (x2 over 91s) default-scheduler 0/5 nodes are available: 3 node(s) didn't match node selector, 5 Insufficient nvidia.com/gpu." // NodeSelector: map[string]string{ // "AMIType": ec2config.AMITypeAL2X8664GPU, // }, }, }, metav1.CreateOptions{}, ) cancel() if err != nil { return err } ts.cfg.Logger.Info("checking nvidia-smi") select { case <-ts.cfg.Stopc: return errors.New("nvidia-smi install aborted") case <-time.After(time.Minute): } descDsArgs := []string{ ts.cfg.EKSConfig.KubectlPath, "--kubeconfig=" + ts.cfg.EKSConfig.KubeConfigPath, "--namespace=kube-system", "describe", "daemonset.apps/nvidia-device-plugin-daemonset", } descDsCmd := strings.Join(descDsArgs, " ") descPoArgs := []string{ ts.cfg.EKSConfig.KubectlPath, "--kubeconfig=" + ts.cfg.EKSConfig.KubeConfigPath, "--namespace=default", "describe", "pod/nvidia-smi", } descPoCmd := strings.Join(descPoArgs, " ") logsArgs := []string{ ts.cfg.EKSConfig.KubectlPath, "--kubeconfig=" + ts.cfg.EKSConfig.KubeConfigPath, "--namespace=default", "logs", "nvidia-smi", "--timestamps", } logsCmd := strings.Join(logsArgs, " ") installed := false retryStart, waitDur := time.Now(), 3*time.Minute for time.Since(retryStart) < waitDur { select { case <-ts.cfg.Stopc: return errors.New("nvidia-smi check aborted") case <-time.After(5 * time.Second): } ts.cfg.Logger.Info("querying nvidia-smi") ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) out, err := exec.New().CommandContext(ctx, descDsArgs[0], descDsArgs[1:]...).CombinedOutput() cancel() output := strings.TrimSpace(string(out)) if err != nil { ts.cfg.Logger.Warn("failed to kubectl describe daemonset.apps/nvidia-device-plugin-daemonset", zap.Error(err)) } fmt.Fprintf(ts.cfg.LogWriter, "\n\n'%s' output:\n\n%s\n\n", descDsCmd, output) ctx, cancel = context.WithTimeout(context.Background(), 15*time.Second) out, err = exec.New().CommandContext(ctx, descPoArgs[0], descPoArgs[1:]...).CombinedOutput() cancel() output = strings.TrimSpace(string(out)) if err != nil { ts.cfg.Logger.Warn("failed to kubectl describe pod/nvidia-smi", zap.Error(err)) } fmt.Fprintf(ts.cfg.LogWriter, "\n\n'%s' output:\n\n%s\n\n", descPoCmd, output) ctx, cancel = context.WithTimeout(context.Background(), 15*time.Second) out, err = exec.New().CommandContext(ctx, logsArgs[0], logsArgs[1:]...).CombinedOutput() cancel() output = strings.TrimSpace(string(out)) if err != nil { ts.cfg.Logger.Warn("failed to kubectl logs", zap.Error(err)) } fmt.Fprintf(ts.cfg.LogWriter, "\n\n'%s' output:\n\n%s\n\n", logsCmd, output) if strings.Contains(output, "NVIDIA-SMI") && strings.Contains(output, "GPU-Util") { installed = true break } } if installed { ts.cfg.Logger.Info("checked nvidia-smi") ts.cfg.EKSConfig.Sync() return nil } ts.cfg.Logger.Warn("failed to test nvidia-smi") return errors.New("nvidia-smi failed") }