{ "variables": { "region": "us-east-1", "flag": "eks-1.23", "subnet_id": "subnet-baff22e5", "security_groupids": "sg-053996a563511a3c6,sg-050407dfb1c555723", "build_ami": "ami-0c9424a408e18bcc9", "efa_pkg": "aws-efa-installer-latest.tar.gz", "intel_mkl_version": "intel-mkl-2020.0-088", "nvidia_version": "525.60.13", "cuda_version": "cuda-toolkit-11-8", "nccl_version": "v2.16.2-1" }, "builders": [{ "type": "amazon-ebs", "region": "{{user `region`}}", "source_ami": "{{user `build_ami`}}", "run_tags": { "Name": "packer-gpu-processor-{{user `flag`}}" }, "subnet_id": "{{user `subnet_id`}}", "security_group_ids": "{{user `security_groupids`}}", "instance_type": "g5.8xlarge", "ssh_username": "ec2-user", "ami_name": "aws_{{user `flag`}}_{{user `nvidia_version`}}-{{timestamp}}", "launch_block_device_mappings":[{ "delete_on_termination": true, "device_name": "/dev/xvda", "volume_size": 100, "throughput": 1000, "iops": 10000, "volume_type": "gp3" }] }], "provisioners": [{ "type": "shell", "expect_disconnect": true, "inline": [ "sudo yum update -y", "sudo amazon-linux-extras install lustre2.10 epel -y", "sudo yum install rsync yum-utils cmake sipcalc dkms mdadm git htop hwloc iftop kernel-tools rpm-build rpmdevtools numactl parallel pigz python2-pip python3-pip wget kernel-devel kernel-headers check check-devel subunit subunit-devel -y", "sudo yum groupinstall 'Development Tools' -y", "uname -r", "echo 'blacklist nouveau' | sudo tee /etc/modprobe.d/nvidia-graphics-drivers.conf", "echo 'blacklist lbm-nouveau' | sudo tee -a /etc/modprobe.d/nvidia-graphics-drivers.conf", "echo 'alias nouveau off' | sudo tee -a /etc/modprobe.d/nvidia-graphics-drivers.conf", "echo 'alias lbm-nouveau off' | sudo tee -a /etc/modprobe.d/nvidia-graphics-drivers.conf", "sudo shutdown -r now"] }, { "type": "shell", "inline_shebang": "/bin/bash -xe", "inline": [ "echo ' StrictHostKeyChecking no' | sudo tee -a /etc/ssh/ssh_config", "echo ' HostbasedAuthentication no' | sudo tee -a /etc/ssh/ssh_config", "echo ' CheckHostIP no' | sudo tee -a /etc/ssh/ssh_config"] }, { "type": "shell", "inline_shebang": "/bin/bash -xe", "inline": [ "cd /tmp", "sudo yum install gcc10 kernel-devel kernel-headers -y", "sudo yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo", "sudo yum clean all", "sudo sed -i '/^\\[main\\]/a\\exclude=kernel*' /etc/yum.conf", "sudo wget -O /tmp/NVIDIA-Linux-driver.run 'https://us.download.nvidia.com/tesla/{{user `nvidia_version`}}/NVIDIA-Linux-x86_64-{{user `nvidia_version`}}.run'", "sudo CC=gcc10-cc sh /tmp/NVIDIA-Linux-driver.run -q -a --ui=none", "sudo curl -O https://developer.download.nvidia.com/compute/nvidia-driver/redist/fabricmanager/linux-x86_64/fabricmanager-linux-x86_64-{{user `nvidia_version`}}-archive.tar.xz", "sudo tar xf fabricmanager-linux-x86_64-{{user `nvidia_version`}}-archive.tar.xz -C /tmp", "sudo rsync -al /tmp/fabricmanager-linux-x86_64-{{user `nvidia_version`}}-archive/ /usr/ --exclude LICENSE", "sudo mv /usr/systemd/nvidia-fabricmanager.service /usr/lib/systemd/system", "sudo systemctl enable nvidia-fabricmanager", "sudo yum -y install {{user `cuda_version`}}", "echo -e 'options nvidia NVreg_EnableGpuFirmware=0' | sudo tee /etc/modprobe.d/nvidia-gsp.conf", "echo -e '#!/bin/sh\nexport LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64\nexport PATH=$PATH:/usr/local/cuda/bin' | sudo tee /etc/profile.d/cuda.sh", "sudo chmod +x /etc/rc.local"] }, { "type": "shell", "inline_shebang": "/bin/bash -xe", "inline": [ "git clone https://github.com/NVIDIA/gdrcopy.git /tmp/gdrcopy", "cd /tmp/gdrcopy/packages", "CUDA=/usr/local/cuda PATH=$PATH:/sbin ./build-rpm-packages.sh", "sudo rpm -Uvh *.rpm"] }, { "type": "shell", "inline": [ "echo 'net.core.default_qdisc = fq' | sudo tee -a /etc/sysctl.conf", "echo 'net.ipv4.tcp_congestion_control = bbr' | sudo tee -a /etc/sysctl.conf", "echo 'net.ipv4.tcp_timestamps = 0' | sudo tee -a /etc/sysctl.conf", "echo 'net.core.rmem_max = 67108864' | sudo tee -a /etc/sysctl.conf", "echo 'net.core.wmem_max = 67108864' | sudo tee -a /etc/sysctl.conf", "echo 'net.ipv4.tcp_rmem = 4096 87380 67108864' | sudo tee -a /etc/sysctl.conf", "echo 'net.ipv4.tcp_wmem = 4096 65536 67108864' | sudo tee -a /etc/sysctl.conf", "sudo sysctl -p"] }, { "type": "shell", "inline_shebang": "/bin/bash -xe", "inline": [ "distribution=$(. /etc/os-release;echo $ID$VERSION_ID)", "sudo amazon-linux-extras install docker", "sudo systemctl enable docker", "curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.repo | sudo tee /etc/yum.repos.d/nvidia-docker.repo", "sudo yum install -y nvidia-container-toolkit nvidia-docker2", "echo -e '{\"live-restore\": true,\"max-concurrent-downloads\": 10,\"default-ulimits\":{\"memlock\":{\"Name\":\"memlock\",\"Soft\":-1,\"Hard\":-1}},\"default-runtime\":\"nvidia\",\"runtimes\":{\"nvidia\":{\"path\":\"nvidia-container-runtime\",\"runtimeArgs\":[]}}}' | sudo tee /etc/docker/daemon.json", "sudo systemctl restart docker", "sudo usermod -aG docker ec2-user"] }, { "type": "shell", "inline_shebang": "/bin/bash -xe", "inline": [ "wget -O /tmp/aws-efa-installer.tar.gz https://efa-installer.amazonaws.com/{{user `efa_pkg`}}", "tar -xf /tmp/aws-efa-installer.tar.gz -C /tmp", "cd /tmp/aws-efa-installer", "sudo ./efa_installer.sh -y -g", "echo -e '#!/bin/sh\nexport LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/amazon/openmpi/lib64:/opt/amazon/efa/lib64\nexport PATH=$PATH:/opt/amazon/efa/bin:/opt/amazon/openmpi/bin' | sudo tee /etc/profile.d/amazon_efa.sh"] }, { "type": "shell", "inline_shebang": "/bin/bash -xe", "inline": [ "sudo mkdir -p /opt/aws", "wget -O /tmp/aws-gpu-boost-clock.sh 'https://github.com/aws-samples/aws-efa-nccl-baseami-pipeline/raw/master/nvidia-efa-ami_base/boost/aws-gpu-boost-clock.sh'", "wget -O /tmp/aws-gpu-boost-clock.service 'https://github.com/aws-samples/aws-efa-nccl-baseami-pipeline/raw/master/nvidia-efa-ami_base/boost/aws-gpu-boost-clock.service'", "sudo mv /tmp/aws-gpu-boost-clock.sh /opt/aws/ && sudo chmod +x /opt/aws/aws-gpu-boost-clock.sh", "sudo mv /tmp/aws-gpu-boost-clock.service /lib/systemd/system", "sudo systemctl enable aws-gpu-boost-clock.service && sudo systemctl start aws-gpu-boost-clock.service"] }, { "type": "shell", "inline_shebang": "/bin/bash -xe", "inline": [ "sudo mkdir -p /opt/mig", "echo -e '#!/bin/bash\nMIG_PARTITION=7' | sudo tee /etc/default/mig", "wget -O /tmp/create_mig.sh 'https://github.com/aws-samples/aws-efa-nccl-baseami-pipeline/raw/master/nvidia-efa-ami_base/mig/create_mig.sh'", "wget -O /tmp/aws-gpu-mig.service 'https://github.com/aws-samples/aws-efa-nccl-baseami-pipeline/raw/master/nvidia-efa-ami_base/mig/aws-gpu-mig.service'", "sudo mv /tmp/create_mig.sh /opt/mig/ && sudo chmod +x /opt/mig/create_mig.sh", "sudo mv /tmp/aws-gpu-mig.service /lib/systemd/system"] }, { "type": "shell", "inline_shebang": "/bin/bash -xe", "inline": [ "sudo /usr/bin/pip3 install boto3", "sudo mkdir -p /opt/aws", "sudo wget -O /tmp/cwa.rpm 'https://s3.amazonaws.com/amazoncloudwatch-agent/amazon_linux/amd64/latest/amazon-cloudwatch-agent.rpm'", "sudo rpm -iUhv /tmp/cwa.rpm", "git clone https://github.com/aws-samples/aws-efa-nccl-baseami-pipeline.git /tmp/aws-efa-nccl-baseami", "sudo mv /tmp/aws-efa-nccl-baseami/nvidia-efa-ami_base/cloudwatch /opt/aws/", "sudo mv /opt/aws/cloudwatch/aws-hw-monitor.service /lib/systemd/system", "echo -e '#!/bin/sh\n' | sudo tee /opt/aws/cloudwatch/aws-cloudwatch-wrapper.sh", "echo -e '/usr/bin/python3 /opt/aws/cloudwatch/nvidia/aws-hwaccel-event-parser.py &' | sudo tee -a /opt/aws/cloudwatch/aws-cloudwatch-wrapper.sh", "echo -e '/usr/bin/python3 /opt/aws/cloudwatch/nvidia/accel-to-cw.py /opt/aws/cloudwatch/nvidia/nvidia-exporter >> /dev/null 2>&1 &\n' | sudo tee -a /opt/aws/cloudwatch/aws-cloudwatch-wrapper.sh", "echo -e '/usr/bin/python3 /opt/aws/cloudwatch/efa/efa-to-cw.py /opt/aws/cloudwatch/efa/efa-exporter >> /dev/null 2>&1 &\n' | sudo tee -a /opt/aws/cloudwatch/aws-cloudwatch-wrapper.sh", "sudo chmod +x /opt/aws/cloudwatch/aws-cloudwatch-wrapper.sh", "sudo cp /opt/aws/cloudwatch/nvidia/cwa-config.json /opt/aws/amazon-cloudwatch-agent/bin/config.json", "sudo /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -c file:/opt/aws/amazon-cloudwatch-agent/bin/config.json -s", "sudo systemctl enable aws-hw-monitor.service", "sudo systemctl restart amazon-cloudwatch-agent.service"] } ] }