{ "variables": { "region": "us-east-1", "flag": "al2022-base", "subnet_id": "subnet-baff22e5", "security_groupids": "sg-053996a563511a3c6,sg-050407dfb1c555723", "build_ami": "ami-0b45a5b26d1df8ed7", "efa_pkg": "aws-efa-installer-latest.tar.gz", "intel_mkl_version": "intel-mkl-2020.0-088", "nvidia_version": "515.86.01", "cuda_version": "cuda-toolkit-11-6", "cudnn_version": "libcudnn8", "nccl_version": "v2.12.7-1" }, "builders": [{ "type": "amazon-ebs", "region": "{{user `region`}}", "source_ami": "{{user `build_ami`}}", "run_tags": { "Name": "packer-gpu-processor_{{user `flag`}}" }, "subnet_id": "{{user `subnet_id`}}", "security_group_ids": "{{user `security_groupids`}}", "instance_type":"g5.2xlarge", "ssh_username": "ec2-user", "ami_name": "aws_{{user `flag`}}_{{user `nvidia_version`}}-{{timestamp}}", "launch_block_device_mappings":[{ "delete_on_termination": true, "device_name": "/dev/xvda", "volume_size": 100, "throughput": 1000, "iops": 10000, "volume_type": "gp3" }] }], "provisioners": [{ "type": "shell", "expect_disconnect": true, "inline": [ "sudo dnf update -y", "sudo dnf install amazon-cloudwatch-agent python3 yum-utils rsync ldconfig cmake dkms mdadm git htop hwloc kernel-tools rpm-build rpmdevtools numactl parallel pigz wget kernel-devel kernel-headers make check check-devel -y", "sudo dnf groupinstall 'Development Tools' -y", "curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py && sudo python3 /tmp/get-pip.py", "uname -r", "echo 'blacklist nouveau' | sudo tee /etc/modprobe.d/nvidia-graphics-drivers.conf", "echo 'blacklist lbm-nouveau' | sudo tee -a /etc/modprobe.d/nvidia-graphics-drivers.conf", "echo 'alias nouveau off' | sudo tee -a /etc/modprobe.d/nvidia-graphics-drivers.conf", "echo 'alias lbm-nouveau off' | sudo tee -a /etc/modprobe.d/nvidia-graphics-drivers.conf", "sudo shutdown -r now"] }, { "type": "shell", "inline": [ "echo 'net.core.default_qdisc = fq' | sudo tee -a /etc/sysctl.conf", "echo 'net.ipv4.tcp_congestion_control = bbr' | sudo tee -a /etc/sysctl.conf", "echo 'net.ipv4.tcp_timestamps = 0' | sudo tee -a /etc/sysctl.conf", "echo 'net.core.rmem_max = 67108864' | sudo tee -a /etc/sysctl.conf", "echo 'net.core.wmem_max = 67108864' | sudo tee -a /etc/sysctl.conf", "echo 'net.ipv4.tcp_rmem = 4096 87380 67108864' | sudo tee -a /etc/sysctl.conf", "echo 'net.ipv4.tcp_wmem = 4096 65536 67108864' | sudo tee -a /etc/sysctl.conf", "sudo sysctl -p"] }, { "type": "shell", "inline": [ "uname -r", "wget -O /tmp/awscli2.zip https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip", "cd /tmp && sudo unzip /tmp/awscli2.zip", "sudo /tmp/aws/install", "aws configure set default.s3.max_concurrent_requests 100", "aws configure set default.s3.max_queue_size 10000", "aws configure set default.s3.multipart_threshold 64MB", "aws configure set default.s3.multipart_chunksize 16MB"] }, { "type": "shell", "inline_shebang": "/bin/bash -xe", "inline": [ "echo ' StrictHostKeyChecking no' | sudo tee -a /etc/ssh/ssh_config", "echo ' HostbasedAuthentication no' | sudo tee -a /etc/ssh/ssh_config", "echo ' CheckHostIP no' | sudo tee -a /etc/ssh/ssh_config"] }, { "type": "shell", "inline_shebang": "/bin/bash -xe", "inline": [ "cd /tmp", "sudo dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/fedora35/x86_64/cuda-fedora35.repo", "sudo wget -O /tmp/NVIDIA-Linux-driver.run 'https://us.download.nvidia.com/tesla/{{user `nvidia_version`}}/NVIDIA-Linux-x86_64-{{user `nvidia_version`}}.run'", "sudo sh /tmp/NVIDIA-Linux-driver.run -q -a --ui=none", "sudo dnf install {{user `cuda_version`}} -y", "sudo curl -O https://developer.download.nvidia.com/compute/nvidia-driver/redist/fabricmanager/linux-x86_64/fabricmanager-linux-x86_64-{{user `nvidia_version`}}-archive.tar.xz", "sudo tar xf fabricmanager-linux-x86_64-{{user `nvidia_version`}}-archive.tar.xz -C /tmp", "sudo rsync -al /tmp/fabricmanager-linux-x86_64-{{user `nvidia_version`}}-archive/ /usr/ --exclude LICENSE", "sudo mv /usr/systemd/nvidia-fabricmanager.service /usr/lib/systemd/system", "sudo systemctl enable nvidia-fabricmanager", "echo 'options nvidia NVreg_EnableGpuFirmware=0' | sudo tee /etc/modprobe.d/nvidia-gsp.conf", "echo -e '#!/bin/sh\nexport LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64\nexport PATH=$PATH:/usr/local/cuda/bin' | sudo tee /etc/profile.d/cuda.sh", "echo -e '#!/bin/bash\nnvidia-smi -pm 1' | sudo tee /etc/rc.local", "sudo chmod +x /etc/rc.local"] }, { "type": "shell", "inline_shebang": "/bin/bash -xe", "inline": [ "sudo dnf install docker -y", "curl -s -L https://nvidia.github.io/nvidia-docker/centos8/nvidia-docker.repo | sudo tee /etc/yum.repos.d/nvidia-docker.repo", "sudo dnf install -y nvidia-container-toolkit", "sudo sed -i 's/^OPTIONS/#&/' /etc/sysconfig/docker", "echo -e '{\"default-ulimits\":{\"memlock\":{\"Name\":\"memlock\",\"Soft\":-1,\"Hard\":-1}},\"default-runtime\":\"nvidia\",\"runtimes\":{\"nvidia\":{\"path\":\"nvidia-container-runtime\",\"runtimeArgs\":[]}}}' | sudo tee /etc/docker/daemon.json", "sudo systemctl restart docker", "sudo usermod -aG docker ec2-user"] }, { "type": "shell", "inline_shebang": "/bin/bash -xe", "inline": [ "sudo mkdir -p /opt/aws", "wget -O /tmp/aws-gpu-boost-clock.sh 'https://github.com/aws-samples/aws-efa-nccl-baseami-pipeline/raw/master/nvidia-efa-ami_base/boost/aws-gpu-boost-clock.sh'", "wget -O /tmp/aws-gpu-boost-clock.service 'https://github.com/aws-samples/aws-efa-nccl-baseami-pipeline/raw/master/nvidia-efa-ami_base/boost/aws-gpu-boost-clock.service'", "sudo mv /tmp/aws-gpu-boost-clock.sh /opt/aws/ && sudo chmod +x /opt/aws/aws-gpu-boost-clock.sh", "sudo mv /tmp/aws-gpu-boost-clock.service /lib/systemd/system", "sudo systemctl enable aws-gpu-boost-clock.service && sudo systemctl start aws-gpu-boost-clock.service"] }, { "type": "shell", "inline_shebang": "/bin/bash -xe", "inline": [ "sudo /usr/local/bin/pip3 install boto3", "sudo mkdir -p /opt/aws", "sudo wget -O /tmp/cwa.rpm 'https://s3.amazonaws.com/amazoncloudwatch-agent/amazon_linux/amd64/latest/amazon-cloudwatch-agent.rpm'", "sudo rpm -iUhv /tmp/cwa.rpm", "git clone https://github.com/aws-samples/aws-efa-nccl-baseami-pipeline.git /tmp/aws-efa-nccl-baseami", "sudo mv /tmp/aws-efa-nccl-baseami/nvidia-efa-ami_base/cloudwatch /opt/aws/", "sudo mv /opt/aws/cloudwatch/aws-hw-monitor.service /lib/systemd/system", "echo -e '#!/bin/sh\n' | sudo tee /opt/aws/cloudwatch/aws-cloudwatch-wrapper.sh", "echo -e '/usr/bin/python3 /opt/aws/cloudwatch/nvidia/aws-hwaccel-event-parser.py &' | sudo tee -a /opt/aws/cloudwatch/aws-cloudwatch-wrapper.sh", "echo -e '/usr/bin/python3 /opt/aws/cloudwatch/nvidia/accel-to-cw.py /opt/aws/cloudwatch/nvidia/nvidia-exporter >> /dev/null 2>&1 &\n' | sudo tee -a /opt/aws/cloudwatch/aws-cloudwatch-wrapper.sh", "echo -e '/usr/bin/python3 /opt/aws/cloudwatch/efa/efa-to-cw.py /opt/aws/cloudwatch/efa/efa-exporter >> /dev/null 2>&1 &\n' | sudo tee -a /opt/aws/cloudwatch/aws-cloudwatch-wrapper.sh", "sudo chmod +x /opt/aws/cloudwatch/aws-cloudwatch-wrapper.sh", "sudo cp /opt/aws/cloudwatch/nvidia/cwa-config.json /opt/aws/amazon-cloudwatch-agent/bin/config.json", "sudo /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -c file:/opt/aws/amazon-cloudwatch-agent/bin/config.json -s", "sudo systemctl enable aws-hw-monitor.service", "sudo systemctl restart amazon-cloudwatch-agent.service"] } ] }