{ "variables": { "region": "us-east-1", "flag": "al2-pyxis", "subnet_id": "subnet-baff22e5", "security_groupids": "sg-053996a563511a3c6,sg-050407dfb1c555723", "build_ami": "ami-02354e95b39ca8dec", "efa_pkg": "aws-efa-installer-latest.tar.gz", "intel_mkl_version": "intel-mkl-2020.0-088", "nvidia_version": "515.86.01", "cuda_version": "cuda-toolkit-11-7", "cudnn_version": "libcudnn8", "nccl_version": "v2.12.7-1", "docker_image": "nvcr.io/nvidia/cuda:11.0.3-runtime-ubuntu18.04", "slurm_version" : "22.05.6" }, "builders": [{ "type": "amazon-ebs", "region": "{{user `region`}}", "source_ami": "{{user `build_ami`}}", "run_tags": { "Name": "packer-gpu-processor_{{user `flag`}}" }, "subnet_id": "{{user `subnet_id`}}", "security_group_ids": "{{user `security_groupids`}}", "instance_type":"g4dn.8xlarge", "ssh_username": "ec2-user", "ami_name": "aws_{{user `flag`}}_{{user `nvidia_version`}}-{{timestamp}}", "launch_block_device_mappings":[{ "delete_on_termination": true, "device_name": "/dev/xvda", "volume_size": 200, "throughput": 1000, "iops": 10000, "volume_type": "gp3" }] }], "provisioners": [{ "type": "shell", "expect_disconnect": true, "inline": [ "sudo yum update -y", "sudo amazon-linux-extras install lustre2.10 epel kernel-5.10 -y", "sudo yum install amazon-cloudwatch-agent python3 yum-utils cmake dkms mdadm git htop hwloc iftop kernel-tools rpm-build rpmdevtools numactl parallel pigz python3-distutils wget kernel-devel kernel-headers check check-devel subunit subunit-devel -y", "sudo yum groupinstall 'Development Tools' -y", "curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py && sudo python3 /tmp/get-pip.py", "uname -r", "echo 'blacklist nouveau' | sudo tee /etc/modprobe.d/nvidia-graphics-drivers.conf", "echo 'blacklist lbm-nouveau' | sudo tee -a /etc/modprobe.d/nvidia-graphics-drivers.conf", "echo 'alias nouveau off' | sudo tee -a /etc/modprobe.d/nvidia-graphics-drivers.conf", "echo 'alias lbm-nouveau off' | sudo tee -a /etc/modprobe.d/nvidia-graphics-drivers.conf", "sudo shutdown -r now"] }, { "type": "shell", "inline": [ "echo 'net.core.default_qdisc = fq' | sudo tee -a /etc/sysctl.conf", "echo 'net.ipv4.tcp_congestion_control = bbr' | sudo tee -a /etc/sysctl.conf", "echo 'net.ipv4.tcp_timestamps = 0' | sudo tee -a /etc/sysctl.conf", "echo 'net.core.rmem_max = 67108864' | sudo tee -a /etc/sysctl.conf", "echo 'net.core.wmem_max = 67108864' | sudo tee -a /etc/sysctl.conf", "echo 'net.ipv4.tcp_rmem = 4096 87380 67108864' | sudo tee -a /etc/sysctl.conf", "echo 'net.ipv4.tcp_wmem = 4096 65536 67108864' | sudo tee -a /etc/sysctl.conf", "sudo sysctl -p"] }, { "type": "shell", "inline": [ "uname -r", "wget -O /tmp/awscli2.zip https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip", "cd /tmp && sudo unzip /tmp/awscli2.zip", "sudo /tmp/aws/install", "aws configure set default.s3.max_concurrent_requests 100", "aws configure set default.s3.max_queue_size 10000", "aws configure set default.s3.multipart_threshold 64MB", "aws configure set default.s3.multipart_chunksize 16MB"] }, { "type": "shell", "inline_shebang": "/bin/bash -xe", "inline": [ "echo ' StrictHostKeyChecking no' | sudo tee -a /etc/ssh/ssh_config", "echo ' HostbasedAuthentication no' | sudo tee -a /etc/ssh/ssh_config", "echo ' CheckHostIP no' | sudo tee -a /etc/ssh/ssh_config"] }, { "type": "shell", "inline_shebang": "/bin/bash -xe", "inline": [ "cd /tmp", "sudo yum install gcc10 kernel-devel kernel-headers -y", "sudo yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo", "sudo yum clean all", "sudo sed -i '/^\\[main\\]/a\\exclude=kernel*' /etc/yum.conf", "sudo wget -O /tmp/NVIDIA-Linux-driver.run 'https://us.download.nvidia.com/tesla/{{user `nvidia_version`}}/NVIDIA-Linux-x86_64-{{user `nvidia_version`}}.run'", "sudo CC=gcc10-cc sh /tmp/NVIDIA-Linux-driver.run -q -a --ui=none", "sudo curl -O https://developer.download.nvidia.com/compute/nvidia-driver/redist/fabricmanager/linux-x86_64/fabricmanager-linux-x86_64-{{user `nvidia_version`}}-archive.tar.xz", "sudo tar xf fabricmanager-linux-x86_64-{{user `nvidia_version`}}-archive.tar.xz -C /tmp", "sudo rsync -al /tmp/fabricmanager-linux-x86_64-{{user `nvidia_version`}}-archive/ /usr/ --exclude LICENSE", "sudo mv /usr/systemd/nvidia-fabricmanager.service /usr/lib/systemd/system", "sudo systemctl enable nvidia-fabricmanager", "sudo yum -y install {{user `cuda_version`}} {{user `cudnn_version`}} {{user `cudnn_version`}}-devel", "echo -e 'options nvidia NVreg_EnableGpuFirmware=0' | sudo tee /etc/modprobe.d/nvidia-gsp.conf", "echo -e '#!/bin/sh\nexport LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64\nexport PATH=$PATH:/usr/local/cuda/bin' | sudo tee /etc/profile.d/cuda.sh", "sudo chmod +x /etc/rc.local"] }, { "type": "shell", "inline_shebang": "/bin/bash -xe", "inline": [ "git clone https://github.com/NVIDIA/gdrcopy.git /tmp/gdrcopy", "cd /tmp/gdrcopy/packages", "CUDA=/usr/local/cuda PATH=$PATH:/sbin ./build-rpm-packages.sh", "sudo rpm -Uvh *.rpm"] }, { "type": "shell", "inline_shebang": "/bin/bash -xe", "inline": [ "distribution=$(. /etc/os-release;echo $ID$VERSION_ID)", "sudo amazon-linux-extras install docker", "sudo systemctl enable docker", "curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.repo | sudo tee /etc/yum.repos.d/nvidia-docker.repo", "sudo yum install -y nvidia-container-toolkit nvidia-docker2", "sudo sed -i 's/^OPTIONS/#&/' /etc/sysconfig/docker", "echo -e '{\"default-ulimits\":{\"memlock\":{\"Name\":\"memlock\",\"Soft\":-1,\"Hard\":-1}},\"default-runtime\":\"nvidia\",\"runtimes\":{\"nvidia\":{\"path\":\"nvidia-container-runtime\",\"runtimeArgs\":[]}}}' | sudo tee /etc/docker/daemon.json", "sudo systemctl restart docker", "sudo usermod -aG docker ec2-user", "sudo docker pull {{user `docker_image`}}"] }, { "type": "shell", "inline_shebang": "/bin/bash -xe", "inline": [ "wget -O /tmp/aws-efa-installer.tar.gz https://efa-installer.amazonaws.com/{{user `efa_pkg`}}", "tar -xf /tmp/aws-efa-installer.tar.gz -C /tmp", "cd /tmp/aws-efa-installer", "sudo ./efa_installer.sh -y -g", "echo -e '#!/bin/sh\nexport LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/amazon/openmpi/lib64:/opt/amazon/efa/lib64\nexport PATH=$PATH:/opt/amazon/efa/bin:/opt/amazon/openmpi/bin' | sudo tee /etc/profile.d/amazon_efa.sh"] }, { "type": "shell", "inline_shebang": "/bin/bash -xe", "inline": [ "cd /opt", "sudo git clone https://github.com/NVIDIA/nccl.git", "cd nccl", "sudo git checkout {{user `nccl_version`}}", "sudo make -j src.build CUDA_HOME=/usr/local/cuda NVCC_GENCODE='-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80'", "echo -e '#!/bin/sh\nexport LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/nccl/build/lib\nexport NCCL_PROTO=simple' | sudo tee /etc/profile.d/nccl.sh"] }, { "type": "shell", "inline_shebang": "/bin/bash -xe", "inline": [ "git clone https://github.com/aws/aws-ofi-nccl.git -b aws", "cd aws-ofi-nccl", "export PATH=$PATH:/opt/amazon/efa/bin:/opt/amazon/openmpi/bin", "./autogen.sh", "sudo mkdir -p /opt/aws-ofi-nccl", "./configure --prefix=/opt/aws-ofi-nccl --with-mpi=/opt/amazon/openmpi --with-libfabric=/opt/amazon/efa --with-nccl=/opt/nccl/build --with-cuda=/usr/local/cuda", "make && sudo make install", "echo -e '#!/bin/sh\nexport LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/aws-ofi-nccl/lib' | sudo tee /etc/profile.d/aws-ofi-nccl.sh"] }, { "type": "shell", "inline_shebang": "/bin/bash -xe", "inline": [ "git clone https://github.com/NVIDIA/nccl-tests.git", "cd nccl-tests", "make MPI=1 MPI_HOME=/opt/amazon/openmpi CUDA_HOME=/usr/local/cuda NCCL_HOME=/opt/nccl/build"] }, { "type": "file", "source": "enroot/enroot.conf", "destination": "/tmp/enroot.conf" }, { "type": "shell", "inline_shebang": "/bin/bash -xe", "inline": [ "sudo wget -O /tmp/slurm-devel.rpm https://github.com/aws-samples/aws-efa-nccl-baseami-pipeline/raw/master/nvidia-efa-ami_base/slurm/slurm-devel-22.05.6-1.amzn2.x86_64.rpm", "sudo wget -O /tmp/slurm.rpm https://github.com/aws-samples/aws-efa-nccl-baseami-pipeline/raw/master/nvidia-efa-ami_base/slurm/slurm-22.05.6-1.amzn2.x86_64.rpm", "cd /tmp && sudo yum localinstall -y /tmp/slurm-devel.rpm /tmp/slurm.rpm", "sudo yum install -y jq squashfs-tools parallel fuse-overlayfs libnvidia-container-tools pigz squashfuse", "export arch=$(uname -m) && sudo -E yum install -y https://github.com/NVIDIA/enroot/releases/download/v3.4.0/enroot-3.4.0-2.el7.${arch}.rpm", "export arch=$(uname -m) && sudo -E yum install -y https://github.com/NVIDIA/enroot/releases/download/v3.4.0/enroot+caps-3.4.0-2.el7.${arch}.rpm", "sudo mkdir /scratch && sudo chmod -R 777 /scratch", "git clone https://github.com/NVIDIA/pyxis.git /tmp/pyxis", "cd /tmp/pyxis && sudo make rpm && sudo rpm -ihv *.rpm", "sudo mv /tmp/enroot.conf /etc/enroot/enroot.conf", "sudo mkdir -p /opt/slurm/etc/plugstack.conf.d", "echo -e 'include /opt/slurm/etc/plugstack.conf.d/*' | sudo tee /opt/slurm/etc/plugstack.conf", "sudo ln -s -f /usr/share/pyxis/pyxis.conf /opt/slurm/etc/plugstack.conf.d/pyxis.conf"] }, { "type": "shell", "inline_shebang": "/bin/bash -xe", "inline": [ "sudo mkdir -p /opt/aws", "wget -O /tmp/aws-gpu-boost-clock.sh 'https://github.com/aws-samples/aws-efa-nccl-baseami-pipeline/raw/master/nvidia-efa-ami_base/boost/aws-gpu-boost-clock.sh'", "wget -O /tmp/aws-gpu-boost-clock.service 'https://github.com/aws-samples/aws-efa-nccl-baseami-pipeline/raw/master/nvidia-efa-ami_base/boost/aws-gpu-boost-clock.service'", "sudo mv /tmp/aws-gpu-boost-clock.sh /opt/aws/ && sudo chmod +x /opt/aws/aws-gpu-boost-clock.sh", "sudo mv /tmp/aws-gpu-boost-clock.service /lib/systemd/system", "sudo systemctl enable aws-gpu-boost-clock.service && sudo systemctl start aws-gpu-boost-clock.service"] }, { "type": "shell", "inline_shebang": "/bin/bash -xe", "inline": [ "sudo /usr/local/bin/pip3 install boto3", "sudo mkdir -p /opt/aws", "sudo wget -O /tmp/cwa.rpm 'https://s3.amazonaws.com/amazoncloudwatch-agent/amazon_linux/amd64/latest/amazon-cloudwatch-agent.rpm'", "sudo rpm -iUhv /tmp/cwa.rpm", "git clone https://github.com/aws-samples/aws-efa-nccl-baseami-pipeline.git /tmp/aws-efa-nccl-baseami", "sudo mv /tmp/aws-efa-nccl-baseami/nvidia-efa-ami_base/cloudwatch /opt/aws/", "sudo mv /opt/aws/cloudwatch/aws-hw-monitor.service /lib/systemd/system", "echo -e '#!/bin/sh\n' | sudo tee /opt/aws/cloudwatch/aws-cloudwatch-wrapper.sh", "echo -e '/usr/bin/python3 /opt/aws/cloudwatch/nvidia/aws-hwaccel-event-parser.py &' | sudo tee -a /opt/aws/cloudwatch/aws-cloudwatch-wrapper.sh", "echo -e '/usr/bin/python3 /opt/aws/cloudwatch/nvidia/accel-to-cw.py /opt/aws/cloudwatch/nvidia/nvidia-exporter >> /dev/null 2>&1 &\n' | sudo tee -a /opt/aws/cloudwatch/aws-cloudwatch-wrapper.sh", "echo -e '/usr/bin/python3 /opt/aws/cloudwatch/efa/efa-to-cw.py /opt/aws/cloudwatch/efa/efa-exporter >> /dev/null 2>&1 &\n' | sudo tee -a /opt/aws/cloudwatch/aws-cloudwatch-wrapper.sh", "sudo chmod +x /opt/aws/cloudwatch/aws-cloudwatch-wrapper.sh", "sudo cp /opt/aws/cloudwatch/nvidia/cwa-config.json /opt/aws/amazon-cloudwatch-agent/bin/config.json", "sudo /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -c file:/opt/aws/amazon-cloudwatch-agent/bin/config.json -s", "sudo systemctl enable aws-hw-monitor.service", "sudo systemctl restart amazon-cloudwatch-agent.service"] } ] }