name: ParallelClusterValidate description: Validate ParallelCluster AMI schemaVersion: 1.0 constants: - CookbookDefaultFile: type: string value: /etc/chef/node_attributes.json phases: - name: validate steps: ### basic ### - name: OperatingSystemRelease action: ExecuteBash inputs: commands: - | set -v FILE=/etc/os-release if [ -e ${FILE} ]; then . ${FILE} echo "${ID}${VERSION_ID:+.${VERSION_ID}}" else echo "The file '${FILE}' does not exist. Failing build." && exit 1 fi # Get uniformed OS name - name: OperatingSystemName action: ExecuteBash inputs: commands: - | set -v RELEASE='{{ validate.OperatingSystemRelease.outputs.stdout }}' if [ `echo "${RELEASE}" | grep -w '^amzn\.2'` ]; then OS='alinux2' elif [ `echo "${RELEASE}" | grep '^centos\.7'` ]; then OS='centos7' elif [ `echo "${RELEASE}" | grep '^ubuntu\.20'` ]; then OS='ubuntu2004' elif [ `echo "${RELEASE}" | grep '^ubuntu\.22'` ]; then OS='ubuntu2204' elif [ `echo "${RELEASE}" | grep '^rhel\.8'` ]; then OS='rhel8' else echo "Operating System '${RELEASE}' is not supported. Failing build." && exit 1 fi echo ${OS} # Get input base AMI Architecture - name: OperatingSystemArchitecture action: ExecuteBash inputs: commands: - | set -v ARCH=$(uname -m) case ${ARCH} in 'x86_64') echo 'x86_64' ;; 'aarch64') echo 'arm64' ;; *) echo "The '${ARCH}' architecture is not supported. Failing build." && exit 1 ;; esac # Get platform name - name: PlatformName action: ExecuteBash inputs: commands: - | set -v OS='{{ validate.OperatingSystemName.outputs.stdout }}' if [ `echo "${OS}" | grep -E '^(alinux|centos|rhel)'` ]; then PLATFORM='RHEL' elif [ `echo "${OS}" | grep -E '^ubuntu'` ]; then PLATFORM='DEBIAN' fi echo ${PLATFORM} # Get AWS region - name: AWSRegion action: ExecuteBash inputs: commands: - | set -v IMDS_TOKEN=$(curl --retry 3 --retry-delay 0 -s --fail -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 300") AVAIL_ZONE=$(curl --retry 3 --retry-delay 0 -s --fail -H "X-aws-ec2-metadata-token: ${IMDS_TOKEN}" http://169.254.169.254/latest/meta-data/placement/availability-zone) AWS_REGION=${AVAIL_ZONE::-1} echo ${AWS_REGION} ### conditions ### - name: IntelMPISupported action: ExecuteBash inputs: commands: - | set -v [[ {{ validate.OperatingSystemArchitecture.outputs.stdout }} != 'arm64' ]] && echo "true" || echo "false" - name: FabricManagerSupported action: ExecuteBash inputs: commands: - | set -v [[ {{ validate.OperatingSystemArchitecture.outputs.stdout }} == 'arm64' ]] && echo "false" || echo "true" - name: LustreSupported action: ExecuteBash inputs: commands: - | set -v ARCHITECTURE='{{ validate.OperatingSystemArchitecture.outputs.stdout }}' OS='{{ validate.OperatingSystemName.outputs.stdout }}' if [ ${ARCHITECTURE} == 'arm64' ] && [[ ${OS} =~ ^(ubuntu(20|22)04|alinux2|rhel8)$ ]] || [ ${ARCHITECTURE} == 'x86_64' ]; then echo "true" else echo "false" fi ### versions ### - name: MungeVersion action: ExecuteBash inputs: commands: - | set -v PATTERN=$(jq '.default.cluster.munge.munge_version' {{ CookbookDefaultFile }}) VERSION=$(echo ${PATTERN} | tr -d '\n' | cut -d = -f 2 | xargs) echo ${VERSION} - name: NvidiaDriverVersion action: ExecuteBash inputs: commands: - | set -v PATTERN=$(jq '.default.cluster.nvidia.driver_version' {{ CookbookDefaultFile }}) VERSION=$(echo ${PATTERN} | tr -d '\n' | cut -d = -f 2 | xargs) echo ${VERSION} - name: CudaVersion action: ExecuteBash inputs: commands: - | set -v PATTERN=$(jq '.default.cluster.nvidia.cuda.version' {{ CookbookDefaultFile }}) VERSION=$(echo ${PATTERN} | tr -d '\n' | cut -d = -f 2 | xargs) echo ${VERSION} - name: CudaSamplesSrcDir action: ExecuteBash inputs: commands: - | set -v cuda_ver="{{ validate.CudaVersion.outputs.stdout }}" if [ ${cuda_ver} \> '11.4' ]; then PATTERN=$(jq '.default.cluster.nvidia.cuda_samples_version' {{ CookbookDefaultFile }}) VERSION=$(echo ${PATTERN} | tr -d '\n' | cut -d = -f 2 | xargs) echo cuda-samples-${VERSION}/Samples else echo cuda-${cuda_ver}/samples fi - name: CudaSamplesBinDir action: ExecuteBash inputs: commands: - | set -v cuda_ver="{{ validate.CudaVersion.outputs.stdout }}" if [ ${cuda_ver} \> '11.4' ]; then PATTERN=$(jq '.default.cluster.nvidia.cuda_samples_version' {{ CookbookDefaultFile }}) VERSION=$(echo ${PATTERN} | tr -d '\n' | cut -d = -f 2 | xargs) echo cuda-samples-${VERSION}/bin else echo cuda-${cuda_ver}/samples/bin fi ### utils ### - name: PatchInSpecProfiles action: ExecuteBash inputs: commands: - | set -v sed -Ei "s#path: cookbooks/aws-parallelcluster#path: /etc/chef/cookbooks/aws-parallelcluster#g" /etc/chef/cookbooks/aws-parallelcluster-*/test/inspec.yml echo "InSpec profiles patched" - name: NvidiaEnabled action: ExecuteBash inputs: commands: - | set -v NVIDIA_ENABLED=$(cat /etc/parallelcluster/image_dna.json | jq -r '.cluster.nvidia.enabled') echo "${NVIDIA_ENABLED}" - name: HasGPU action: ExecuteBash inputs: commands: - | set -v HAS_GPU=$(lspci | grep -o "NVIDIA") || HAS_GPU="false" echo "${HAS_GPU}" - name: Munge action: ExecuteBash inputs: commands: - | set -vx echo "check munge installed" munge --version | grep {{ validate.MungeVersion.outputs.stdout }} [[ $? -ne 0 ]] && echo "Check munge version failed" && exit 1 echo "Munge test passed" - name: EFAIntelMPI action: ExecuteBash inputs: commands: - | set -vx PLATFORM='{{ validate.PlatformName.outputs.stdout }}' if [ {{ validate.IntelMPISupported.outputs.stdout }} == true ]; then echo "Checking efa packages installed..." if [ ${PLATFORM} == RHEL ]; then rpm -qa | grep libfabric && rpm -qa | grep efa- [[ $? -ne 0 ]] && echo "Check efa rpm failed" && exit 1 echo "Checking Intel MPI 20xx installed and module available..." unset MODULEPATH source /etc/profile.d/modules.sh (module avail intelmpi)2>&1 | grep "/opt/intel/mpi/20.*/modulefiles" [[ $? -ne 0 ]] && echo "Check Intel MPI failed" && exit 1 else dpkg -l | grep libfabric && modinfo efa | grep efa && [ -d /opt/amazon/efa ] [[ $? -ne 0 ]] && echo "Check efa deb failed" && exit 1 fi fi echo "EFA test passed" - name: NvidiaCudaFabricManager action: ExecuteBash inputs: commands: - | set -vx PLATFORM='{{ validate.PlatformName.outputs.stdout }}' if [[ {{ validate.NvidiaEnabled.outputs.stdout }} == 'no' ]]; then echo "Nvidia recipe not enabled, skipping." && exit 0 fi if [ {{ validate.HasGPU.outputs.stdout }} == "false" ]; then echo "No GPU detected, skipping." && exit 0 fi driver_ver="{{ validate.NvidiaDriverVersion.outputs.stdout }}" export PATH="/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin:/opt/aws/bin" echo "Testing Nvidia driver version" driver_output=$(nvidia-smi | grep -E -o "Driver Version: [0-9.]+") [[ "${driver_output}" != "Driver Version: ${driver_ver}" ]] && "ERROR Installed version ${driver_output} but expected ${driver_ver}" && exit 1 echo "Correctly installed Nvidia ${driver_output}" if [ {{ validate.FabricManagerSupported.outputs.stdout }} == "true" ]; then echo "Testing Nvidia Fabric Manager version" nvidia_driver_version=$(modinfo -F version nvidia) if [ "${PLATFORM}" == "RHEL" ]; then yum list installed | grep "nvidia-fabric.*manager" | grep "${nvidia_driver_version}" || exit 1 yum versionlock list | grep "nvidia-fabric.*manager" || exit 1 else apt list --installed | grep "nvidia-fabric.*manager" | grep "${nvidia_driver_version}" || exit 1 apt-mark showhold | grep "nvidia-fabric.*manager" || exit 1 fi echo "Fabric Manager match Nvidia driver and version is locked" fi echo "Testing CUDA installation with nvcc" cuda_ver="{{ validate.CudaVersion.outputs.stdout }}" export PATH=/usr/local/cuda-${cuda_ver}/bin:${PATH} export LD_LIBRARY_PATH=/usr/local/cuda-${cuda_ver}/lib64:${LD_LIBRARY_PATH} cuda_output=$(nvcc -V | grep -E -o "release [0-9]+.[0-9]+") [[ "${cuda_output}" != "release ${cuda_ver}" ]] && echo "ERROR Installed version ${cuda_output} but expected ${cuda_ver}" && exit 1 echo "Correctly installed CUDA ${cuda_output}" echo "Testing CUDA with deviceQuery..." if [ {{ validate.OperatingSystemArchitecture.outputs.stdout }} != 'arm64' ]; then /usr/local/cuda-${cuda_ver}/extras/demo_suite/deviceQuery | grep -o "Result = PASS" [[ $? -ne 0 ]] && echo "CUDA deviceQuery test failed" && exit 1 else cd /usr/local/{{ validate.CudaSamplesSrcDir.outputs.stdout }}/1_Utilities/deviceQuery make /usr/local/{{ validate.CudaSamplesBinDir.outputs.stdout }}/sbsa/linux/release/deviceQuery | grep -o "Result = PASS" [[ $? -ne 0 ]] && echo "CUDA deviceQuery test failed" && exit 1 fi echo "CUDA deviceQuery test passed" - name: FSxLustre action: ExecuteBash inputs: commands: - | set -vx OS='{{ validate.OperatingSystemName.outputs.stdout }}' if [ {{ validate.LustreSupported.outputs.stdout }} == true ]; then echo "Checking for Lustre client..." if [ ${OS} == centos7 ]; then rpm -qa | grep lustre-client fi fi [[ $? -ne 0 ]] && echo "Check for Lustre client failed" && exit 1 echo "FSx Lustre test passed" - name: Python action: ExecuteBash inputs: commands: - | set -vx echo "Checking python3 installed..." which python3 [[ $? -ne 0 ]] && echo "Python3 is not installed" && exit 1 echo "Python test passed" - name: DPKG action: ExecuteBash inputs: commands: - | set -vx PLATFORM='{{ validate.PlatformName.outputs.stdout }}' if [ ${PLATFORM} != DEBIAN ]; then echo "Checking dpkg is not installed on non-debian OS..." if command -v dpkg &> /dev/null; then echo "ERROR: dpkg found on non-Debian system" && exit 1 fi echo "dpkg test passed" fi - name: InSpecValidationsForAwsBatch action: ExecuteBash inputs: commands: - | set -vx echo "Performing InSpec validation for AwsBatch on the AMI..." cd /etc/chef/cookbooks/aws-parallelcluster-awsbatch inspec exec test --profiles-path . --controls /tag:install/ --no-distinct-exit [[ $? -ne 0 ]] && echo "InSpec validation for AwsBatch failed" && exit 1 echo "InSpec validation for AwsBatch passed" - name: InSpecValidationsForPlatform action: ExecuteBash inputs: commands: - | set -vx echo "Performing InSpec validation for platform on the AMI..." cd /etc/chef/cookbooks/aws-parallelcluster-platform inspec exec test --profiles-path . --controls /tag:install/ --no-distinct-exit [[ $? -ne 0 ]] && echo "InSpec validation for platform failed" && exit 1 echo "InSpec validation for platform passed" - name: InSpecValidationsForEnvironment action: ExecuteBash inputs: commands: - | set -vx echo "Performing InSpec validation for environment on the AMI..." cd /etc/chef/cookbooks/aws-parallelcluster-environment inspec exec test --profiles-path . --controls /tag:install/ --no-distinct-exit [[ $? -ne 0 ]] && echo "InSpec validation for environment failed" && exit 1 echo "InSpec validation for environment passed" - name: InSpecValidationsForComputeFleet action: ExecuteBash inputs: commands: - | set -vx echo "Performing InSpec validation for compute fleet on the AMI..." cd /etc/chef/cookbooks/aws-parallelcluster-computefleet inspec exec test --profiles-path . --controls /tag:install/ --no-distinct-exit [[ $? -ne 0 ]] && echo "InSpec validation for compute fleet failed" && exit 1 echo "InSpec validation for compute fleet passed" - name: InSpecValidationsForShared action: ExecuteBash inputs: commands: - | set -vx echo "Performing InSpec validation for shared cookbook on the AMI..." cd /etc/chef/cookbooks/aws-parallelcluster-shared inspec exec test --profiles-path . --controls /tag:install/ --no-distinct-exit [[ $? -ne 0 ]] && echo "InSpec validation for shared cookbook failed" && exit 1 echo "InSpec validation for shared cookbook passed" - name: InSpecValidationsForSlurm action: ExecuteBash inputs: commands: - | set -vx echo "Performing InSpec validation for Slurm on the AMI..." cd /etc/chef/cookbooks/aws-parallelcluster-slurm inspec exec test --profiles-path . --controls /tag:install/ --no-distinct-exit [[ $? -ne 0 ]] && echo "InSpec validation for Slurm failed" && exit 1 echo "InSpec validation for slurm passed"