# docker image region us-west-2 FROM 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.12-gpu-py38-cu113-ubuntu20.04-sagemaker-v1 AS sagemaker ######## ### :::Update NCCL::: ### This is a temporary workaround to ### upgrade NCCL ######## # Remove Older NCCL RUN rm -rvf `find /usr/local | grep nccl` /usr/local/obj ENV NCCL_VERSION=2.12.12 # Install Newer NCCL RUN cd /tmp \ && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION}-1 \ && cd nccl \ && make -j `nproc` src.build BUILDDIR=/usr/local \ && rm -rf /tmp/nccl # Reinstall AWS OFI NCCL ENV BRANCH_OFI=1.4.0-aws RUN mkdir /tmp/efa-ofi-nccl && cd /tmp/efa-ofi-nccl && git clone https://github.com/aws/aws-ofi-nccl.git -b v${BRANCH_OFI} && cd aws-ofi-nccl && ./autogen.sh && ./configure --with-libfabric=/opt/amazon/efa --with-mpi=/opt/amazon/openmpi --with-cuda=/usr/local/cuda --with-nccl=/usr/local --prefix=/usr/local && make && make install && rm -rf /tmp/efa-ofi-nccl && rm -rf /var/lib/apt/lists/* && apt-get clean ######## ### :::Update NCCL::: ######## LABEL maintainer="Amazon AI" LABEL dlc_major_version="1" # Version args - overwritten by args specified in buildspec ARG PYTHON=python3 ARG PT_BUCKET=https://aws-pytorch-unified-cicd-binaries.s3.us-west-2.amazonaws.com/trcomp/r1.12.0_sm/20221202-082049/72513603d1a1d74129e980c4935de5bcd0959691 ARG PT_URL=${PT_BUCKET}/torch-1.12.0%2Bcu113-cp38-cp38-linux_x86_64.whl ARG PT_XLA_URL=${PT_BUCKET}/torch_xla-1.12-cp38-cp38-linux_x86_64.whl ARG TORCHVISION_URL=${PT_BUCKET}/torchvision-0.13.0a0%2Bda3794e-cp38-cp38-linux_x86_64.whl ARG TORCHAUDIO_URL=${PT_BUCKET}/torchaudio-0.12.0%2B2e13884-cp38-cp38-linux_x86_64.whl ARG SMD_DATA_PARALLEL_URL=https://smdataparallel.s3.amazonaws.com/binary/pytorch/1.12.0/cu113-compiler/2022-08-09/smdistributed_dataparallel-1.5.0-cp38-cp38-linux_x86_64.whl # Install Sagemaker PythonSDK RUN pip install --no-cache-dir sagemaker RUN apt-get update \ # TODO: Remove upgrade statements once packages are updated in base image && apt-get -y upgrade --only-upgrade systemd openssl cryptsetup \ && apt install -y git-lfs \ && apt install -y libomp5 \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* # Install PyTorch RUN pip uninstall -y torch \ && pip install --no-deps --no-cache-dir -U --force-reinstall ${PT_URL} \ && rm -rf ${PT_URL} # Install PyTorch XLA RUN pip uninstall -y torch_xla \ && pip install --no-deps --no-cache-dir -U --force-reinstall ${PT_XLA_URL} \ && rm -rf ${PT_XLA_URL} # Install TorchVision RUN pip uninstall -y torchvision \ && pip install --no-deps --no-cache-dir -U --force-reinstall ${TORCHVISION_URL} \ && rm -rf ${TORCHVISION_URL} # Install TorchAudio RUN pip uninstall -y torchaudio \ && pip install --no-deps --no-cache-dir -U --force-reinstall ${TORCHAUDIO_URL} \ && rm -rf ${TORCHAUDIO_URL} ARG CONDA_PREFIX=/opt/conda # Numpy version gets downgraded and becomes incompatible with PyTorch wheel in the above conda command. RUN pip install numpy --upgrade # Fix library links RUN ln -s ${CONDA_PREFIX}/lib/libmkl_intel_lp64.so ${CONDA_PREFIX}/lib/libmkl_intel_lp64.so.1 \ && ln -s ${CONDA_PREFIX}/lib/libmkl_intel_thread.so ${CONDA_PREFIX}/lib/libmkl_intel_thread.so.1 \ && ln -s ${CONDA_PREFIX}/lib/libmkl_core.so ${CONDA_PREFIX}/lib/libmkl_core.so.1 \ && ln -s ${CONDA_PREFIX}/lib/libmkl_gnu_thread.so ${CONDA_PREFIX}/lib/libmkl_gnu_thread.so.1 # Install SM Distributed DataParallel binary RUN SMDATAPARALLEL_PT=1 pip install --no-cache-dir ${SMD_DATA_PARALLEL_URL} --force-reinstall # Install SageMaker Training Toolkit RUN pip uninstall -y sagemaker-training \ && pip install --no-deps --no-cache-dir -U --force-reinstall "sagemaker-training>=4.3.2" # Install SageMaker PyTorch Training Toolkit RUN pip uninstall -y sagemaker-pytorch-training \ && pip install --no-deps --no-cache-dir -U --force-reinstall "sagemaker-pytorch-training>=2.6.1,<3" # CVE Fixes RUN pip install --no-cache-dir -U "protobuf<3.20.0" RUN apt-get update -y && \ apt-get upgrade -y openssl && \ apt-get clean # Uninstall Torch Data - not needed for NLP support with PT 1.12 RUN pip uninstall -y torchdata # Reinstall Horovod RUN pip uninstall -y horovod \ && ldconfig /usr/local/cuda-11.3/targets/x86_64-linux/lib/stubs \ && HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_CUDA_HOME=/usr/local/cuda-11.3 HOROVOD_WITH_PYTORCH=1 pip install --no-cache-dir horovod==${HOROVOD_VERSION} \ && ldconfig RUN HOME_DIR=/root \ && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ && chmod +x /usr/local/bin/testOSSCompliance \ && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ && rm -rf ${HOME_DIR}/oss_compliance*