FROM nvidia/cuda:11.3.1-base-ubuntu20.04 AS base_image ENV DEBIAN_FRONTEND=noninteractive \ LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" RUN apt-get update \ && apt-get upgrade -y \ && apt-get autoremove -y \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* FROM base_image LABEL maintainer="Amazon AI" LABEL dlc_major_version="1" ARG PYTHON=python3 ARG PYTHON_VERSION=3.8.13 ARG PYTHON_SHORT_VERSION=3.8 ARG CUBLAS_VERSION=11.5.1.109 ARG OPEN_MPI_PATH=/opt/amazon/openmpi ARG EFA_PATH=/opt/amazon/efa ARG CUDA_HOME=/usr/local/cuda ARG MAMBA_VERSION=4.12.0-2 # This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20 ARG DEBIAN_FRONTEND=noninteractive # Python won’t try to write .pyc or .pyo files on the import of source modules # Force stdin, stdout and stderr to be totally unbuffered. Good for logging ENV PYTHONDONTWRITEBYTECODE=1 ENV PYTHONUNBUFFERED=1 ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" ENV LD_LIBRARY_PATH="/opt/conda/lib:${LD_LIBRARY_PATH}" ENV PYTHONIOENCODING=UTF-8 ENV LANG=C.UTF-8 ENV LC_ALL=C.UTF-8 ENV PATH /opt/conda/bin:$PATH ENV TORCH_CUDA_ARCH_LIST="3.7 5.0 7.0+PTX 8.0" ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all" ENV CUDNN_VERSION=8.2.0.53 ENV NCCL_VERSION=2.10.3 ENV HOROVOD_VERSION=0.25.0 ENV EFA_VERSION=1.15.2 ENV OMPI_VERSION=4.1.1 ENV BRANCH_OFI=1.2.0-aws # TODO: Add DGL environment variables back when DGL is re-enabled # ENV DGLBACKEND=pytorch ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" ENV MANUAL_BUILD=0 ENV RDMAV_FORK_SAFE=1 ENV DLC_CONTAINER_TYPE=training # There's no PT pipeline for binary for diy/ec2 currently. Using the inference branch (PyTorch + telemetry feature) with different building environments for both training and inference DLCs temporarily. ARG PT_TRAINING_URL=https://pytorch-ei-binaries.s3.us-west-2.amazonaws.com/r1.10.2_inference/20220211-013556/528b6ca3747ec951396383f2a70d1354283ad29f/gpu/torch-1.10.2%2Bcu113-cp38-cp38-manylinux1_x86_64.whl ARG PT_TORCHVISION_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/pytorch/1.10.2/torchvision-0.11.3-cp38-cp38-linux_x86_64.whl ARG PT_S3_WHL_GPU=https://aws-s3-plugin.s3.us-west-2.amazonaws.com/binaries/0.0.1/1c3e69e/awsio-0.0.1-cp38-cp38-manylinux1_x86_64.whl ## NVIDIA updated and rotated the signing keys used by apt, dnf/yum, and zypper package managers beginning April 27, 2022 ## This code snippet has been taken from the NVIDIA forums https://forums.developer.nvidia.com/t/notice-cuda-linux-repository-key-rotation/212771 ## to ensure that the builds do not fail RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub RUN apt-get update \ # TODO: Remove systemd upgrade once it is updated in base image && apt-get -y upgrade --only-upgrade systemd \ && apt-get install -y --allow-change-held-packages --no-install-recommends \ build-essential \ ca-certificates \ cmake \ cuda-command-line-tools-11-3 \ cuda-cudart-11-3 \ cuda-libraries-11-3 \ curl \ emacs \ git \ jq \ libcublas-11-3=${CUBLAS_VERSION}-1 \ libcublas-dev-11-3=${CUBLAS_VERSION}-1 \ libcudnn8=$CUDNN_VERSION-1+cuda11.3 \ libcufft-dev-11-3 \ libcurand-dev-11-3 \ libcusolver-dev-11-3 \ libcusparse-dev-11-3 \ libglib2.0-0 \ libgl1-mesa-glx \ libsm6 \ libxext6 \ libxrender-dev \ libgomp1 \ libibverbs-dev \ libhwloc-dev \ libnuma1 \ libnuma-dev \ libssl1.1 \ libtool \ hwloc \ openssl \ python3-dev \ vim \ wget \ unzip \ zlib1g-dev \ jq \ && rm -rf /var/lib/apt/lists/* \ && apt-get clean RUN cd /tmp \ && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION}-1 \ && cd nccl \ && make -j64 src.build BUILDDIR=/usr/local \ && rm -rf /tmp/nccl # Install EFA along without AWS OPEN_MPI RUN mkdir /tmp/efa \ && cd /tmp/efa \ && curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \ && tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \ && cd aws-efa-installer \ && apt-get update \ && ./efa_installer.sh -y --skip-kmod -g \ && rm -rf $OPEN_MPI_PATH \ && rm -rf /tmp/efa \ && rm -rf /tmp/aws-efa-installer-${EFA_VERSION}.tar.gz \ && rm -rf /var/lib/apt/lists/* \ && apt-get clean # Install OpenMPI without libfabric support RUN mkdir /tmp/openmpi && \ cd /tmp/openmpi && \ wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz && \ tar zxf openmpi-${OMPI_VERSION}.tar.gz && \ cd openmpi-${OMPI_VERSION} && \ ./configure --enable-orterun-prefix-by-default --prefix=$OPEN_MPI_PATH && \ make -j $(nproc) all && \ make install && \ ldconfig && \ cd / && \ rm -rf /tmp/openmpi ENV PATH="$OPEN_MPI_PATH/bin:$PATH" ENV LD_LIBRARY_PATH=$OPEN_MPI_PATH/lib/:$EFA_PATH/lib/:$LD_LIBRARY_PATH RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value \ && curl -L -o ~/mambaforge.sh https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-x86_64.sh \ && chmod +x ~/mambaforge.sh \ && ~/mambaforge.sh -b -p /opt/conda \ && rm ~/mambaforge.sh \ && /opt/conda/bin/conda install -c conda-forge \ python=$PYTHON_VERSION \ mkl \ mkl-include \ cython \ typing \ future \ "pyopenssl>=17.5.0" \ # Below 2 are included in miniconda base, but not mamba so need to install conda-content-trust \ charset-normalizer \ # TODO: add dgl back once the ecs test for dgl gpu get fixed # && conda install -y -c dglteam dgl-cuda11.3=0.7.2 \ && /opt/conda/bin/conda clean -ya # Conda installs links for libtinfo.so.6 and libtinfo.so.6.2 both # Which causes "/opt/conda/lib/libtinfo.so.6: no version information available" warning # Removing link for libtinfo.so.6. This change is needed only for ubuntu 20.04-conda, and can be reverted # once conda fixes the issue RUN rm -rf /opt/conda/lib/libtinfo.so.6 RUN /opt/conda/bin/conda config --set ssl_verify False \ && pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org \ && ln -s /opt/conda/bin/pip /usr/local/bin/pip3 RUN conda install -c pytorch magma-cuda113==2.5.2 \ && conda install -y -c conda-forge \ h5py \ requests \ libgcc \ && conda clean -ya WORKDIR /root # Uninstall torch and torchvision before installing the custom versions from an S3 bucket # Install urllib3>=1.25.9 to satisfy awscli/boto3/botocore requirements while fixing for CVE-2020-26137 RUN pip install --no-cache-dir -U \ "opencv-python>=4.6,<5" \ numpy==1.22.2 \ ipython==8.0.1 \ "pyyaml>=5.4,<5.5" \ "awscli<2" \ "urllib3>=1.25.9" \ boto3 \ packaging \ psutil \ "Pillow>=9.0.0" \ scipy \ pybind11 \ click \ mpi4py==3.0.3 \ cmake==3.18.2.post1 \ torchnet \ "cryptography>3.2" \ && pip uninstall -y torch \ && pip install --no-cache-dir -U ${PT_TRAINING_URL} \ && pip uninstall -y torchvision \ && pip install --no-deps --no-cache-dir -U ${PT_TORCHVISION_URL} # Install Nvidia Apex ## Pin apex commit requested by sm-model-parallel team RUN git clone https://github.com/NVIDIA/apex && \ cd apex && \ git checkout aa756ce && \ pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ # Configure Open MPI and configure NCCL parameters RUN mv $OPEN_MPI_PATH/bin/mpirun $OPEN_MPI_PATH/bin/mpirun.real \ && echo '#!/bin/bash' > $OPEN_MPI_PATH/bin/mpirun \ && echo "${OPEN_MPI_PATH}/bin/mpirun.real --allow-run-as-root \"\$@\"" >> $OPEN_MPI_PATH/bin/mpirun \ && chmod a+x $OPEN_MPI_PATH/bin/mpirun \ && echo "hwloc_base_binding_policy = none" >> $OPEN_MPI_PATH/etc/openmpi-mca-params.conf \ && echo "rmaps_base_mapping_policy = slot" >> $OPEN_MPI_PATH/etc/openmpi-mca-params.conf \ && echo NCCL_DEBUG=INFO >> /etc/nccl.conf \ && echo NCCL_SOCKET_IFNAME=^docker0 >> /etc/nccl.conf # Install AWS OFI NCCL plug-in RUN apt-get update && apt-get install -y autoconf RUN mkdir /tmp/efa-ofi-nccl \ && cd /tmp/efa-ofi-nccl \ && git clone https://github.com/aws/aws-ofi-nccl.git -b v${BRANCH_OFI} \ && cd aws-ofi-nccl \ && ./autogen.sh \ && ./configure --with-libfabric=/opt/amazon/efa \ --with-mpi=/opt/amazon/openmpi \ --with-cuda=/usr/local/cuda \ --with-nccl=/usr/local --prefix=/usr/local \ && make \ && make install \ && rm -rf /tmp/efa-ofi-nccl \ && rm -rf /var/lib/apt/lists/* \ && apt-get clean # Install OpenSSH for MPI to communicate between containers, allow OpenSSH to talk to containers without asking for confirmation RUN apt-get update \ && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \ && apt-get install -y --no-install-recommends openssh-client openssh-server \ && mkdir -p /var/run/sshd \ && cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \ && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \ && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config \ && rm -rf /var/lib/apt/lists/* \ && apt-get clean # Configure OpenSSH so that nodes can communicate with each other RUN mkdir -p /var/run/sshd && \ sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd RUN rm -rf /root/.ssh/ && \ mkdir -p /root/.ssh/ && \ ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa && \ cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config # Install Horovod RUN pip uninstall -y horovod \ && ldconfig /usr/local/cuda-11.3/targets/x86_64-linux/lib/stubs \ && HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_CUDA_HOME=/usr/local/cuda-11.3 HOROVOD_WITH_PYTORCH=1 pip install --no-cache-dir horovod==${HOROVOD_VERSION} \ && ldconfig # Install PT S3 plugin RUN pip install --no-cache-dir -U ${PT_S3_WHL_GPU} RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt WORKDIR / COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py RUN chmod +x /usr/local/bin/deep_learning_container.py RUN HOME_DIR=/root \ && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ && chmod +x /usr/local/bin/testOSSCompliance \ && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ && rm -rf ${HOME_DIR}/oss_compliance* \ && rm -rf /tmp/tmp* RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-1.10/license.txt # Starts framework CMD ["/bin/bash"]