# Expecting base image to be the image built by ./Dockerfile.ec2.gpu ARG BASE_IMAGE="" FROM $BASE_IMAGE LABEL maintainer="Amazon AI" LABEL dlc_major_version="1" ARG PYTHON=python3 ARG PYTHON_VERSION=3.8.13 ARG PYTHON_SHORT_VERSION=3.8 ARG RMM_VERSION=0.15.0 # The smdebug pipeline relies for following format to perform string replace and trigger DLC pipeline for validating # the nightly builds. Therefore, while updating the smdebug version, please ensure that the format is not disturbed. ARG SMDEBUG_VERSION=1.0.22 ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main # swap the pytorch training wheel with additional smdebug and smmdp features ARG PT_TRAINING_URL=https://aws-pytorch-unified-cicd-binaries.s3.us-west-2.amazonaws.com/r1.10.2_sm/20220912-052006/8463a9e51a7938004fece4983b367a2cfa876237/torch-1.10.2%2Bcu113-cp38-cp38-linux_x86_64.whl ARG SMD_MODEL_PARALLEL_URL=https://sagemaker-distributed-model-parallel.s3.us-west-2.amazonaws.com/pytorch-1.10.2/build-artifacts/2022-09-17-02-28/smdistributed_modelparallel-1.11.2-cp38-cp38-linux_x86_64.whl ARG SMDATAPARALLEL_BINARY=https://smdataparallel.s3.amazonaws.com/binary/pytorch/1.10.2/cu113/2022-09-15/smdistributed_dataparallel-1.4.3-cp38-cp38-linux_x86_64.whl # Install scikit-learn and pandas RUN conda install -y -c conda-forge \ scikit-learn \ pandas WORKDIR / # Install libboost from source. This package is needed for smdataparallel functionality [for networking asynchronous IO]. RUN wget https://sourceforge.net/projects/boost/files/boost/1.73.0/boost_1_73_0.tar.gz/download -O boost_1_73_0.tar.gz \ && tar -xzf boost_1_73_0.tar.gz \ && cd boost_1_73_0 \ && ./bootstrap.sh \ && ./b2 threading=multi --prefix=/opt/conda -j 64 cxxflags=-fPIC cflags=-fPIC install || true \ && cd .. \ && rm -rf boost_1_73_0.tar.gz \ && rm -rf boost_1_73_0 \ && cd /opt/conda/include/boost WORKDIR /opt/pytorch # Copy workaround script for incorrect hostname COPY changehostname.c / COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh WORKDIR /root RUN pip install --no-cache-dir -U \ # smdebug==${SMDEBUG_VERSION} \ smclarify \ "sagemaker>=2,<3" \ sagemaker-experiments==0.* \ "sagemaker-pytorch-training>=2.5.1,<3" # Install smdebug from souce RUN cd /tmp \ && git clone -b ${SMDEBUG_VERSION} https://github.com/awslabs/sagemaker-debugger \ && cd sagemaker-debugger \ && python setup.py install \ && rm -rf /tmp/* # Install extra packages # numba 0.54 only works with numpy>=1.20. See https://github.com/numba/numba/issues/7339 RUN pip install --no-cache-dir -U \ "bokeh>=2.3,<3" \ "imageio>=2.9,<3" \ "opencv-python>=4.6,<5" \ "plotly>=5.1,<6" \ "seaborn>=0.11,<1" \ "numba<0.54" \ "shap>=0.39,<1" \ && pip uninstall -y torch \ && pip install --no-cache-dir -U ${PT_TRAINING_URL} \ && pip uninstall -y torchvision \ && pip install --no-deps --no-cache-dir -U torchvision==0.11.3 # Install Horovod RUN pip uninstall -y horovod \ && ldconfig /usr/local/cuda-11.3/targets/x86_64-linux/lib/stubs \ && HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_CUDA_HOME=/usr/local/cuda-11.3 HOROVOD_WITH_PYTORCH=1 pip install --no-cache-dir horovod==${HOROVOD_VERSION} \ && ldconfig # Install RAPIDSMemoryManager. # Requires cmake>=3.14. RUN wget -nv https://github.com/rapidsai/rmm/archive/v${RMM_VERSION}.tar.gz \ && tar -xvf v${RMM_VERSION}.tar.gz \ && cd rmm-${RMM_VERSION} \ && INSTALL_PREFIX=/usr/local ./build.sh librmm \ && cd .. \ && rm -rf v${RMM_VERSION}.tar* \ && rm -rf rmm-${RMM_VERSION} # Install Nvidia Apex ## Pin apex commit requested by sm-model-parallel team RUN rm -rf apex && \ git clone https://github.com/NVIDIA/apex && \ cd apex && \ git checkout aa756ce && \ pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ # Install SM Distributed Modelparallel binary RUN pip install --no-cache-dir -U ${SMD_MODEL_PARALLEL_URL} # Install SM Distributed DataParallel binary RUN SMDATAPARALLEL_PT=1 pip install --no-cache-dir ${SMDATAPARALLEL_BINARY} ENV LD_LIBRARY_PATH="/opt/conda/lib/python${PYTHON_SHORT_VERSION}/site-packages/smdistributed/dataparallel/lib:$LD_LIBRARY_PATH" WORKDIR / RUN chmod +x /usr/local/bin/start_with_right_hostname.sh RUN HOME_DIR=/root \ && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ && chmod +x /usr/local/bin/testOSSCompliance \ && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ && rm -rf ${HOME_DIR}/oss_compliance* \ && rm -rf /tmp/tmp* ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"] CMD ["/bin/bash"]