FROM ubuntu:20.04 LABEL maintainer="Amazon AI" LABEL dlc_major_version="1" ARG PYTHON=python3.8 ARG PYTHON_VERSION=3.8.16 # Neuron SDK components version numbers ARG NEURONX_RUNTIME_LIB_VERSION=2.13.* ARG NEURONX_COLLECTIVES_LIB_VERSION=2.13.* ARG NEURONX_TOOLS_VERSION=2.10.* ARG NEURONX_FRAMEWORK_VERSION=1.13.1.1.7.* ARG NEURONX_CC_VERSION=2.6.* ARG OPEN_MPI_VERSION=4.0.7 ARG PIP=pip3 # This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20 ARG DEBIAN_FRONTEND=noninteractive # Python won’t try to write .pyc or .pyo files on the import of source modules # Force stdin, stdout and stderr to be totally unbuffered. Good for logging ENV PYTHONDONTWRITEBYTECODE=1 ENV PYTHONUNBUFFERED=1 ENV PYTHONIOENCODING=UTF-8 ENV LANG=C.UTF-8 ENV LC_ALL=C.UTF-8 ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/aws/neuron/lib" ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib" ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib64" ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/openmpi/lib64" ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" ENV PATH /opt/aws/neuron/bin/:$PATH ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main ENV DGLBACKEND=pytorch RUN apt-get update \ && apt-get install -y --no-install-recommends \ build-essential \ ca-certificates \ cmake \ curl \ emacs \ git \ jq \ libopencv-dev \ openjdk-8-jdk-headless \ openjdk-8-jdk \ openjdk-8-jre \ libglib2.0-0 \ libgl1-mesa-glx \ libsm6 \ libxext6 \ libxrender-dev \ openjdk-11-jdk \ software-properties-common \ wget \ unzip \ vim \ zlib1g-dev \ openssl \ libssl-dev \ libsqlite3-dev \ libgdbm-dev \ libc6-dev \ libbz2-dev \ tk-dev \ libffi-dev \ libcap-dev \ gnupg2 \ gpg-agent \ && rm -rf /var/lib/apt/lists/* \ && apt-get clean RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - RUN apt-get update \ && apt-get install -y \ aws-neuronx-tools=$NEURONX_TOOLS_VERSION \ aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \ aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \ && rm -rf /var/lib/apt/lists/* \ && rm -rf /tmp/tmp* \ && apt-get clean RUN wget https://www.open-mpi.org/software/ompi/v4.0/downloads/openmpi-$OPEN_MPI_VERSION.tar.gz \ && gunzip -c openmpi-$OPEN_MPI_VERSION.tar.gz | tar xf - \ && cd openmpi-$OPEN_MPI_VERSION \ && ./configure --prefix=/home/.openmpi \ && make all install \ && cd .. \ && rm openmpi-$OPEN_MPI_VERSION.tar.gz \ && rm -rf openmpi-$OPEN_MPI_VERSION # install Python RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \ && tar -xzf Python-$PYTHON_VERSION.tgz \ && cd Python-$PYTHON_VERSION \ && ./configure --enable-shared --prefix=/usr/local \ && make -j $(nproc) && make install \ && cd .. && rm -rf ../Python-$PYTHON_VERSION* \ && ln -s /usr/local/bin/pip3 /usr/bin/pip \ && ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \ && ${PIP} --no-cache-dir install --upgrade \ pip \ setuptools WORKDIR / # The ENV variables declared below are changed in the previous section # Grouping these ENV variables in the first section causes # ompi_info to fail. This is only observed in CPU containers ENV PATH="$PATH:/home/.openmpi/bin" ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/" RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value # Copy workaround script for incorrect hostname COPY changehostname.c / COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh RUN ${PIP} install --no-cache-dir -U \ "pyyaml>=5.4,<5.5" \ "bokeh>=2.3,<3" \ "opencv-python>=4.6,<5" \ "awscli<2" \ scipy \ click \ "cryptography>3.2" \ "sagemaker>=2,<3" \ "sagemaker-pytorch-training<3" \ psutil==5.6.7 \ dataset \ transformers \ Pillow RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt RUN ${PIP} config set global.extra-index-url https://pip.repos.neuron.amazonaws.com \ && ${PIP} install --force-reinstall torch-neuronx==$NEURONX_FRAMEWORK_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com \ && ${PIP} install --force-reinstall neuronx-cc==$NEURONX_CC_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com # attrs, neurox-cc required: >=19.2.0, sagemaker 2.103.0 <22,>=20.3.0 # protobuf neurox-cc<4 , sagemaker training <3.20,>=3.9.2 # awscli 1.25.47 has requirement docutils<0.17,>=0.10 # etcd for kubernetes installation # awscli 1.27.127 has requirement rsa<4.8,>=3.1.2, but you have rsa 4.9. # awscli 1.27.127 requires urllib3 < 1.27, python-etcd requires urllib3 >= 1.7, latest urllib3 release is 2.0.2 RUN ${PIP} install --no-cache-dir -U \ "attrs>=20.3.0,<22.0.0" \ "protobuf>=2.9.2,<3.20" \ "docutils>=0.10,<0.17" \ "rsa<4.8,>=3.1.2" \ "python-etcd" \ "urllib3<1.27" # Install extra packages needed by sagemaker (for passing test_utility_packages_using_import) RUN pip install --no-cache-dir -U \ "bokeh>=2.3,<3" \ "imageio>=2.9,<3" \ "opencv-python>=4.3,<5" \ "plotly>=5.1,<6" \ "seaborn>=0.11,<1" \ "numba<0.54" \ "shap>=0.39,<1" \ "numpy<=1.20.0,>=1.13.3" \ "pyyaml>=5.4,<5.5" # EFA Installer does apt get. Make sure to run apt update before that RUN apt-get update RUN cd $HOME \ && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \ && wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \ && cat aws-efa-installer.key | gpg --fingerprint \ && wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \ && tar -xf aws-efa-installer-latest.tar.gz \ && cd aws-efa-installer \ && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ && cd $HOME # Clean up after apt update RUN rm -rf /var/lib/apt/lists/* \ && rm -rf /tmp/tmp* \ && apt-get clean # Install some common packages used by training scripts # torchvision needed for MLP. since it depends on torch and torch neuron/torch # is already installed install it with nodeps RUN pip3 install --no-cache-dir --no-deps -U \ torchvision==0.14.* # Needed for running bert training scripts RUN pip3 install --no-cache-dir -U \ graphviz \ tensorboard==2.6 \ accelerate \ sentencepiece!=0.1.92 \ h5py \ requests COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py RUN chmod +x /usr/local/bin/start_with_right_hostname.sh \ && chmod +x /usr/local/bin/deep_learning_container.py RUN HOME_DIR=/root \ && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ && chmod +x /usr/local/bin/testOSSCompliance \ && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ && rm -rf ${HOME_DIR}/oss_compliance* \ && rm -rf /tmp/tmp* RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-1.13/license.txt # Starts framework ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"] CMD ["/bin/bash"]