FROM ubuntu:20.04 LABEL maintainer="Amazon AI" LABEL dlc_major_version="1" ARG PYTHON=python3.8 # Add arguments to achieve the version, python and url ARG PYTHON_VERSION=3.8.10 # Add arguments to achieve the version, python and url ARG OPEN_MPI_VERSION=4.0.1 ARG PIP=pip3 # This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20 ARG DEBIAN_FRONTEND=noninteractive # Python won’t try to write .pyc or .pyo files on the import of source modules # Force stdin, stdout and stderr to be totally unbuffered. Good for logging ENV PYTHONDONTWRITEBYTECODE=1 ENV PYTHONUNBUFFERED=1 ENV PYTHONIOENCODING=UTF-8 ENV LANG=C.UTF-8 ENV LC_ALL=C.UTF-8 ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/aws/neuron/lib" ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib" ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib64" ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/openmpi/lib64" ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" ENV PATH /opt/aws/neuron/bin/:$PATH ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main ENV DGLBACKEND=pytorch RUN apt-get update \ # TODO: Remove systemd upgrade once it is updated in base image && apt-get -y upgrade --only-upgrade systemd \ && apt-get install -y --no-install-recommends \ build-essential \ ca-certificates \ cmake \ curl \ emacs \ git \ jq \ libopencv-dev \ openjdk-8-jdk-headless \ openjdk-8-jdk \ openjdk-8-jre \ libglib2.0-0 \ libgl1-mesa-glx \ libsm6 \ libxext6 \ libxrender-dev \ openjdk-11-jdk \ software-properties-common \ wget \ unzip \ vim \ zlib1g-dev \ openssl \ libssl-dev \ libsqlite3-dev \ libgdbm-dev \ libc6-dev \ libbz2-dev \ tk-dev \ libffi-dev \ libcap-dev \ gnupg2 \ gpg-agent \ && rm -rf /var/lib/apt/lists/* \ && apt-get clean RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - RUN apt-get update \ && apt-get install -y \ aws-neuronx-tools \ aws-neuronx-collectives \ aws-neuronx-runtime-lib \ && rm -rf /var/lib/apt/lists/* \ && rm -rf /tmp/tmp* \ && apt-get clean RUN wget https://www.open-mpi.org/software/ompi/v4.0/downloads/openmpi-$OPEN_MPI_VERSION.tar.gz \ && gunzip -c openmpi-$OPEN_MPI_VERSION.tar.gz | tar xf - \ && cd openmpi-$OPEN_MPI_VERSION \ && ./configure --prefix=/home/.openmpi \ && make all install \ && cd .. \ && rm openmpi-$OPEN_MPI_VERSION.tar.gz \ && rm -rf openmpi-$OPEN_MPI_VERSION # install Python RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \ && tar -xzf Python-$PYTHON_VERSION.tgz \ && cd Python-$PYTHON_VERSION \ && ./configure --enable-shared --prefix=/usr/local \ && make -j $(nproc) && make install \ && cd .. && rm -rf ../Python-$PYTHON_VERSION* \ && ln -s /usr/local/bin/pip3 /usr/bin/pip \ && ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \ && ${PIP} --no-cache-dir install --upgrade \ pip \ setuptools WORKDIR / # The ENV variables declared below are changed in the previous section # Grouping these ENV variables in the first section causes # ompi_info to fail. This is only observed in CPU containers ENV PATH="$PATH:/home/.openmpi/bin" ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/" RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value # Copy workaround script for incorrect hostname COPY changehostname.c / COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh RUN ${PIP} install --no-cache-dir -U \ "pyyaml>=5.4,<5.5" \ "bokeh>=2.3,<3" \ "opencv-python>=4.6,<5" \ "awscli<2" \ scipy \ click \ "cryptography>3.2" \ "sagemaker>=2,<3" \ "sagemaker-pytorch-training<3" \ psutil==5.6.7 \ dataset \ transformers \ Pillow RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt RUN ${PIP} config set global.extra-index-url https://pip.repos.neuron.amazonaws.com \ && ${PIP} install --force-reinstall torch-neuronx==1.13.0.1.5.0 --extra-index-url https://pip.repos.neuron.amazonaws.com \ && ${PIP} install --force-reinstall neuronx-cc==2.4.0.21 --extra-index-url https://pip.repos.neuron.amazonaws.com # attrs, neurox-cc required: >=19.2.0, sagemaker 2.103.0 <22,>=20.3.0 # protobuf neurox-cc<4 , sagemaker training <3.20,>=3.9.2 # awscli 1.25.47 has requirement docutils<0.17,>=0.10 # etcd for kubernetes installation # awscli 1.27.33 has requirement rsa<4.8,>=3.1.2, but you have rsa 4.9. RUN ${PIP} install --no-cache-dir -U \ 'attrs>=20.3.0,<22.0.0' \ 'protobuf>=2.9.2,<3.20' \ 'docutils>=0.10,<0.17' \ 'rsa<4.8,>=3.1.2' \ "python-etcd" # Install extra packages needed by sagemaker (for passing test_utility_packages_using_import) RUN pip install --no-cache-dir -U \ "bokeh>=2.3,<3" \ "imageio>=2.9,<3" \ "opencv-python>=4.3,<5" \ "plotly>=5.1,<6" \ "seaborn>=0.11,<1" \ "numba<0.54" \ "shap>=0.39,<1" \ "numpy<=1.20.0,>=1.13.3" \ "pyyaml>=5.4,<5.5" # EFA Installer does apt get. Make sure to run apt update before that RUN apt-get update RUN cd $HOME \ && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \ && wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \ && cat aws-efa-installer.key | gpg --fingerprint \ && wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \ && tar -xf aws-efa-installer-latest.tar.gz \ && cd aws-efa-installer \ && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ && cd $HOME # Clean up after apt update RUN rm -rf /var/lib/apt/lists/* \ && rm -rf /tmp/tmp* \ && apt-get clean # Install some common packages used by training scripts # torchvision needed for MLP. since it depends on torch and torch neuron/torch # is already installed install it with nodeps RUN pip3 install --no-cache-dir --no-deps -U \ torchvision==0.14.1 # Needed for running bert training scripts RUN pip3 install --no-cache-dir -U \ graphviz \ tensorboard==2.6 \ accelerate \ sentencepiece!=0.1.92 \ h5py \ requests COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py RUN chmod +x /usr/local/bin/start_with_right_hostname.sh \ && chmod +x /usr/local/bin/deep_learning_container.py RUN HOME_DIR=/root \ && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ && chmod +x /usr/local/bin/testOSSCompliance \ && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ && rm -rf ${HOME_DIR}/oss_compliance* \ && rm -rf /tmp/tmp* RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-1.13/license.txt # Starts framework ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"] CMD ["/bin/bash"]