FROM nvidia/cuda:10.0-devel-ubuntu16.04 # TensorFlow v1.13 is coupled to CUDA10. ENV TENSORFLOW_VERSION=1.13.1 ENV CUDNN_VERSION=7.4.2.24-1+cuda10.0 ENV NCCL_VERSION=2.4.2-1+cuda10.0 # Python 2.7 or 3.5 is supported by Ubuntu Xenial out of the box ARG python=3.5 ENV PYTHON_VERSION=${python} RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential \ cmake \ git \ curl \ vim \ wget \ ca-certificates \ libcudnn7=${CUDNN_VERSION} \ libnccl2=${NCCL_VERSION} \ libnccl-dev=${NCCL_VERSION} \ libjpeg-dev \ libpng-dev \ python${PYTHON_VERSION} \ python${PYTHON_VERSION}-dev RUN ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ python get-pip.py && \ rm get-pip.py # Install TensorFlow, Keras RUN pip install tensorflow-gpu==${TENSORFLOW_VERSION} keras h5py # Install Open MPI 4.0.0 RUN mkdir /tmp/openmpi && \ cd /tmp/openmpi && \ wget https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.0.tar.gz && \ tar zxf openmpi-4.0.0.tar.gz && \ cd openmpi-4.0.0 && \ ./configure --enable-orterun-prefix-by-default && \ make -j $(nproc) all && \ make install && \ ldconfig && \ rm -rf /tmp/openmpi # Install Horovod, temporarily using CUDA stubs # /usr/local/cuda links to /usr/local/cuda-10.0 RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs && \ HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir horovod && \ ldconfig # Create a wrapper for OpenMPI to allow running as root by default RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \ echo '#!/bin/bash' > /usr/local/bin/mpirun && \ echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \ chmod a+x /usr/local/bin/mpirun # Configure OpenMPI to run good defaults: # --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0 RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \ echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf && \ echo "btl_tcp_if_exclude = lo,docker0" >> /usr/local/etc/openmpi-mca-params.conf # Set default NCCL parameters RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf # Install OpenSSH for MPI to communicate between containers RUN apt-get install -y --no-install-recommends openssh-client openssh-server && \ mkdir -p /var/run/sshd # Allow OpenSSH to talk to containers without asking for confirmation RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config RUN mkdir /code && git clone https://github.com/aws-samples/deep-learning-models.git /code WORKDIR "/code" CMD mpirun \ python models/resnet/tensorflow/train_imagenet_resnet_hvd.py \ --batch_size=256 \ --model=resnet50 \ --num_batches=1000 \ --fp16 \ --lr_decay_mode=poly \ --synthetic