FROM nvidia/cuda:latest ENV USER root # ------------------------------------------------------------------------------------- # install needed software - # openssh # mpi # awscli # supervisor # ------------------------------------------------------------------------------------- RUN apt update RUN apt upgrade -y RUN DEBIAN_FRONTEND=noninteractive apt install -y iproute2 openssh-server openssh-client python python-pip python3 python3-dev python3-pip build-essential gfortran wget curl libfftw3-dev git libcudnn7 libcudnn7-dev wget libjemalloc-dev pkg-config zip unzip RUN pip2 install supervisor awscli ENV DEBIAN_FRONTEND noninteractive ENV NOTVISIBLE "in users profile" ##################################################### ## SSH SETUP RUN mkdir -p /var/run/sshd RUN sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd RUN echo "export VISIBLE=now" >> /etc/profile RUN echo "${USER} ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers ENV SSHDIR /root/.ssh RUN mkdir -p ${SSHDIR} RUN touch ${SSHDIR}/sshd_config RUN ssh-keygen -t rsa -f ${SSHDIR}/ssh_host_rsa_key -N '' RUN cp ${SSHDIR}/ssh_host_rsa_key.pub ${SSHDIR}/authorized_keys RUN cp ${SSHDIR}/ssh_host_rsa_key ${SSHDIR}/id_rsa RUN echo " IdentityFile ${SSHDIR}/id_rsa" >> /etc/ssh/ssh_config RUN echo "Host *" >> /etc/ssh/ssh_config && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config RUN chmod -R 600 ${SSHDIR}/* && \ chown -R ${USER}:${USER} ${SSHDIR}/ # check if ssh agent is running or not, if not, run RUN eval `ssh-agent -s` && ssh-add ${SSHDIR}/id_rsa ################################################## ## S3 OPTIMIZATION RUN aws configure set default.s3.max_concurrent_requests 30 RUN aws configure set default.s3.max_queue_size 10000 RUN aws configure set default.s3.multipart_threshold 64MB RUN aws configure set default.s3.multipart_chunksize 16MB RUN aws configure set default.s3.max_bandwidth 4096MB/s RUN aws configure set default.s3.addressing_style path ################################################## ## CUDA MPI RUN wget -O /tmp/openmpi.tar.gz https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.0.tar.gz && \ tar -xvf /tmp/openmpi.tar.gz -C /tmp RUN cd /tmp/openmpi* && ./configure --prefix=/opt/openmpi --with-cuda --enable-mpirun-prefix-by-default && \ make -j $(nproc) && make install RUN echo "export PATH=$PATH:/opt/openmpi/bin" >> /etc/bash.bashrc RUN echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/openmpi/lib:/usr/local/cuda/include:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64" >> /etc/bash.bashrc ################################################### ## TENSORFLOW INSTALL RUN pip3 install numpy six wheel mock RUN pip3 install keras_applications==1.0.6 --no-deps RUN pip3 install keras_preprocessing==1.0.5 --no-deps RUN mkdir -p /usr/lib/x86_64-linux-gnu/nccl/lib RUN mkdir -p /usr/lib/x86_64-linux-gnu/nccl/include RUN cp /usr/lib/x86_64-linux-gnu/libnccl* /usr/lib/x86_64-linux-gnu/nccl/ RUN cp /usr/include/nccl.h /usr/lib/x86_64-linux-gnu/nccl/include RUN wget -O /tmp/bazel.sh "https://github.com/bazelbuild/bazel/releases/download/0.18.0/bazel-0.18.0-installer-linux-x86_64.sh" RUN chmod +x /tmp/bazel.sh RUN bash -c "/tmp/bazel.sh" RUN git clone https://github.com/tensorflow/tensorflow /root/tensorflow ADD conf/tensorflow_build.sh /root/ RUN chmod +x /root/tensorflow_build.sh RUN /root/tensorflow_build.sh ADD conf/horovod_build.sh /root/ RUN chmod +x /root/horovod_build.sh RUN /root/horovod_build.sh ################################################### ## IMAGENET DATASET RUN git clone https://github.com/aws-samples/deep-learning-models.git /root/deep-learning-models ################################################### ## supervisor container startup ADD conf/supervisord/supervisord.conf /etc/supervisor/supervisord.conf ADD supervised-scripts/mpi-run.sh supervised-scripts/mpi-run.sh RUN chmod 755 supervised-scripts/mpi-run.sh EXPOSE 22 RUN export PATH="$PATH:/opt/openmpi/bin" ADD batch-runtime-scripts/entry-point.sh batch-runtime-scripts/entry-point.sh RUN chmod 0755 batch-runtime-scripts/entry-point.sh CMD /batch-runtime-scripts/entry-point.sh