# https://github.com/aws/deep-learning-containers/blob/master/available_images.md 
# Refer to the above page to pull latest Pytorch image

# docker image region us-west-2
FROM 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.11.0-gpu-py38-cu113-ubuntu20.04-sagemaker


########
###   :::Update NCCL:::
###   This is a temporary workaround to
###   upgrade NCCL
########

# Remove Older NCCL
RUN rm -rvf `find /usr/local | grep nccl` /usr/local/obj

ENV NCCL_VERSION=2.12.12

# Install Newer NCCL
RUN cd /tmp \
 && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION}-1 \
 && cd nccl \
 && make -j `nproc` src.build BUILDDIR=/usr/local \
 && rm -rf /tmp/nccl

# Reinstall AWS OFI NCCL

ENV BRANCH_OFI=1.4.0-aws

RUN mkdir /tmp/efa-ofi-nccl  && cd /tmp/efa-ofi-nccl  && git clone https://github.com/aws/aws-ofi-nccl.git -b v${BRANCH_OFI}  && cd aws-ofi-nccl  && ./autogen.sh  && ./configure --with-libfabric=/opt/amazon/efa   --with-mpi=/opt/amazon/openmpi   --with-cuda=/usr/local/cuda   --with-nccl=/usr/local --prefix=/usr/local  && make  && make install  && rm -rf /tmp/efa-ofi-nccl  && rm -rf /var/lib/apt/lists/*  && apt-get clean

########
###   :::Update NCCL:::
########

LABEL maintainer="Amazon AI"
LABEL dlc_major_version="1"

# Version args - overwritten by args specified in buildspec
ARG TRANSFORMERS_VERSION=4.21.1
ARG DATASETS_VERSION=1.18.4
ARG PYTHON=python3

ARG PT_BUCKET=https://sm-training-comp-pytorch-binaries.s3.us-west-2.amazonaws.com
ARG PT_BINARY_PATH=${PT_BUCKET}/bdd09cad-152b-42bd-993c-16e49d7af027/20220819-180620/54da28ad4c33c2f259969f74364149ecc58672e0
ARG PT_URL=${PT_BINARY_PATH}/torch-1.11.0%2Bcu113-cp38-cp38-linux_x86_64.whl
ARG PT_XLA_URL=${PT_BINARY_PATH}/torch_xla-1.11.0-cp38-cp38-linux_x86_64.whl
ARG TORCHVISION_URL=${PT_BINARY_PATH}/torchvision-0.12.0a0%2B9b5a3fe-cp38-cp38-linux_x86_64.whl
ARG TORCHAUDIO_URL=${PT_BINARY_PATH}/torchaudio-0.11.0%2B6297e97-cp38-cp38-linux_x86_64.whl
ARG HF_BINARY_PATH=${PT_BUCKET}/3464575b-920a-48e6-b964-4d4c5fe2acae/20220816-021125/7780d990d11d9552e913cae06c7315fa8545648b
ARG HF_TRANSFORMERS_URL=${HF_BINARY_PATH}/transformers-${TRANSFORMERS_VERSION}-py3-none-any.whl


RUN pip install --no-cache-dir -U \
    sagemaker


# install Hugging Face libraries and its dependencies
RUN pip install -U --no-cache-dir \ 
	transformers[sklearn,sentencepiece,audio,vision]==${TRANSFORMERS_VERSION} \ 
	datasets==${DATASETS_VERSION}
	
RUN apt-get update \
 # TODO: Remove upgrade statements once packages are updated in base image
 && apt-get -y install --only-upgrade systemd openssl cryptsetup \
 && apt install -y git-lfs \
 && apt install -y libomp5 \
 && apt-get clean \  
 && rm -rf /var/lib/apt/lists/*


# Install PyTorch
RUN pip uninstall -y torch \
 && pip install --no-deps --no-cache-dir -U --force-reinstall ${PT_URL} \
 && rm -rf ${PT_URL}

# Install PyTorch XLA
RUN pip uninstall -y torch_xla \
 && pip install --no-deps --no-cache-dir -U --force-reinstall ${PT_XLA_URL} \
 && rm -rf ${PT_XLA_URL}

# Install TorchVision
RUN pip uninstall -y torchvision \
 && pip install --no-deps --no-cache-dir -U --force-reinstall ${TORCHVISION_URL} \
 && rm -rf ${TORCHVISION_URL}

# Install TorchAudio
RUN pip uninstall -y torchaudio \
 && pip install --no-deps --no-cache-dir -U --force-reinstall ${TORCHAUDIO_URL} \
 && rm -rf ${TORCHAUDIO_URL}

# Install Transformers
RUN pip uninstall -y transformers \
 && pip install --no-deps --no-cache-dir --force-reinstall -U ${HF_TRANSFORMERS_URL} \
 && rm -rf ${HF_TRANSFORMERS_URL} \
 && rm -rf $(${PYTHON} -c "import transformers; print(transformers.__file__.replace('__init__.py', 'examples'))")

# Install SageMaker Training Toolkit
RUN pip uninstall -y sagemaker-training \
 && pip install --no-deps --no-cache-dir -U --force-reinstall "sagemaker-training>=4.2.6"

# Install SageMaker PyTorch Training Toolkit
RUN pip uninstall -y sagemaker-pytorch-training \
 && pip install --no-deps --no-cache-dir -U --force-reinstall "sagemaker-pytorch-training>=2.6.1,<3"


ARG CONDA_PREFIX=/opt/conda

# remove micromamba folder at default installation path
RUN rm -rf /root/micromamba/ 

# Install common training dependencies
RUN ${CONDA_PREFIX}/bin/conda install -y -c conda-forge librosa
RUN pip install --no-cache-dir \
    sacrebleu \
    fugashi \
    ipadic \
    nltk \
    rouge-score \
    soundfile \
    sentence-transformers    

# Numpy version gets downgraded and becomes incompatible with PyTorch wheel in the above conda command.
RUN pip install -U numpy==1.22.2

# Fixing identified CVEs
RUN pip install -U \
    "protobuf<3.20.0" \
    "importlib-metadata<5.0,>=1.4.0" \
    "wheel>=0.38.0"

# Fix library links
RUN ln -s ${CONDA_PREFIX}/lib/libmkl_intel_lp64.so ${CONDA_PREFIX}/lib/libmkl_intel_lp64.so.1 \
 && ln -s ${CONDA_PREFIX}/lib/libmkl_intel_thread.so ${CONDA_PREFIX}/lib/libmkl_intel_thread.so.1 \
 && ln -s ${CONDA_PREFIX}/lib/libmkl_core.so ${CONDA_PREFIX}/lib/libmkl_core.so.1 \
 && ln -s ${CONDA_PREFIX}/lib/libmkl_gnu_thread.so ${CONDA_PREFIX}/lib/libmkl_gnu_thread.so.1

# Install Horovod
ENV HOROVOD_VERSION=0.24.3
RUN pip uninstall -y horovod \
 && ldconfig /usr/local/cuda-11.3/targets/x86_64-linux/lib/stubs \
 && HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_CUDA_HOME=/usr/local/cuda-11.3 HOROVOD_WITH_PYTORCH=1 pip install --no-cache-dir horovod==${HOROVOD_VERSION} \
 && ldconfig

# Removing the cache as it is needed for security verification
RUN rm -rf /root/.cache | true

RUN HOME_DIR=/root \
 && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
 && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
 && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
 && chmod +x /usr/local/bin/testOSSCompliance \
 && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
 && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
 && rm -rf ${HOME_DIR}/oss_compliance*