# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0

# Container definition for Layout+language model training & inference on SageMaker

ARG BASE_IMAGE
FROM ${BASE_IMAGE}

# Core dependencies:
# - Pin PyTorch to prevent pip accidentally re-installing/upgrading it via detectron
# - Pin setuptools per https://github.com/pytorch/pytorch/issues/69894#issuecomment-1080635462
# - Pin protobuf < 3.21 due to an error like https://stackoverflow.com/q/72441758 as of 2023-02
#   (which seems to originate from somewhere in SM DDP package when unconstrained install results
#   in downloading protobuf@4.x)
RUN pip install "amazon-textract-response-parser>=0.1,<0.2" "Pillow>=8,<9" \
    && PT_VER=`pip show torch | grep 'Version:' | sed 's/Version: //'` \
    && pip install git+https://github.com/facebookresearch/detectron2.git setuptools==59.5.0 \
        torch==$PT_VER "torchvision>=0.11.3,<0.15" "datasets>=2.4,<3" "protobuf<3.21" \
        "transformers>=4.25,<4.27"

# Could also consider installing detectron2 via pre-built Linux wheel, depending on the PyTorch and
# CUDA versions of your base container:
# https://github.com/aws/deep-learning-containers/tree/master/huggingface/pytorch
# https://detectron2.readthedocs.io/en/latest/tutorials/install.html
#
# For example:
#     && pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu113/torch1.10/index.html


# Additional dependencies:
# - pytesseract shouldn't be necessary after Transformers v4.18 (because we don't use Tesseract
#   OCR), but older versions have a bug: https://github.com/huggingface/transformers/issues/16845
# - datasets 1.18 and torchvision 0.11 are installed in the HF training container but missing from
#   the inference container, and we need them for inference. Upgraded datasets to use some new
#   logging controls and debug multi-worker .map() pre-processing:
RUN PT_VER=`pip show torch | grep 'Version:' | sed 's/Version: //'` \
    && pip install pytesseract torch==$PT_VER


# If you'd like to enable this container as a Custom Image for notebook kernels, for debugging in
# SageMaker Studio, build it with INCLUDE_NOTEBOOK_KERNEL=1 arg to include IPython kernel and also
# some other PDF processing + OCR utilities:
ARG INCLUDE_NOTEBOOK_KERNEL
RUN if test -z "$INCLUDE_NOTEBOOK_KERNEL" ; \
    then \
        echo Skipping notebook kernel dependencies \
    ; else \
        conda install -y -c conda-forge poppler tesseract && \
        PT_VER=`pip show torch | grep 'Version:' | sed 's/Version: //'` && \
        pip install easyocr ipykernel "ipywidgets>=7,<8" pdf2image pytesseract sagemaker \
            torch==$PT_VER && \
        export TESSDATA_PREFIX='/opt/conda/share/tessdata' && \
        python -m ipykernel install --sys-prefix \
    ; fi

# We would like to disable SMDEBUG when running as a notebook kernel, because it can cause some
# unwanted side-effects... But at the time of writing Dockerfile doesn't have full support for a
# conditional env statement - so:
# if --build-arg INCLUDE_NOTEBOOK_KERNEL=1, set USE_SMDEBUG to 'false', else set null.
ENV USE_SMDEBUG=${INCLUDE_NOTEBOOK_KERNEL:+false}
# ...But '' will cause problems in SM Training, default empty value to 'true' instead (which should
# be the default per:
# https://github.com/awslabs/sagemaker-debugger/blob/56fabe531692403e77ce9b5879d55211adec238e/smdebug/core/config_validator.py#L21
ENV USE_SMDEBUG=${USE_SMDEBUG:-true}

# See below guidance for adding an image built with INCLUDE_NOTEBOOK_KERNEL to SMStudio:
# https://docs.aws.amazon.com/sagemaker/latest/dg/studio-byoi.html
# https://github.com/aws-samples/sagemaker-studio-custom-image-samples
#
# An image config something like the following should work:
# {
#     "KernelSpecs": [
#         {
#             "Name": "python3",
#             "DisplayName": "Textract Transformers"
#         },
#     ],
#     "FileSystemConfig": {
#         "MountPath": "/root/data",
#         "DefaultUid": 0,
#         "DefaultGid": 0
#     }
# }