# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0

#### Container image with document/image processing (and optionally OCR) tools added.

ARG BASE_IMAGE
FROM ${BASE_IMAGE}

# Common/base doc & image processing tools:
RUN conda install -c conda-forge poppler -y \
  && pip install amazon-textract-response-parser pdf2image "Pillow>=8,<9"

# Optional OCR engine: Tesseract+PyTesseract
# conda tesseract already includes Leptonica dependency and multi-language tessdata files by default
# (but didn't set the required TESSDATA_PREFIX variable at time of writing)
ARG INCLUDE_OCR_TESSERACT
RUN if test -z "$INCLUDE_OCR_TESSERACT" ; \
    then \
        echo Skipping OCR engine Tesseract \
    ; else \
        conda install -y -c conda-forge tesseract && \
        pip install pytesseract && \
        export TESSDATA_PREFIX='/opt/conda/share/tessdata' \
    ; fi