#!/bin/bash
# Universal entrypoint script for containerized tooling for use with AWS Batch
# that handles data staging of predefined inputs and outputs.
#
# Environment Variables
#   JOB_WORKFLOW_NAME
#       Optional
#       Name of the parent workflow for this job.  Used with JOB_WORKFLOW_EXECUTION_ID
#       to generate a unique prefix for workflow outputs.
#
#   JOB_WORKFLOW_EXECUTION_ID
#       Optional
#       Unique identifier for the current workflow run.  Used with JOB_WORKFLOW_NAME
#       to generate a unique prefix for workflow outputs.
#
#   JOB_AWS_CLI_PATH
#       Required if staging data from S3
#       Default: /opt/miniconda/bin
#       Path to add to the PATH environment variable so that the AWS CLI can be
#       located.  Use this if bindmounting the AWS CLI from the host and it is
#       packaged in a self-contained way (e.g. not needing OS/distribution 
#       specific shared libraries).  The AWS CLI installed with `conda` is
#       sufficiently self-contained.  Using a standard python virtualenv does
#       not work.
# 
#   JOB_DATA_ISOLATION
#       Optional
#       Default: null
#       Set to 1 if container will need to use an isolated data space - e.g.
#       it will operate in a volume mounted from the host for scratch
#
#   JOB_INPUTS
#       Optional
#       Default: null
#       A space delimited list of http(s) urls or s3 object urls - e.g.:
#           https://somedomain.com/path s3://{prefix1}/{key_pattern1} [s3://{prefix2}/{key_pattern2} [...]]
#       for files that the job will use as inputs
#
#   JOB_OUTPUTS
#       Optional
#       Default: null
#       A space delimited list of files - e.g.:
#           file1 [file2 [...]]
#       that the job generates that will be retained - i.e. transferred back to S3
#
#   JOB_OUTPUT_PREFIX
#       Required if JOB_OUTPUTS need to be stored on S3
#       Default: null
#       S3 location (e.g. s3://bucket/prefix) were job outputs will be stored
#
#   JOB_INPUT_S3_COPY_METHOD
#       Optional
#       Default: s3cp
#       If copying files from an S3 bucket, choose the method for the copy
#           s3cp: use s3 cp --no-progress --recursive --exclude "*" --include JOB_INPUT (an s3 input from the JOB_INPUTS)
#           s3sync: use s3 sync JOB_INPUT . (for each s3 input from the JOB_INPUTS)
#
#   JOB_OUTPUT_S3_COPY_METHOD
#       Optional
#       Default: s3cp
#       If copying files to an S3 bucket, choose the method for the copy
#           s3cp: use s3 cp --no-progress file (a file from the JOB_OUTPUTS)
#           s3sync: use s3 sync LOCAL_PATH JOB_OUTPUT_PREFIX (Sync a local path to the JOB_OUTPUT_PREFIX location)
#
#   WORKING_DIRECTORY
#       Optional
#       Default: The container working directory
#       Set an alternative working directory, useful when the host machine has 
#       an external mount using FSx, EFS, etc.


set -e  # exit on error

if [[ $JOB_VERBOSE ]]; then
    set -x  # enable echo
fi

# If WORKING_DIRECTORY is set and valid, change directory to the WORKING_DIRECTORY
if [[ $WORKING_DIRECTORY && -d "$WORKING_DIRECTORY" ]]; then
    cd $WORKING_DIRECTORY
fi

WORKING_FOLDER=`pwd`
DEFAULT_AWS_CLI_PATH=/opt/aws-cli/bin
AWS_CLI_PATH=${JOB_AWS_CLI_PATH:-$DEFAULT_AWS_CLI_PATH}
PATH=$PATH:$AWS_CLI_PATH

# ensure that JOB_INPUT_PREFIX is fully evaluated if present
if [[ $JOB_INPUT_PREFIX ]]; then
    JOB_INPUT_PREFIX=`echo $JOB_INPUT_PREFIX | envsubst`
fi

if [[ $JOB_DATA_ISOLATION && $JOB_DATA_ISOLATION == 1 ]]; then
    ## AWS Batch places multiple jobs on an instance
    ## To avoid file path clobbering if using a host mounted scratch use the JobID 
    ## and JobAttempt to create a unique path
    
    if [[ $AWS_BATCH_JOB_ID ]]; then
        GUID="$AWS_BATCH_JOB_ID/$AWS_BATCH_JOB_ATTEMPT"
    else
        GUID=`date | md5sum | cut -d " " -f 1`
    fi

    mkdir -p $GUID
    cd $GUID
fi

function stage_in() (
    # loops over list of inputs (patterns allowed) which are a space delimited list
    # of s3 urls:
    #   s3://{prefix1}/{key_pattern1} [s3://{prefix2}/{key_pattern2} [...]]
    # uses the AWS CLI to download objects

    # `noglob` option is needed so that patterns are not expanded against the 
    # local filesystem. this setting is local to the function
    set -o noglob

    for item in "$@"; do
        item=`echo $item | envsubst`
        if [[ $item =~ ^s3:// ]]; then
            if [[ $JOB_INPUT_S3_COPY_METHOD && $JOB_INPUT_S3_COPY_METHOD == 's3sync' ]]; then
                echo "[input][s3sync] remote: $item ==> $WORKING_FOLDER/"
                
                aws s3 sync $item .
            else
                local item_key=`basename $item`
                local item_prefix=`dirname $item`

                echo "[input][s3cp] remote: $item ==> $WORKING_FOLDER/$item_key"
                
                aws s3 cp \
                    --no-progress \
                    --recursive \
                    --exclude "*" \
                    --include "${item_key}" \
                    ${item_prefix} .
            fi
        elif [[ $item =~ ^https?:// ]]; then
            echo "[input][url] $item ==> $WORKING_FOLDER/"
        
            wget $item
        else
            echo "[input] local: $item"

        fi
    done
)

function stage_out() (
    # loops over list of outputs which are a space delimited list of filenames:
    #   file1 [file2 [...]]
    # uses the AWS CLI to upload objects

    for item in "$@"; do
        if [[ ! -f $item && ! -d $item ]]; then
            # If an expected output is not found it is generally considered an
            # error.  To suppress this error when using glob expansion you can 
            # set the `nullglob` option (`shopt -s nullglob`)
            echo "[output] ERROR: $item does not exist" 1>&2
            exit 1
        else
            if [[ $JOB_OUTPUT_PREFIX && $JOB_OUTPUT_PREFIX =~ ^s3:// ]]; then
                local item_key=`basename $item`
                local output_prefix=$JOB_OUTPUT_PREFIX

                if [[ $JOB_WORKFLOW_NAME && $JOB_WORKFLOW_EXECUTION_ID ]]; then
                    local output_prefix=$output_prefix/$JOB_WORKFLOW_NAME/$JOB_WORKFLOW_EXECUTION_ID
                fi

                if [[ $JOB_OUTPUT_S3_COPY_METHOD && $JOB_OUTPUT_S3_COPY_METHOD == 's3sync' ]]; then
                    echo "[output][s3sync] remote: $WORKING_FOLDER/$item ==> $output_prefix/"
                    
                    aws s3 sync $item $output_prefix/
                else
                    echo "[output][s3cp] remote: $WORKING_FOLDER/$item ==> $output_prefix/${item_key}"

                    aws s3 cp \
                        --no-progress \
                        ./$item $output_prefix/${item_key}
                fi
                

            elif [[ $JOB_OUTPUT_PREFIX && ! $JOB_OUTPUT_PREFIX =~ ^s3:// ]]; then
                echo "[output] ERROR: unsupported remote output destination $JOB_OUTPUT_PREFIX" 1>&2

            else
                echo "[output] local: $WORKING_FOLDER/$item"

            fi
        fi
    done
)

# Command is specified in the JobSubmission container overrides.
# gives the user flexibility to specify tooling options as needed.
#
# Note that AWS Batch has an implicit 8kb limit on the amount of data allowed in
# container overrides, which includes environment variable data.
COMMAND=`echo "$*" | envsubst`

printenv
stage_in $JOB_INPUTS

echo "[command]: $COMMAND"
bash -c "$COMMAND"


stage_out $JOB_OUTPUTS