version 1.0

## Copyright Broad Institute, 2019
##
## The haplotypecaller-gvcf-gatk4 workflow runs the HaplotypeCaller tool
## from GATK4 in GVCF mode on a single sample according to GATK Best Practices.
## When executed the workflow scatters the HaplotypeCaller tool over a sample
## using an intervals list file. The output file produced will be a
## single gvcf file which can be used by the joint-discovery workflow.
##
## Requirements/expectations :
## - One analysis-ready BAM file for a single sample (as identified in RG:SM)
## - Set of variant calling intervals lists for the scatter, provided in a file
##
## Outputs :
## - One GVCF file and its index
##
## Cromwell version support
## - Successfully tested on v53
##
## Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
##
## LICENSING :
## This script is released under the WDL source code license (BSD-3) (see LICENSE in
## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
## be subject to different licenses. Users are responsible for checking that they are
## authorized to run all programs before running this script. Please see the dockers
## for detailed licensing information pertaining to the included programs.

# WORKFLOW DEFINITION
workflow HaplotypeCallerGvcf_GATK4 {
    input {
        File input_bam
        File input_bam_index
        File ref_dict
        File ref_fasta
        File ref_fasta_index
        File scattered_calling_intervals_list

        Boolean make_gvcf = true
        Boolean make_bamout = false
        String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.2.0.0"
        String gatk_path = "/gatk/gatk"
        String gitc_docker = "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.7-1603303710"
        String samtools_path = "samtools"
    }

    Array[File] scattered_calling_intervals = read_lines(scattered_calling_intervals_list)

    #is the input a cram file?
    Boolean is_cram = sub(basename(input_bam), ".*\\.", "") == "cram"

    String sample_basename = if is_cram then  basename(input_bam, ".cram") else basename(input_bam, ".bam")
    String vcf_basename = sample_basename
    String output_suffix = if make_gvcf then ".g.vcf.gz" else ".vcf.gz"
    String output_filename = vcf_basename + output_suffix

    # We need disk to localize the sharded input and output due to the scatter for HaplotypeCaller.
    # If we take the number we are scattering by and reduce by 20 we will have enough disk space
    # to account for the fact that the data is quite uneven across the shards.
    Int potential_hc_divisor = length(scattered_calling_intervals) - 20
    Int hc_divisor = if potential_hc_divisor > 1 then potential_hc_divisor else 1

    if ( is_cram ) {
        call CramToBamTask {
            input:
                input_cram = input_bam,
                sample_name = sample_basename,
                ref_dict = ref_dict,
                ref_fasta = ref_fasta,
                ref_fasta_index = ref_fasta_index,
                docker = gitc_docker,
                samtools_path = samtools_path
        }
    }

    # Call variants in parallel over grouped calling intervals
    scatter (interval_file in scattered_calling_intervals) {

        # Generate GVCF by interval
        call HaplotypeCaller {
            input:
                input_bam = select_first([CramToBamTask.output_bam, input_bam]),
                input_bam_index = select_first([CramToBamTask.output_bai, input_bam_index]),
                interval_list = interval_file,
                output_filename = output_filename,
                ref_dict = ref_dict,
                ref_fasta = ref_fasta,
                ref_fasta_index = ref_fasta_index,
                hc_scatter = hc_divisor,
                make_gvcf = make_gvcf,
                make_bamout = make_bamout,
                docker = gatk_docker,
                gatk_path = gatk_path
        }
    }

    # Merge per-interval GVCFs
    call MergeGVCFs {
        input:
            input_vcfs = HaplotypeCaller.output_vcf,
            input_vcfs_indexes = HaplotypeCaller.output_vcf_index,
            output_filename = output_filename,
            docker = gatk_docker,
            gatk_path = gatk_path
    }

    # Outputs that will be retained when execution is complete
    output {
        File output_vcf = MergeGVCFs.output_vcf
        File output_vcf_index = MergeGVCFs.output_vcf_index
    }
}

# TASK DEFINITIONS

task CramToBamTask {
    input {
        # Command parameters
        File ref_fasta
        File ref_fasta_index
        File ref_dict
        File input_cram
        String sample_name

        # Runtime parameters
        String docker
        Int? machine_mem_gb
        String samtools_path
        Int? retry_attempts
    }

    command {
        set -e
        set -o pipefail

        ~{samtools_path} view -h -T ~{ref_fasta} ~{input_cram} |
        ~{samtools_path} view -b -o ~{sample_name}.bam -
        ~{samtools_path} index -b ~{sample_name}.bam
        mv ~{sample_name}.bam.bai ~{sample_name}.bai
    }
    runtime {
        docker: docker
        memory: select_first([machine_mem_gb, 15]) + " GB"
        awsBatchRetryAttempts: select_first([retry_attempts, 3])
    }
    output {
        File output_bam = "~{sample_name}.bam"
        File output_bai = "~{sample_name}.bai"
    }
}

# HaplotypeCaller per-sample in GVCF mode
task HaplotypeCaller {
    input {
        # Command parameters
        File input_bam
        File input_bam_index
        File interval_list
        String output_filename
        File ref_dict
        File ref_fasta
        File ref_fasta_index
        Float? contamination
        Boolean make_gvcf
        Boolean make_bamout
        Int hc_scatter

        String? gcs_project_for_requester_pays

        String gatk_path
        String? java_options

        # Runtime parameters
        String docker
        Int? mem_gb
        Int? retry_attempts
    }

    String java_opt = select_first([java_options, "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10"])

    Int machine_mem_gb = select_first([mem_gb, 7])
    Int command_mem_gb = machine_mem_gb - 1

    String vcf_basename = if make_gvcf then  basename(output_filename, ".gvcf") else basename(output_filename, ".vcf")
    String bamout_arg = if make_bamout then "-bamout ~{vcf_basename}.bamout.bam" else ""

    parameter_meta {
        input_bam: {
                       description: "a bam file",
                       localization_optional: true
                   }
        input_bam_index: {
                             description: "an index file for the bam input",
                             localization_optional: true
                         }
    }
    command {
        set -e

        ~{gatk_path} --java-options "-Xmx~{command_mem_gb}G ~{java_opt}" \
        HaplotypeCaller \
        -R ~{ref_fasta} \
        -I ~{input_bam} \
        -L ~{interval_list} \
        -O ~{output_filename} \
        -contamination ~{default="0" contamination} \
        -G StandardAnnotation -G StandardHCAnnotation ~{true="-G AS_StandardAnnotation" false="" make_gvcf} \
        -GQB 10 -GQB 20 -GQB 30 -GQB 40 -GQB 50 -GQB 60 -GQB 70 -GQB 80 -GQB 90 \
        ~{true="-ERC GVCF" false="" make_gvcf} \
        ~{if defined(gcs_project_for_requester_pays) then "--gcs-project-for-requester-pays ~{gcs_project_for_requester_pays}" else ""} \
        ~{bamout_arg}

        # Cromwell doesn't like optional task outputs, so we have to touch this file.
        touch ~{vcf_basename}.bamout.bam
    }
    runtime {
        docker: docker
        memory: machine_mem_gb + " GB"
        awsBatchRetryAttempts: select_first([retry_attempts, 3])
    }
    output {
        File output_vcf = "~{output_filename}"
        File output_vcf_index = "~{output_filename}.tbi"
        File bamout = "~{vcf_basename}.bamout.bam"
    }
}
# Merge GVCFs generated per-interval for the same sample
task MergeGVCFs {
    input {
        # Command parameters
        Array[File] input_vcfs
        Array[File] input_vcfs_indexes
        String output_filename

        String gatk_path

        # Runtime parameters
        String docker
        Int? mem_gb
        Int? retry_attempts
    }
    Int machine_mem_gb = select_first([mem_gb, 3])
    Int command_mem_gb = machine_mem_gb - 1

    command {
        set -e

        ~{gatk_path} --java-options "-Xmx~{command_mem_gb}G"  \
        MergeVcfs \
        --INPUT ~{sep=' --INPUT ' input_vcfs} \
        --OUTPUT ~{output_filename}
    }
    runtime {
        docker: docker
        memory: machine_mem_gb + " GB"
        awsBatchRetryAttempts: select_first([retry_attempts, 3])
    }
    output {
        File output_vcf = "~{output_filename}"
        File output_vcf_index = "~{output_filename}.tbi"
    }
}