version 1.0 ## Copyright Broad Institute, 2019 ## ## The haplotypecaller-gvcf-gatk4 workflow runs the HaplotypeCaller tool ## from GATK4 in GVCF mode on a single sample according to GATK Best Practices. ## When executed the workflow scatters the HaplotypeCaller tool over a sample ## using an intervals list file. The output file produced will be a ## single gvcf file. ## ## Requirements/expectations : ## - One analysis-ready BAM file for a single sample (as identified in RG:SM) ## - Set of variant calling intervals lists for the scatter, provided in a file ## ## Outputs : ## - One GVCF file and its index ## ## ## LICENSING : ## This script is released under the WDL source code license (BSD-3) (see LICENSE in ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may ## be subject to different licenses. Users are responsible for checking that they are ## authorized to run all programs before running this script. Please see the dockers ## for detailed licensing information pertaining to the included programs. # WORKFLOW DEFINITION workflow HaplotypeCallerGvcf_GATK4 { input { File input_bam File input_bam_index File ref_fasta File scattered_calling_intervals_archive File ref_fasta_index File ref_dict String gatk_docker String gotc_docker } #File ref_fasta="s3://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta" #String ref_base_ext = basename(ref_fasta) #String ref_base = basename(ref_fasta,".fasta") #String ref_base_path = sub(ref_fasta,ref_base_ext,"") #File ref_fasta_index= ref_base_path + ref_base_ext + ".fai" #File ref_dict = ref_base_path + ref_base + ".dict" #File ref_fasta_index="s3://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai" #File scattered_calling_intervals_archive="s3://omics-test-input-bucket/workflow/intervals.tar.gz" #File ref_dict="s3://broad-references/hg38/v0/Homo_sapiens_assembly38.dict" Boolean make_gvcf = false Boolean make_bamout = false #String gatk_docker = "022521056385.dkr.ecr.us-east-1.amazonaws.com/gatk:4.1.9.0" String gatk_path = "/gatk/gatk" String sample_basename = basename(input_bam, ".bam") String vcf_basename = sample_basename String output_suffix = if make_gvcf then ".g.vcf.gz" else ".vcf.gz" String output_filename = vcf_basename + output_suffix #Array[File] scattered_calling_intervals = read_lines(scattered_calling_intervals_list) # Call variants in parallel over grouped calling intervals call UnpackIntervals { input: archive = scattered_calling_intervals_archive, docker = gotc_docker } scatter (interval_file in UnpackIntervals.interval_files) { # Generate GVCF by interval call HaplotypeCaller { input: input_bam = input_bam, input_bam_index = input_bam_index, interval_list = interval_file, output_filename = output_filename, ref_dict = ref_dict, ref_fasta = ref_fasta, ref_fasta_index = ref_fasta_index, make_gvcf = make_gvcf, make_bamout = make_bamout, docker = gatk_docker, gatk_path = gatk_path } } # Merge per-interval GVCFs call MergeGVCFs { input: input_vcfs = HaplotypeCaller.output_vcf, input_vcfs_indexes = HaplotypeCaller.output_vcf_index, output_filename = output_filename, docker = gatk_docker, gatk_path = gatk_path } # Outputs that will be retained when execution is complete output { File output_vcf = MergeGVCFs.output_vcf File output_vcf_index = MergeGVCFs.output_vcf_index } } # TASK DEFINITIONS task UnpackIntervals { input { File archive String docker } String basestem_input = basename(archive, ".tar") command { echo "Unpack Intervals" >&2 tar xvf ~{archive} --directory ./ } runtime { docker: docker cpu: 2 memory: "2 GiB" } output { Array[File] interval_files = glob("${basestem_input}/*") } } # HaplotypeCaller per-sample in GVCF mode task HaplotypeCaller { input { # Command parameters File input_bam File input_bam_index File interval_list String output_filename File ref_dict File ref_fasta File ref_fasta_index Float? contamination Boolean make_gvcf Boolean make_bamout String gatk_path String? java_options # Runtime parameters String docker Int? mem_gb } String java_opt = select_first([java_options, ""]) Int machine_mem_gb = select_first([mem_gb, 16]) Int command_mem_gb = machine_mem_gb - 2 String vcf_basename = if make_gvcf then basename(output_filename, ".gvcf") else basename(output_filename, ".vcf") String bamout_arg = if make_bamout then "-bamout ~{vcf_basename}.bamout.bam" else "" parameter_meta { input_bam: { description: "a bam file" } input_bam_index: { description: "an index file for the bam input" } } command { echo HaplotypeCaller >&2 set -euxo pipefail ~{gatk_path} --java-options "-Xmx~{command_mem_gb}G ~{java_opt}" \ HaplotypeCaller \ -R ~{ref_fasta} \ -I ~{input_bam} \ -L ~{interval_list} \ -O ~{output_filename} \ -contamination ~{default="0" contamination} \ -G StandardAnnotation -G StandardHCAnnotation ~{true="-G AS_StandardAnnotation" false="" make_gvcf} \ -GQB 10 -GQB 20 -GQB 30 -GQB 40 -GQB 50 -GQB 60 -GQB 70 -GQB 80 -GQB 90 \ ~{true="-ERC GVCF" false="" make_gvcf} \ ~{bamout_arg} touch ~{vcf_basename}.bamout.bam } runtime { docker: docker memory: machine_mem_gb + " GiB" cpu: 4 } output { File output_vcf = "~{output_filename}" File output_vcf_index = "~{output_filename}.tbi" File bamout = "~{vcf_basename}.bamout.bam" } } # Merge GVCFs generated per-interval for the same sample task MergeGVCFs { input { # Command parameters Array[File] input_vcfs Array[File] input_vcfs_indexes String output_filename String gatk_path # Runtime parameters String docker Int? mem_gb } Int machine_mem_gb = select_first([mem_gb, 8]) Int command_mem_gb = machine_mem_gb - 2 command { echo MergeGVCFs set -euxo pipefail ~{gatk_path} --java-options "-Xmx~{command_mem_gb}G" \ MergeVcfs \ --INPUT ~{sep=' --INPUT ' input_vcfs} \ --OUTPUT ~{output_filename} } runtime { docker: docker memory: machine_mem_gb + " GB" cpu: 2 } output { File output_vcf = "~{output_filename}" File output_vcf_index = "~{output_filename}.tbi" } }