version 1.1 ## Copyright Broad Institute, 2017 ## ## This WDL workflow runs GATK4 Mutect 2 on a single tumor-normal pair or on a single tumor sample, ## and performs additional filtering. ## ## Main requirements/expectations : ## - One analysis-ready BAM file (and its index) for each sample ## ## Description of inputs: ## ## ## ** Workflow options ** ## intervals: genomic intervals (will be used for scatter) ## scatter_count: number of parallel jobs to generate when scattering over intervals ## m2_extra_args, m2_extra_filtering_args: additional arguments for Mutect2 calling and filtering (optional) ## split_intervals_extra_args: additional arguments for splitting intervals before scattering (optional) ## run_orientation_bias_mixture_model_filter: (optional) if true, filter orientation bias sites with the read orientation artifact mixture model. ## ## ** Primary inputs ** ## ref_fasta, ref_fai, ref_dict: reference genome, index, and dictionary ## tumor_reas, tumor_reads_index: BAM and index for the tumor sample ## normal_reads, normal_reads_index: BAM and index for the normal sample ## ## ** Primary resources ** (optional but strongly recommended) ## pon, pon_idx: optional panel of normals (and its index) in VCF format containing probable technical artifacts (false positves) ## gnomad, gnomad_idx: optional database of known germline variants (and its index) (see http://gnomad.broadinstitute.org/downloads) ## variants_for_contamination, variants_for_contamination_idx: VCF of common variants (and its index)with allele frequencies for calculating contamination ## ## ** Secondary resources ** (for optional tasks) ## realignment_index_bundle: resource for FilterAlignmentArtifacts, which runs if and only if it is specified. Generated by BwaMemIndexImageCreator. ## ## Outputs : ## - One VCF file and its index with primary filtering applied; secondary filtering if requested; a bamout.bam ## file of reassembled reads if requested ## ## Cromwell version support ## - Successfully tested on v34 ## ## LICENSING : ## This script is released under the GATK source code license (Apache 2.0) (see LICENSE in ## https://github.com/broadinstitute/gatk). Note however that the programs it calls may ## be subject to different licenses. Users are responsible for checking that they are ## authorized to run all programs before running this script. Please see the docker ## pages at https://hub.docker.com/r/broadinstitute/* for detailed licensing information ## pertaining to the included programs. struct Runtime { String gatk_docker Int cpu Int machine_mem Int command_mem } workflow Mutect2 { input { # basic inputs File? intervals File tumor_reads File tumor_reads_index File normal_reads File normal_reads_index # optional but usually recommended resources File? pon File? pon_idx File? gnomad File? gnomad_idx File? variants_for_contamination File? variants_for_contamination_idx # extra arguments String? m2_extra_args String? m2_extra_filtering_args String? getpileupsummaries_extra_args String? split_intervals_extra_args # additional modes and outputs File? realignment_index_bundle String? realignment_extra_args Boolean run_orientation_bias_mixture_model_filter = false Boolean make_bamout = false Boolean compress_vcfs = false File? gga_vcf File? gga_vcf_idx Boolean make_m3_training_dataset = false Boolean make_m3_test_dataset = false File? m3_training_dataset_truth_vcf File? m3_training_dataset_truth_vcf_idx # runtime String ecr_registry String aws_region Int scatter_count=50 Int small_task_cpu = 2 Int small_task_mem = 4 Int learn_read_orientation_mem = 8192 Int filter_alignment_artifacts_mem = 9216 } String src_bucket_name = "omics-" + aws_region File ref_fasta="s3://" + src_bucket_name + "/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta" File ref_fai="s3://" + src_bucket_name + "/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai" File ref_dict="s3://" + src_bucket_name + "/broad-references/hg38/v0/Homo_sapiens_assembly38.dict" String gatk_docker = ecr_registry + "/ecr-public/aws-genomics/broadinstitute/gatk:4.2.6.1-corretto-11" Runtime standard_runtime = Runtime {"gatk_docker": gatk_docker, "cpu": small_task_cpu, "machine_mem": small_task_mem * 1024, "command_mem": (small_task_mem * 1024) - 512} call SplitIntervals { input: intervals = intervals, ref_fasta = ref_fasta, ref_fai = ref_fai, ref_dict = ref_dict, scatter_count = scatter_count, split_intervals_extra_args = split_intervals_extra_args, runtime_params = standard_runtime } scatter (subintervals in SplitIntervals.interval_files ) { call M2 { input: intervals = subintervals, ref_fasta = ref_fasta, ref_fai = ref_fai, ref_dict = ref_dict, tumor_reads = tumor_reads, tumor_reads_index = tumor_reads_index, normal_reads = normal_reads, normal_reads_index = normal_reads_index, pon = pon, pon_idx = pon_idx, gnomad = gnomad, gnomad_idx = gnomad_idx, m2_extra_args = m2_extra_args, getpileupsummaries_extra_args = getpileupsummaries_extra_args, variants_for_contamination = variants_for_contamination, variants_for_contamination_idx = variants_for_contamination_idx, make_bamout = make_bamout, run_ob_filter = run_orientation_bias_mixture_model_filter, compress_vcfs = compress_vcfs, gga_vcf = gga_vcf, gga_vcf_idx = gga_vcf_idx, make_m3_test_dataset = make_m3_test_dataset, m3_training_dataset_truth_vcf = m3_training_dataset_truth_vcf, m3_training_dataset_truth_vcf_idx = m3_training_dataset_truth_vcf_idx, gatk_docker = gatk_docker } } if (run_orientation_bias_mixture_model_filter) { call LearnReadOrientationModel { input: f1r2_tar_gz = M2.f1r2_counts, runtime_params = standard_runtime, mem = learn_read_orientation_mem } } call MergeVCFs { input: input_vcfs = M2.unfiltered_vcf, input_vcf_indices = M2.unfiltered_vcf_idx, compress_vcfs = compress_vcfs, runtime_params = standard_runtime } if (make_bamout) { call MergeBamOuts { input: ref_fasta = ref_fasta, ref_fai = ref_fai, ref_dict = ref_dict, bam_outs = M2.output_bamOut, runtime_params = standard_runtime, } } call MergeStats { input: stats = M2.stats, runtime_params = standard_runtime } if (defined(variants_for_contamination)) { call MergePileupSummaries as MergeTumorPileups { input: input_tables = flatten(M2.tumor_pileups), output_name = "tumor-pileups", ref_dict = ref_dict, runtime_params = standard_runtime } if (defined(normal_reads)){ call MergePileupSummaries as MergeNormalPileups { input: input_tables = flatten(M2.normal_pileups), output_name = "normal-pileups", ref_dict = ref_dict, runtime_params = standard_runtime } } call CalculateContamination { input: tumor_pileups = MergeTumorPileups.merged_table, normal_pileups = MergeNormalPileups.merged_table, runtime_params = standard_runtime } } if (make_m3_training_dataset || make_m3_test_dataset) { call Concatenate { input: input_files = M2.m3_dataset, gatk_docker = gatk_docker } } call Filter { input: ref_fasta = ref_fasta, ref_fai = ref_fai, ref_dict = ref_dict, unfiltered_vcf = MergeVCFs.merged_vcf, unfiltered_vcf_idx = MergeVCFs.merged_vcf_idx, compress_vcfs = compress_vcfs, mutect_stats = MergeStats.merged_stats, contamination_table = CalculateContamination.contamination_table, maf_segments = CalculateContamination.maf_segments, artifact_priors_tar_gz = LearnReadOrientationModel.artifact_prior_table, m2_extra_filtering_args = m2_extra_filtering_args, runtime_params = standard_runtime } if (defined(realignment_index_bundle)) { call FilterAlignmentArtifacts { input: ref_fasta = ref_fasta, ref_fai = ref_fai, ref_dict = ref_dict, reads = tumor_reads, reads_index = tumor_reads_index, realignment_index_bundle = select_first([realignment_index_bundle]), realignment_extra_args = realignment_extra_args, compress_vcfs = compress_vcfs, input_vcf = Filter.filtered_vcf, input_vcf_idx = Filter.filtered_vcf_idx, runtime_params = standard_runtime, mem_mibs = filter_alignment_artifacts_mem } } output { File filtered_vcf = select_first([FilterAlignmentArtifacts.filtered_vcf, Filter.filtered_vcf]) File filtered_vcf_idx = select_first([FilterAlignmentArtifacts.filtered_vcf_idx, Filter.filtered_vcf_idx]) File filtering_stats = Filter.filtering_stats File mutect_stats = MergeStats.merged_stats File? contamination_table = CalculateContamination.contamination_table File? bamout = MergeBamOuts.merged_bam_out File? bamout_index = MergeBamOuts.merged_bam_out_index File? maf_segments = CalculateContamination.maf_segments File? read_orientation_model_params = LearnReadOrientationModel.artifact_prior_table File? m3_dataset = Concatenate.concatenated } } task SplitIntervals { input { File? intervals File ref_fasta File ref_fai File ref_dict Int scatter_count String? split_intervals_extra_args # runtime Runtime runtime_params } String disk_usage_cmd = "echo storage remaining: $(df -Ph . | awk 'NR==2 {print $4}')" command { echo SplitIntervals set -ex mkdir interval-files /gatk/gatk --java-options "-Xmx~{runtime_params.command_mem}m" SplitIntervals \ -R ~{ref_fasta} \ ~{"-L " + intervals} \ -scatter ~{scatter_count} \ -O interval-files \ ~{split_intervals_extra_args} cp interval-files/*.interval_list . ~{disk_usage_cmd} } runtime { container: runtime_params.gatk_docker memory: "~{runtime_params.machine_mem} MiB" cpu: runtime_params.cpu } output { Array[File] interval_files = glob("*.interval_list") } } task M2 { input { File? intervals File ref_fasta File ref_fai File ref_dict File tumor_reads File tumor_reads_index File normal_reads File normal_reads_index File? pon File? pon_idx File? gnomad File? gnomad_idx String? m2_extra_args String? getpileupsummaries_extra_args Boolean? make_bamout Boolean? run_ob_filter Boolean compress_vcfs File? gga_vcf File? gga_vcf_idx File? variants_for_contamination File? variants_for_contamination_idx Boolean make_m3_test_dataset = false File? m3_training_dataset_truth_vcf File? m3_training_dataset_truth_vcf_idx # runtime String gatk_docker Int mem = 16384 Int? cpu } String output_vcf = "output" + if compress_vcfs then ".vcf.gz" else ".vcf" String output_vcf_idx = output_vcf + if compress_vcfs then ".tbi" else ".idx" String output_stats = output_vcf + ".stats" # Mem is in units of GB but our command and memory runtime values are in MB Int command_mem = mem - 512 String disk_usage_cmd = "echo storage remaining: $(df -Ph . | awk 'NR==2 {print $4}')" command <<< echo M2 set -x # We need to create these files regardless, even if they stay empty touch bamout.bam touch f1r2.tar.gz touch dataset.txt echo "" > normal_name.txt /gatk/gatk --java-options "-Xmx~{command_mem}m" GetSampleName -R ~{ref_fasta} -I ~{tumor_reads} -O tumor_name.txt /gatk/gatk --java-options "-Xmx~{command_mem}m" GetSampleName -R ~{ref_fasta} -I ~{normal_reads} -O normal_name.txt /gatk/gatk --java-options "-Xmx~{command_mem}m" Mutect2 \ -R ~{ref_fasta} \ ~{"-I " + tumor_reads} --tumor "$(cat tumor_name.txt)" \ ~{"-I " + normal_reads} --normal "$(cat normal_name.txt)" \ ~{"--germline-resource " + gnomad} \ ~{"-pon " + pon} \ ~{"-L " + intervals} \ ~{"--alleles " + gga_vcf} \ -O "~{output_vcf}" \ ~{if select_first([make_bamout, false]) then '--bam-output bamout.bam' else '' } \ ~{if select_first([run_ob_filter, false]) then '--f1r2-tar-gz f1r2.tar.gz' else ''} \ ~{if make_m3_test_dataset then '--mutect3-dataset dataset.txt --mutect3-training-mode' else ''} \ ~{"--mutect3-training-truth " + m3_training_dataset_truth_vcf} \ ~{m2_extra_args} m2_exit_code=$? ### GetPileupSummaries # If the variants for contamination and the intervals for this scatter don't intersect, GetPileupSummaries # throws an error. However, there is nothing wrong with an empty intersection for our purposes; it simply doesn't # contribute to the merged pileup summaries that we create downstream. We implement this by with array outputs. # If the tool errors, no table is created and the glob yields an empty array. set +e if [[ -n "~{variants_for_contamination}" ]]; then /gatk/gatk --java-options "-Xmx~{command_mem}m" GetPileupSummaries -R ~{ref_fasta} -I ~{tumor_reads} ~{"--interval-set-rule INTERSECTION -L " + intervals} \ -V ~{variants_for_contamination} -L ~{variants_for_contamination} -O tumor-pileups.table ~{getpileupsummaries_extra_args} if [[ -n "~{normal_reads}" ]]; then /gatk/gatk --java-options "-Xmx~{command_mem}m" GetPileupSummaries -R ~{ref_fasta} -I ~{normal_reads} ~{"--interval-set-rule INTERSECTION -L " + intervals} \ -V ~{variants_for_contamination} -L ~{variants_for_contamination} -O normal-pileups.table ~{getpileupsummaries_extra_args} fi fi ~{disk_usage_cmd} # the script only fails if Mutect2 itself fails exit $m2_exit_code >>> runtime { container: gatk_docker memory: "~{mem} MiB" cpu: select_first([cpu, 4]) } output { File unfiltered_vcf = "~{output_vcf}" File unfiltered_vcf_idx = "~{output_vcf_idx}" File output_bamOut = "bamout.bam" String tumor_sample = read_string("tumor_name.txt") String normal_sample = read_string("normal_name.txt") File stats = "~{output_stats}" File f1r2_counts = "f1r2.tar.gz" Array[File] tumor_pileups = glob("*tumor-pileups.table") Array[File] normal_pileups = glob("*normal-pileups.table") File m3_dataset = "dataset.txt" } } task MergeVCFs { input { Array[File] input_vcfs Array[File] input_vcf_indices Boolean compress_vcfs Runtime runtime_params } String output_vcf = if compress_vcfs then "merged.vcf.gz" else "merged.vcf" String output_vcf_idx = output_vcf + if compress_vcfs then ".tbi" else ".idx" String disk_usage_cmd = "echo storage remaining: $(df -Ph . | awk 'NR==2 {print $4}')" # using MergeVcfs instead of GatherVcfs so we can create indices command <<< set -ex /gatk/gatk --java-options "-Xmx~{runtime_params.command_mem}m" MergeVcfs -I ~{sep(' -I ', input_vcfs)} -O ~{output_vcf} ~{disk_usage_cmd} >>> runtime { container: runtime_params.gatk_docker memory: "~{runtime_params.machine_mem} MiB" cpu: runtime_params.cpu } output { File merged_vcf = "~{output_vcf}" File merged_vcf_idx = "~{output_vcf_idx}" } } task MergeBamOuts { input { File ref_fasta File ref_fai File ref_dict Array[File]+ bam_outs Runtime runtime_params } String disk_usage_cmd = "echo storage remaining: $(df -Ph . | awk 'NR==2 {print $4}')" command <<< # This command block assumes that there is at least one file in bam_outs. # Do not call this task if len(bam_outs) == 0 set -ex /gatk/gatk --java-options "-Xmx~{runtime_params.command_mem}m" GatherBamFiles \ -I ~{sep(" -I ", bam_outs)} -O unsorted.out.bam -R ~{ref_fasta} # We must sort because adjacent scatters may have overlapping (padded) assembly regions, hence # overlapping bamouts /gatk/gatk --java-options "-Xmx~{runtime_params.command_mem}m" SortSam -I unsorted.out.bam \ -O bamout.bam --SORT_ORDER coordinate -VALIDATION_STRINGENCY LENIENT /gatk/gatk --java-options "-Xmx~{runtime_params.command_mem}m" BuildBamIndex -I bamout.bam -VALIDATION_STRINGENCY LENIENT ~{disk_usage_cmd} >>> runtime { container: runtime_params.gatk_docker memory: "~{runtime_params.machine_mem} MiB" cpu: runtime_params.cpu } output { File merged_bam_out = "bamout.bam" File merged_bam_out_index = "bamout.bai" } } task MergeStats { input { Array[File]+ stats Runtime runtime_params } String disk_usage_cmd = "echo storage remaining: $(df -Ph . | awk 'NR==2 {print $4}')" command { set -ex /gatk/gatk --java-options "-Xmx~{runtime_params.command_mem}m" MergeMutectStats \ -stats ~{sep(" -stats ", stats)} -O merged.stats ~{disk_usage_cmd} } runtime { container: runtime_params.gatk_docker memory: "~{runtime_params.machine_mem} MiB" cpu: runtime_params.cpu } output { File merged_stats = "merged.stats" } } task MergePileupSummaries { input { Array[File] input_tables String output_name File ref_dict Runtime runtime_params } String disk_usage_cmd = "echo storage remaining: $(df -Ph . | awk 'NR==2 {print $4}')" command { set -ex /gatk/gatk --java-options "-Xmx~{runtime_params.command_mem}m" GatherPileupSummaries \ --sequence-dictionary ~{ref_dict} \ -I ~{sep(" -I ", input_tables)} \ -O ~{output_name}.tsv ~{disk_usage_cmd} } runtime { container: runtime_params.gatk_docker memory: "~{runtime_params.machine_mem} MiB" cpu: runtime_params.cpu } output { File merged_table = "~{output_name}.tsv" } } # Learning step of the orientation bias mixture model, which is the recommended orientation bias filter as of September 2018 task LearnReadOrientationModel { input { Array[File] f1r2_tar_gz Runtime runtime_params Int? mem #override memory } Int machine_mem = select_first([mem, runtime_params.machine_mem]) Int command_mem = machine_mem - 1000 String disk_usage_cmd = "echo storage remaining: $(df -Ph . | awk 'NR==2 {print $4}')" command { set -ex /gatk/gatk --java-options "-Xmx~{command_mem}m" LearnReadOrientationModel \ -I ~{sep(" -I ", f1r2_tar_gz)} \ -O "artifact-priors.tar.gz" ~{disk_usage_cmd} } runtime { container: runtime_params.gatk_docker memory: "~{machine_mem} MiB" cpu: runtime_params.cpu } output { File artifact_prior_table = "artifact-priors.tar.gz" } } task CalculateContamination { input { File tumor_pileups File? normal_pileups Runtime runtime_params } String disk_usage_cmd = "echo storage remaining: $(df -Ph . | awk 'NR==2 {print $4}')" command { set -ex /gatk/gatk --java-options "-Xmx~{runtime_params.command_mem}m" CalculateContamination -I ~{tumor_pileups} \ -O contamination.table --tumor-segmentation segments.table ~{"-matched " + normal_pileups} ~{disk_usage_cmd} } runtime { container: runtime_params.gatk_docker memory: "~{runtime_params.machine_mem} MiB" cpu: runtime_params.cpu } output { File contamination_table = "contamination.table" File maf_segments = "segments.table" } } task Filter { input { File ref_fasta File ref_fai File ref_dict File unfiltered_vcf File unfiltered_vcf_idx Boolean compress_vcfs File? mutect_stats File? artifact_priors_tar_gz File? contamination_table File? maf_segments String? m2_extra_filtering_args Runtime runtime_params } String output_vcf = if compress_vcfs then "filtered.vcf.gz" else "filtered.vcf" String output_vcf_idx = output_vcf + if compress_vcfs then ".tbi" else ".idx" String disk_usage_cmd = "echo storage remaining: $(df -Ph . | awk 'NR==2 {print $4}')" command { set -ex /gatk/gatk --java-options "-Xmx~{runtime_params.command_mem}m" FilterMutectCalls -V ~{unfiltered_vcf} \ -R ~{ref_fasta} \ -O ~{output_vcf} \ ~{"--contamination-table " + contamination_table} \ ~{"--tumor-segmentation " + maf_segments} \ ~{"--ob-priors " + artifact_priors_tar_gz} \ ~{"-stats " + mutect_stats} \ --filtering-stats filtering.stats \ ~{m2_extra_filtering_args} ~{disk_usage_cmd} } runtime { container: runtime_params.gatk_docker memory: "~{runtime_params.machine_mem} MiB" cpu: runtime_params.cpu } output { File filtered_vcf = "~{output_vcf}" File filtered_vcf_idx = "~{output_vcf_idx}" File filtering_stats = "filtering.stats" } } task FilterAlignmentArtifacts { input { File ref_fasta File ref_fai File ref_dict File input_vcf File input_vcf_idx File reads File reads_index Boolean compress_vcfs File realignment_index_bundle String? realignment_extra_args Runtime runtime_params Int mem_mibs } String output_vcf = if compress_vcfs then "filtered.vcf.gz" else "filtered.vcf" String output_vcf_idx = output_vcf + if compress_vcfs then ".tbi" else ".idx" Int machine_mem = mem_mibs Int command_mem = machine_mem - 500 String disk_usage_cmd = "echo storage remaining: $(df -Ph . | awk 'NR==2 {print $4}')" command { set -ex /gatk/gatk --java-options "-Xmx~{command_mem}m" FilterAlignmentArtifacts \ -R ~{ref_fasta} \ -V ~{input_vcf} \ -I ~{reads} \ --bwa-mem-index-image ~{realignment_index_bundle} \ ~{realignment_extra_args} \ -O ~{output_vcf} ~{disk_usage_cmd} } runtime { container: runtime_params.gatk_docker memory: "~{machine_mem} MiB" cpu: runtime_params.cpu } output { File filtered_vcf = "~{output_vcf}" File filtered_vcf_idx = "~{output_vcf_idx}" } } task Concatenate { input { Array[File] input_files Int mem_mibs = 7168 String gatk_docker } String disk_usage_cmd = "echo storage remaining: $(df -Ph . | awk 'NR==2 {print $4}')" command { set -ex cat ~{sep(' ', input_files)} > output.txt ~{disk_usage_cmd} } runtime { container: gatk_docker memory: "~{mem_mibs} MiB" cpu: 2 } output { File concatenated = "output.txt" } }