From a438df8eaa3634f9fb80b17c474e260c43c9f098 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Tue, 27 Feb 2024 19:57:20 -0500 Subject: [PATCH 01/92] wip add beagle imputation stuff --- .../CreateImputationRefPanelBeagle.wdl | 45 +++ .../imputation_beagle/ImputationBeagle.wdl | 366 ++++++++++++++++++ tasks/broad/ImputationTasks.wdl | 76 ++++ 3 files changed, 487 insertions(+) create mode 100644 pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl create mode 100644 pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl diff --git a/pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl b/pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl new file mode 100644 index 0000000000..06eeede76b --- /dev/null +++ b/pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl @@ -0,0 +1,45 @@ +version 1.0 + +# This script is under review. It is not actively tested or maintained at this time. +workflow CreateImputationRefPanelBeagle { + input { + Array[File] ref_vcf + Int disk_size + } + + scatter (idx in range(length(ref_vcf))) { + call BuildBref3 { + input: + vcf = ref_vcf[idx], + disk_size = disk_size + } + } + + output { + Array[File] out_bref3 = BuildBref3.out_bref3 + } +} + +task BuildBref3 { + input { + File vcf + Int disk_size + } + + String name = basename(vcf, ".vcf.gz") + + command <<< + java -jar bref3.22Jul22.46e.jar ~{vcf} > ~{name}.bref3 + >>> + + runtime { + docker: "us-central1-docker.pkg.dev/morgan-fieldeng-gcp/imputation-beagle-development:0.0.1-22Jul22.46e-wip-temp-20240227" + memory: "256 GB" + cpu: 4 + disks: "local-disk " + disk_size + " HDD" + } + + output { + File out_bref3 = "~{name}.bref3" + } +} \ No newline at end of file diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl new file mode 100644 index 0000000000..6126b1f5d4 --- /dev/null +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl @@ -0,0 +1,366 @@ +version 1.0 + +import "../../../../structs/imputation/ImputationStructs.wdl" as structs +import "../../../../tasks/broad/ImputationTasks.wdl" as tasks +import "../../../../tasks/broad/Utilities.wdl" as utils + +workflow ImputationBeagle { + + String pipeline_version = "0.0.1" + + input { + Int chunkLength = 25000000 + Int chunkOverlaps = 5000000 # this is the padding that will be added to the beginning and end of each chunk to reduce edge effects + + # You can either input a multisample VCF or an array of single sample VCFs + # The pipeline will just merge the single sample VCFs into one multisample VCF + # and then impute the multisample VCF + # If you want to run a single sample VCF, set the multi_sample_vcf input to the + # single sample VCF + File? multi_sample_vcf + File? multi_sample_vcf_index + Array[File]? single_sample_vcfs + Array[File]? single_sample_vcf_indices + + Boolean perform_extra_qc_steps = false # these are optional additional extra QC steps from Amit's group that should only be + # run for large sample sets, especially a diverse set of samples (it's further limiting called at sites to 95% and by HWE) + Float? optional_qc_max_missing + Float? optional_qc_hwe + File ref_dict # for reheadering / adding contig lengths in the header of the ouptut VCF, and calculating contig lengths + Array[String] contigs + String reference_panel_path # path to the bucket where the reference panel files are stored for all contigs + File genetic_maps_eagle + String output_callset_name # the output callset name + Boolean split_output_to_single_sample = false + Int merge_ssvcf_mem_mb = 3000 # the memory allocation for MergeSingleSampleVcfs (in mb) + + Float frac_above_maf_5_percent_well_imputed_threshold = 0.9 # require fraction of maf > 0.05 sites well imputed to be greater than this to pass + Int chunks_fail_threshold = 1 # require fewer than this many chunks to fail in order to pass + + # file extensions used to find reference panel files + String vcf_suffix = ".vcf.gz" + String vcf_index_suffix = ".vcf.gz.tbi" + String bcf_suffix = ".bcf" + String bcf_index_suffix = ".bcf.csi" + String m3vcf_suffix = ".cleaned.m3vcf.gz" + } + + if (defined(single_sample_vcfs) && defined(multi_sample_vcf)) { + call utils.ErrorWithMessage as ErrorMessageDoubleInput{ + input: + message = "single_sample_vcfs and multi_sample_vcf cannot both be defined as input" + } + } + + if (!defined(single_sample_vcfs) && !defined(multi_sample_vcf)) { + call utils.ErrorWithMessage as ErrorMessageNoInput { + input: + message = "One (and only one) of single_sample_vcfs and multi_sample_vcf must be defined as input" + } + } + + if (defined(single_sample_vcfs)) { + call tasks.MergeSingleSampleVcfs { + input: + input_vcfs = select_first([single_sample_vcfs]), + input_vcf_indices = select_first([single_sample_vcf_indices]), + output_vcf_basename = "merged_input_samples", + memory_mb = merge_ssvcf_mem_mb + } + } + + File vcf_to_impute = select_first([multi_sample_vcf, MergeSingleSampleVcfs.output_vcf]) + File vcf_index_to_impute = select_first([multi_sample_vcf_index, MergeSingleSampleVcfs.output_vcf_index]) + + call tasks.CountSamples { + input: + vcf = vcf_to_impute, + } + + Float chunkLengthFloat = chunkLength + + scatter (contig in contigs) { + + String reference_filename = reference_panel_path + "ALL.chr" + contig + ".phase3_integrated.20130502.genotypes.cleaned" + + ReferencePanelContig referencePanelContig = { + "vcf": reference_filename + vcf_suffix, + "vcf_index": reference_filename + vcf_index_suffix, + "bcf": reference_filename + bcf_suffix, + "bcf_index": reference_filename + bcf_index_suffix, + "m3vcf": reference_filename + m3vcf_suffix, + "contig": contig + } + + call tasks.CalculateChromosomeLength { + input: + ref_dict = ref_dict, + chrom = referencePanelContig.contig + } + + Int num_chunks = ceil(CalculateChromosomeLength.chrom_length / chunkLengthFloat) + + scatter (i in range(num_chunks)) { + String chunk_contig = referencePanelContig.contig + Int start = (i * chunkLength) + 1 + Int startWithOverlaps = if (start - chunkOverlaps < 1) then 1 else start - chunkOverlaps + Int end = if (CalculateChromosomeLength.chrom_length < ((i + 1) * chunkLength)) then CalculateChromosomeLength.chrom_length else ((i + 1) * chunkLength) + Int endWithOverlaps = if (CalculateChromosomeLength.chrom_length < end + chunkOverlaps) then CalculateChromosomeLength.chrom_length else end + chunkOverlaps + + call tasks.GenerateChunk { + input: + vcf = vcf_to_impute, + vcf_index = vcf_index_to_impute, + start = startWithOverlaps, + end = endWithOverlaps, + chrom = referencePanelContig.contig, + basename = "chrom_" + referencePanelContig.contig + "_chunk_" + i + } + + if (perform_extra_qc_steps) { + call tasks.OptionalQCSites { + input: + input_vcf = GenerateChunk.output_vcf, + input_vcf_index = GenerateChunk.output_vcf_index, + output_vcf_basename = "chrom_" + referencePanelContig.contig + "_chunk_" + i, + optional_qc_max_missing = optional_qc_max_missing, + optional_qc_hwe = optional_qc_hwe + } + } + + call tasks.CountVariantsInChunks { + input: + vcf = select_first([OptionalQCSites.output_vcf, GenerateChunk.output_vcf]), + vcf_index = select_first([OptionalQCSites.output_vcf_index, GenerateChunk.output_vcf_index]), + panel_vcf = referencePanelContig.vcf, + panel_vcf_index = referencePanelContig.vcf_index + } + call tasks.CheckChunksBeagle { + input: + vcf = select_first([OptionalQCSites.output_vcf, GenerateChunk.output_vcf]), + vcf_index = select_first([OptionalQCSites.output_vcf_index, GenerateChunk.output_vcf_index]), + panel_vcf = referencePanelContig.vcf, + panel_vcf_index = referencePanelContig.vcf_index, + var_in_original = CountVariantsInChunks.var_in_original, + var_in_reference = CountVariantsInChunks.var_in_reference + } + + call tasks.SubsetVcfToRegion { + input: + vcf = vcf_to_impute, + vcf_index = vcf_index_to_impute, + output_basename = "input_samples_subset_to_chunk", + contig = referencePanelContig.contig, + start = start, + end = end + } + + call tasks.SetIDs as SetIdsVcfToImpute { + input: + vcf = SubsetVcfToRegion.output_vcf, + output_basename = "input_samples_with_variant_ids" + } + + call tasks.ExtractIDs as ExtractIdsVcfToImpute { + input: + vcf = SetIdsVcfToImpute.output_vcf, + output_basename = "imputed_sites" + } + + if (CheckChunksBeagle.valid) { + call tasks.PhaseAndImputeBeagle { + input: + dataset_vcf = select_first([OptionalQCSites.output_vcf, GenerateChunk.output_vcf]), + ref_panel_bref3 = referencePanelContig.bcf, + chrom = referencePanelContig.contig, + genetic_map_file = genetic_maps_eagle, + start = startWithOverlaps, + end = endWithOverlaps + } + + call tasks.UpdateHeader { + input: + vcf = PhaseAndImputeBeagle.vcf, + vcf_index = PhaseAndImputeBeagle.vcf_index, + ref_dict = ref_dict, + basename = "chrom_" + referencePanelContig.contig + "_chunk_" + i +"_imputed" + } + + call tasks.SeparateMultiallelics { + input: + original_vcf = UpdateHeader.output_vcf, + original_vcf_index = UpdateHeader.output_vcf_index, + output_basename = "chrom" + referencePanelContig.contig + "_chunk_" + i +"_imputed" + } + + call tasks.RemoveSymbolicAlleles { + input: + original_vcf = SeparateMultiallelics.output_vcf, + original_vcf_index = SeparateMultiallelics.output_vcf_index, + output_basename = "chrom" + referencePanelContig.contig + "_chunk_" + i +"_imputed" + } + + call tasks.SetIDs { + input: + vcf = RemoveSymbolicAlleles.output_vcf, + output_basename = "chrom" + referencePanelContig.contig + "_chunk_" + i +"_imputed" + } + + call tasks.ExtractIDs { + input: + vcf = SetIDs.output_vcf, + output_basename = "imputed_sites" + } + } + call tasks.FindSitesUniqueToFileTwoOnly { + input: + file1 = select_first([ExtractIDs.ids, write_lines([])]), + file2 = ExtractIdsVcfToImpute.ids + } + + call tasks.SelectVariantsByIds { + input: + vcf = SetIdsVcfToImpute.output_vcf, + ids = FindSitesUniqueToFileTwoOnly.missing_sites, + basename = "imputed_sites_to_recover" + } + + call tasks.RemoveAnnotations { + input: + vcf = SelectVariantsByIds.output_vcf, + basename = "imputed_sites_to_recover_annotations_removed" + } + + call tasks.InterleaveVariants { + input: + vcfs = select_all([RemoveAnnotations.output_vcf, SetIDs.output_vcf]), + basename = output_callset_name + } + } + # Array[File] aggregatedImputationMetrics = select_all(AggregateImputationQCMetrics.aggregated_metrics) + Array[File] chromosome_vcfs = select_all(InterleaveVariants.output_vcf) + } + + Array[String] phased_vcfs = flatten(chromosome_vcfs) + + call tasks.GetMissingContigList { + input: + ref_dict = ref_dict, + included_contigs = write_lines(contigs) + } + + scatter (missing_contig in GetMissingContigList.missing_contigs) { + call tasks.CalculateChromosomeLength as CalculateMissingChromosomeLength { + input: + ref_dict = ref_dict, + chrom = missing_contig + } + + Int num_chunks_missing_contig = ceil(CalculateMissingChromosomeLength.chrom_length / chunkLengthFloat) + + scatter (i_missing_contig in range(num_chunks_missing_contig)) { + Int start_missing_contig = (i_missing_contig * chunkLength) + 1 + Int end_missing_contig = if (CalculateMissingChromosomeLength.chrom_length < ((i_missing_contig + 1) * chunkLength)) then CalculateMissingChromosomeLength.chrom_length else ((i_missing_contig + 1) * chunkLength) + + call tasks.SubsetVcfToRegion as SubsetVcfToRegionMissingContig{ + input: + vcf = vcf_to_impute, + vcf_index = vcf_index_to_impute, + output_basename = "input_samples_subset_to_chunk", + contig = missing_contig, + start = start_missing_contig, + end = end_missing_contig, + exclude_filtered = true + } + + call tasks.SetIDs as SetIDsMissingContigs { + input: + vcf = SubsetVcfToRegionMissingContig.output_vcf, + output_basename = "unimputed_contigs_" + missing_contig +"_"+ i_missing_contig + "_with_ids" + } + + call tasks.RemoveAnnotations as RemoveAnnotationsMissingContigs { + input: + vcf = SetIDsMissingContigs.output_vcf, + basename = "unimputed_contigs_" + missing_contig +"_"+ i_missing_contig + "_annotations_removed" + } + } + } + + Array[String] missing_remove_annotation_vcfs = flatten(RemoveAnnotationsMissingContigs.output_vcf) + + scatter(missing_remove_annotation_vcf in missing_remove_annotation_vcfs){ + call tasks.ReplaceHeader { + input: + vcf_to_replace_header = missing_remove_annotation_vcf, + vcf_with_new_header = phased_vcfs[0] + } + } + + Array[String] missing_contig_vcfs = ReplaceHeader.output_vcf + Array[String] unsorted_vcfs = flatten([phased_vcfs, missing_contig_vcfs]) + + call tasks.GatherVcfs { + input: + input_vcfs = unsorted_vcfs, + output_vcf_basename = output_callset_name + } + +# call tasks.MergeImputationQCMetrics { +# input: +# metrics = flatten(aggregatedImputationMetrics), +# basename = output_callset_name +# } + +# if (MergeImputationQCMetrics.frac_above_maf_5_percent_well_imputed < frac_above_maf_5_percent_well_imputed_threshold) { +# call utils.ErrorWithMessage as FailQCWellImputedFrac { +# input: +# message = "Well imputed fraction was " + MergeImputationQCMetrics.frac_above_maf_5_percent_well_imputed + ", QC failure threshold was set at " + frac_above_maf_5_percent_well_imputed_threshold +# } +# } + + call tasks.StoreChunksInfo { + input: + chroms = flatten(chunk_contig), + starts = flatten(start), + ends = flatten(end), + vars_in_array = flatten(CountVariantsInChunks.var_in_original), + vars_in_panel = flatten(CountVariantsInChunks.var_in_reference), + valids = flatten(CheckChunksBeagle.valid), + basename = output_callset_name + } + + Int n_failed_chunks_int = read_int(StoreChunksInfo.n_failed_chunks) + + if (n_failed_chunks_int >= chunks_fail_threshold) { + call utils.ErrorWithMessage as FailQCNChunks { + input: + message = n_failed_chunks_int + " chunks failed imputation, QC threshold was set to " + chunks_fail_threshold + } + } + + if (split_output_to_single_sample) { + call tasks.SplitMultiSampleVcf { + input: + multiSampleVcf = GatherVcfs.output_vcf, + nSamples = CountSamples.nSamples + } + } + + + output { + Array[File]? imputed_single_sample_vcfs = SplitMultiSampleVcf.single_sample_vcfs + Array[File]? imputed_single_sample_vcf_indices = SplitMultiSampleVcf.single_sample_vcf_indices + File imputed_multisample_vcf = GatherVcfs.output_vcf + File imputed_multisample_vcf_index = GatherVcfs.output_vcf_index + # File aggregated_imputation_metrics = MergeImputationQCMetrics.aggregated_metrics + File chunks_info = StoreChunksInfo.chunks_info + File failed_chunks = StoreChunksInfo.failed_chunks + File n_failed_chunks = StoreChunksInfo.n_failed_chunks + } + + meta { + allowNestedInputs: true + } + +} diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index 793ae119b2..a653b28bd3 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -261,6 +261,82 @@ task Minimac4 { } } +task CheckChunksBeagle { + input { + File vcf + File vcf_index + File panel_vcf + File panel_vcf_index + Int var_in_original + Int var_in_reference + + Int disk_size_gb = ceil(2*size([vcf, vcf_index, panel_vcf, panel_vcf_index], "GiB")) + String bcftools_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.7-1.10.2-0.1.16-1669908889" + Int cpu = 1 + Int memory_mb = 4000 + } + command <<< + set -e -o pipefail + + if [ $(( ~{var_in_reference} * 2 - ~{var_in_original})) -gt 0 ] && [ ~{var_in_reference} -gt 3 ]; then + echo true > valid_file.txt + else + echo false > valid_file.txt + fi + >>> + output { + Boolean valid = read_boolean("valid_file.txt") + } + runtime { + docker: bcftools_docker + disks: "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu + } +} + +task PhaseAndImputeBeagle { + input { + File dataset_vcf + File ref_panel_bref3 + File genetic_map_file + String chrom # not needed if ref file has been chunked and you are using the entire chunk + Int start # not needed if ref file has been chunked and you are using the entire chunk + Int end # not needed if ref file has been chunked and you are using the entire chunk + + String beagle_docker = "us.gcr.io/broad-gotc-dev/imputation-beagle:0.0.1-22Jul22.46e-wip-temp-20240227" + Int cpu = 8 # This parameter can be higher or lower + Int memory_mb = 32000 # value depends on chunk size, the number of samples in ref and target panel, and whether imputation is performed + Int xmx_mb = 29000 # I suggest setting this parameter to be 85-90% of the memory_mb parameter + Int disk_size_gb = ceil(3 * size([dataset_vcf, ref_panel_bref3], "GiB")) + 50 # value may need to be adjusted + } + command <<< + set -e -o pipefail + + java -ea -jar -Xmx~{xmx_mb}m \ + bref3.22Jul22.46e.jar \ + gt=~{dataset_vcf} \ + ref=~{ref_panel_bref3} \ + map=~{genetic_map_file} \ + out=imputed_~{chrom} \ # rename output file to "phased_{chrom}" if phasing without imputing + chrom=~{chrom}:~{start}-~{end} \ # not needed if ref and targ files have been chunked and you are using the entire chunk + impute=true \ # set impute=false if you wish to phase without imputing ungenotyped markers + nthreads=~{cpu} + + bcftools index -t imputed_~{chrom}.vcf.gz + >>> + output { + File vcf = "imputed_~{chrom}.vcf.gz" + File vcf_index = "imputed_~{chrom}.vcf.gz.tbi" + } + runtime { + docker: beagle_docker + disks: "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu + } +} + task GatherVcfs { input { Array[File] input_vcfs From 737599f9c83dadc58b58623d3cac493c8d93a532 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Tue, 27 Feb 2024 19:59:09 -0500 Subject: [PATCH 02/92] add 2 wdls to dockstore.yml --- .dockstore.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.dockstore.yml b/.dockstore.yml index 9ab1966238..08eff44e17 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -83,6 +83,14 @@ workflows: subclass: WDL primaryDescriptorPath: /pipelines/broad/arrays/imputation/Imputation.wdl + - name: ImputationBeagle + subclass: WDL + primaryDescriptorPath: /pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl + + - name: CreateImputationRefPanelBeagle + subclass: WDL + primaryDescriptorPath: /pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl + - name: RNAWithUMIsPipeline subclass: WDL primaryDescriptorPath: /pipelines/broad/rna_seq/RNAWithUMIsPipeline.wdl From 134150da4cadf6118da415eb65514be616ef9402 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Tue, 27 Feb 2024 20:31:29 -0500 Subject: [PATCH 03/92] fix docker gar url --- .../arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl | 2 +- tasks/broad/ImputationTasks.wdl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl b/pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl index 06eeede76b..64026e5e25 100644 --- a/pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl +++ b/pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl @@ -33,7 +33,7 @@ task BuildBref3 { >>> runtime { - docker: "us-central1-docker.pkg.dev/morgan-fieldeng-gcp/imputation-beagle-development:0.0.1-22Jul22.46e-wip-temp-20240227" + docker: "us-central1-docker.pkg.dev/morgan-fieldeng-gcp/imputation-beagle-development/imputation-beagle:0.0.1-22Jul22.46e-wip-temp-20240227" memory: "256 GB" cpu: 4 disks: "local-disk " + disk_size + " HDD" diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index a653b28bd3..3b79e04d53 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -304,7 +304,7 @@ task PhaseAndImputeBeagle { Int start # not needed if ref file has been chunked and you are using the entire chunk Int end # not needed if ref file has been chunked and you are using the entire chunk - String beagle_docker = "us.gcr.io/broad-gotc-dev/imputation-beagle:0.0.1-22Jul22.46e-wip-temp-20240227" + String beagle_docker = "us-central1-docker.pkg.dev/morgan-fieldeng-gcp/imputation-beagle-development/imputation-beagle:0.0.1-22Jul22.46e-wip-temp-20240227" Int cpu = 8 # This parameter can be higher or lower Int memory_mb = 32000 # value depends on chunk size, the number of samples in ref and target panel, and whether imputation is performed Int xmx_mb = 29000 # I suggest setting this parameter to be 85-90% of the memory_mb parameter From a307f9398e2d4d902383ce0f22f7c1293baed307 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Tue, 27 Feb 2024 20:36:49 -0500 Subject: [PATCH 04/92] use the right path for jars --- .../arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl b/pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl index 64026e5e25..c1a14190c4 100644 --- a/pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl +++ b/pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl @@ -29,7 +29,7 @@ task BuildBref3 { String name = basename(vcf, ".vcf.gz") command <<< - java -jar bref3.22Jul22.46e.jar ~{vcf} > ~{name}.bref3 + java -jar /usr/gitc/bref3.22Jul22.46e.jar ~{vcf} > ~{name}.bref3 >>> runtime { From 658d5b6c1012b8ab941e0a5066a5e1676e0d2976 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Wed, 28 Feb 2024 12:07:57 -0500 Subject: [PATCH 05/92] wip on imputation wdl --- .../imputation_beagle/ImputationBeagle.wdl | 29 +++++++++++-------- tasks/broad/ImputationTasks.wdl | 2 +- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl index 6126b1f5d4..db9a940fb5 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl @@ -1,6 +1,5 @@ version 1.0 -import "../../../../structs/imputation/ImputationStructs.wdl" as structs import "../../../../tasks/broad/ImputationTasks.wdl" as tasks import "../../../../tasks/broad/Utilities.wdl" as utils @@ -28,8 +27,8 @@ workflow ImputationBeagle { Float? optional_qc_hwe File ref_dict # for reheadering / adding contig lengths in the header of the ouptut VCF, and calculating contig lengths Array[String] contigs - String reference_panel_path # path to the bucket where the reference panel files are stored for all contigs - File genetic_maps_eagle + String reference_panel_path = "gs://morgan-imputation-development/1000G-ref-panel/hg19/" # path to the bucket where the reference panel files are stored for all contigs + String genetic_maps_path = "gs://morgan-imputation-development/plink-genetic-maps/GRCh37/" # path to the bucket where genetic maps are stored for all contigs String output_callset_name # the output callset name Boolean split_output_to_single_sample = false Int merge_ssvcf_mem_mb = 3000 # the memory allocation for MergeSingleSampleVcfs (in mb) @@ -40,9 +39,7 @@ workflow ImputationBeagle { # file extensions used to find reference panel files String vcf_suffix = ".vcf.gz" String vcf_index_suffix = ".vcf.gz.tbi" - String bcf_suffix = ".bcf" - String bcf_index_suffix = ".bcf.csi" - String m3vcf_suffix = ".cleaned.m3vcf.gz" + String bref3_suffix = ".bref3" } if (defined(single_sample_vcfs) && defined(multi_sample_vcf)) { @@ -82,14 +79,14 @@ workflow ImputationBeagle { scatter (contig in contigs) { String reference_filename = reference_panel_path + "ALL.chr" + contig + ".phase3_integrated.20130502.genotypes.cleaned" + String genetic_map_filename = genetic_maps_path + "plink.chr" + contig + ".GRCh37.map" ReferencePanelContig referencePanelContig = { "vcf": reference_filename + vcf_suffix, "vcf_index": reference_filename + vcf_index_suffix, - "bcf": reference_filename + bcf_suffix, - "bcf_index": reference_filename + bcf_index_suffix, - "m3vcf": reference_filename + m3vcf_suffix, - "contig": contig + "bref3": reference_filename + bref3_suffix, + "contig": contig, + "genetic_map": genetic_map_filename } call tasks.CalculateChromosomeLength { @@ -171,9 +168,9 @@ workflow ImputationBeagle { call tasks.PhaseAndImputeBeagle { input: dataset_vcf = select_first([OptionalQCSites.output_vcf, GenerateChunk.output_vcf]), - ref_panel_bref3 = referencePanelContig.bcf, + ref_panel_bref3 = referencePanelContig.bref3, chrom = referencePanelContig.contig, - genetic_map_file = genetic_maps_eagle, + genetic_map_file = referencePanelContig.genetic_map, start = startWithOverlaps, end = endWithOverlaps } @@ -364,3 +361,11 @@ workflow ImputationBeagle { } } + +struct ReferencePanelContig { + File vcf + File vcf_index + File bref3 + String contig + File genetic_map +} \ No newline at end of file diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index 3b79e04d53..59ac505107 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -314,7 +314,7 @@ task PhaseAndImputeBeagle { set -e -o pipefail java -ea -jar -Xmx~{xmx_mb}m \ - bref3.22Jul22.46e.jar \ + /usr/gitc/bref3.22Jul22.46e.jar \ gt=~{dataset_vcf} \ ref=~{ref_panel_bref3} \ map=~{genetic_map_file} \ From 8e479e043da5ab72599ef823af0087547af93aae Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Wed, 28 Feb 2024 13:15:56 -0500 Subject: [PATCH 06/92] oops use correct jar --- tasks/broad/ImputationTasks.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index 59ac505107..665772afe5 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -314,7 +314,7 @@ task PhaseAndImputeBeagle { set -e -o pipefail java -ea -jar -Xmx~{xmx_mb}m \ - /usr/gitc/bref3.22Jul22.46e.jar \ + /usr/gitc/beagle.22Jul22.46e.jar \ gt=~{dataset_vcf} \ ref=~{ref_panel_bref3} \ map=~{genetic_map_file} \ From 10ab3314f887452012f182a87cc59eeccd458cb4 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Wed, 28 Feb 2024 13:28:26 -0500 Subject: [PATCH 07/92] missing equals --- tasks/broad/ImputationTasks.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index 665772afe5..9b3cb45675 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -313,7 +313,7 @@ task PhaseAndImputeBeagle { command <<< set -e -o pipefail - java -ea -jar -Xmx~{xmx_mb}m \ + java -ea -jar -Xmx=~{xmx_mb}m \ /usr/gitc/beagle.22Jul22.46e.jar \ gt=~{dataset_vcf} \ ref=~{ref_panel_bref3} \ From 1e1bebbbbcfdb4bcabea178f0de534dd34206f5a Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Wed, 28 Feb 2024 13:43:25 -0500 Subject: [PATCH 08/92] fix java call again --- tasks/broad/ImputationTasks.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index 9b3cb45675..44268c7682 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -313,8 +313,8 @@ task PhaseAndImputeBeagle { command <<< set -e -o pipefail - java -ea -jar -Xmx=~{xmx_mb}m \ - /usr/gitc/beagle.22Jul22.46e.jar \ + java -ea -Xmx~{xmx_mb}m \ + -jar /usr/gitc/beagle.22Jul22.46e.jar \ gt=~{dataset_vcf} \ ref=~{ref_panel_bref3} \ map=~{genetic_map_file} \ From 6b4d26c084264d9e96da26fb648cce612d386919 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Thu, 29 Feb 2024 16:11:10 -0500 Subject: [PATCH 09/92] fix java call --- .../arrays/imputation_beagle/ImputationBeagle.wdl | 14 ++++++++------ tasks/broad/ImputationTasks.wdl | 13 ++++++++++--- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl index db9a940fb5..9446ee2af0 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl @@ -103,6 +103,7 @@ workflow ImputationBeagle { Int startWithOverlaps = if (start - chunkOverlaps < 1) then 1 else start - chunkOverlaps Int end = if (CalculateChromosomeLength.chrom_length < ((i + 1) * chunkLength)) then CalculateChromosomeLength.chrom_length else ((i + 1) * chunkLength) Int endWithOverlaps = if (CalculateChromosomeLength.chrom_length < end + chunkOverlaps) then CalculateChromosomeLength.chrom_length else end + chunkOverlaps + String chunk_basename = "chrom_" + referencePanelContig.contig + "_chunk_" + i call tasks.GenerateChunk { input: @@ -111,7 +112,7 @@ workflow ImputationBeagle { start = startWithOverlaps, end = endWithOverlaps, chrom = referencePanelContig.contig, - basename = "chrom_" + referencePanelContig.contig + "_chunk_" + i + basename = chunk_basename } if (perform_extra_qc_steps) { @@ -119,7 +120,7 @@ workflow ImputationBeagle { input: input_vcf = GenerateChunk.output_vcf, input_vcf_index = GenerateChunk.output_vcf_index, - output_vcf_basename = "chrom_" + referencePanelContig.contig + "_chunk_" + i, + output_vcf_basename = chunk_basename, optional_qc_max_missing = optional_qc_max_missing, optional_qc_hwe = optional_qc_hwe } @@ -170,6 +171,7 @@ workflow ImputationBeagle { dataset_vcf = select_first([OptionalQCSites.output_vcf, GenerateChunk.output_vcf]), ref_panel_bref3 = referencePanelContig.bref3, chrom = referencePanelContig.contig, + basename = chunk_basename, genetic_map_file = referencePanelContig.genetic_map, start = startWithOverlaps, end = endWithOverlaps @@ -180,27 +182,27 @@ workflow ImputationBeagle { vcf = PhaseAndImputeBeagle.vcf, vcf_index = PhaseAndImputeBeagle.vcf_index, ref_dict = ref_dict, - basename = "chrom_" + referencePanelContig.contig + "_chunk_" + i +"_imputed" + basename = chunk_basename + "_imputed" } call tasks.SeparateMultiallelics { input: original_vcf = UpdateHeader.output_vcf, original_vcf_index = UpdateHeader.output_vcf_index, - output_basename = "chrom" + referencePanelContig.contig + "_chunk_" + i +"_imputed" + output_basename = chunk_basename + "_imputed" } call tasks.RemoveSymbolicAlleles { input: original_vcf = SeparateMultiallelics.output_vcf, original_vcf_index = SeparateMultiallelics.output_vcf_index, - output_basename = "chrom" + referencePanelContig.contig + "_chunk_" + i +"_imputed" + output_basename = chunk_basename + "_imputed" } call tasks.SetIDs { input: vcf = RemoveSymbolicAlleles.output_vcf, - output_basename = "chrom" + referencePanelContig.contig + "_chunk_" + i +"_imputed" + output_basename = chunk_basename + "_imputed" } call tasks.ExtractIDs { diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index 44268c7682..77c9ed9506 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -300,6 +300,7 @@ task PhaseAndImputeBeagle { File dataset_vcf File ref_panel_bref3 File genetic_map_file + String basename String chrom # not needed if ref file has been chunked and you are using the entire chunk Int start # not needed if ref file has been chunked and you are using the entire chunk Int end # not needed if ref file has been chunked and you are using the entire chunk @@ -310,6 +311,7 @@ task PhaseAndImputeBeagle { Int xmx_mb = 29000 # I suggest setting this parameter to be 85-90% of the memory_mb parameter Int disk_size_gb = ceil(3 * size([dataset_vcf, ref_panel_bref3], "GiB")) + 50 # value may need to be adjusted } + command <<< set -e -o pipefail @@ -318,11 +320,16 @@ task PhaseAndImputeBeagle { gt=~{dataset_vcf} \ ref=~{ref_panel_bref3} \ map=~{genetic_map_file} \ - out=imputed_~{chrom} \ # rename output file to "phased_{chrom}" if phasing without imputing - chrom=~{chrom}:~{start}-~{end} \ # not needed if ref and targ files have been chunked and you are using the entire chunk - impute=true \ # set impute=false if you wish to phase without imputing ungenotyped markers + out=imputed_~{basename} \ + chrom=~{chrom}:~{start}-~{end} \ + impute=true \ nthreads=~{cpu} + # notes: + # rename output file to "phased_{chrom}" if phasing without imputing + # `chrom` not needed if ref and targ files have been chunked and you are using the entire chunk + # set impute=false if you wish to phase without imputing ungenotyped markers + bcftools index -t imputed_~{chrom}.vcf.gz >>> output { From d96c9bb5e9f9889c67a4c2d8425fc29a0ebcc91d Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Thu, 29 Feb 2024 16:37:08 -0500 Subject: [PATCH 10/92] oops match file names --- tasks/broad/ImputationTasks.wdl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index 77c9ed9506..70cbad106e 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -326,15 +326,15 @@ task PhaseAndImputeBeagle { nthreads=~{cpu} # notes: - # rename output file to "phased_{chrom}" if phasing without imputing + # rename output file to "phased_{basename}" if phasing without imputing # `chrom` not needed if ref and targ files have been chunked and you are using the entire chunk # set impute=false if you wish to phase without imputing ungenotyped markers - bcftools index -t imputed_~{chrom}.vcf.gz + bcftools index -t imputed_~{basename}.vcf.gz >>> output { - File vcf = "imputed_~{chrom}.vcf.gz" - File vcf_index = "imputed_~{chrom}.vcf.gz.tbi" + File vcf = "imputed_~{basename}.vcf.gz" + File vcf_index = "imputed_~{basename}.vcf.gz.tbi" } runtime { docker: beagle_docker From 6a2a5869f372b6856c4c26fdfec6e677e3db56b2 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Fri, 1 Mar 2024 16:44:56 -0500 Subject: [PATCH 11/92] update beagle jar to 01Mar24.d36 --- tasks/broad/ImputationTasks.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index 70cbad106e..ba12fd0be8 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -305,7 +305,7 @@ task PhaseAndImputeBeagle { Int start # not needed if ref file has been chunked and you are using the entire chunk Int end # not needed if ref file has been chunked and you are using the entire chunk - String beagle_docker = "us-central1-docker.pkg.dev/morgan-fieldeng-gcp/imputation-beagle-development/imputation-beagle:0.0.1-22Jul22.46e-wip-temp-20240227" + String beagle_docker = "us-central1-docker.pkg.dev/morgan-fieldeng-gcp/imputation-beagle-development/imputation-beagle:0.0.1-01Mar24.d36-wip-temp-20240301" Int cpu = 8 # This parameter can be higher or lower Int memory_mb = 32000 # value depends on chunk size, the number of samples in ref and target panel, and whether imputation is performed Int xmx_mb = 29000 # I suggest setting this parameter to be 85-90% of the memory_mb parameter @@ -316,7 +316,7 @@ task PhaseAndImputeBeagle { set -e -o pipefail java -ea -Xmx~{xmx_mb}m \ - -jar /usr/gitc/beagle.22Jul22.46e.jar \ + -jar /usr/gitc/beagle.01Mar24.d36.jar \ gt=~{dataset_vcf} \ ref=~{ref_panel_bref3} \ map=~{genetic_map_file} \ From 71b49c444a3f2e9ba1c571eb8b5784ce275e1fdb Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Fri, 1 Mar 2024 19:24:11 -0500 Subject: [PATCH 12/92] debug GatherVcfs --- tasks/broad/ImputationTasks.wdl | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index ba12fd0be8..4bb6023f24 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -335,6 +335,7 @@ task PhaseAndImputeBeagle { output { File vcf = "imputed_~{basename}.vcf.gz" File vcf_index = "imputed_~{basename}.vcf.gz.tbi" + File log = "imputed_~{basename}.log" } runtime { docker: beagle_docker @@ -366,8 +367,10 @@ task GatherVcfs { --REORDER_INPUT_BY_FIRST_VARIANT \ -O ~{output_vcf_basename}.vcf.gz - gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" \ - IndexFeatureFile -I ~{output_vcf_basename}.vcf.gz + # gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" \ + # IndexFeatureFile -I ~{output_vcf_basename}.vcf.gz + + ls >>> runtime { @@ -378,7 +381,7 @@ task GatherVcfs { } output { File output_vcf = "~{output_vcf_basename}.vcf.gz" - File output_vcf_index = "~{output_vcf_basename}.vcf.gz.tbi" + # File output_vcf_index = "~{output_vcf_basename}.vcf.gz.tbi" } } From 406e83e3b750752027debf33c8569bddd6a3b718 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Fri, 1 Mar 2024 19:25:52 -0500 Subject: [PATCH 13/92] debug GatherVcfs 2 --- pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl index 9446ee2af0..2c95f1d59b 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl @@ -351,7 +351,7 @@ workflow ImputationBeagle { Array[File]? imputed_single_sample_vcfs = SplitMultiSampleVcf.single_sample_vcfs Array[File]? imputed_single_sample_vcf_indices = SplitMultiSampleVcf.single_sample_vcf_indices File imputed_multisample_vcf = GatherVcfs.output_vcf - File imputed_multisample_vcf_index = GatherVcfs.output_vcf_index + # File imputed_multisample_vcf_index = GatherVcfs.output_vcf_index # File aggregated_imputation_metrics = MergeImputationQCMetrics.aggregated_metrics File chunks_info = StoreChunksInfo.chunks_info File failed_chunks = StoreChunksInfo.failed_chunks From 0f049ba19cd4e7caf21ea6c6a8b73c639491982b Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Fri, 1 Mar 2024 19:30:05 -0500 Subject: [PATCH 14/92] try to resolve missing file issue --- pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl | 2 +- tasks/broad/ImputationTasks.wdl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl index 2c95f1d59b..fcf30e7437 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl @@ -351,7 +351,7 @@ workflow ImputationBeagle { Array[File]? imputed_single_sample_vcfs = SplitMultiSampleVcf.single_sample_vcfs Array[File]? imputed_single_sample_vcf_indices = SplitMultiSampleVcf.single_sample_vcf_indices File imputed_multisample_vcf = GatherVcfs.output_vcf - # File imputed_multisample_vcf_index = GatherVcfs.output_vcf_index + File imputed_multisample_vcf_index = select_first([GatherVcfs.output_vcf_index]) # File aggregated_imputation_metrics = MergeImputationQCMetrics.aggregated_metrics File chunks_info = StoreChunksInfo.chunks_info File failed_chunks = StoreChunksInfo.failed_chunks diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index 4bb6023f24..a623abcad8 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -381,7 +381,7 @@ task GatherVcfs { } output { File output_vcf = "~{output_vcf_basename}.vcf.gz" - # File output_vcf_index = "~{output_vcf_basename}.vcf.gz.tbi" + File? output_vcf_index = "~{output_vcf_basename}.vcf.gz.tbi" } } From 4c424280edbf17b59b9657b2d9d2f4b3dd3cc41f Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Mon, 4 Mar 2024 14:03:30 -0500 Subject: [PATCH 15/92] don't impute over padding --- .../broad/arrays/imputation_beagle/ImputationBeagle.wdl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl index fcf30e7437..0600b8af61 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl @@ -173,8 +173,8 @@ workflow ImputationBeagle { chrom = referencePanelContig.contig, basename = chunk_basename, genetic_map_file = referencePanelContig.genetic_map, - start = startWithOverlaps, - end = endWithOverlaps + start = start, # was startWithOverlaps, same with end + end = end } call tasks.UpdateHeader { @@ -302,7 +302,7 @@ workflow ImputationBeagle { call tasks.GatherVcfs { input: input_vcfs = unsorted_vcfs, - output_vcf_basename = output_callset_name + output_vcf_basename = output_callset_name + ".imputed" } # call tasks.MergeImputationQCMetrics { From f92f2e68f6e91e672d8e43a267bcafaf1e43cb7c Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Mon, 4 Mar 2024 14:21:48 -0500 Subject: [PATCH 16/92] make the index again --- tasks/broad/ImputationTasks.wdl | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index a623abcad8..13fe497963 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -367,11 +367,8 @@ task GatherVcfs { --REORDER_INPUT_BY_FIRST_VARIANT \ -O ~{output_vcf_basename}.vcf.gz - # gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" \ - # IndexFeatureFile -I ~{output_vcf_basename}.vcf.gz - - ls - + gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" \ + IndexFeatureFile -I ~{output_vcf_basename}.vcf.gz >>> runtime { docker: gatk_docker @@ -381,7 +378,7 @@ task GatherVcfs { } output { File output_vcf = "~{output_vcf_basename}.vcf.gz" - File? output_vcf_index = "~{output_vcf_basename}.vcf.gz.tbi" + File output_vcf_index = "~{output_vcf_basename}.vcf.gz.tbi" } } From 3d7162c5dce724ec97105fb3dd60ac299493903e Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Fri, 8 Mar 2024 10:28:01 -0500 Subject: [PATCH 17/92] supply vcf_index input to SelectVariantsByIds --- .../broad/arrays/imputation_beagle/ImputationBeagle.wdl | 5 +++-- tasks/broad/ImputationTasks.wdl | 5 +++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl index 0600b8af61..1ed405b643 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl @@ -27,8 +27,8 @@ workflow ImputationBeagle { Float? optional_qc_hwe File ref_dict # for reheadering / adding contig lengths in the header of the ouptut VCF, and calculating contig lengths Array[String] contigs - String reference_panel_path = "gs://morgan-imputation-development/1000G-ref-panel/hg19/" # path to the bucket where the reference panel files are stored for all contigs - String genetic_maps_path = "gs://morgan-imputation-development/plink-genetic-maps/GRCh37/" # path to the bucket where genetic maps are stored for all contigs + String reference_panel_path # path to the bucket where the reference panel files are stored for all contigs + String genetic_maps_path # path to the bucket where genetic maps are stored for all contigs String output_callset_name # the output callset name Boolean split_output_to_single_sample = false Int merge_ssvcf_mem_mb = 3000 # the memory allocation for MergeSingleSampleVcfs (in mb) @@ -220,6 +220,7 @@ workflow ImputationBeagle { call tasks.SelectVariantsByIds { input: vcf = SetIdsVcfToImpute.output_vcf, + vcf_index = SetIdsVcfToImpute.output_vcf_index, ids = FindSitesUniqueToFileTwoOnly.missing_sites, basename = "imputed_sites_to_recover" } diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index 13fe497963..5686141dbc 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -834,6 +834,7 @@ task ExtractIDs { task SelectVariantsByIds { input { File vcf + File vcf_index File ids String basename @@ -847,6 +848,10 @@ task SelectVariantsByIds { description: "vcf", localization_optional: true } + vcf_index: { + description: "vcf", + localization_optional: true + } } Int command_mem = memory_mb - 1000 Int max_heap = memory_mb - 500 From 9f6624e7f5a9f249d7933939f44e54c9a8619d38 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Fri, 15 Mar 2024 11:26:20 -0400 Subject: [PATCH 18/92] update Imputation wdl too --- pipelines/broad/arrays/imputation/Imputation.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/pipelines/broad/arrays/imputation/Imputation.wdl b/pipelines/broad/arrays/imputation/Imputation.wdl index 4a44ba4ac5..058f0066de 100644 --- a/pipelines/broad/arrays/imputation/Imputation.wdl +++ b/pipelines/broad/arrays/imputation/Imputation.wdl @@ -242,6 +242,7 @@ workflow Imputation { call tasks.SelectVariantsByIds { input: vcf = SetIdsVcfToImpute.output_vcf, + vcf_index = SetIdsVcfToImpute.output_vcf_index, ids = FindSitesUniqueToFileTwoOnly.missing_sites, basename = "imputed_sites_to_recover" } From ea6ab6d9d132ca554fbfba087b10536d942c9a5d Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Fri, 15 Mar 2024 13:51:59 -0400 Subject: [PATCH 19/92] newlines --- pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl | 2 +- tasks/broad/ImputationTasks.wdl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl index 1ed405b643..7efc227e1b 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl @@ -371,4 +371,4 @@ struct ReferencePanelContig { File bref3 String contig File genetic_map -} \ No newline at end of file +} diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index 5686141dbc..c36c83e2d5 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -989,4 +989,4 @@ task SplitMultiSampleVcf { Array[File] single_sample_vcfs = glob("out_dir/*.vcf.gz") Array[File] single_sample_vcf_indices = glob("out_dir/*.vcf.gz.tbi") } -} \ No newline at end of file +} From 49c7987c589f1b12f62b481af023028cd192fb07 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Tue, 19 Mar 2024 15:09:21 -0400 Subject: [PATCH 20/92] update for hg38 --- .../broad/arrays/imputation_beagle/ImputationBeagle.wdl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl index 7efc227e1b..6b27dbf4ff 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl @@ -77,9 +77,9 @@ workflow ImputationBeagle { Float chunkLengthFloat = chunkLength scatter (contig in contigs) { - - String reference_filename = reference_panel_path + "ALL.chr" + contig + ".phase3_integrated.20130502.genotypes.cleaned" - String genetic_map_filename = genetic_maps_path + "plink.chr" + contig + ".GRCh37.map" + # these are specific to hg38 + String reference_filename = reference_panel_path + "hgdp.tgp.gwaspy.merged.chr" + contig + ".merged.AN_added.bcf.ac2." + String genetic_map_filename = genetic_maps_path + "plink.chr" + contig + ".GRCh38.map" ReferencePanelContig referencePanelContig = { "vcf": reference_filename + vcf_suffix, From c3d1e8177637679be1aa3ffb04e8a2df68370720 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Tue, 19 Mar 2024 15:12:06 -0400 Subject: [PATCH 21/92] Revert "update for hg38" This reverts commit 3757137a02a7d92686a5664596261a0fb91e2624. --- .../broad/arrays/imputation_beagle/ImputationBeagle.wdl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl index 6b27dbf4ff..7efc227e1b 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl @@ -77,9 +77,9 @@ workflow ImputationBeagle { Float chunkLengthFloat = chunkLength scatter (contig in contigs) { - # these are specific to hg38 - String reference_filename = reference_panel_path + "hgdp.tgp.gwaspy.merged.chr" + contig + ".merged.AN_added.bcf.ac2." - String genetic_map_filename = genetic_maps_path + "plink.chr" + contig + ".GRCh38.map" + + String reference_filename = reference_panel_path + "ALL.chr" + contig + ".phase3_integrated.20130502.genotypes.cleaned" + String genetic_map_filename = genetic_maps_path + "plink.chr" + contig + ".GRCh37.map" ReferencePanelContig referencePanelContig = { "vcf": reference_filename + vcf_suffix, From 13fe9c24d201d638bcab8574297f82ffa4c25447 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Tue, 19 Mar 2024 15:09:21 -0400 Subject: [PATCH 22/92] update for hg38 --- .../broad/arrays/imputation_beagle/ImputationBeagle.wdl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl index 7efc227e1b..6b27dbf4ff 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl @@ -77,9 +77,9 @@ workflow ImputationBeagle { Float chunkLengthFloat = chunkLength scatter (contig in contigs) { - - String reference_filename = reference_panel_path + "ALL.chr" + contig + ".phase3_integrated.20130502.genotypes.cleaned" - String genetic_map_filename = genetic_maps_path + "plink.chr" + contig + ".GRCh37.map" + # these are specific to hg38 + String reference_filename = reference_panel_path + "hgdp.tgp.gwaspy.merged.chr" + contig + ".merged.AN_added.bcf.ac2." + String genetic_map_filename = genetic_maps_path + "plink.chr" + contig + ".GRCh38.map" ReferencePanelContig referencePanelContig = { "vcf": reference_filename + vcf_suffix, From 5d239feca5ff19fe5fe3158be494b89ffca0fecf Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Wed, 20 Mar 2024 15:23:30 -0400 Subject: [PATCH 23/92] liftover wdl --- .../arrays/imputation_beagle/LiftoverVcfs.wdl | 98 +++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl diff --git a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl new file mode 100644 index 0000000000..de7fffcfb7 --- /dev/null +++ b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl @@ -0,0 +1,98 @@ +version 1.0 + +# Liftover VCFs from hg19 to hg38 +workflow LiftoverVcfs { + + String pipeline_version = "1.0.0" + + input { + File vcf_path + File vcf_index_path + + File liftover_chain + + String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" # docker: "us.gcr.io/broad-gatk/gatk:4.2.6.1" + Int min_disk_size = 100 + + File hg38_reference_fasta + File hg38_reference_fasta_index + File hg38_reference_dict + + Int preemptible_tries = 3 + } + + String vcf_basename = basename(vcf_path) + + # Lift over the array to hg38. + call LiftOverArrays { + input: + input_vcf = vcf_path, + input_vcf_index = vcf_index_path, + liftover_chain = liftover_chain, + reference_fasta = hg38_reference_fasta, + reference_dict = hg38_reference_dict, + output_basename = vcf_basename, + docker = docker, + preemptible_tries = preemptible_tries, + min_disk_size = min_disk_size + } + + output { + File hg38_vcf = LiftOverArrays.lifted_over_vcf + File hg38_vcf_index = LiftOverArrays.lifted_over_vcf_index + } +} + +task LiftOverArrays { + input { + File input_vcf + File input_vcf_index + File liftover_chain + File reference_fasta + File reference_dict + String output_basename + String docker + Int preemptible_tries + Int min_disk_size + } + + Int disk_size_from_file = (ceil(size(input_vcf, "GiB") + size(liftover_chain, "GiB") + size(reference_fasta, "GiB")) * 2) + 20 + Int disk_size = if ( disk_size_from_file > min_disk_size ) then disk_size_from_file else min_disk_size + + command <<< + set -euo pipefail + + # assuming mem unit is GB, take 2 fewer GB than the max available memory + java_max_mem_size=$(($(printf "%.0f\n" ${MEM_SIZE}) - 2)) + java_Xmx_str="-Xmx${java_max_mem_size}g" + echo "VM mem size is ${MEM_SIZE} ${MEM_UNIT}; will use java flag ${java_Xmx_str} for max memory" + + java -Xms4g ${java_Xmx_str} -jar /usr/picard/picard.jar LiftoverVcf \ + INPUT=~{input_vcf} \ + OUTPUT=~{output_basename}.liftedover.vcf \ + CHAIN=~{liftover_chain} \ + REJECT=~{output_basename}.rejected_variants.vcf \ + REFERENCE_SEQUENCE=~{reference_fasta} \ + MAX_RECORDS_IN_RAM=100000 + + # compress vcf - this creates a file with .gz suffix + bgzip ~{output_basename} + + # generate new index - this creates a file with .tbi suffix + tabix ~{output_basename}.gz + >>> + + runtime { + docker: docker + memory: "7 GiB" + cpu: "1" + disks: "local-disk ~{disk_size} HDD" + maxRetries: 3 + preemptible: preemptible_tries + } + + output { + File lifted_over_vcf = "~{output_basename}.liftedover.vcf" + File lifted_over_vcf_index = "~{output_basename}.liftedover.vcf.idx" + } +} From d13ab27c0b85157287e127a2c4191dc57726b599 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Wed, 20 Mar 2024 15:42:47 -0400 Subject: [PATCH 24/92] remove GCP-specific vm commands --- pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl index de7fffcfb7..d3bd298779 100644 --- a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl +++ b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl @@ -59,15 +59,11 @@ task LiftOverArrays { Int disk_size_from_file = (ceil(size(input_vcf, "GiB") + size(liftover_chain, "GiB") + size(reference_fasta, "GiB")) * 2) + 20 Int disk_size = if ( disk_size_from_file > min_disk_size ) then disk_size_from_file else min_disk_size + command <<< set -euo pipefail - # assuming mem unit is GB, take 2 fewer GB than the max available memory - java_max_mem_size=$(($(printf "%.0f\n" ${MEM_SIZE}) - 2)) - java_Xmx_str="-Xmx${java_max_mem_size}g" - echo "VM mem size is ${MEM_SIZE} ${MEM_UNIT}; will use java flag ${java_Xmx_str} for max memory" - - java -Xms4g ${java_Xmx_str} -jar /usr/picard/picard.jar LiftoverVcf \ + java -Xms4g -Xmx6500m -jar /usr/picard/picard.jar LiftoverVcf \ INPUT=~{input_vcf} \ OUTPUT=~{output_basename}.liftedover.vcf \ CHAIN=~{liftover_chain} \ From 55fe32028fe864c37e2f821cb9d82763c8ee4739 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Thu, 21 Mar 2024 11:34:56 -0400 Subject: [PATCH 25/92] use gatk --- .../arrays/imputation_beagle/LiftoverVcfs.wdl | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl index d3bd298779..8db6a677fe 100644 --- a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl +++ b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl @@ -11,13 +11,14 @@ workflow LiftoverVcfs { File liftover_chain - String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" # docker: "us.gcr.io/broad-gatk/gatk:4.2.6.1" + String docker = "us.gcr.io/broad-gatk/gatk:4.2.6.1" Int min_disk_size = 100 File hg38_reference_fasta File hg38_reference_fasta_index File hg38_reference_dict + Int max_retries = 3 Int preemptible_tries = 3 } @@ -33,6 +34,7 @@ workflow LiftoverVcfs { reference_dict = hg38_reference_dict, output_basename = vcf_basename, docker = docker, + max_retries = max_retries, preemptible_tries = preemptible_tries, min_disk_size = min_disk_size } @@ -52,6 +54,7 @@ task LiftOverArrays { File reference_dict String output_basename String docker + Int max_retries Int preemptible_tries Int min_disk_size } @@ -63,13 +66,14 @@ task LiftOverArrays { command <<< set -euo pipefail - java -Xms4g -Xmx6500m -jar /usr/picard/picard.jar LiftoverVcf \ - INPUT=~{input_vcf} \ - OUTPUT=~{output_basename}.liftedover.vcf \ - CHAIN=~{liftover_chain} \ - REJECT=~{output_basename}.rejected_variants.vcf \ - REFERENCE_SEQUENCE=~{reference_fasta} \ - MAX_RECORDS_IN_RAM=100000 + gatk --java-options "-Xms4g -Xmx6500m" \ + LiftoverVcf \ + --INPUT ~{input_vcf} \ + --OUTPUT ~{output_basename}.liftedover.vcf \ + --CHAIN ~{liftover_chain} \ + --REJECT ~{output_basename}.rejected_variants.vcf \ + --REFERENCE_SEQUENCE ~{reference_fasta} \ + --MAX_RECORDS_IN_RAM 100000 # compress vcf - this creates a file with .gz suffix bgzip ~{output_basename} @@ -83,7 +87,7 @@ task LiftOverArrays { memory: "7 GiB" cpu: "1" disks: "local-disk ~{disk_size} HDD" - maxRetries: 3 + maxRetries: max_retries preemptible: preemptible_tries } From 29d965c526c24b62b00a9f063e2ae44c909e44bf Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Thu, 21 Mar 2024 11:44:13 -0400 Subject: [PATCH 26/92] fix suffix and basename --- pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl index 8db6a677fe..c9ea1e3ecc 100644 --- a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl +++ b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl @@ -22,7 +22,7 @@ workflow LiftoverVcfs { Int preemptible_tries = 3 } - String vcf_basename = basename(vcf_path) + String vcf_basename = basename(vcf_path, ".vcf.gz") # Lift over the array to hg38. call LiftOverArrays { @@ -93,6 +93,6 @@ task LiftOverArrays { output { File lifted_over_vcf = "~{output_basename}.liftedover.vcf" - File lifted_over_vcf_index = "~{output_basename}.liftedover.vcf.idx" + File lifted_over_vcf_index = "~{output_basename}.liftedover.vcf.tbi" } } From fa2ca590c35888efac819173ebfe00e77025c700 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Thu, 21 Mar 2024 12:20:18 -0400 Subject: [PATCH 27/92] fix more filenames --- pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl index c9ea1e3ecc..17d511a0dc 100644 --- a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl +++ b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl @@ -76,10 +76,10 @@ task LiftOverArrays { --MAX_RECORDS_IN_RAM 100000 # compress vcf - this creates a file with .gz suffix - bgzip ~{output_basename} + bgzip ~{output_basename}.liftedover.vcf # generate new index - this creates a file with .tbi suffix - tabix ~{output_basename}.gz + tabix ~{output_basename}.liftedover.vcf.gz >>> runtime { @@ -92,7 +92,7 @@ task LiftOverArrays { } output { - File lifted_over_vcf = "~{output_basename}.liftedover.vcf" - File lifted_over_vcf_index = "~{output_basename}.liftedover.vcf.tbi" + File lifted_over_vcf = "~{output_basename}.liftedover.vcf.gz" + File lifted_over_vcf_index = "~{output_basename}.liftedover.vcf.gz.tbi" } } From ec1602be9f5db2f934436c84a1a8cc865ad9ec59 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Thu, 21 Mar 2024 19:07:10 -0400 Subject: [PATCH 28/92] remove missing contig stuff for now --- .../imputation_beagle/ImputationBeagle.wdl | 104 +++++++++--------- 1 file changed, 52 insertions(+), 52 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl index 6b27dbf4ff..afa17e4e85 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl @@ -243,62 +243,62 @@ workflow ImputationBeagle { Array[String] phased_vcfs = flatten(chromosome_vcfs) - call tasks.GetMissingContigList { - input: - ref_dict = ref_dict, - included_contigs = write_lines(contigs) - } - - scatter (missing_contig in GetMissingContigList.missing_contigs) { - call tasks.CalculateChromosomeLength as CalculateMissingChromosomeLength { - input: - ref_dict = ref_dict, - chrom = missing_contig - } - - Int num_chunks_missing_contig = ceil(CalculateMissingChromosomeLength.chrom_length / chunkLengthFloat) - - scatter (i_missing_contig in range(num_chunks_missing_contig)) { - Int start_missing_contig = (i_missing_contig * chunkLength) + 1 - Int end_missing_contig = if (CalculateMissingChromosomeLength.chrom_length < ((i_missing_contig + 1) * chunkLength)) then CalculateMissingChromosomeLength.chrom_length else ((i_missing_contig + 1) * chunkLength) - - call tasks.SubsetVcfToRegion as SubsetVcfToRegionMissingContig{ - input: - vcf = vcf_to_impute, - vcf_index = vcf_index_to_impute, - output_basename = "input_samples_subset_to_chunk", - contig = missing_contig, - start = start_missing_contig, - end = end_missing_contig, - exclude_filtered = true - } +# call tasks.GetMissingContigList { +# input: +# ref_dict = ref_dict, +# included_contigs = write_lines(contigs) +# } - call tasks.SetIDs as SetIDsMissingContigs { - input: - vcf = SubsetVcfToRegionMissingContig.output_vcf, - output_basename = "unimputed_contigs_" + missing_contig +"_"+ i_missing_contig + "_with_ids" - } +# scatter (missing_contig in GetMissingContigList.missing_contigs) { +# call tasks.CalculateChromosomeLength as CalculateMissingChromosomeLength { +# input: +# ref_dict = ref_dict, +# chrom = missing_contig +# } - call tasks.RemoveAnnotations as RemoveAnnotationsMissingContigs { - input: - vcf = SetIDsMissingContigs.output_vcf, - basename = "unimputed_contigs_" + missing_contig +"_"+ i_missing_contig + "_annotations_removed" - } - } - } +# Int num_chunks_missing_contig = ceil(CalculateMissingChromosomeLength.chrom_length / chunkLengthFloat) + +# scatter (i_missing_contig in range(num_chunks_missing_contig)) { +# Int start_missing_contig = (i_missing_contig * chunkLength) + 1 +# Int end_missing_contig = if (CalculateMissingChromosomeLength.chrom_length < ((i_missing_contig + 1) * chunkLength)) then CalculateMissingChromosomeLength.chrom_length else ((i_missing_contig + 1) * chunkLength) + +# call tasks.SubsetVcfToRegion as SubsetVcfToRegionMissingContig{ +# input: +# vcf = vcf_to_impute, +# vcf_index = vcf_index_to_impute, +# output_basename = "input_samples_subset_to_chunk", +# contig = missing_contig, +# start = start_missing_contig, +# end = end_missing_contig, +# exclude_filtered = true +# } + +# call tasks.SetIDs as SetIDsMissingContigs { +# input: +# vcf = SubsetVcfToRegionMissingContig.output_vcf, +# output_basename = "unimputed_contigs_" + missing_contig +"_"+ i_missing_contig + "_with_ids" +# } + +# call tasks.RemoveAnnotations as RemoveAnnotationsMissingContigs { +# input: +# vcf = SetIDsMissingContigs.output_vcf, +# basename = "unimputed_contigs_" + missing_contig +"_"+ i_missing_contig + "_annotations_removed" +# } +# } +# } - Array[String] missing_remove_annotation_vcfs = flatten(RemoveAnnotationsMissingContigs.output_vcf) +# Array[String] missing_remove_annotation_vcfs = flatten(RemoveAnnotationsMissingContigs.output_vcf) - scatter(missing_remove_annotation_vcf in missing_remove_annotation_vcfs){ - call tasks.ReplaceHeader { - input: - vcf_to_replace_header = missing_remove_annotation_vcf, - vcf_with_new_header = phased_vcfs[0] - } - } +# scatter(missing_remove_annotation_vcf in missing_remove_annotation_vcfs){ +# call tasks.ReplaceHeader { +# input: +# vcf_to_replace_header = missing_remove_annotation_vcf, +# vcf_with_new_header = phased_vcfs[0] +# } +# } - Array[String] missing_contig_vcfs = ReplaceHeader.output_vcf - Array[String] unsorted_vcfs = flatten([phased_vcfs, missing_contig_vcfs]) +# Array[String] missing_contig_vcfs = ReplaceHeader.output_vcf + Array[String] unsorted_vcfs = phased_vcfs # flatten([phased_vcfs, missing_contig_vcfs]) call tasks.GatherVcfs { input: @@ -352,7 +352,7 @@ workflow ImputationBeagle { Array[File]? imputed_single_sample_vcfs = SplitMultiSampleVcf.single_sample_vcfs Array[File]? imputed_single_sample_vcf_indices = SplitMultiSampleVcf.single_sample_vcf_indices File imputed_multisample_vcf = GatherVcfs.output_vcf - File imputed_multisample_vcf_index = select_first([GatherVcfs.output_vcf_index]) + File imputed_multisample_vcf_index = GatherVcfs.output_vcf_index # File aggregated_imputation_metrics = MergeImputationQCMetrics.aggregated_metrics File chunks_info = StoreChunksInfo.chunks_info File failed_chunks = StoreChunksInfo.failed_chunks From 6ba6d032e5e083ab3dc90e3ee9ce8c3013b0e09e Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Thu, 21 Mar 2024 20:02:34 -0400 Subject: [PATCH 29/92] fix ref panel path --- pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl index afa17e4e85..fe06487a0b 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl @@ -78,7 +78,7 @@ workflow ImputationBeagle { scatter (contig in contigs) { # these are specific to hg38 - String reference_filename = reference_panel_path + "hgdp.tgp.gwaspy.merged.chr" + contig + ".merged.AN_added.bcf.ac2." + String reference_filename = reference_panel_path + "hgdp.tgp.gwaspy.merged." + contig + ".merged.AN_added.bcf.ac2" String genetic_map_filename = genetic_maps_path + "plink.chr" + contig + ".GRCh38.map" ReferencePanelContig referencePanelContig = { From c4575a7b65a1eed4d360ae9f4d018fc9b88b388f Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Thu, 21 Mar 2024 20:04:55 -0400 Subject: [PATCH 30/92] another chr fix --- .../broad/arrays/imputation_beagle/ImputationBeagle.wdl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl index fe06487a0b..2c955aa996 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl @@ -77,9 +77,9 @@ workflow ImputationBeagle { Float chunkLengthFloat = chunkLength scatter (contig in contigs) { - # these are specific to hg38 + # these are specific to hg38 - contig is format 'chr1' String reference_filename = reference_panel_path + "hgdp.tgp.gwaspy.merged." + contig + ".merged.AN_added.bcf.ac2" - String genetic_map_filename = genetic_maps_path + "plink.chr" + contig + ".GRCh38.map" + String genetic_map_filename = genetic_maps_path + "plink." + contig + ".GRCh38.map" ReferencePanelContig referencePanelContig = { "vcf": reference_filename + vcf_suffix, @@ -103,7 +103,7 @@ workflow ImputationBeagle { Int startWithOverlaps = if (start - chunkOverlaps < 1) then 1 else start - chunkOverlaps Int end = if (CalculateChromosomeLength.chrom_length < ((i + 1) * chunkLength)) then CalculateChromosomeLength.chrom_length else ((i + 1) * chunkLength) Int endWithOverlaps = if (CalculateChromosomeLength.chrom_length < end + chunkOverlaps) then CalculateChromosomeLength.chrom_length else end + chunkOverlaps - String chunk_basename = "chrom_" + referencePanelContig.contig + "_chunk_" + i + String chunk_basename = referencePanelContig.contig + "_chunk_" + i call tasks.GenerateChunk { input: From f206448a13d073135037e40942480a7f010f1fb2 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Wed, 27 Mar 2024 14:58:36 -0400 Subject: [PATCH 31/92] warn on missign contig --- .../broad/arrays/imputation_beagle/ImputationBeagle.wdl | 4 ---- pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl | 3 ++- tasks/broad/ImputationTasks.wdl | 7 +------ 3 files changed, 3 insertions(+), 11 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl index 2c955aa996..ec634271a5 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl @@ -135,10 +135,6 @@ workflow ImputationBeagle { } call tasks.CheckChunksBeagle { input: - vcf = select_first([OptionalQCSites.output_vcf, GenerateChunk.output_vcf]), - vcf_index = select_first([OptionalQCSites.output_vcf_index, GenerateChunk.output_vcf_index]), - panel_vcf = referencePanelContig.vcf, - panel_vcf_index = referencePanelContig.vcf_index, var_in_original = CountVariantsInChunks.var_in_original, var_in_reference = CountVariantsInChunks.var_in_reference } diff --git a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl index 17d511a0dc..6e79255b42 100644 --- a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl +++ b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl @@ -73,7 +73,8 @@ task LiftOverArrays { --CHAIN ~{liftover_chain} \ --REJECT ~{output_basename}.rejected_variants.vcf \ --REFERENCE_SEQUENCE ~{reference_fasta} \ - --MAX_RECORDS_IN_RAM 100000 + --MAX_RECORDS_IN_RAM 100000 \ + --WARN_ON_MISSING_CONTIG true # compress vcf - this creates a file with .gz suffix bgzip ~{output_basename}.liftedover.vcf diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index c36c83e2d5..1147d8e518 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -263,14 +263,9 @@ task Minimac4 { task CheckChunksBeagle { input { - File vcf - File vcf_index - File panel_vcf - File panel_vcf_index Int var_in_original Int var_in_reference - Int disk_size_gb = ceil(2*size([vcf, vcf_index, panel_vcf, panel_vcf_index], "GiB")) String bcftools_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.7-1.10.2-0.1.16-1669908889" Int cpu = 1 Int memory_mb = 4000 @@ -289,7 +284,7 @@ task CheckChunksBeagle { } runtime { docker: bcftools_docker - disks: "local-disk ${disk_size_gb} HDD" + disks: "local-disk 10 HDD" memory: "${memory_mb} MiB" cpu: cpu } From 517719de15002372324f7ca94b73b9f6919116d2 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Wed, 27 Mar 2024 15:32:19 -0400 Subject: [PATCH 32/92] do fail if missing contig --- pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl index 6e79255b42..17d511a0dc 100644 --- a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl +++ b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl @@ -73,8 +73,7 @@ task LiftOverArrays { --CHAIN ~{liftover_chain} \ --REJECT ~{output_basename}.rejected_variants.vcf \ --REFERENCE_SEQUENCE ~{reference_fasta} \ - --MAX_RECORDS_IN_RAM 100000 \ - --WARN_ON_MISSING_CONTIG true + --MAX_RECORDS_IN_RAM 100000 # compress vcf - this creates a file with .gz suffix bgzip ~{output_basename}.liftedover.vcf From 0f5208398fa5bd7159b0725d6b5e4fba1e333495 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Wed, 27 Mar 2024 17:29:36 -0400 Subject: [PATCH 33/92] more mem --- pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl index 17d511a0dc..5db6a2481c 100644 --- a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl +++ b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl @@ -66,7 +66,7 @@ task LiftOverArrays { command <<< set -euo pipefail - gatk --java-options "-Xms4g -Xmx6500m" \ + gatk --java-options "-Xms4g -Xmx15g" \ LiftoverVcf \ --INPUT ~{input_vcf} \ --OUTPUT ~{output_basename}.liftedover.vcf \ @@ -84,7 +84,7 @@ task LiftOverArrays { runtime { docker: docker - memory: "7 GiB" + memory: "16 GiB" cpu: "1" disks: "local-disk ~{disk_size} HDD" maxRetries: max_retries From 2c6643ce40dc57bc0f2792da5d7aaaa7f6349fa6 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Wed, 27 Mar 2024 19:01:17 -0400 Subject: [PATCH 34/92] troubleshooting wld --- .../test_empty_write_lines_input.wdl | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 pipelines/broad/arrays/imputation_beagle/test_empty_write_lines_input.wdl diff --git a/pipelines/broad/arrays/imputation_beagle/test_empty_write_lines_input.wdl b/pipelines/broad/arrays/imputation_beagle/test_empty_write_lines_input.wdl new file mode 100644 index 0000000000..0315c977ec --- /dev/null +++ b/pipelines/broad/arrays/imputation_beagle/test_empty_write_lines_input.wdl @@ -0,0 +1,44 @@ +version 1.0 + +# testing ToA and write_lines([]) as task input +workflow test_empty_write_lines_input { + + input { + File write_lines_at_wdl_input = write_lines([]) + } + + # use file defined at wdl input + call LocalizeFile as LocalizeEmptyFileFromWdlInput { + input: + input_file = write_lines_at_wdl_input + } + + # use file defined at task input + call LocalizeFile as LocalizeEmptyFileFromTaskInput { + input: + input_file = write_lines([]) + } +} + +task LocalizeFile { + input { + File input_file + } + + command <<< + set -euo pipefail + + cat ~{input_file} | wc -l > num_lines.txt + >>> + + runtime { + docker: "ubuntu:20.04" + memory: "2 GiB" + cpu: "1" + disks: "local-disk 8 HDD" + } + + output { + Int num_lines = read_int("num_lines.txt") + } +} From 554ad061f918222fbecd9cc392fb9d5089a4cd82 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Wed, 27 Mar 2024 21:00:44 -0400 Subject: [PATCH 35/92] fixed plink path --- pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl index ec634271a5..93533e7131 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl @@ -79,7 +79,7 @@ workflow ImputationBeagle { scatter (contig in contigs) { # these are specific to hg38 - contig is format 'chr1' String reference_filename = reference_panel_path + "hgdp.tgp.gwaspy.merged." + contig + ".merged.AN_added.bcf.ac2" - String genetic_map_filename = genetic_maps_path + "plink." + contig + ".GRCh38.map" + String genetic_map_filename = genetic_maps_path + "plink." + contig + ".GRCh38.withchr.map" ReferencePanelContig referencePanelContig = { "vcf": reference_filename + vcf_suffix, @@ -133,6 +133,7 @@ workflow ImputationBeagle { panel_vcf = referencePanelContig.vcf, panel_vcf_index = referencePanelContig.vcf_index } + call tasks.CheckChunksBeagle { input: var_in_original = CountVariantsInChunks.var_in_original, From d7d07e9282eeb505c1254f0fbf4f5a2110b3e02f Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Thu, 28 Mar 2024 10:14:50 -0400 Subject: [PATCH 36/92] add select_first test --- .../imputation_beagle/test_empty_write_lines_input.wdl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pipelines/broad/arrays/imputation_beagle/test_empty_write_lines_input.wdl b/pipelines/broad/arrays/imputation_beagle/test_empty_write_lines_input.wdl index 0315c977ec..fc8d1acb0a 100644 --- a/pipelines/broad/arrays/imputation_beagle/test_empty_write_lines_input.wdl +++ b/pipelines/broad/arrays/imputation_beagle/test_empty_write_lines_input.wdl @@ -5,6 +5,7 @@ workflow test_empty_write_lines_input { input { File write_lines_at_wdl_input = write_lines([]) + File? undefined_file } # use file defined at wdl input @@ -18,6 +19,12 @@ workflow test_empty_write_lines_input { input: input_file = write_lines([]) } + + # use file with select_first + call LocalizeFile as LocalizeEmptyFileWithSelectFirst { + input: + input_file = select_first([undefined_file, write_lines([])]) + } } task LocalizeFile { From ef0c5a0a6eefaee525d5341127b77724f7a59eec Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Thu, 28 Mar 2024 11:16:00 -0400 Subject: [PATCH 37/92] cleanup --- .../imputation_beagle/ImputationBeagle.wdl | 131 ++---------------- 1 file changed, 12 insertions(+), 119 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl index 93533e7131..6261e6cbf8 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl @@ -11,15 +11,8 @@ workflow ImputationBeagle { Int chunkLength = 25000000 Int chunkOverlaps = 5000000 # this is the padding that will be added to the beginning and end of each chunk to reduce edge effects - # You can either input a multisample VCF or an array of single sample VCFs - # The pipeline will just merge the single sample VCFs into one multisample VCF - # and then impute the multisample VCF - # If you want to run a single sample VCF, set the multi_sample_vcf input to the - # single sample VCF - File? multi_sample_vcf - File? multi_sample_vcf_index - Array[File]? single_sample_vcfs - Array[File]? single_sample_vcf_indices + File multi_sample_vcf + File multi_sample_vcf_index Boolean perform_extra_qc_steps = false # these are optional additional extra QC steps from Amit's group that should only be # run for large sample sets, especially a diverse set of samples (it's further limiting called at sites to 95% and by HWE) @@ -42,36 +35,9 @@ workflow ImputationBeagle { String bref3_suffix = ".bref3" } - if (defined(single_sample_vcfs) && defined(multi_sample_vcf)) { - call utils.ErrorWithMessage as ErrorMessageDoubleInput{ - input: - message = "single_sample_vcfs and multi_sample_vcf cannot both be defined as input" - } - } - - if (!defined(single_sample_vcfs) && !defined(multi_sample_vcf)) { - call utils.ErrorWithMessage as ErrorMessageNoInput { - input: - message = "One (and only one) of single_sample_vcfs and multi_sample_vcf must be defined as input" - } - } - - if (defined(single_sample_vcfs)) { - call tasks.MergeSingleSampleVcfs { - input: - input_vcfs = select_first([single_sample_vcfs]), - input_vcf_indices = select_first([single_sample_vcf_indices]), - output_vcf_basename = "merged_input_samples", - memory_mb = merge_ssvcf_mem_mb - } - } - - File vcf_to_impute = select_first([multi_sample_vcf, MergeSingleSampleVcfs.output_vcf]) - File vcf_index_to_impute = select_first([multi_sample_vcf_index, MergeSingleSampleVcfs.output_vcf_index]) - call tasks.CountSamples { input: - vcf = vcf_to_impute, + vcf = multi_sample_vcf, } Float chunkLengthFloat = chunkLength @@ -107,8 +73,8 @@ workflow ImputationBeagle { call tasks.GenerateChunk { input: - vcf = vcf_to_impute, - vcf_index = vcf_index_to_impute, + vcf = multi_sample_vcf, + vcf_index = multi_sample_vcf_index, start = startWithOverlaps, end = endWithOverlaps, chrom = referencePanelContig.contig, @@ -142,8 +108,8 @@ workflow ImputationBeagle { call tasks.SubsetVcfToRegion { input: - vcf = vcf_to_impute, - vcf_index = vcf_index_to_impute, + vcf = multi_sample_vcf, + vcf_index = multi_sample_vcf_index, output_basename = "input_samples_subset_to_chunk", contig = referencePanelContig.contig, start = start, @@ -170,7 +136,7 @@ workflow ImputationBeagle { chrom = referencePanelContig.contig, basename = chunk_basename, genetic_map_file = referencePanelContig.genetic_map, - start = start, # was startWithOverlaps, same with end + start = start, end = end } @@ -234,88 +200,16 @@ workflow ImputationBeagle { basename = output_callset_name } } - # Array[File] aggregatedImputationMetrics = select_all(AggregateImputationQCMetrics.aggregated_metrics) + Array[File] chromosome_vcfs = select_all(InterleaveVariants.output_vcf) } - Array[String] phased_vcfs = flatten(chromosome_vcfs) - -# call tasks.GetMissingContigList { -# input: -# ref_dict = ref_dict, -# included_contigs = write_lines(contigs) -# } - -# scatter (missing_contig in GetMissingContigList.missing_contigs) { -# call tasks.CalculateChromosomeLength as CalculateMissingChromosomeLength { -# input: -# ref_dict = ref_dict, -# chrom = missing_contig -# } - -# Int num_chunks_missing_contig = ceil(CalculateMissingChromosomeLength.chrom_length / chunkLengthFloat) - -# scatter (i_missing_contig in range(num_chunks_missing_contig)) { -# Int start_missing_contig = (i_missing_contig * chunkLength) + 1 -# Int end_missing_contig = if (CalculateMissingChromosomeLength.chrom_length < ((i_missing_contig + 1) * chunkLength)) then CalculateMissingChromosomeLength.chrom_length else ((i_missing_contig + 1) * chunkLength) - -# call tasks.SubsetVcfToRegion as SubsetVcfToRegionMissingContig{ -# input: -# vcf = vcf_to_impute, -# vcf_index = vcf_index_to_impute, -# output_basename = "input_samples_subset_to_chunk", -# contig = missing_contig, -# start = start_missing_contig, -# end = end_missing_contig, -# exclude_filtered = true -# } - -# call tasks.SetIDs as SetIDsMissingContigs { -# input: -# vcf = SubsetVcfToRegionMissingContig.output_vcf, -# output_basename = "unimputed_contigs_" + missing_contig +"_"+ i_missing_contig + "_with_ids" -# } - -# call tasks.RemoveAnnotations as RemoveAnnotationsMissingContigs { -# input: -# vcf = SetIDsMissingContigs.output_vcf, -# basename = "unimputed_contigs_" + missing_contig +"_"+ i_missing_contig + "_annotations_removed" -# } -# } -# } - -# Array[String] missing_remove_annotation_vcfs = flatten(RemoveAnnotationsMissingContigs.output_vcf) - -# scatter(missing_remove_annotation_vcf in missing_remove_annotation_vcfs){ -# call tasks.ReplaceHeader { -# input: -# vcf_to_replace_header = missing_remove_annotation_vcf, -# vcf_with_new_header = phased_vcfs[0] -# } -# } - -# Array[String] missing_contig_vcfs = ReplaceHeader.output_vcf - Array[String] unsorted_vcfs = phased_vcfs # flatten([phased_vcfs, missing_contig_vcfs]) - call tasks.GatherVcfs { input: - input_vcfs = unsorted_vcfs, + input_vcfs = flatten(chromosome_vcfs), output_vcf_basename = output_callset_name + ".imputed" } -# call tasks.MergeImputationQCMetrics { -# input: -# metrics = flatten(aggregatedImputationMetrics), -# basename = output_callset_name -# } - -# if (MergeImputationQCMetrics.frac_above_maf_5_percent_well_imputed < frac_above_maf_5_percent_well_imputed_threshold) { -# call utils.ErrorWithMessage as FailQCWellImputedFrac { -# input: -# message = "Well imputed fraction was " + MergeImputationQCMetrics.frac_above_maf_5_percent_well_imputed + ", QC failure threshold was set at " + frac_above_maf_5_percent_well_imputed_threshold -# } -# } - call tasks.StoreChunksInfo { input: chroms = flatten(chunk_contig), @@ -348,9 +242,8 @@ workflow ImputationBeagle { output { Array[File]? imputed_single_sample_vcfs = SplitMultiSampleVcf.single_sample_vcfs Array[File]? imputed_single_sample_vcf_indices = SplitMultiSampleVcf.single_sample_vcf_indices - File imputed_multisample_vcf = GatherVcfs.output_vcf - File imputed_multisample_vcf_index = GatherVcfs.output_vcf_index - # File aggregated_imputation_metrics = MergeImputationQCMetrics.aggregated_metrics + File imputed_multi_sample_vcf = GatherVcfs.output_vcf + File imputed_multi_sample_vcf_index = GatherVcfs.output_vcf_index File chunks_info = StoreChunksInfo.chunks_info File failed_chunks = StoreChunksInfo.failed_chunks File n_failed_chunks = StoreChunksInfo.n_failed_chunks From 60fedf562a9c9f89a57a57bb260f6f3a7d4f2307 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Thu, 28 Mar 2024 13:08:05 -0400 Subject: [PATCH 38/92] add if block to test --- .../imputation_beagle/test_empty_write_lines_input.wdl | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pipelines/broad/arrays/imputation_beagle/test_empty_write_lines_input.wdl b/pipelines/broad/arrays/imputation_beagle/test_empty_write_lines_input.wdl index fc8d1acb0a..24b6d0a31d 100644 --- a/pipelines/broad/arrays/imputation_beagle/test_empty_write_lines_input.wdl +++ b/pipelines/broad/arrays/imputation_beagle/test_empty_write_lines_input.wdl @@ -25,6 +25,16 @@ workflow test_empty_write_lines_input { input: input_file = select_first([undefined_file, write_lines([])]) } + + # use file generated in an if block + Boolean run_block = false + if (run_block) { + File file_from_block = write_lines(["foo"]) + } + call LocalizeFile as LocalizeEmptyFileWithIfBlock { + input: + input_file = select_first([file_from_block, write_lines(["foo"])]) + } } task LocalizeFile { From 201d0b2eec0bd5cb66879302bcdf79a599fc64e1 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Tue, 2 Apr 2024 15:16:50 -0400 Subject: [PATCH 39/92] create and use ref panel interval list --- .../imputation_beagle/ImputationBeagle.wdl | 19 ++++-- tasks/broad/ImputationTasks.wdl | 67 +++++++++++++++++++ 2 files changed, 79 insertions(+), 7 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl index 6261e6cbf8..ce55821205 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl @@ -61,6 +61,12 @@ workflow ImputationBeagle { chrom = referencePanelContig.contig } + call tasks.CreateRefPanelIntervalLists { + input: + ref_panel_vcf = referencePanelContig.vcf, + ref_panel_vcf_index = referencePanelContig.vcf_index + } + Int num_chunks = ceil(CalculateChromosomeLength.chrom_length / chunkLengthFloat) scatter (i in range(num_chunks)) { @@ -92,18 +98,17 @@ workflow ImputationBeagle { } } - call tasks.CountVariantsInChunks { + call tasks.CountVariantsInChunksBeagle { input: vcf = select_first([OptionalQCSites.output_vcf, GenerateChunk.output_vcf]), vcf_index = select_first([OptionalQCSites.output_vcf_index, GenerateChunk.output_vcf_index]), - panel_vcf = referencePanelContig.vcf, - panel_vcf_index = referencePanelContig.vcf_index + panel_interval_list = CreateRefPanelIntervalLists.interval_list } call tasks.CheckChunksBeagle { input: - var_in_original = CountVariantsInChunks.var_in_original, - var_in_reference = CountVariantsInChunks.var_in_reference + var_in_original = CountVariantsInChunksBeagle.var_in_original, + var_in_reference = CountVariantsInChunksBeagle.var_in_reference } call tasks.SubsetVcfToRegion { @@ -215,8 +220,8 @@ workflow ImputationBeagle { chroms = flatten(chunk_contig), starts = flatten(start), ends = flatten(end), - vars_in_array = flatten(CountVariantsInChunks.var_in_original), - vars_in_panel = flatten(CountVariantsInChunks.var_in_reference), + vars_in_array = flatten(CountVariantsInChunksBeagle.var_in_original), + vars_in_panel = flatten(CountVariantsInChunksBeagle.var_in_reference), valids = flatten(CheckChunksBeagle.valid), basename = output_callset_name } diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index 1147d8e518..27b48b9e6a 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -53,6 +53,41 @@ task GetMissingContigList { } } +task CreateRefPanelIntervalLists { + input { + File ref_panel_vcf + File ref_panel_vcf_index + + Int disk_size_gb = ceil(2*size(ref_panel_vcf, "GiB")) + 50 # not sure how big the disk size needs to be since we aren't downloading the entire VCF here + Int cpu = 1 + Int memory_mb = 8000 + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" + } + + Int command_mem = memory_mb - 1000 + Int max_heap = memory_mb - 500 + + String basename = basename(ref_panel_vcf, '.vcf.gz') + + command { + gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" \ + VcfToIntervalList \ + -I ~{ref_panel_vcf} \ + -O ~{basename}.interval_list + } + + output { + File interval_list = "~{basename}.interval_list" + } + + runtime { + docker: gatk_docker + disks: "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu + } +} + task GenerateChunk { input { Int start @@ -261,6 +296,38 @@ task Minimac4 { } } +task CountVariantsInChunksBeagle { + input { + File vcf + File vcf_index + File panel_interval_list + + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" + Int cpu = 1 + Int memory_mb = 4000 + Int disk_size_gb = 2 * ceil(size([vcf, vcf_index, panel_interval_list], "GiB")) + 20 + } + Int command_mem = memory_mb - 1000 + Int max_heap = memory_mb - 500 + + command <<< + set -e -o pipefail + + echo $(gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V ~{vcf} | sed 's/Tool returned://') > var_in_original + echo $(gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V ~{vcf} -L ~{panel_interval_list} | sed 's/Tool returned://') > var_in_reference + >>> + output { + Int var_in_original = read_int("var_in_original") + Int var_in_reference = read_int("var_in_reference") + } + runtime { + docker: gatk_docker + disks: "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu + } +} + task CheckChunksBeagle { input { Int var_in_original From 895a6c65bd651618995179175697d572f75314e5 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Wed, 3 Apr 2024 10:07:39 -0400 Subject: [PATCH 40/92] move interval list creation to ref panel wdl --- .../CreateImputationRefPanelBeagle.wdl | 122 ++++++++++++------ .../imputation_beagle/ImputationBeagle.wdl | 21 +-- tasks/broad/ImputationTasks.wdl | 2 +- 3 files changed, 92 insertions(+), 53 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl b/pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl index c1a14190c4..a446154975 100644 --- a/pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl +++ b/pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl @@ -2,44 +2,94 @@ version 1.0 # This script is under review. It is not actively tested or maintained at this time. workflow CreateImputationRefPanelBeagle { - input { - Array[File] ref_vcf - Int disk_size - } - - scatter (idx in range(length(ref_vcf))) { - call BuildBref3 { - input: - vcf = ref_vcf[idx], - disk_size = disk_size + input { + Array[File] ref_vcf + Array[File] ref_vcf_index + Int disk_size + + Boolean make_brefs = true + Boolean make_interval_lists = true + } + + scatter (idx in range(length(ref_vcf))) { + if (make_brefs) { + call BuildBref3 { + input: + vcf = ref_vcf[idx], + disk_size = disk_size } - } + } + + if (make_interval_lists) { + call CreateRefPanelIntervalLists { + input: + ref_panel_vcf = ref_vcf[idx], + ref_panel_vcf_index = ref_vcf_index[idx] + } + } + } - output { - Array[File] out_bref3 = BuildBref3.out_bref3 - } + output { + Array[File?] bref3s = BuildBref3.out_bref3 + Array[File?] interval_lists = CreateRefPanelIntervalLists.interval_list + } } task BuildBref3 { - input { - File vcf - Int disk_size - } - - String name = basename(vcf, ".vcf.gz") - - command <<< - java -jar /usr/gitc/bref3.22Jul22.46e.jar ~{vcf} > ~{name}.bref3 - >>> - - runtime { - docker: "us-central1-docker.pkg.dev/morgan-fieldeng-gcp/imputation-beagle-development/imputation-beagle:0.0.1-22Jul22.46e-wip-temp-20240227" - memory: "256 GB" - cpu: 4 - disks: "local-disk " + disk_size + " HDD" - } - - output { - File out_bref3 = "~{name}.bref3" - } -} \ No newline at end of file + input { + File vcf + Int disk_size + } + + String name = basename(vcf, ".vcf.gz") + + command <<< + java -jar /usr/gitc/bref3.22Jul22.46e.jar ~{vcf} > ~{name}.bref3 + >>> + + runtime { + docker: "us-central1-docker.pkg.dev/morgan-fieldeng-gcp/imputation-beagle-development/imputation-beagle:0.0.1-22Jul22.46e-wip-temp-20240227" + memory: "256 GB" + cpu: 4 + disks: "local-disk " + disk_size + " HDD" + } + + output { + File out_bref3 = "~{name}.bref3" + } +} + +task CreateRefPanelIntervalLists { + input { + File ref_panel_vcf + File ref_panel_vcf_index + + Int disk_size_gb = ceil(2*size(ref_panel_vcf, "GiB")) + 50 # not sure how big the disk size needs to be since we aren't downloading the entire VCF here + Int cpu = 1 + Int memory_mb = 8000 + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" + } + + Int command_mem = memory_mb - 1000 + Int max_heap = memory_mb - 500 + + String basename = basename(ref_panel_vcf, '.vcf.gz') + + command { + gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" \ + VcfToIntervalList \ + -I ~{ref_panel_vcf} \ + -O ~{basename}.interval_list + } + + output { + File interval_list = "~{basename}.interval_list" + } + + runtime { + docker: gatk_docker + disks: "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu + } +} diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl index ce55821205..356775f2ce 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl @@ -24,14 +24,11 @@ workflow ImputationBeagle { String genetic_maps_path # path to the bucket where genetic maps are stored for all contigs String output_callset_name # the output callset name Boolean split_output_to_single_sample = false - Int merge_ssvcf_mem_mb = 3000 # the memory allocation for MergeSingleSampleVcfs (in mb) - - Float frac_above_maf_5_percent_well_imputed_threshold = 0.9 # require fraction of maf > 0.05 sites well imputed to be greater than this to pass + Int chunks_fail_threshold = 1 # require fewer than this many chunks to fail in order to pass # file extensions used to find reference panel files - String vcf_suffix = ".vcf.gz" - String vcf_index_suffix = ".vcf.gz.tbi" + String interval_list_suffix = ".interval_list" String bref3_suffix = ".bref3" } @@ -48,8 +45,7 @@ workflow ImputationBeagle { String genetic_map_filename = genetic_maps_path + "plink." + contig + ".GRCh38.withchr.map" ReferencePanelContig referencePanelContig = { - "vcf": reference_filename + vcf_suffix, - "vcf_index": reference_filename + vcf_index_suffix, + "interval_list": reference_filename + interval_list_suffix, "bref3": reference_filename + bref3_suffix, "contig": contig, "genetic_map": genetic_map_filename @@ -61,12 +57,6 @@ workflow ImputationBeagle { chrom = referencePanelContig.contig } - call tasks.CreateRefPanelIntervalLists { - input: - ref_panel_vcf = referencePanelContig.vcf, - ref_panel_vcf_index = referencePanelContig.vcf_index - } - Int num_chunks = ceil(CalculateChromosomeLength.chrom_length / chunkLengthFloat) scatter (i in range(num_chunks)) { @@ -102,7 +92,7 @@ workflow ImputationBeagle { input: vcf = select_first([OptionalQCSites.output_vcf, GenerateChunk.output_vcf]), vcf_index = select_first([OptionalQCSites.output_vcf_index, GenerateChunk.output_vcf_index]), - panel_interval_list = CreateRefPanelIntervalLists.interval_list + panel_interval_list = referencePanelContig.interval_list } call tasks.CheckChunksBeagle { @@ -261,8 +251,7 @@ workflow ImputationBeagle { } struct ReferencePanelContig { - File vcf - File vcf_index + File interval_list File bref3 String contig File genetic_map diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index 27b48b9e6a..00ac9910bd 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -589,7 +589,7 @@ task OptionalQCSites { set -e -o pipefail # site missing rate < 5% ; hwe p > 1e-6 - vcftools --gzvcf ~{input_vcf} --max-missing ~{max_missing} --hwe ~{hwe} --recode -c | bgzip -c > ~{output_vcf_basename}.vcf.gz + tools --gzvcf ~{input_vcf} --max-missing ~{max_missing} --hwe ~{hwe} --recode -c | bgzip -c > ~{output_vcf_basename}.vcf.gz bcftools index -t ~{output_vcf_basename}.vcf.gz # Note: this is necessary because vcftools doesn't have a way to output a zipped vcf, nor a way to index one (hence needing to use bcf). >>> runtime { From ccb2f3cbe39504fb4434409523b4a027826aec14 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Wed, 3 Apr 2024 13:44:08 -0400 Subject: [PATCH 41/92] give default values for optional inputs, weird --- pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl index 356775f2ce..f190581ccf 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl @@ -16,8 +16,8 @@ workflow ImputationBeagle { Boolean perform_extra_qc_steps = false # these are optional additional extra QC steps from Amit's group that should only be # run for large sample sets, especially a diverse set of samples (it's further limiting called at sites to 95% and by HWE) - Float? optional_qc_max_missing - Float? optional_qc_hwe + Float optional_qc_max_missing = 0.05 + Float optional_qc_hwe = 0.000001 File ref_dict # for reheadering / adding contig lengths in the header of the ouptut VCF, and calculating contig lengths Array[String] contigs String reference_panel_path # path to the bucket where the reference panel files are stored for all contigs From b3b229ad32f2b67c337f27637b13d450625a85e7 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Wed, 3 Apr 2024 15:51:05 -0400 Subject: [PATCH 42/92] change CountVariants calls --- .../arrays/imputation_beagle/ImputationBeagle.wdl | 5 ++--- tasks/broad/ImputationTasks.wdl | 12 ++++++------ 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl index f190581ccf..a9a01e0606 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl @@ -98,7 +98,7 @@ workflow ImputationBeagle { call tasks.CheckChunksBeagle { input: var_in_original = CountVariantsInChunksBeagle.var_in_original, - var_in_reference = CountVariantsInChunksBeagle.var_in_reference + var_also_in_reference = CountVariantsInChunksBeagle.var_also_in_reference } call tasks.SubsetVcfToRegion { @@ -211,7 +211,7 @@ workflow ImputationBeagle { starts = flatten(start), ends = flatten(end), vars_in_array = flatten(CountVariantsInChunksBeagle.var_in_original), - vars_in_panel = flatten(CountVariantsInChunksBeagle.var_in_reference), + vars_in_panel = flatten(CountVariantsInChunksBeagle.var_also_in_reference), valids = flatten(CheckChunksBeagle.valid), basename = output_callset_name } @@ -233,7 +233,6 @@ workflow ImputationBeagle { } } - output { Array[File]? imputed_single_sample_vcfs = SplitMultiSampleVcf.single_sample_vcfs Array[File]? imputed_single_sample_vcf_indices = SplitMultiSampleVcf.single_sample_vcf_indices diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index 00ac9910bd..b7d671f0b6 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -304,7 +304,7 @@ task CountVariantsInChunksBeagle { String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" Int cpu = 1 - Int memory_mb = 4000 + Int memory_mb = 8000 Int disk_size_gb = 2 * ceil(size([vcf, vcf_index, panel_interval_list], "GiB")) + 20 } Int command_mem = memory_mb - 1000 @@ -313,12 +313,12 @@ task CountVariantsInChunksBeagle { command <<< set -e -o pipefail - echo $(gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V ~{vcf} | sed 's/Tool returned://') > var_in_original - echo $(gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V ~{vcf} -L ~{panel_interval_list} | sed 's/Tool returned://') > var_in_reference + gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V ~{vcf} 2>&1 | tail -n 1 > var_in_original + gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V ~{vcf} -L ~{panel_interval_list} 2>&1 | tail -n 1 > var_also_in_reference >>> output { Int var_in_original = read_int("var_in_original") - Int var_in_reference = read_int("var_in_reference") + Int var_also_in_reference = read_int("var_also_in_reference") } runtime { docker: gatk_docker @@ -331,7 +331,7 @@ task CountVariantsInChunksBeagle { task CheckChunksBeagle { input { Int var_in_original - Int var_in_reference + Int var_also_in_reference String bcftools_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.7-1.10.2-0.1.16-1669908889" Int cpu = 1 @@ -340,7 +340,7 @@ task CheckChunksBeagle { command <<< set -e -o pipefail - if [ $(( ~{var_in_reference} * 2 - ~{var_in_original})) -gt 0 ] && [ ~{var_in_reference} -gt 3 ]; then + if [ $(( ~{var_also_in_reference} * 2 - ~{var_in_original})) -gt 0 ] && [ ~{var_also_in_reference} -gt 3 ]; then echo true > valid_file.txt else echo false > valid_file.txt From e012dda8e73669961faff96685e67d37860388bc Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Wed, 3 Apr 2024 20:51:14 -0400 Subject: [PATCH 43/92] test --- .../imputation_beagle/test_CountVariants.wdl | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 pipelines/broad/arrays/imputation_beagle/test_CountVariants.wdl diff --git a/pipelines/broad/arrays/imputation_beagle/test_CountVariants.wdl b/pipelines/broad/arrays/imputation_beagle/test_CountVariants.wdl new file mode 100644 index 0000000000..64b78992c5 --- /dev/null +++ b/pipelines/broad/arrays/imputation_beagle/test_CountVariants.wdl @@ -0,0 +1,55 @@ +version 1.0 + +workflow TestCountVariants { + + input { + File vcf_path + File vcf_index_path + } + + # Lift over the array to hg38. + call CountVariantsTest { + input: + vcf = vcf_path, + vcf_index = vcf_index_path + } + + output { + File test1 = CountVariantsTest.test1 + File test2 = CountVariantsTest.test2 + } +} + +task CountVariantsTest { + input { + File vcf + File vcf_index + + + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" + Int cpu = 1 + Int memory_mb = 8000 + Int disk_size_gb = 2 * ceil(size([vcf, vcf_index], "GiB")) + 20 + } + Int command_mem = memory_mb - 1000 + Int max_heap = memory_mb - 500 + + command <<< + set -e -o pipefail + + gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V ~{vcf} > test_1.txt + gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V ~{vcf} 2> test_2.txt + gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V ~{vcf} 2>&1 > test_3.txt + >>> + output { + File test1 = "test_1.txt" + File test2 = "test_2.txt" + File test3 = "test_3.txt" + } + runtime { + docker: gatk_docker + disks: "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu + } +} \ No newline at end of file From 99f90e22e28111c65ea8354f31a8ed046d7f4f0c Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Wed, 3 Apr 2024 20:53:52 -0400 Subject: [PATCH 44/92] add output to test --- pipelines/broad/arrays/imputation_beagle/test_CountVariants.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/pipelines/broad/arrays/imputation_beagle/test_CountVariants.wdl b/pipelines/broad/arrays/imputation_beagle/test_CountVariants.wdl index 64b78992c5..03f3bf9c4a 100644 --- a/pipelines/broad/arrays/imputation_beagle/test_CountVariants.wdl +++ b/pipelines/broad/arrays/imputation_beagle/test_CountVariants.wdl @@ -17,6 +17,7 @@ workflow TestCountVariants { output { File test1 = CountVariantsTest.test1 File test2 = CountVariantsTest.test2 + File test3 = CountVariantsTest.test3 } } From e6a5b0583d4f8c74013e9c3b3397beac026cb8f6 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Wed, 3 Apr 2024 21:32:53 -0400 Subject: [PATCH 45/92] next test --- .../broad/arrays/imputation_beagle/test_CountVariants.wdl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/test_CountVariants.wdl b/pipelines/broad/arrays/imputation_beagle/test_CountVariants.wdl index 03f3bf9c4a..c2be0fed90 100644 --- a/pipelines/broad/arrays/imputation_beagle/test_CountVariants.wdl +++ b/pipelines/broad/arrays/imputation_beagle/test_CountVariants.wdl @@ -38,9 +38,9 @@ task CountVariantsTest { command <<< set -e -o pipefail - gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V ~{vcf} > test_1.txt - gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V ~{vcf} 2> test_2.txt - gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V ~{vcf} 2>&1 > test_3.txt + gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V ~{vcf} | tail -n 1 > test_1.txt + gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V ~{vcf} > | tail -n 1 > test_2.txt + gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V ~{vcf} 2>&1 > | tail -n 1 > test_3.txt >>> output { File test1 = "test_1.txt" From 3d44349b0cb8c8f82f82cef0fe1ac95a4b0a5505 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Wed, 3 Apr 2024 22:00:02 -0400 Subject: [PATCH 46/92] more test --- .../arrays/imputation_beagle/test_CountVariants.wdl | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pipelines/broad/arrays/imputation_beagle/test_CountVariants.wdl b/pipelines/broad/arrays/imputation_beagle/test_CountVariants.wdl index c2be0fed90..ee3ceb4c69 100644 --- a/pipelines/broad/arrays/imputation_beagle/test_CountVariants.wdl +++ b/pipelines/broad/arrays/imputation_beagle/test_CountVariants.wdl @@ -38,9 +38,16 @@ task CountVariantsTest { command <<< set -e -o pipefail - gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V ~{vcf} | tail -n 1 > test_1.txt + gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V ~{vcf} > test_0.txt + cat test_0.txt | tail -n 1 > test_1.txt + echo "test_1:" + cat test_1.txt gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V ~{vcf} > | tail -n 1 > test_2.txt + echo "test_2:" + cat test_2.txt gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V ~{vcf} 2>&1 > | tail -n 1 > test_3.txt + echo "test_3:" + cat test_3.txt >>> output { File test1 = "test_1.txt" From 3ef57a855d1eb813a45b9f45ec79f52a1b0b910f Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Thu, 4 Apr 2024 10:09:21 -0400 Subject: [PATCH 47/92] another test --- .../broad/arrays/imputation_beagle/test_CountVariants.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/test_CountVariants.wdl b/pipelines/broad/arrays/imputation_beagle/test_CountVariants.wdl index ee3ceb4c69..11eed1139b 100644 --- a/pipelines/broad/arrays/imputation_beagle/test_CountVariants.wdl +++ b/pipelines/broad/arrays/imputation_beagle/test_CountVariants.wdl @@ -42,10 +42,10 @@ task CountVariantsTest { cat test_0.txt | tail -n 1 > test_1.txt echo "test_1:" cat test_1.txt - gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V ~{vcf} > | tail -n 1 > test_2.txt + gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V ~{vcf} | tail -n 1 > test_2.txt echo "test_2:" cat test_2.txt - gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V ~{vcf} 2>&1 > | tail -n 1 > test_3.txt + gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V ~{vcf} 2>&1 | tail -n 1 > test_3.txt echo "test_3:" cat test_3.txt >>> From 56c6469c411697c76ac292892febbfd07206b8de Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Thu, 4 Apr 2024 10:31:05 -0400 Subject: [PATCH 48/92] update real task --- tasks/broad/ImputationTasks.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index b7d671f0b6..98ed464f43 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -313,8 +313,8 @@ task CountVariantsInChunksBeagle { command <<< set -e -o pipefail - gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V ~{vcf} 2>&1 | tail -n 1 > var_in_original - gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V ~{vcf} -L ~{panel_interval_list} 2>&1 | tail -n 1 > var_also_in_reference + gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V ~{vcf} | tail -n 1 > var_in_original + gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V ~{vcf} -L ~{panel_interval_list} | tail -n 1 > var_also_in_reference >>> output { Int var_in_original = read_int("var_in_original") From 20fe39367e5c2200dab875caa256a2514bd1941b Mon Sep 17 00:00:00 2001 From: jsotobroad Date: Sat, 27 Apr 2024 15:53:25 -0400 Subject: [PATCH 49/92] TSPS-226 presplit and prechunk beagle inputs (#1272) *pre splitting and prechunking beagle imputation inputs to lower log numbers and storage account egress --------- Co-authored-by: Jose Soto --- .../ImputationBeaglePreChunk.wdl | 249 ++++++++++++++++++ .../imputation_beagle/test_CountVariants.wdl | 63 ----- .../test_empty_write_lines_input.wdl | 61 ----- tasks/broad/ImputationTasks.wdl | 166 +++++++++++- 4 files changed, 412 insertions(+), 127 deletions(-) create mode 100644 pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl delete mode 100644 pipelines/broad/arrays/imputation_beagle/test_CountVariants.wdl delete mode 100644 pipelines/broad/arrays/imputation_beagle/test_empty_write_lines_input.wdl diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl new file mode 100644 index 0000000000..622c684274 --- /dev/null +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl @@ -0,0 +1,249 @@ +version 1.0 + +import "../../../../tasks/broad/ImputationTasks.wdl" as tasks +import "../../../../tasks/broad/Utilities.wdl" as utils + +workflow ImputationBeaglePreChunk { + + String pipeline_version = "0.0.1" + + input { + Int chunkLength = 25000000 + Int chunkOverlaps = 5000000 # this is the padding that will be added to the beginning and end of each chunk to reduce edge effects + + File multi_sample_vcf + File multi_sample_vcf_index + + Boolean perform_extra_qc_steps = false # these are optional additional extra QC steps from Amit's group that should only be + # run for large sample sets, especially a diverse set of samples (it's further limiting called at sites to 95% and by HWE) + Float optional_qc_max_missing = 0.05 + Float optional_qc_hwe = 0.000001 + File ref_dict # for reheadering / adding contig lengths in the header of the ouptut VCF, and calculating contig lengths + Array[String] contigs + String reference_panel_path # path to the bucket where the reference panel files are stored for all contigs + String genetic_maps_path # path to the bucket where genetic maps are stored for all contigs + String output_callset_name # the output callset name + Boolean split_output_to_single_sample = false + + Int chunks_fail_threshold = 1 # require fewer than this many chunks to fail in order to pass + + # file extensions used to find reference panel files + String interval_list_suffix = ".interval_list" + String bref3_suffix = ".bref3" + } + + call tasks.CountSamples { + input: + vcf = multi_sample_vcf, + } + + call tasks.PreSplitVcf { + input: + contigs = contigs, + vcf = multi_sample_vcf, + vcf_index = multi_sample_vcf_index + } + + scatter (contig_index in range(length(contigs))) { + # these are specific to hg38 - contig is format 'chr1' + String reference_filename = reference_panel_path + "hgdp.tgp.gwaspy.merged." + contigs[contig_index] + ".merged.AN_added.bcf.ac2" + String genetic_map_filename = genetic_maps_path + "plink." + contigs[contig_index] + ".GRCh38.withchr.map" + + ReferencePanelContig referencePanelContig = { + "interval_list": reference_filename + interval_list_suffix, + "bref3": reference_filename + bref3_suffix, + "contig": contigs[contig_index], + "genetic_map": genetic_map_filename + } + + call tasks.CalculateChromosomeLength { + input: + ref_dict = ref_dict, + chrom = referencePanelContig.contig + } + + call tasks.PreChunkVcf { + input: + chromosome_length=CalculateChromosomeLength.chrom_length, + chunk_length = chunkLength, + chunk_overlap = chunkOverlaps, + chrom = contigs[contig_index], + vcf = PreSplitVcf.chr_split_vcfs[contig_index], + vcf_index = PreSplitVcf.chr_split_vcf_indices[contig_index] + } + + scatter (i in range(length(PreChunkVcf.generate_chunk_vcfs))) { + String chunk_contig = referencePanelContig.contig + String chunk_basename = referencePanelContig.contig + "_chunk_" + i + + Int start = PreChunkVcf.starts[i] + Int end = PreChunkVcf.ends[i] + + if (perform_extra_qc_steps) { + call tasks.OptionalQCSites { + input: + input_vcf = PreChunkVcf.generate_chunk_vcfs[i], + input_vcf_index = PreChunkVcf.generate_chunk_vcf_indices[i], + output_vcf_basename = chunk_basename, + optional_qc_max_missing = optional_qc_max_missing, + optional_qc_hwe = optional_qc_hwe + } + } + + call tasks.CountVariantsInChunksBeagle { + input: + vcf = select_first([OptionalQCSites.output_vcf, PreChunkVcf.generate_chunk_vcfs[i]]), + vcf_index = select_first([OptionalQCSites.output_vcf_index, PreChunkVcf.generate_chunk_vcf_indices[i]]), + panel_interval_list = referencePanelContig.interval_list + } + + call tasks.CheckChunksBeagle { + input: + var_in_original = CountVariantsInChunksBeagle.var_in_original, + var_also_in_reference = CountVariantsInChunksBeagle.var_also_in_reference + } + + call tasks.SetIDs as SetIdsVcfToImpute { + input: + vcf = PreChunkVcf.subset_vcfs[i], + output_basename = "input_samples_with_variant_ids" + } + + call tasks.ExtractIDs as ExtractIdsVcfToImpute { + input: + vcf = SetIdsVcfToImpute.output_vcf, + output_basename = "imputed_sites" + } + + if (CheckChunksBeagle.valid) { + call tasks.PhaseAndImputeBeagle { + input: + dataset_vcf = select_first([OptionalQCSites.output_vcf, PreChunkVcf.generate_chunk_vcfs[i]]), + ref_panel_bref3 = referencePanelContig.bref3, + chrom = referencePanelContig.contig, + basename = chunk_basename, + genetic_map_file = referencePanelContig.genetic_map, + start = start, + end = end + } + + call tasks.UpdateHeader { + input: + vcf = PhaseAndImputeBeagle.vcf, + vcf_index = PhaseAndImputeBeagle.vcf_index, + ref_dict = ref_dict, + basename = chunk_basename + "_imputed" + } + + call tasks.SeparateMultiallelics { + input: + original_vcf = UpdateHeader.output_vcf, + original_vcf_index = UpdateHeader.output_vcf_index, + output_basename = chunk_basename + "_imputed" + } + + call tasks.RemoveSymbolicAlleles { + input: + original_vcf = SeparateMultiallelics.output_vcf, + original_vcf_index = SeparateMultiallelics.output_vcf_index, + output_basename = chunk_basename + "_imputed" + } + + call tasks.SetIDs { + input: + vcf = RemoveSymbolicAlleles.output_vcf, + output_basename = chunk_basename + "_imputed" + } + + call tasks.ExtractIDs { + input: + vcf = SetIDs.output_vcf, + output_basename = "imputed_sites" + } + } + call tasks.FindSitesUniqueToFileTwoOnly { + input: + file1 = select_first([ExtractIDs.ids, write_lines([])]), + file2 = ExtractIdsVcfToImpute.ids + } + + call tasks.SelectVariantsByIds { + input: + vcf = SetIdsVcfToImpute.output_vcf, + vcf_index = SetIdsVcfToImpute.output_vcf_index, + ids = FindSitesUniqueToFileTwoOnly.missing_sites, + basename = "imputed_sites_to_recover" + } + + call tasks.RemoveAnnotations { + input: + vcf = SelectVariantsByIds.output_vcf, + basename = "imputed_sites_to_recover_annotations_removed" + } + + call tasks.InterleaveVariants { + input: + vcfs = select_all([RemoveAnnotations.output_vcf, SetIDs.output_vcf]), + basename = output_callset_name + } + } + + Array[File] chromosome_vcfs = select_all(InterleaveVariants.output_vcf) + } + + call tasks.GatherVcfs { + input: + input_vcfs = flatten(chromosome_vcfs), + output_vcf_basename = output_callset_name + ".imputed" + } + + call tasks.StoreChunksInfo { + input: + chroms = flatten(chunk_contig), + starts = flatten(start), + ends = flatten(end), + vars_in_array = flatten(CountVariantsInChunksBeagle.var_in_original), + vars_in_panel = flatten(CountVariantsInChunksBeagle.var_also_in_reference), + valids = flatten(CheckChunksBeagle.valid), + basename = output_callset_name + } + + Int n_failed_chunks_int = read_int(StoreChunksInfo.n_failed_chunks) + + if (n_failed_chunks_int >= chunks_fail_threshold) { + call utils.ErrorWithMessage as FailQCNChunks { + input: + message = n_failed_chunks_int + " chunks failed imputation, QC threshold was set to " + chunks_fail_threshold + } + } + + if (split_output_to_single_sample) { + call tasks.SplitMultiSampleVcf { + input: + multiSampleVcf = GatherVcfs.output_vcf, + nSamples = CountSamples.nSamples + } + } + + output { + Array[File]? imputed_single_sample_vcfs = SplitMultiSampleVcf.single_sample_vcfs + Array[File]? imputed_single_sample_vcf_indices = SplitMultiSampleVcf.single_sample_vcf_indices + File imputed_multi_sample_vcf = GatherVcfs.output_vcf + File imputed_multi_sample_vcf_index = GatherVcfs.output_vcf_index + File chunks_info = StoreChunksInfo.chunks_info + File failed_chunks = StoreChunksInfo.failed_chunks + File n_failed_chunks = StoreChunksInfo.n_failed_chunks + } + + meta { + allowNestedInputs: true + } + +} + +struct ReferencePanelContig { + File interval_list + File bref3 + String contig + File genetic_map +} diff --git a/pipelines/broad/arrays/imputation_beagle/test_CountVariants.wdl b/pipelines/broad/arrays/imputation_beagle/test_CountVariants.wdl deleted file mode 100644 index 11eed1139b..0000000000 --- a/pipelines/broad/arrays/imputation_beagle/test_CountVariants.wdl +++ /dev/null @@ -1,63 +0,0 @@ -version 1.0 - -workflow TestCountVariants { - - input { - File vcf_path - File vcf_index_path - } - - # Lift over the array to hg38. - call CountVariantsTest { - input: - vcf = vcf_path, - vcf_index = vcf_index_path - } - - output { - File test1 = CountVariantsTest.test1 - File test2 = CountVariantsTest.test2 - File test3 = CountVariantsTest.test3 - } -} - -task CountVariantsTest { - input { - File vcf - File vcf_index - - - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" - Int cpu = 1 - Int memory_mb = 8000 - Int disk_size_gb = 2 * ceil(size([vcf, vcf_index], "GiB")) + 20 - } - Int command_mem = memory_mb - 1000 - Int max_heap = memory_mb - 500 - - command <<< - set -e -o pipefail - - gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V ~{vcf} > test_0.txt - cat test_0.txt | tail -n 1 > test_1.txt - echo "test_1:" - cat test_1.txt - gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V ~{vcf} | tail -n 1 > test_2.txt - echo "test_2:" - cat test_2.txt - gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V ~{vcf} 2>&1 | tail -n 1 > test_3.txt - echo "test_3:" - cat test_3.txt - >>> - output { - File test1 = "test_1.txt" - File test2 = "test_2.txt" - File test3 = "test_3.txt" - } - runtime { - docker: gatk_docker - disks: "local-disk ${disk_size_gb} HDD" - memory: "${memory_mb} MiB" - cpu: cpu - } -} \ No newline at end of file diff --git a/pipelines/broad/arrays/imputation_beagle/test_empty_write_lines_input.wdl b/pipelines/broad/arrays/imputation_beagle/test_empty_write_lines_input.wdl deleted file mode 100644 index 24b6d0a31d..0000000000 --- a/pipelines/broad/arrays/imputation_beagle/test_empty_write_lines_input.wdl +++ /dev/null @@ -1,61 +0,0 @@ -version 1.0 - -# testing ToA and write_lines([]) as task input -workflow test_empty_write_lines_input { - - input { - File write_lines_at_wdl_input = write_lines([]) - File? undefined_file - } - - # use file defined at wdl input - call LocalizeFile as LocalizeEmptyFileFromWdlInput { - input: - input_file = write_lines_at_wdl_input - } - - # use file defined at task input - call LocalizeFile as LocalizeEmptyFileFromTaskInput { - input: - input_file = write_lines([]) - } - - # use file with select_first - call LocalizeFile as LocalizeEmptyFileWithSelectFirst { - input: - input_file = select_first([undefined_file, write_lines([])]) - } - - # use file generated in an if block - Boolean run_block = false - if (run_block) { - File file_from_block = write_lines(["foo"]) - } - call LocalizeFile as LocalizeEmptyFileWithIfBlock { - input: - input_file = select_first([file_from_block, write_lines(["foo"])]) - } -} - -task LocalizeFile { - input { - File input_file - } - - command <<< - set -euo pipefail - - cat ~{input_file} | wc -l > num_lines.txt - >>> - - runtime { - docker: "ubuntu:20.04" - memory: "2 GiB" - cpu: "1" - disks: "local-disk 8 HDD" - } - - output { - Int num_lines = read_int("num_lines.txt") - } -} diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index 98ed464f43..5f57b6fd94 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -313,8 +313,11 @@ task CountVariantsInChunksBeagle { command <<< set -e -o pipefail - gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V ~{vcf} | tail -n 1 > var_in_original - gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V ~{vcf} -L ~{panel_interval_list} | tail -n 1 > var_also_in_reference + ln -sf ~{vcf} input.vcf.gz + ln -sf ~{vcf_index} input.vcf.gz.tbi + + gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V input.vcf.gz | tail -n 1 > var_in_original + gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V input.vcf.gz -L ~{panel_interval_list} | tail -n 1 > var_also_in_reference >>> output { Int var_in_original = read_int("var_in_original") @@ -587,9 +590,11 @@ task OptionalQCSites { Float hwe = select_first([optional_qc_hwe, 0.000001]) command <<< set -e -o pipefail + ln -sf ~{input_vcf} input.vcf.gz + ln -sf ~{input_vcf_index} input.vcf.gz.tbi # site missing rate < 5% ; hwe p > 1e-6 - tools --gzvcf ~{input_vcf} --max-missing ~{max_missing} --hwe ~{hwe} --recode -c | bgzip -c > ~{output_vcf_basename}.vcf.gz + tools --gzvcf input.vcf.gz --max-missing ~{max_missing} --hwe ~{hwe} --recode -c | bgzip -c > ~{output_vcf_basename}.vcf.gz bcftools index -t ~{output_vcf_basename}.vcf.gz # Note: this is necessary because vcftools doesn't have a way to output a zipped vcf, nor a way to index one (hence needing to use bcf). >>> runtime { @@ -1052,3 +1057,158 @@ task SplitMultiSampleVcf { Array[File] single_sample_vcf_indices = glob("out_dir/*.vcf.gz.tbi") } } + +task PreSplitVcf { + input { + Array[String] contigs + File vcf + File vcf_index + + Int disk_size_gb = ceil(3*size(vcf, "GiB")) + 50 + Int cpu = 1 + Int memory_mb = 8000 + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" + } + Int command_mem = memory_mb - 1000 + Int max_heap = memory_mb - 500 + + command { + set -e -o pipefail + + mkdir split_vcfs + + CONTIG_FILE=~{write_lines(contigs)} + i=0 + + while read -r line; + do + + SPLIT=$(printf "%03d" $i) + echo "SPLIT: $SPLIT" + + gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" \ + SelectVariants \ + -V ~{vcf} \ + -L $line \ + -O split_vcfs/split_chr_$SPLIT.vcf.gz + + i=$(($i + 1)) + + done < $CONTIG_FILE + } + runtime { + docker: gatk_docker + disks: "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu + } + output { + Array[File] chr_split_vcfs = glob("split_vcfs/*.vcf.gz") + Array[File] chr_split_vcf_indices = glob("split_vcfs/*.vcf.gz.tbi") + } +} + +task PreChunkVcf { + input { + Int chromosome_length + Int chunk_length + Int chunk_overlap + String chrom + File vcf + File vcf_index + Boolean exclude_filtered = false + + Int disk_size_gb = ceil(4*size(vcf, "GiB")) + 50 + Int cpu = 1 + Int memory_mb = 8000 + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" + } + Int command_mem = memory_mb - 1000 + Int max_heap = memory_mb - 500 + + command { + set -e -o pipefail + + ln -sf ~{vcf} input.vcf.gz + ln -sf ~{vcf_index} input.vcf.gz.tbi + + mkdir generate_chunk + mkdir subset_vcf + + CHROM_LENGTH=~{chromosome_length} + CHUNK_LENGTH=~{chunk_length} + CHUNK_OVERLAPS=~{chunk_overlap} + i=0 + LOOP_DRIVER=$(( $i * $CHUNK_LENGTH + 1 )) + + while [ $LOOP_DRIVER -lt $CHROM_LENGTH ] + do + START=$(( $i * $CHUNK_LENGTH + 1 )) + START_OVERLAP_CHECK=$(($START - $CHUNK_OVERLAPS)) + if [ $START_OVERLAP_CHECK -lt 1 ]; then + START_WITH_OVERLAPS=$START + else + START_WITH_OVERLAPS=$(($START - $CHUNK_OVERLAPS)) + fi + echo "START: $START" + echo "START WITH OVERLAPS: $START_WITH_OVERLAPS" + + END_CHECK=$(( ($i + 1) * $CHUNK_LENGTH )) + if [ $END_CHECK -gt $CHROM_LENGTH ]; then + END=$CHROM_LENGTH + else + END=$(( ($i + 1) * $CHUNK_LENGTH )) + fi + + END_OVERLAP_CHECK=$(( $END + $CHUNK_OVERLAPS )) + if [ $END_OVERLAP_CHECK -gt $CHROM_LENGTH ]; then + END_WITH_OVERLAPS=$CHROM_LENGTH + else + END_WITH_OVERLAPS=$(( $END + $CHUNK_OVERLAPS )) + fi + echo "END: $END" + echo "END WITH OVERLAPS: $END_WITH_OVERLAPS" + + CHUNK=$(printf "%03d" $i) + echo "CHUNK: $CHUNK" + + gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" \ + SelectVariants \ + -V input.vcf.gz \ + --select-type-to-include SNP \ + --max-nocall-fraction 0.1 \ + -xl-select-type SYMBOLIC \ + --select-type-to-exclude MIXED \ + --restrict-alleles-to BIALLELIC \ + -L ~{chrom}:$START_WITH_OVERLAPS-$END_WITH_OVERLAPS \ + -O generate_chunk/~{chrom}_generate_chunk_$CHUNK.vcf.gz \ + --exclude-filtered true + + gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" \ + SelectVariants \ + -V input.vcf.gz \ + -L ~{chrom}:$START-$END \ + -select "POS >= $START" ~{if exclude_filtered then "--exclude-filtered" else ""} \ + -O subset_vcf/~{chrom}_subset_chunk_$CHUNK.vcf.gz + + echo $START >> start.txt + echo $END >> end.txt + + i=$(($i + 1)) + LOOP_DRIVER=$(( $i * $CHUNK_LENGTH + 1 )) + done + } + runtime { + docker: gatk_docker + disks: "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu + } + output { + Array[File] generate_chunk_vcfs = glob("generate_chunk/*.vcf.gz") + Array[File] generate_chunk_vcf_indices = glob("generate_chunk/*.vcf.gz.tbi") + Array[File] subset_vcfs = glob("subset_vcf/*.vcf.gz") + Array[String] starts = read_lines("start.txt") + Array[String] ends = read_lines("end.txt") + } +} From cd6134e0c14cb0ee50c8170a7e8b3114a6c95af4 Mon Sep 17 00:00:00 2001 From: jsotobroad Date: Mon, 20 May 2024 11:28:52 -0400 Subject: [PATCH 50/92] TSPS-221 remove index input and add seed to make beagle tool deterministic (#1285) * remove multi sample vcf index workflow input and add it to the PreSplitVcf task. add seed number so that beagle is always deterministic. add comment to cpu input for PhaseAndImputeBeagle task * change output_callset_name to output_base_name and remove optional outputs * change n_failed_chunks ticket to an int --------- Co-authored-by: Jose Soto --- .../ImputationBeaglePreChunk.wdl | 16 ++++++---------- tasks/broad/ImputationTasks.wdl | 9 ++++++--- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl index 622c684274..d17b2cc80a 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl @@ -12,7 +12,6 @@ workflow ImputationBeaglePreChunk { Int chunkOverlaps = 5000000 # this is the padding that will be added to the beginning and end of each chunk to reduce edge effects File multi_sample_vcf - File multi_sample_vcf_index Boolean perform_extra_qc_steps = false # these are optional additional extra QC steps from Amit's group that should only be # run for large sample sets, especially a diverse set of samples (it's further limiting called at sites to 95% and by HWE) @@ -22,7 +21,7 @@ workflow ImputationBeaglePreChunk { Array[String] contigs String reference_panel_path # path to the bucket where the reference panel files are stored for all contigs String genetic_maps_path # path to the bucket where genetic maps are stored for all contigs - String output_callset_name # the output callset name + String output_basename # the basename for intermediate and output files Boolean split_output_to_single_sample = false Int chunks_fail_threshold = 1 # require fewer than this many chunks to fail in order to pass @@ -40,8 +39,7 @@ workflow ImputationBeaglePreChunk { call tasks.PreSplitVcf { input: contigs = contigs, - vcf = multi_sample_vcf, - vcf_index = multi_sample_vcf_index + vcf = multi_sample_vcf } scatter (contig_index in range(length(contigs))) { @@ -184,7 +182,7 @@ workflow ImputationBeaglePreChunk { call tasks.InterleaveVariants { input: vcfs = select_all([RemoveAnnotations.output_vcf, SetIDs.output_vcf]), - basename = output_callset_name + basename = output_basename } } @@ -194,7 +192,7 @@ workflow ImputationBeaglePreChunk { call tasks.GatherVcfs { input: input_vcfs = flatten(chromosome_vcfs), - output_vcf_basename = output_callset_name + ".imputed" + output_vcf_basename = output_basename + ".imputed" } call tasks.StoreChunksInfo { @@ -205,7 +203,7 @@ workflow ImputationBeaglePreChunk { vars_in_array = flatten(CountVariantsInChunksBeagle.var_in_original), vars_in_panel = flatten(CountVariantsInChunksBeagle.var_also_in_reference), valids = flatten(CheckChunksBeagle.valid), - basename = output_callset_name + basename = output_basename } Int n_failed_chunks_int = read_int(StoreChunksInfo.n_failed_chunks) @@ -226,13 +224,11 @@ workflow ImputationBeaglePreChunk { } output { - Array[File]? imputed_single_sample_vcfs = SplitMultiSampleVcf.single_sample_vcfs - Array[File]? imputed_single_sample_vcf_indices = SplitMultiSampleVcf.single_sample_vcf_indices File imputed_multi_sample_vcf = GatherVcfs.output_vcf File imputed_multi_sample_vcf_index = GatherVcfs.output_vcf_index File chunks_info = StoreChunksInfo.chunks_info File failed_chunks = StoreChunksInfo.failed_chunks - File n_failed_chunks = StoreChunksInfo.n_failed_chunks + Int n_failed_chunks = n_failed_chunks_int } meta { diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index 5f57b6fd94..edddf3714f 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -371,7 +371,7 @@ task PhaseAndImputeBeagle { Int end # not needed if ref file has been chunked and you are using the entire chunk String beagle_docker = "us-central1-docker.pkg.dev/morgan-fieldeng-gcp/imputation-beagle-development/imputation-beagle:0.0.1-01Mar24.d36-wip-temp-20240301" - Int cpu = 8 # This parameter can be higher or lower + Int cpu = 8 # This parameter is used as the nthreads input to Beagle which is part of how we make it determinstic. Changing this value may change the output generated by the tool Int memory_mb = 32000 # value depends on chunk size, the number of samples in ref and target panel, and whether imputation is performed Int xmx_mb = 29000 # I suggest setting this parameter to be 85-90% of the memory_mb parameter Int disk_size_gb = ceil(3 * size([dataset_vcf, ref_panel_bref3], "GiB")) + 50 # value may need to be adjusted @@ -388,7 +388,8 @@ task PhaseAndImputeBeagle { out=imputed_~{basename} \ chrom=~{chrom}:~{start}-~{end} \ impute=true \ - nthreads=~{cpu} + nthreads=~{cpu} \ + seed=-99999 # notes: # rename output file to "phased_{basename}" if phasing without imputing @@ -1062,7 +1063,6 @@ task PreSplitVcf { input { Array[String] contigs File vcf - File vcf_index Int disk_size_gb = ceil(3*size(vcf, "GiB")) + 50 Int cpu = 1 @@ -1075,6 +1075,9 @@ task PreSplitVcf { command { set -e -o pipefail + gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" \ + IndexFeatureFile -I ~{vcf} + mkdir split_vcfs CONTIG_FILE=~{write_lines(contigs)} From b063f2bd4cde779d0ac2d335ff7b49034138cc70 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Tue, 21 May 2024 10:46:46 -0400 Subject: [PATCH 51/92] rename workflow --- .../broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl index d17b2cc80a..313895a29b 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl @@ -3,7 +3,7 @@ version 1.0 import "../../../../tasks/broad/ImputationTasks.wdl" as tasks import "../../../../tasks/broad/Utilities.wdl" as utils -workflow ImputationBeaglePreChunk { +workflow ImputationBeagle { String pipeline_version = "0.0.1" From 5906ce164de3facd8cd3a3bc097324283efe2a06 Mon Sep 17 00:00:00 2001 From: jsotobroad Date: Mon, 3 Jun 2024 19:51:36 -0400 Subject: [PATCH 52/92] TSPS-241 Clean up beagle wdl (#1288) * clean up wdl with stuff from TSPS-241 * try to make fail fast work with double nested scatters --------- Co-authored-by: Jose Soto --- .../ImputationBeaglePreChunk.wdl | 177 +++++++++--------- tasks/broad/ImputationTasks.wdl | 25 ++- tasks/broad/Utilities.wdl | 2 +- 3 files changed, 109 insertions(+), 95 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl index 313895a29b..e4e9727769 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl @@ -13,18 +13,11 @@ workflow ImputationBeagle { File multi_sample_vcf - Boolean perform_extra_qc_steps = false # these are optional additional extra QC steps from Amit's group that should only be - # run for large sample sets, especially a diverse set of samples (it's further limiting called at sites to 95% and by HWE) - Float optional_qc_max_missing = 0.05 - Float optional_qc_hwe = 0.000001 File ref_dict # for reheadering / adding contig lengths in the header of the ouptut VCF, and calculating contig lengths Array[String] contigs String reference_panel_path # path to the bucket where the reference panel files are stored for all contigs String genetic_maps_path # path to the bucket where genetic maps are stored for all contigs String output_basename # the basename for intermediate and output files - Boolean split_output_to_single_sample = false - - Int chunks_fail_threshold = 1 # require fewer than this many chunks to fail in order to pass # file extensions used to find reference panel files String interval_list_suffix = ".interval_list" @@ -72,26 +65,14 @@ workflow ImputationBeagle { scatter (i in range(length(PreChunkVcf.generate_chunk_vcfs))) { String chunk_contig = referencePanelContig.contig - String chunk_basename = referencePanelContig.contig + "_chunk_" + i Int start = PreChunkVcf.starts[i] Int end = PreChunkVcf.ends[i] - if (perform_extra_qc_steps) { - call tasks.OptionalQCSites { - input: - input_vcf = PreChunkVcf.generate_chunk_vcfs[i], - input_vcf_index = PreChunkVcf.generate_chunk_vcf_indices[i], - output_vcf_basename = chunk_basename, - optional_qc_max_missing = optional_qc_max_missing, - optional_qc_hwe = optional_qc_hwe - } - } - call tasks.CountVariantsInChunksBeagle { input: - vcf = select_first([OptionalQCSites.output_vcf, PreChunkVcf.generate_chunk_vcfs[i]]), - vcf_index = select_first([OptionalQCSites.output_vcf_index, PreChunkVcf.generate_chunk_vcf_indices[i]]), + vcf = PreChunkVcf.generate_chunk_vcfs[i], + vcf_index = PreChunkVcf.generate_chunk_vcf_indices[i], panel_interval_list = referencePanelContig.interval_list } @@ -106,69 +87,98 @@ workflow ImputationBeagle { vcf = PreChunkVcf.subset_vcfs[i], output_basename = "input_samples_with_variant_ids" } + } + + call tasks.StoreChunksInfo as StoreContigLevelChunksInfo { + input: + chroms = chunk_contig, + starts = start, + ends = end, + vars_in_array = CountVariantsInChunksBeagle.var_in_original, + vars_in_panel = CountVariantsInChunksBeagle.var_also_in_reference, + valids = CheckChunksBeagle.valid, + basename = output_basename + } + + # if any chunk for any chromosome fail CheckChunks, then we will not impute run any task in the next scatter, + # namely phasing and imputing which would be the most costly to throw away + Int n_failed_chunks_int = read_int(StoreContigLevelChunksInfo.n_failed_chunks) + call tasks.ErrorWithMessageIfErrorCountNotZero as FailQCNChunks { + input: + errorCount = n_failed_chunks_int, + message = "contig " + referencePanelContig.contig + " had " + n_failed_chunks_int + " failing chunks" + } + + scatter (i in range(length(PreChunkVcf.generate_chunk_vcfs))) { + + String chunk_basename = referencePanelContig.contig + "_chunk_" + i + + Int start2 = PreChunkVcf.starts[i] + Int end2 = PreChunkVcf.ends[i] + + call tasks.ExtractIDs as ExtractIdsVcfToImpute { + input: + vcf = SetIdsVcfToImpute.output_vcf[i], + output_basename = "imputed_sites", + for_dependency = FailQCNChunks.done # these shenanigans can be replaced with `after` in wdl 1.1 + } + + call tasks.PhaseAndImputeBeagle { + input: + dataset_vcf = PreChunkVcf.generate_chunk_vcfs[i], + ref_panel_bref3 = referencePanelContig.bref3, + chrom = referencePanelContig.contig, + basename = chunk_basename, + genetic_map_file = referencePanelContig.genetic_map, + start = start2, + end = end2 + } + + call tasks.UpdateHeader { + input: + vcf = PhaseAndImputeBeagle.vcf, + vcf_index = PhaseAndImputeBeagle.vcf_index, + ref_dict = ref_dict, + basename = chunk_basename + "_imputed" + } - call tasks.ExtractIDs as ExtractIdsVcfToImpute { + call tasks.SeparateMultiallelics { input: - vcf = SetIdsVcfToImpute.output_vcf, - output_basename = "imputed_sites" + original_vcf = UpdateHeader.output_vcf, + original_vcf_index = UpdateHeader.output_vcf_index, + output_basename = chunk_basename + "_imputed" } - if (CheckChunksBeagle.valid) { - call tasks.PhaseAndImputeBeagle { - input: - dataset_vcf = select_first([OptionalQCSites.output_vcf, PreChunkVcf.generate_chunk_vcfs[i]]), - ref_panel_bref3 = referencePanelContig.bref3, - chrom = referencePanelContig.contig, - basename = chunk_basename, - genetic_map_file = referencePanelContig.genetic_map, - start = start, - end = end - } - - call tasks.UpdateHeader { - input: - vcf = PhaseAndImputeBeagle.vcf, - vcf_index = PhaseAndImputeBeagle.vcf_index, - ref_dict = ref_dict, - basename = chunk_basename + "_imputed" - } - - call tasks.SeparateMultiallelics { - input: - original_vcf = UpdateHeader.output_vcf, - original_vcf_index = UpdateHeader.output_vcf_index, - output_basename = chunk_basename + "_imputed" - } - - call tasks.RemoveSymbolicAlleles { - input: - original_vcf = SeparateMultiallelics.output_vcf, - original_vcf_index = SeparateMultiallelics.output_vcf_index, - output_basename = chunk_basename + "_imputed" - } - - call tasks.SetIDs { - input: - vcf = RemoveSymbolicAlleles.output_vcf, - output_basename = chunk_basename + "_imputed" - } - - call tasks.ExtractIDs { - input: - vcf = SetIDs.output_vcf, - output_basename = "imputed_sites" - } + call tasks.RemoveSymbolicAlleles { + input: + original_vcf = SeparateMultiallelics.output_vcf, + original_vcf_index = SeparateMultiallelics.output_vcf_index, + output_basename = chunk_basename + "_imputed" + } + + call tasks.SetIDs { + input: + vcf = RemoveSymbolicAlleles.output_vcf, + output_basename = chunk_basename + "_imputed" + } + + call tasks.ExtractIDs { + input: + vcf = SetIDs.output_vcf, + output_basename = "imputed_sites", + for_dependency = true } + call tasks.FindSitesUniqueToFileTwoOnly { input: - file1 = select_first([ExtractIDs.ids, write_lines([])]), + file1 = ExtractIDs.ids, file2 = ExtractIdsVcfToImpute.ids } call tasks.SelectVariantsByIds { input: - vcf = SetIdsVcfToImpute.output_vcf, - vcf_index = SetIdsVcfToImpute.output_vcf_index, + vcf = SetIdsVcfToImpute.output_vcf[i], + vcf_index = SetIdsVcfToImpute.output_vcf_index[i], ids = FindSitesUniqueToFileTwoOnly.missing_sites, basename = "imputed_sites_to_recover" } @@ -181,12 +191,12 @@ workflow ImputationBeagle { call tasks.InterleaveVariants { input: - vcfs = select_all([RemoveAnnotations.output_vcf, SetIDs.output_vcf]), + vcfs = [RemoveAnnotations.output_vcf, SetIDs.output_vcf], basename = output_basename } } - Array[File] chromosome_vcfs = select_all(InterleaveVariants.output_vcf) + Array[File] chromosome_vcfs = InterleaveVariants.output_vcf } call tasks.GatherVcfs { @@ -206,29 +216,10 @@ workflow ImputationBeagle { basename = output_basename } - Int n_failed_chunks_int = read_int(StoreChunksInfo.n_failed_chunks) - - if (n_failed_chunks_int >= chunks_fail_threshold) { - call utils.ErrorWithMessage as FailQCNChunks { - input: - message = n_failed_chunks_int + " chunks failed imputation, QC threshold was set to " + chunks_fail_threshold - } - } - - if (split_output_to_single_sample) { - call tasks.SplitMultiSampleVcf { - input: - multiSampleVcf = GatherVcfs.output_vcf, - nSamples = CountSamples.nSamples - } - } - output { File imputed_multi_sample_vcf = GatherVcfs.output_vcf File imputed_multi_sample_vcf_index = GatherVcfs.output_vcf_index File chunks_info = StoreChunksInfo.chunks_info - File failed_chunks = StoreChunksInfo.failed_chunks - Int n_failed_chunks = n_failed_chunks_int } meta { diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index edddf3714f..ae81361c8b 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -391,7 +391,7 @@ task PhaseAndImputeBeagle { nthreads=~{cpu} \ seed=-99999 - # notes: + # notes: # rename output file to "phased_{basename}" if phasing without imputing # `chrom` not needed if ref and targ files have been chunked and you are using the entire chunk # set impute=false if you wish to phase without imputing ungenotyped markers @@ -884,6 +884,7 @@ task ExtractIDs { String bcftools_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.7-1.10.2-0.1.16-1669908889" Int cpu = 1 Int memory_mb = 4000 + Boolean for_dependency = true } command <<< bcftools query -f "%ID\n" ~{vcf} -o ~{output_basename}.ids.txt @@ -1215,3 +1216,25 @@ task PreChunkVcf { Array[String] ends = read_lines("end.txt") } } + +task ErrorWithMessageIfErrorCountNotZero { + input { + Int errorCount + String message + } + command <<< + if [[ ~{errorCount} -gt 0 ]]; then + >&2 echo "Error: ~{message}" + exit 1 + else + exit 0 + fi + >>> + + runtime { + docker: "ubuntu:20.04" + } + output { + Boolean done = true + } +} diff --git a/tasks/broad/Utilities.wdl b/tasks/broad/Utilities.wdl index e6a1aeec17..52121a74a9 100644 --- a/tasks/broad/Utilities.wdl +++ b/tasks/broad/Utilities.wdl @@ -300,4 +300,4 @@ task GetValidationInputs { Array[String] results_files = read_lines("results_files.txt") } -} \ No newline at end of file +} From 8372d3b14aaba48fb3102005d8a4e18f40280f84 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Thu, 6 Jun 2024 15:42:11 -0400 Subject: [PATCH 53/92] add specific gatk_docker --- .../ImputationBeaglePreChunk.wdl | 26 +++++++++++++------ 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl index e4e9727769..14b8a97035 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl @@ -22,6 +22,8 @@ workflow ImputationBeagle { # file extensions used to find reference panel files String interval_list_suffix = ".interval_list" String bref3_suffix = ".bref3" + + String gatk_docker = "broadinstitute/gatk-nightly:2024-06-06-4.5.0.0-36-g2a420e483-NIGHTLY-SNAPSHOT" } call tasks.CountSamples { @@ -32,7 +34,8 @@ workflow ImputationBeagle { call tasks.PreSplitVcf { input: contigs = contigs, - vcf = multi_sample_vcf + vcf = multi_sample_vcf, + gatk_docker = gatk_docker } scatter (contig_index in range(length(contigs))) { @@ -60,7 +63,8 @@ workflow ImputationBeagle { chunk_overlap = chunkOverlaps, chrom = contigs[contig_index], vcf = PreSplitVcf.chr_split_vcfs[contig_index], - vcf_index = PreSplitVcf.chr_split_vcf_indices[contig_index] + vcf_index = PreSplitVcf.chr_split_vcf_indices[contig_index], + gatk_docker = gatk_docker } scatter (i in range(length(PreChunkVcf.generate_chunk_vcfs))) { @@ -73,7 +77,8 @@ workflow ImputationBeagle { input: vcf = PreChunkVcf.generate_chunk_vcfs[i], vcf_index = PreChunkVcf.generate_chunk_vcf_indices[i], - panel_interval_list = referencePanelContig.interval_list + panel_interval_list = referencePanelContig.interval_list, + gatk_docker = gatk_docker } call tasks.CheckChunksBeagle { @@ -139,7 +144,8 @@ workflow ImputationBeagle { vcf = PhaseAndImputeBeagle.vcf, vcf_index = PhaseAndImputeBeagle.vcf_index, ref_dict = ref_dict, - basename = chunk_basename + "_imputed" + basename = chunk_basename + "_imputed", + gatk_docker = gatk_docker } call tasks.SeparateMultiallelics { @@ -153,7 +159,8 @@ workflow ImputationBeagle { input: original_vcf = SeparateMultiallelics.output_vcf, original_vcf_index = SeparateMultiallelics.output_vcf_index, - output_basename = chunk_basename + "_imputed" + output_basename = chunk_basename + "_imputed", + gatk_docker = gatk_docker } call tasks.SetIDs { @@ -180,7 +187,8 @@ workflow ImputationBeagle { vcf = SetIdsVcfToImpute.output_vcf[i], vcf_index = SetIdsVcfToImpute.output_vcf_index[i], ids = FindSitesUniqueToFileTwoOnly.missing_sites, - basename = "imputed_sites_to_recover" + basename = "imputed_sites_to_recover", + gatk_docker = gatk_docker } call tasks.RemoveAnnotations { @@ -192,7 +200,8 @@ workflow ImputationBeagle { call tasks.InterleaveVariants { input: vcfs = [RemoveAnnotations.output_vcf, SetIDs.output_vcf], - basename = output_basename + basename = output_basename, + gatk_docker = gatk_docker } } @@ -202,7 +211,8 @@ workflow ImputationBeagle { call tasks.GatherVcfs { input: input_vcfs = flatten(chromosome_vcfs), - output_vcf_basename = output_basename + ".imputed" + output_vcf_basename = output_basename + ".imputed", + gatk_docker = gatk_docker } call tasks.StoreChunksInfo { From 827bc4299557d59f6616651bdeecfc8b590d09fb Mon Sep 17 00:00:00 2001 From: jsotobroad Date: Mon, 10 Jun 2024 13:52:38 -0400 Subject: [PATCH 54/92] TSPS-142 updates to help creating simulated reference panel and running imputation against it (#1296) * add optional error count override for testing * rename reference base prefix variable and make it more user friendly --------- Co-authored-by: Jose Soto --- .../imputation_beagle/ImputationBeaglePreChunk.wdl | 12 +++++++----- .../broad/arrays/imputation_beagle/LiftoverVcfs.wdl | 1 - 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl index 14b8a97035..7d79f9c754 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl @@ -15,7 +15,7 @@ workflow ImputationBeagle { File ref_dict # for reheadering / adding contig lengths in the header of the ouptut VCF, and calculating contig lengths Array[String] contigs - String reference_panel_path # path to the bucket where the reference panel files are stored for all contigs + String reference_panel_path_prefix # path + file prefix to the bucket where the reference panel files are stored for all contigs String genetic_maps_path # path to the bucket where genetic maps are stored for all contigs String output_basename # the basename for intermediate and output files @@ -24,6 +24,8 @@ workflow ImputationBeagle { String bref3_suffix = ".bref3" String gatk_docker = "broadinstitute/gatk-nightly:2024-06-06-4.5.0.0-36-g2a420e483-NIGHTLY-SNAPSHOT" + + Int? error_count_override } call tasks.CountSamples { @@ -40,12 +42,12 @@ workflow ImputationBeagle { scatter (contig_index in range(length(contigs))) { # these are specific to hg38 - contig is format 'chr1' - String reference_filename = reference_panel_path + "hgdp.tgp.gwaspy.merged." + contigs[contig_index] + ".merged.AN_added.bcf.ac2" + String reference_basename = reference_panel_path_prefix + "." + contigs[contig_index] String genetic_map_filename = genetic_maps_path + "plink." + contigs[contig_index] + ".GRCh38.withchr.map" ReferencePanelContig referencePanelContig = { - "interval_list": reference_filename + interval_list_suffix, - "bref3": reference_filename + bref3_suffix, + "interval_list": reference_basename + interval_list_suffix, + "bref3": reference_basename + bref3_suffix, "contig": contigs[contig_index], "genetic_map": genetic_map_filename } @@ -107,7 +109,7 @@ workflow ImputationBeagle { # if any chunk for any chromosome fail CheckChunks, then we will not impute run any task in the next scatter, # namely phasing and imputing which would be the most costly to throw away - Int n_failed_chunks_int = read_int(StoreContigLevelChunksInfo.n_failed_chunks) + Int n_failed_chunks_int = select_first([error_count_override, read_int(StoreContigLevelChunksInfo.n_failed_chunks)]) call tasks.ErrorWithMessageIfErrorCountNotZero as FailQCNChunks { input: errorCount = n_failed_chunks_int, diff --git a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl index 5db6a2481c..47d3392662 100644 --- a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl +++ b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl @@ -62,7 +62,6 @@ task LiftOverArrays { Int disk_size_from_file = (ceil(size(input_vcf, "GiB") + size(liftover_chain, "GiB") + size(reference_fasta, "GiB")) * 2) + 20 Int disk_size = if ( disk_size_from_file > min_disk_size ) then disk_size_from_file else min_disk_size - command <<< set -euo pipefail From b85a7032dc16551540e80e5f0e62c7a2b70a1c81 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Fri, 28 Jun 2024 09:43:40 -0400 Subject: [PATCH 55/92] add maxRetries 2 to all imputation beagle tasks --- tasks/broad/ImputationTasks.wdl | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index ae81361c8b..3c7f6d4aa4 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -19,6 +19,7 @@ task CalculateChromosomeLength { disks: "local-disk ${disk_size_gb} HDD" memory: "${memory_mb} MiB" cpu: cpu + maxRetries: 2 } output { Int chrom_length = read_int(stdout()) @@ -328,6 +329,7 @@ task CountVariantsInChunksBeagle { disks: "local-disk ${disk_size_gb} HDD" memory: "${memory_mb} MiB" cpu: cpu + maxRetries: 2 } } @@ -357,6 +359,7 @@ task CheckChunksBeagle { disks: "local-disk 10 HDD" memory: "${memory_mb} MiB" cpu: cpu + maxRetries: 2 } } @@ -408,6 +411,7 @@ task PhaseAndImputeBeagle { disks: "local-disk ${disk_size_gb} HDD" memory: "${memory_mb} MiB" cpu: cpu + maxRetries: 2 } } @@ -441,6 +445,7 @@ task GatherVcfs { disks: "local-disk ${disk_size_gb} HDD" memory: "${memory_mb} MiB" cpu: cpu + maxRetries: 2 } output { File output_vcf = "~{output_vcf_basename}.vcf.gz" @@ -507,6 +512,7 @@ task UpdateHeader { disks: "local-disk ${disk_size_gb} HDD" memory: "${memory_mb} MiB" cpu: cpu + maxRetries: 2 } output { File output_vcf = "~{basename}.vcf.gz" @@ -541,6 +547,7 @@ task RemoveSymbolicAlleles { disks: "local-disk ${disk_size_gb} HDD" memory: "${memory_mb} MiB" cpu: cpu + maxRetries: 2 } } @@ -570,6 +577,7 @@ task SeparateMultiallelics { disks: "local-disk ${disk_size_gb} HDD" memory: "${memory_mb} MiB" cpu: cpu + maxRetries: 2 } } @@ -660,6 +668,7 @@ task CountSamples { Int memory_mb = 3000 Int disk_size_gb = 100 + ceil(size(vcf, "GiB")) } + command <<< bcftools query -l ~{vcf} | wc -l >>> @@ -668,6 +677,7 @@ task CountSamples { disks: "local-disk ${disk_size_gb} HDD" memory: "${memory_mb} MiB" cpu: cpu + maxRetries: 2 } output { Int nSamples = read_int(stdout()) @@ -752,6 +762,7 @@ task StoreChunksInfo { memory: "${memory_mb} MiB" cpu: cpu preemptible : 3 + maxRetries: 2 } output { File chunks_info = "~{basename}_chunk_info.tsv" @@ -868,6 +879,7 @@ task SetIDs { disks: "local-disk ${disk_size_gb} HDD" memory: "${memory_mb} MiB" cpu: cpu + maxRetries: 2 } output { File output_vcf = "~{output_basename}.vcf.gz" @@ -897,6 +909,7 @@ task ExtractIDs { disks: "local-disk ${disk_size_gb} HDD" memory: "${memory_mb} MiB" cpu: cpu + maxRetries: 2 } } @@ -937,6 +950,7 @@ task SelectVariantsByIds { disks: "local-disk ${disk_size_gb} SSD" memory: "${memory_mb} MiB" cpu: cpu + maxRetries: 2 } output { File output_vcf = "~{basename}.vcf.gz" @@ -965,6 +979,7 @@ task RemoveAnnotations { disks: "local-disk ${disk_size_gb} HDD" memory: "${memory_mb} MiB" cpu: cpu + maxRetries: 2 } output { File output_vcf = "~{basename}.vcf.gz" @@ -996,6 +1011,7 @@ task InterleaveVariants { disks: "local-disk ${disk_size_gb} SSD" memory: "${memory_mb} MiB" cpu: cpu + maxRetries: 2 } output { File output_vcf = "~{basename}.vcf.gz" @@ -1021,6 +1037,7 @@ task FindSitesUniqueToFileTwoOnly { disks: "local-disk ${disk_size_gb} HDD" memory: "${memory_mb} MiB" cpu: cpu + maxRetries: 2 } output { File missing_sites = "missing_sites.ids" @@ -1105,6 +1122,7 @@ task PreSplitVcf { disks: "local-disk ${disk_size_gb} HDD" memory: "${memory_mb} MiB" cpu: cpu + maxRetries: 2 } output { Array[File] chr_split_vcfs = glob("split_vcfs/*.vcf.gz") @@ -1207,6 +1225,7 @@ task PreChunkVcf { disks: "local-disk ${disk_size_gb} HDD" memory: "${memory_mb} MiB" cpu: cpu + maxRetries: 2 } output { Array[File] generate_chunk_vcfs = glob("generate_chunk/*.vcf.gz") From e64be57b1b84e1df642ed1a33f1445050b2e8412 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Mon, 8 Jul 2024 12:49:59 -0400 Subject: [PATCH 56/92] add prechunk wdl to dockstore --- .dockstore.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.dockstore.yml b/.dockstore.yml index 08eff44e17..48610d2d7b 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -87,6 +87,10 @@ workflows: subclass: WDL primaryDescriptorPath: /pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl + - name: ImputationBeaglePreChunk + subclass: WDL + primaryDescriptorPath: /pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl + - name: CreateImputationRefPanelBeagle subclass: WDL primaryDescriptorPath: /pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl From dede6ce654186ee0996875490d2d20ccf9b79541 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Thu, 11 Jul 2024 10:24:16 -0400 Subject: [PATCH 57/92] use acr for default ubuntu image --- tasks/broad/ImputationTasks.wdl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index 3c7f6d4aa4..f33ecf7098 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -5,7 +5,7 @@ task CalculateChromosomeLength { File ref_dict String chrom - String ubuntu_docker = "ubuntu:20.04" + String ubuntu_docker = "ubuntu.azurecr.io/ubuntu:20.04" Int memory_mb = 2000 Int cpu = 1 Int disk_size_gb = ceil(2*size(ref_dict, "GiB")) + 5 @@ -31,7 +31,7 @@ task GetMissingContigList { File ref_dict File included_contigs - String ubuntu_docker = "ubuntu:20.04" + String ubuntu_docker = "ubuntu.azurecr.io/ubuntu:20.04" Int memory_mb = 2000 Int cpu = 1 Int disk_size_gb = ceil(2*size(ref_dict, "GiB")) + 5 @@ -1024,7 +1024,7 @@ task FindSitesUniqueToFileTwoOnly { File file1 File file2 - String ubuntu_docker = "ubuntu:20.04" + String ubuntu_docker = "ubuntu.azurecr.io/ubuntu:20.04" Int cpu = 1 Int memory_mb = 4000 Int disk_size_gb = ceil(size(file1, "GiB") + 2*size(file2, "GiB")) + 100 @@ -1251,7 +1251,7 @@ task ErrorWithMessageIfErrorCountNotZero { >>> runtime { - docker: "ubuntu:20.04" + docker: "ubuntu.azurecr.io/ubuntu:20.04" } output { Boolean done = true From 6d94ca706f6db6f56231c550f84019fd7d16e813 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Fri, 12 Jul 2024 11:19:04 -0400 Subject: [PATCH 58/92] add preemptible 3 --- tasks/broad/ImputationTasks.wdl | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index f33ecf7098..147e157e25 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -20,6 +20,7 @@ task CalculateChromosomeLength { memory: "${memory_mb} MiB" cpu: cpu maxRetries: 2 + preemptible: 3 } output { Int chrom_length = read_int(stdout()) @@ -330,6 +331,7 @@ task CountVariantsInChunksBeagle { memory: "${memory_mb} MiB" cpu: cpu maxRetries: 2 + preemptible: 3 } } @@ -360,6 +362,7 @@ task CheckChunksBeagle { memory: "${memory_mb} MiB" cpu: cpu maxRetries: 2 + preemptible: 3 } } @@ -412,6 +415,7 @@ task PhaseAndImputeBeagle { memory: "${memory_mb} MiB" cpu: cpu maxRetries: 2 + preemptible: 3 } } @@ -446,6 +450,7 @@ task GatherVcfs { memory: "${memory_mb} MiB" cpu: cpu maxRetries: 2 + preemptible: 3 } output { File output_vcf = "~{output_vcf_basename}.vcf.gz" @@ -513,6 +518,7 @@ task UpdateHeader { memory: "${memory_mb} MiB" cpu: cpu maxRetries: 2 + preemptible: 3 } output { File output_vcf = "~{basename}.vcf.gz" @@ -548,6 +554,7 @@ task RemoveSymbolicAlleles { memory: "${memory_mb} MiB" cpu: cpu maxRetries: 2 + preemptible: 3 } } @@ -578,6 +585,7 @@ task SeparateMultiallelics { memory: "${memory_mb} MiB" cpu: cpu maxRetries: 2 + preemptible: 3 } } @@ -721,7 +729,7 @@ task AggregateImputationQCMetrics { disks : "local-disk ${disk_size_gb} HDD" memory: "${memory_mb} MiB" cpu: cpu - preemptible : 3 + preemptible: 3 } output { File aggregated_metrics = "~{basename}_aggregated_imputation_metrics.tsv" @@ -761,7 +769,7 @@ task StoreChunksInfo { disks : "local-disk ${disk_size_gb} HDD" memory: "${memory_mb} MiB" cpu: cpu - preemptible : 3 + preemptible: 3 maxRetries: 2 } output { @@ -800,7 +808,7 @@ task MergeImputationQCMetrics { disks : "local-disk ${disk_size_gb} HDD" memory: "${memory_mb} MiB" cpu: cpu - preemptible : 3 + preemptible: 3 } output { File aggregated_metrics = "~{basename}_aggregated_imputation_metrics.tsv" @@ -880,6 +888,7 @@ task SetIDs { memory: "${memory_mb} MiB" cpu: cpu maxRetries: 2 + preemptible: 3 } output { File output_vcf = "~{output_basename}.vcf.gz" @@ -910,6 +919,7 @@ task ExtractIDs { memory: "${memory_mb} MiB" cpu: cpu maxRetries: 2 + preemptible: 3 } } @@ -951,6 +961,7 @@ task SelectVariantsByIds { memory: "${memory_mb} MiB" cpu: cpu maxRetries: 2 + preemptible: 3 } output { File output_vcf = "~{basename}.vcf.gz" @@ -980,6 +991,7 @@ task RemoveAnnotations { memory: "${memory_mb} MiB" cpu: cpu maxRetries: 2 + preemptible: 3 } output { File output_vcf = "~{basename}.vcf.gz" @@ -1012,6 +1024,7 @@ task InterleaveVariants { memory: "${memory_mb} MiB" cpu: cpu maxRetries: 2 + preemptible: 3 } output { File output_vcf = "~{basename}.vcf.gz" @@ -1038,6 +1051,7 @@ task FindSitesUniqueToFileTwoOnly { memory: "${memory_mb} MiB" cpu: cpu maxRetries: 2 + preemptible: 3 } output { File missing_sites = "missing_sites.ids" @@ -1123,6 +1137,7 @@ task PreSplitVcf { memory: "${memory_mb} MiB" cpu: cpu maxRetries: 2 + preemptible: 3 } output { Array[File] chr_split_vcfs = glob("split_vcfs/*.vcf.gz") @@ -1226,6 +1241,7 @@ task PreChunkVcf { memory: "${memory_mb} MiB" cpu: cpu maxRetries: 2 + preemptible: 3 } output { Array[File] generate_chunk_vcfs = glob("generate_chunk/*.vcf.gz") @@ -1252,6 +1268,7 @@ task ErrorWithMessageIfErrorCountNotZero { runtime { docker: "ubuntu.azurecr.io/ubuntu:20.04" + preemptible: 3 } output { Boolean done = true From 8c43b1bb8904224796dce4e4c68f384e013250d4 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Fri, 12 Jul 2024 17:47:25 -0400 Subject: [PATCH 59/92] use acr gatk docker as default --- .../broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl index 7d79f9c754..dc4903e1f1 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl @@ -23,7 +23,7 @@ workflow ImputationBeagle { String interval_list_suffix = ".interval_list" String bref3_suffix = ".bref3" - String gatk_docker = "broadinstitute/gatk-nightly:2024-06-06-4.5.0.0-36-g2a420e483-NIGHTLY-SNAPSHOT" + String gatk_docker = "terrapublic.azurecr.io/gatk:4.5-squashed" # "broadinstitute/gatk-nightly:2024-06-06-4.5.0.0-36-g2a420e483-NIGHTLY-SNAPSHOT" Int? error_count_override } From c646241ada23516e269a6763ce0273ba9c6b60e7 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Mon, 15 Jul 2024 11:52:42 -0400 Subject: [PATCH 60/92] don't use preemptibles on GatherVcfs --- tasks/broad/ImputationTasks.wdl | 1 - 1 file changed, 1 deletion(-) diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index 147e157e25..1890eff536 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -450,7 +450,6 @@ task GatherVcfs { memory: "${memory_mb} MiB" cpu: cpu maxRetries: 2 - preemptible: 3 } output { File output_vcf = "~{output_vcf_basename}.vcf.gz" From 902969bb290127be7d911f742ec55fce8790891c Mon Sep 17 00:00:00 2001 From: "M. Morgan Aster" Date: Mon, 15 Jul 2024 15:25:52 -0400 Subject: [PATCH 61/92] basename fix for imputation beagle ref panel generation (#1332) * try auto specifying chr at end of basename * both tasks * add liftovervcfs to dockstore * allow specifying max mem --- .dockstore.yml | 4 ++++ .../CreateImputationRefPanelBeagle.wdl | 20 +++++++++++++++---- .../arrays/imputation_beagle/LiftoverVcfs.wdl | 10 +++++++--- 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/.dockstore.yml b/.dockstore.yml index 48610d2d7b..ee31e93d13 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -95,6 +95,10 @@ workflows: subclass: WDL primaryDescriptorPath: /pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl + - name: LiftoverVcfs + subclass: WDL + primaryDescriptorPath: /pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl + - name: RNAWithUMIsPipeline subclass: WDL primaryDescriptorPath: /pipelines/broad/rna_seq/RNAWithUMIsPipeline.wdl diff --git a/pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl b/pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl index a446154975..1e1c7821d1 100644 --- a/pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl +++ b/pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl @@ -7,16 +7,22 @@ workflow CreateImputationRefPanelBeagle { Array[File] ref_vcf_index Int disk_size + String? output_basename + Boolean make_brefs = true Boolean make_interval_lists = true } scatter (idx in range(length(ref_vcf))) { + Int? chr = idx + 1 + String? custom_basename_with_chr = output_basename + ".chr" + chr + if (make_brefs) { call BuildBref3 { input: vcf = ref_vcf[idx], - disk_size = disk_size + disk_size = disk_size, + output_basename = custom_basename_with_chr } } @@ -24,7 +30,8 @@ workflow CreateImputationRefPanelBeagle { call CreateRefPanelIntervalLists { input: ref_panel_vcf = ref_vcf[idx], - ref_panel_vcf_index = ref_vcf_index[idx] + ref_panel_vcf_index = ref_vcf_index[idx], + output_basename = custom_basename_with_chr, } } } @@ -38,10 +45,12 @@ workflow CreateImputationRefPanelBeagle { task BuildBref3 { input { File vcf + String? output_basename Int disk_size } - String name = basename(vcf, ".vcf.gz") + String name_from_file = basename(vcf, ".vcf.gz") + String name = select_first([output_basename, name_from_file]) command <<< java -jar /usr/gitc/bref3.22Jul22.46e.jar ~{vcf} > ~{name}.bref3 @@ -64,6 +73,8 @@ task CreateRefPanelIntervalLists { File ref_panel_vcf File ref_panel_vcf_index + String? output_basename + Int disk_size_gb = ceil(2*size(ref_panel_vcf, "GiB")) + 50 # not sure how big the disk size needs to be since we aren't downloading the entire VCF here Int cpu = 1 Int memory_mb = 8000 @@ -73,7 +84,8 @@ task CreateRefPanelIntervalLists { Int command_mem = memory_mb - 1000 Int max_heap = memory_mb - 500 - String basename = basename(ref_panel_vcf, '.vcf.gz') + String name_from_file = basename(ref_panel_vcf, ".vcf.gz") + String basename = select_first([output_basename, name_from_file]) command { gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" \ diff --git a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl index 47d3392662..b8742d3817 100644 --- a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl +++ b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl @@ -13,6 +13,7 @@ workflow LiftoverVcfs { String docker = "us.gcr.io/broad-gatk/gatk:4.2.6.1" Int min_disk_size = 100 + Int mem_gb = 16 File hg38_reference_fasta File hg38_reference_fasta_index @@ -36,7 +37,8 @@ workflow LiftoverVcfs { docker = docker, max_retries = max_retries, preemptible_tries = preemptible_tries, - min_disk_size = min_disk_size + min_disk_size = min_disk_size, + mem_gb = mem_gb } output { @@ -57,15 +59,17 @@ task LiftOverArrays { Int max_retries Int preemptible_tries Int min_disk_size + Int mem_gb } Int disk_size_from_file = (ceil(size(input_vcf, "GiB") + size(liftover_chain, "GiB") + size(reference_fasta, "GiB")) * 2) + 20 Int disk_size = if ( disk_size_from_file > min_disk_size ) then disk_size_from_file else min_disk_size + Int max_mem_gb = mem_gb - 1 command <<< set -euo pipefail - gatk --java-options "-Xms4g -Xmx15g" \ + gatk --java-options "-Xms4g -Xmx~{max_mem_gb}g" \ LiftoverVcf \ --INPUT ~{input_vcf} \ --OUTPUT ~{output_basename}.liftedover.vcf \ @@ -83,7 +87,7 @@ task LiftOverArrays { runtime { docker: docker - memory: "16 GiB" + memory: "~{mem_gb} GiB" cpu: "1" disks: "local-disk ~{disk_size} HDD" maxRetries: max_retries From 4130623d8fc6c17d7fa8b7bb1393e0882dd15808 Mon Sep 17 00:00:00 2001 From: "M. Morgan Aster" Date: Fri, 19 Jul 2024 12:51:08 -0400 Subject: [PATCH 62/92] TSPS-269 Speed up CountVariantsInChunksBeagle by using bedtools (#1335) * try creating bed files * try again * try again again * a different thing * use bedtools and bed ref panel files * oops update the correct task * fix * use the right freaking file name * remove comment --- .../CreateImputationRefPanelBeagle.wdl | 59 ++++++++++++++++--- .../ImputationBeaglePreChunk.wdl | 8 +-- tasks/broad/ImputationTasks.wdl | 9 +-- 3 files changed, 61 insertions(+), 15 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl b/pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl index 1e1c7821d1..6fee942de2 100644 --- a/pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl +++ b/pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl @@ -11,6 +11,7 @@ workflow CreateImputationRefPanelBeagle { Boolean make_brefs = true Boolean make_interval_lists = true + Boolean make_bed_files = true } scatter (idx in range(length(ref_vcf))) { @@ -26,19 +27,28 @@ workflow CreateImputationRefPanelBeagle { } } - if (make_interval_lists) { - call CreateRefPanelIntervalLists { - input: - ref_panel_vcf = ref_vcf[idx], - ref_panel_vcf_index = ref_vcf_index[idx], - output_basename = custom_basename_with_chr, - } + if (make_interval_lists || make_bed_files) { + call CreateRefPanelIntervalLists { + input: + ref_panel_vcf = ref_vcf[idx], + ref_panel_vcf_index = ref_vcf_index[idx], + output_basename = custom_basename_with_chr } + } + + if (make_bed_files) { + File interval_list = select_first([CreateRefPanelIntervalLists.interval_list]) + call CreateRefPanelBedFiles { + input: + ref_panel_interval_list = interval_list + } + } } output { Array[File?] bref3s = BuildBref3.out_bref3 Array[File?] interval_lists = CreateRefPanelIntervalLists.interval_list + Array[File?] bed_files = CreateRefPanelBedFiles.bed_file } } @@ -105,3 +115,38 @@ task CreateRefPanelIntervalLists { cpu: cpu } } + +task CreateRefPanelBedFiles { + input { + File ref_panel_interval_list + + Int disk_size_gb = ceil(2*size(ref_panel_interval_list, "GiB")) + 50 + Int cpu = 1 + Int memory_mb = 8000 + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" + } + + Int command_mem = memory_mb - 1000 + Int max_heap = memory_mb - 500 + + String basename = basename(ref_panel_interval_list, ".interval_list") + + + command { + gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" \ + IntervalListToBed \ + -I ~{ref_panel_interval_list} \ + -O ~{basename}.bed + } + + output { + File bed_file = "~{basename}.bed" + } + + runtime { + docker: gatk_docker + disks: "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu + } +} diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl index dc4903e1f1..4e9814fd06 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl @@ -20,7 +20,7 @@ workflow ImputationBeagle { String output_basename # the basename for intermediate and output files # file extensions used to find reference panel files - String interval_list_suffix = ".interval_list" + String bed_suffix = ".bed" String bref3_suffix = ".bref3" String gatk_docker = "terrapublic.azurecr.io/gatk:4.5-squashed" # "broadinstitute/gatk-nightly:2024-06-06-4.5.0.0-36-g2a420e483-NIGHTLY-SNAPSHOT" @@ -46,7 +46,7 @@ workflow ImputationBeagle { String genetic_map_filename = genetic_maps_path + "plink." + contigs[contig_index] + ".GRCh38.withchr.map" ReferencePanelContig referencePanelContig = { - "interval_list": reference_basename + interval_list_suffix, + "bed": reference_basename + bed_suffix, "bref3": reference_basename + bref3_suffix, "contig": contigs[contig_index], "genetic_map": genetic_map_filename @@ -79,7 +79,7 @@ workflow ImputationBeagle { input: vcf = PreChunkVcf.generate_chunk_vcfs[i], vcf_index = PreChunkVcf.generate_chunk_vcf_indices[i], - panel_interval_list = referencePanelContig.interval_list, + panel_bed_file = referencePanelContig.bed, gatk_docker = gatk_docker } @@ -241,7 +241,7 @@ workflow ImputationBeagle { } struct ReferencePanelContig { - File interval_list + File bed File bref3 String contig File genetic_map diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index 1890eff536..89ce22920f 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -161,7 +161,7 @@ task CountVariantsInChunks { set -e -o pipefail echo $(gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V ~{vcf} | sed 's/Tool returned://') > var_in_original - echo $(gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V ~{vcf} -L ~{panel_vcf} | sed 's/Tool returned://') > var_in_reference + echo $(gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V ~{vcf} -L ~{panel_vcf} | sed 's/Tool returned://') > var_in_reference >>> output { Int var_in_original = read_int("var_in_original") @@ -302,12 +302,12 @@ task CountVariantsInChunksBeagle { input { File vcf File vcf_index - File panel_interval_list + File panel_bed_file String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" Int cpu = 1 Int memory_mb = 8000 - Int disk_size_gb = 2 * ceil(size([vcf, vcf_index, panel_interval_list], "GiB")) + 20 + Int disk_size_gb = 2 * ceil(size([vcf, vcf_index, panel_bed_file], "GiB")) + 20 } Int command_mem = memory_mb - 1000 Int max_heap = memory_mb - 500 @@ -319,8 +319,9 @@ task CountVariantsInChunksBeagle { ln -sf ~{vcf_index} input.vcf.gz.tbi gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V input.vcf.gz | tail -n 1 > var_in_original - gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V input.vcf.gz -L ~{panel_interval_list} | tail -n 1 > var_also_in_reference + bedtools intersect -a ~{vcf} -b ~{panel_bed_file} | wc -l > var_also_in_reference >>> + output { Int var_in_original = read_int("var_in_original") Int var_also_in_reference = read_int("var_also_in_reference") From 4b7fcfee3bc79706c6592df5a4c6de0e03c54b91 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Fri, 26 Jul 2024 11:14:36 -0400 Subject: [PATCH 63/92] update pipeline version to 0.0.2 --- .../broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl index 4e9814fd06..ead2e53b71 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl @@ -5,7 +5,7 @@ import "../../../../tasks/broad/Utilities.wdl" as utils workflow ImputationBeagle { - String pipeline_version = "0.0.1" + String pipeline_version = "0.0.2" input { Int chunkLength = 25000000 From 51adbd934d13e30f25f1f44f4a3099451042410b Mon Sep 17 00:00:00 2001 From: "M. Morgan Aster" Date: Mon, 5 Aug 2024 10:11:58 -0400 Subject: [PATCH 64/92] TSPS-293: Fix up streaming imputation beagle (#1347) update ImputationBeagle --- .../imputation_beagle/ImputationBeagle.wdl | 237 +++++++++--------- .../ImputationBeaglePreChunk.wdl | 2 +- tasks/broad/ImputationTasks.wdl | 38 +++ 3 files changed, 164 insertions(+), 113 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl index a9a01e0606..77d493ca7c 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl @@ -12,87 +12,85 @@ workflow ImputationBeagle { Int chunkOverlaps = 5000000 # this is the padding that will be added to the beginning and end of each chunk to reduce edge effects File multi_sample_vcf - File multi_sample_vcf_index - Boolean perform_extra_qc_steps = false # these are optional additional extra QC steps from Amit's group that should only be - # run for large sample sets, especially a diverse set of samples (it's further limiting called at sites to 95% and by HWE) - Float optional_qc_max_missing = 0.05 - Float optional_qc_hwe = 0.000001 File ref_dict # for reheadering / adding contig lengths in the header of the ouptut VCF, and calculating contig lengths Array[String] contigs - String reference_panel_path # path to the bucket where the reference panel files are stored for all contigs + String reference_panel_path_prefix # path + file prefix to the bucket where the reference panel files are stored for all contigs String genetic_maps_path # path to the bucket where genetic maps are stored for all contigs - String output_callset_name # the output callset name - Boolean split_output_to_single_sample = false - - Int chunks_fail_threshold = 1 # require fewer than this many chunks to fail in order to pass + String output_basename # the basename for intermediate and output files # file extensions used to find reference panel files - String interval_list_suffix = ".interval_list" + String bed_suffix = ".bed" String bref3_suffix = ".bref3" + + String gatk_docker = "broadinstitute/gatk:4.6.0.0" + String ubuntu_docker = "ubuntu:20.04" + + Int? error_count_override } call tasks.CountSamples { input: - vcf = multi_sample_vcf, + vcf = multi_sample_vcf + } + + call tasks.CreateVcfIndex { + input: + vcf_input = multi_sample_vcf, + gatk_docker = gatk_docker } Float chunkLengthFloat = chunkLength scatter (contig in contigs) { # these are specific to hg38 - contig is format 'chr1' - String reference_filename = reference_panel_path + "hgdp.tgp.gwaspy.merged." + contig + ".merged.AN_added.bcf.ac2" + String reference_basename = reference_panel_path_prefix + "." + contig String genetic_map_filename = genetic_maps_path + "plink." + contig + ".GRCh38.withchr.map" ReferencePanelContig referencePanelContig = { - "interval_list": reference_filename + interval_list_suffix, - "bref3": reference_filename + bref3_suffix, + "bed": reference_basename + bed_suffix, + "bref3": reference_basename + bref3_suffix, "contig": contig, "genetic_map": genetic_map_filename } + call tasks.CalculateChromosomeLength { input: ref_dict = ref_dict, - chrom = referencePanelContig.contig + chrom = referencePanelContig.contig, + ubuntu_docker = ubuntu_docker } Int num_chunks = ceil(CalculateChromosomeLength.chrom_length / chunkLengthFloat) scatter (i in range(num_chunks)) { String chunk_contig = referencePanelContig.contig + Int start = (i * chunkLength) + 1 Int startWithOverlaps = if (start - chunkOverlaps < 1) then 1 else start - chunkOverlaps Int end = if (CalculateChromosomeLength.chrom_length < ((i + 1) * chunkLength)) then CalculateChromosomeLength.chrom_length else ((i + 1) * chunkLength) Int endWithOverlaps = if (CalculateChromosomeLength.chrom_length < end + chunkOverlaps) then CalculateChromosomeLength.chrom_length else end + chunkOverlaps String chunk_basename = referencePanelContig.contig + "_chunk_" + i + # generate the chunked vcf file that will be used for imputation, including overlaps call tasks.GenerateChunk { input: - vcf = multi_sample_vcf, - vcf_index = multi_sample_vcf_index, + vcf = CreateVcfIndex.vcf, + vcf_index = CreateVcfIndex.vcf_index, start = startWithOverlaps, end = endWithOverlaps, chrom = referencePanelContig.contig, - basename = chunk_basename - } - - if (perform_extra_qc_steps) { - call tasks.OptionalQCSites { - input: - input_vcf = GenerateChunk.output_vcf, - input_vcf_index = GenerateChunk.output_vcf_index, - output_vcf_basename = chunk_basename, - optional_qc_max_missing = optional_qc_max_missing, - optional_qc_hwe = optional_qc_hwe - } + basename = chunk_basename, + gatk_docker = gatk_docker } call tasks.CountVariantsInChunksBeagle { input: - vcf = select_first([OptionalQCSites.output_vcf, GenerateChunk.output_vcf]), - vcf_index = select_first([OptionalQCSites.output_vcf_index, GenerateChunk.output_vcf_index]), - panel_interval_list = referencePanelContig.interval_list + vcf = GenerateChunk.output_vcf, + vcf_index = GenerateChunk.output_vcf_index, + panel_bed_file = referencePanelContig.bed, + gatk_docker = gatk_docker } call tasks.CheckChunksBeagle { @@ -101,14 +99,16 @@ workflow ImputationBeagle { var_also_in_reference = CountVariantsInChunksBeagle.var_also_in_reference } + # create chunk without overlaps to get sites to impute call tasks.SubsetVcfToRegion { input: - vcf = multi_sample_vcf, - vcf_index = multi_sample_vcf_index, + vcf = CreateVcfIndex.vcf, + vcf_index = CreateVcfIndex.vcf_index, output_basename = "input_samples_subset_to_chunk", contig = referencePanelContig.contig, start = start, - end = end + end = end, + gatk_docker = gatk_docker } call tasks.SetIDs as SetIdsVcfToImpute { @@ -116,71 +116,103 @@ workflow ImputationBeagle { vcf = SubsetVcfToRegion.output_vcf, output_basename = "input_samples_with_variant_ids" } + } + + Array[File] chunkedVcfsWithOverlapsForImputation = GenerateChunk.output_vcf + Array[File] chunkedVcfsWithoutOverlapsForSiteIds = SetIdsVcfToImpute.output_vcf + Array[File] chunkedVcfIndexesWithoutOverlapsForSiteIds = SetIdsVcfToImpute.output_vcf_index + + call tasks.StoreChunksInfo as StoreContigLevelChunksInfo { + input: + chroms = chunk_contig, + starts = start, + ends = end, + vars_in_array = CountVariantsInChunksBeagle.var_in_original, + vars_in_panel = CountVariantsInChunksBeagle.var_also_in_reference, + valids = CheckChunksBeagle.valid, + basename = output_basename + } + + # if any chunk for any chromosome fail CheckChunks, then we will not impute run any task in the next scatter, + # namely phasing and imputing which would be the most costly to throw away + Int n_failed_chunks_int = select_first([error_count_override, read_int(StoreContigLevelChunksInfo.n_failed_chunks)]) + call tasks.ErrorWithMessageIfErrorCountNotZero as FailQCNChunks { + input: + errorCount = n_failed_chunks_int, + message = "contig " + referencePanelContig.contig + " had " + n_failed_chunks_int + " failing chunks" + } + + scatter (i in range(num_chunks)) { + String chunk_basename_imputed = referencePanelContig.contig + "_chunk_" + i + "_imputed" call tasks.ExtractIDs as ExtractIdsVcfToImpute { input: - vcf = SetIdsVcfToImpute.output_vcf, - output_basename = "imputed_sites" + vcf = chunkedVcfsWithoutOverlapsForSiteIds[i], + output_basename = "imputed_sites", + for_dependency = FailQCNChunks.done # these shenanigans can be replaced with `after` in wdl 1.1 } - if (CheckChunksBeagle.valid) { - call tasks.PhaseAndImputeBeagle { - input: - dataset_vcf = select_first([OptionalQCSites.output_vcf, GenerateChunk.output_vcf]), - ref_panel_bref3 = referencePanelContig.bref3, - chrom = referencePanelContig.contig, - basename = chunk_basename, - genetic_map_file = referencePanelContig.genetic_map, - start = start, - end = end - } - - call tasks.UpdateHeader { - input: - vcf = PhaseAndImputeBeagle.vcf, - vcf_index = PhaseAndImputeBeagle.vcf_index, - ref_dict = ref_dict, - basename = chunk_basename + "_imputed" - } - - call tasks.SeparateMultiallelics { - input: - original_vcf = UpdateHeader.output_vcf, - original_vcf_index = UpdateHeader.output_vcf_index, - output_basename = chunk_basename + "_imputed" - } - - call tasks.RemoveSymbolicAlleles { - input: - original_vcf = SeparateMultiallelics.output_vcf, - original_vcf_index = SeparateMultiallelics.output_vcf_index, - output_basename = chunk_basename + "_imputed" - } - - call tasks.SetIDs { - input: - vcf = RemoveSymbolicAlleles.output_vcf, - output_basename = chunk_basename + "_imputed" - } - - call tasks.ExtractIDs { - input: - vcf = SetIDs.output_vcf, - output_basename = "imputed_sites" - } + call tasks.PhaseAndImputeBeagle { + input: + dataset_vcf = chunkedVcfsWithOverlapsForImputation[i], + ref_panel_bref3 = referencePanelContig.bref3, + chrom = referencePanelContig.contig, + basename = chunk_basename_imputed, + genetic_map_file = referencePanelContig.genetic_map, + start = start[i], + end = end[i] } + + call tasks.UpdateHeader { + input: + vcf = PhaseAndImputeBeagle.vcf, + vcf_index = PhaseAndImputeBeagle.vcf_index, + ref_dict = ref_dict, + basename = chunk_basename_imputed, + gatk_docker = gatk_docker + } + + call tasks.SeparateMultiallelics { + input: + original_vcf = UpdateHeader.output_vcf, + original_vcf_index = UpdateHeader.output_vcf_index, + output_basename = chunk_basename_imputed + } + + call tasks.RemoveSymbolicAlleles { + input: + original_vcf = SeparateMultiallelics.output_vcf, + original_vcf_index = SeparateMultiallelics.output_vcf_index, + output_basename = chunk_basename_imputed, + gatk_docker = gatk_docker + } + + call tasks.SetIDs { + input: + vcf = RemoveSymbolicAlleles.output_vcf, + output_basename = chunk_basename_imputed + } + + call tasks.ExtractIDs { + input: + vcf = SetIDs.output_vcf, + output_basename = "imputed_sites" + } + call tasks.FindSitesUniqueToFileTwoOnly { input: file1 = select_first([ExtractIDs.ids, write_lines([])]), - file2 = ExtractIdsVcfToImpute.ids + file2 = ExtractIdsVcfToImpute.ids, + ubuntu_docker = ubuntu_docker } call tasks.SelectVariantsByIds { input: - vcf = SetIdsVcfToImpute.output_vcf, - vcf_index = SetIdsVcfToImpute.output_vcf_index, + vcf = chunkedVcfsWithoutOverlapsForSiteIds[i], + vcf_index = chunkedVcfIndexesWithoutOverlapsForSiteIds[i], ids = FindSitesUniqueToFileTwoOnly.missing_sites, - basename = "imputed_sites_to_recover" + basename = "imputed_sites_to_recover", + gatk_docker = gatk_docker } call tasks.RemoveAnnotations { @@ -192,7 +224,8 @@ workflow ImputationBeagle { call tasks.InterleaveVariants { input: vcfs = select_all([RemoveAnnotations.output_vcf, SetIDs.output_vcf]), - basename = output_callset_name + basename = output_basename, # TODO consider using a contig/chunk labeled basename + gatk_docker = gatk_docker } } @@ -202,7 +235,8 @@ workflow ImputationBeagle { call tasks.GatherVcfs { input: input_vcfs = flatten(chromosome_vcfs), - output_vcf_basename = output_callset_name + ".imputed" + output_vcf_basename = output_basename + ".imputed", + gatk_docker = gatk_docker } call tasks.StoreChunksInfo { @@ -213,34 +247,13 @@ workflow ImputationBeagle { vars_in_array = flatten(CountVariantsInChunksBeagle.var_in_original), vars_in_panel = flatten(CountVariantsInChunksBeagle.var_also_in_reference), valids = flatten(CheckChunksBeagle.valid), - basename = output_callset_name + basename = output_basename } - Int n_failed_chunks_int = read_int(StoreChunksInfo.n_failed_chunks) - - if (n_failed_chunks_int >= chunks_fail_threshold) { - call utils.ErrorWithMessage as FailQCNChunks { - input: - message = n_failed_chunks_int + " chunks failed imputation, QC threshold was set to " + chunks_fail_threshold - } - } - - if (split_output_to_single_sample) { - call tasks.SplitMultiSampleVcf { - input: - multiSampleVcf = GatherVcfs.output_vcf, - nSamples = CountSamples.nSamples - } - } - output { - Array[File]? imputed_single_sample_vcfs = SplitMultiSampleVcf.single_sample_vcfs - Array[File]? imputed_single_sample_vcf_indices = SplitMultiSampleVcf.single_sample_vcf_indices File imputed_multi_sample_vcf = GatherVcfs.output_vcf File imputed_multi_sample_vcf_index = GatherVcfs.output_vcf_index File chunks_info = StoreChunksInfo.chunks_info - File failed_chunks = StoreChunksInfo.failed_chunks - File n_failed_chunks = StoreChunksInfo.n_failed_chunks } meta { @@ -250,7 +263,7 @@ workflow ImputationBeagle { } struct ReferencePanelContig { - File interval_list + File bed File bref3 String contig File genetic_map diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl index ead2e53b71..4e9814fd06 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl @@ -5,7 +5,7 @@ import "../../../../tasks/broad/Utilities.wdl" as utils workflow ImputationBeagle { - String pipeline_version = "0.0.2" + String pipeline_version = "0.0.1" input { Int chunkLength = 25000000 diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index 89ce22920f..86e57dc08c 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -1091,6 +1091,44 @@ task SplitMultiSampleVcf { } } +task CreateVcfIndex { + input { + File vcf_input + + Int disk_size_gb = ceil(3*size(vcf_input, "GiB")) + 50 + Int cpu = 1 + Int memory_mb = 8000 + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" + } + Int command_mem = memory_mb - 1000 + Int max_heap = memory_mb - 500 + + String vcf_basename = basename(vcf_input) + + command { + set -e -o pipefail + + ln -sf ~{vcf_input} ~{vcf_basename} + + gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" \ + IndexFeatureFile -I ~{vcf_basename} + + + } + runtime { + docker: gatk_docker + disks: "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu + maxRetries: 2 + preemptible: 3 + } + output { + File vcf = "~{vcf_basename}" + File vcf_index = "~{vcf_basename}.tbi" + } +} + task PreSplitVcf { input { Array[String] contigs From f72a46ce116670f953433104d69902aa85d40c5f Mon Sep 17 00:00:00 2001 From: jsotobroad Date: Wed, 13 Nov 2024 13:13:31 -0500 Subject: [PATCH 65/92] add array imputation quota consumed wdl (#1425) * add array imputation quota consumed wdl * add changelogs for imputation array related workflows --------- Co-authored-by: Jose Soto --- .dockstore.yml | 4 +++ .../ArrayImputationQuotaConsumed.changelog.md | 4 +++ .../ArrayImputationQuotaConsumed.wdl | 34 +++++++++++++++++++ ...reateImputationRefPanelBeagle.changelog.md | 4 +++ .../CreateImputationRefPanelBeagle.wdl | 2 ++ .../ImputationBeagle.changelog.md | 4 +++ .../ImputationBeaglePreChunk.changelog.md | 4 +++ .../LiftoverVcfs.changelog.md | 4 +++ .../arrays/imputation_beagle/LiftoverVcfs.wdl | 2 +- 9 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 pipelines/broad/arrays/imputation_beagle/ArrayImputationQuotaConsumed.changelog.md create mode 100644 pipelines/broad/arrays/imputation_beagle/ArrayImputationQuotaConsumed.wdl create mode 100644 pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.changelog.md create mode 100644 pipelines/broad/arrays/imputation_beagle/ImputationBeagle.changelog.md create mode 100644 pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.changelog.md create mode 100644 pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.changelog.md diff --git a/.dockstore.yml b/.dockstore.yml index ee31e93d13..637c14572b 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -99,6 +99,10 @@ workflows: subclass: WDL primaryDescriptorPath: /pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl + - name: ArrayImputationQuotaConsumed + subclass: WDL + primaryDescriptorPath: /pipelines/broad/arrays/imputation_beagle/ArrayImputationQuotaConsumed.wdl + - name: RNAWithUMIsPipeline subclass: WDL primaryDescriptorPath: /pipelines/broad/rna_seq/RNAWithUMIsPipeline.wdl diff --git a/pipelines/broad/arrays/imputation_beagle/ArrayImputationQuotaConsumed.changelog.md b/pipelines/broad/arrays/imputation_beagle/ArrayImputationQuotaConsumed.changelog.md new file mode 100644 index 0000000000..336273806b --- /dev/null +++ b/pipelines/broad/arrays/imputation_beagle/ArrayImputationQuotaConsumed.changelog.md @@ -0,0 +1,4 @@ +# 0.0.1 +2024-11-13 (Date of Last Commit) + +* Pipeline still in developmental state diff --git a/pipelines/broad/arrays/imputation_beagle/ArrayImputationQuotaConsumed.wdl b/pipelines/broad/arrays/imputation_beagle/ArrayImputationQuotaConsumed.wdl new file mode 100644 index 0000000000..c14f16158f --- /dev/null +++ b/pipelines/broad/arrays/imputation_beagle/ArrayImputationQuotaConsumed.wdl @@ -0,0 +1,34 @@ +version 1.0 + +import "../../../../tasks/broad/ImputationTasks.wdl" as tasks + +workflow QuotaConsumed { + String pipeline_version = "0.0.1" + + input { + Int chunkLength = 25000000 + Int chunkOverlaps = 5000000 + + File multi_sample_vcf + + File ref_dict + Array[String] contigs + String reference_panel_path_prefix + String genetic_maps_path + String output_basename + Boolean split_output_to_single_sample = false + + # file extensions used to find reference panel files + String interval_list_suffix = ".interval_list" + String bref3_suffix = ".bref3" + } + + call tasks.CountSamples { + input: + vcf = multi_sample_vcf + } + + output { + Int quota_consumed = CountSamples.nSamples + } +} diff --git a/pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.changelog.md b/pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.changelog.md new file mode 100644 index 0000000000..336273806b --- /dev/null +++ b/pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.changelog.md @@ -0,0 +1,4 @@ +# 0.0.1 +2024-11-13 (Date of Last Commit) + +* Pipeline still in developmental state diff --git a/pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl b/pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl index 6fee942de2..5cc6c5b5b4 100644 --- a/pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl +++ b/pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl @@ -2,6 +2,8 @@ version 1.0 # This script is under review. It is not actively tested or maintained at this time. workflow CreateImputationRefPanelBeagle { + String pipeline_version = "0.0.1" + input { Array[File] ref_vcf Array[File] ref_vcf_index diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.changelog.md b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.changelog.md new file mode 100644 index 0000000000..336273806b --- /dev/null +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.changelog.md @@ -0,0 +1,4 @@ +# 0.0.1 +2024-11-13 (Date of Last Commit) + +* Pipeline still in developmental state diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.changelog.md b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.changelog.md new file mode 100644 index 0000000000..336273806b --- /dev/null +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.changelog.md @@ -0,0 +1,4 @@ +# 0.0.1 +2024-11-13 (Date of Last Commit) + +* Pipeline still in developmental state diff --git a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.changelog.md b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.changelog.md new file mode 100644 index 0000000000..336273806b --- /dev/null +++ b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.changelog.md @@ -0,0 +1,4 @@ +# 0.0.1 +2024-11-13 (Date of Last Commit) + +* Pipeline still in developmental state diff --git a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl index b8742d3817..fa57933f9b 100644 --- a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl +++ b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl @@ -3,7 +3,7 @@ version 1.0 # Liftover VCFs from hg19 to hg38 workflow LiftoverVcfs { - String pipeline_version = "1.0.0" + String pipeline_version = "0.0.1" input { File vcf_path From 144aed50f0534cd350b01e8f53d663d720fc4878 Mon Sep 17 00:00:00 2001 From: jsotobroad Date: Thu, 21 Nov 2024 10:20:10 -0500 Subject: [PATCH 66/92] TSPS-239 get wdl running on 400k sample ref panel (#1373) * changes to help beagle imputation wdl run on a 400k sample reference panel --------- Co-authored-by: Jose Soto * remove create imputation ref panel beagle wdl and changelog * PR feedback --------- Co-authored-by: Jose Soto Co-authored-by: M. Morgan Aster --- .dockstore.yml | 4 - ...reateImputationRefPanelBeagle.changelog.md | 4 - .../CreateImputationRefPanelBeagle.wdl | 154 ------------- .../imputation_beagle/ImputationBeagle.wdl | 40 +++- .../ImputationBeaglePreChunk.wdl | 1 - tasks/broad/ImputationTasks.wdl | 206 ++++++++++-------- 6 files changed, 149 insertions(+), 260 deletions(-) delete mode 100644 pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.changelog.md delete mode 100644 pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl diff --git a/.dockstore.yml b/.dockstore.yml index 637c14572b..a5ebd0c8b0 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -91,10 +91,6 @@ workflows: subclass: WDL primaryDescriptorPath: /pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl - - name: CreateImputationRefPanelBeagle - subclass: WDL - primaryDescriptorPath: /pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl - - name: LiftoverVcfs subclass: WDL primaryDescriptorPath: /pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl diff --git a/pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.changelog.md b/pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.changelog.md deleted file mode 100644 index 336273806b..0000000000 --- a/pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.changelog.md +++ /dev/null @@ -1,4 +0,0 @@ -# 0.0.1 -2024-11-13 (Date of Last Commit) - -* Pipeline still in developmental state diff --git a/pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl b/pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl deleted file mode 100644 index 5cc6c5b5b4..0000000000 --- a/pipelines/broad/arrays/imputation_beagle/CreateImputationRefPanelBeagle.wdl +++ /dev/null @@ -1,154 +0,0 @@ -version 1.0 - -# This script is under review. It is not actively tested or maintained at this time. -workflow CreateImputationRefPanelBeagle { - String pipeline_version = "0.0.1" - - input { - Array[File] ref_vcf - Array[File] ref_vcf_index - Int disk_size - - String? output_basename - - Boolean make_brefs = true - Boolean make_interval_lists = true - Boolean make_bed_files = true - } - - scatter (idx in range(length(ref_vcf))) { - Int? chr = idx + 1 - String? custom_basename_with_chr = output_basename + ".chr" + chr - - if (make_brefs) { - call BuildBref3 { - input: - vcf = ref_vcf[idx], - disk_size = disk_size, - output_basename = custom_basename_with_chr - } - } - - if (make_interval_lists || make_bed_files) { - call CreateRefPanelIntervalLists { - input: - ref_panel_vcf = ref_vcf[idx], - ref_panel_vcf_index = ref_vcf_index[idx], - output_basename = custom_basename_with_chr - } - } - - if (make_bed_files) { - File interval_list = select_first([CreateRefPanelIntervalLists.interval_list]) - call CreateRefPanelBedFiles { - input: - ref_panel_interval_list = interval_list - } - } - } - - output { - Array[File?] bref3s = BuildBref3.out_bref3 - Array[File?] interval_lists = CreateRefPanelIntervalLists.interval_list - Array[File?] bed_files = CreateRefPanelBedFiles.bed_file - } -} - -task BuildBref3 { - input { - File vcf - String? output_basename - Int disk_size - } - - String name_from_file = basename(vcf, ".vcf.gz") - String name = select_first([output_basename, name_from_file]) - - command <<< - java -jar /usr/gitc/bref3.22Jul22.46e.jar ~{vcf} > ~{name}.bref3 - >>> - - runtime { - docker: "us-central1-docker.pkg.dev/morgan-fieldeng-gcp/imputation-beagle-development/imputation-beagle:0.0.1-22Jul22.46e-wip-temp-20240227" - memory: "256 GB" - cpu: 4 - disks: "local-disk " + disk_size + " HDD" - } - - output { - File out_bref3 = "~{name}.bref3" - } -} - -task CreateRefPanelIntervalLists { - input { - File ref_panel_vcf - File ref_panel_vcf_index - - String? output_basename - - Int disk_size_gb = ceil(2*size(ref_panel_vcf, "GiB")) + 50 # not sure how big the disk size needs to be since we aren't downloading the entire VCF here - Int cpu = 1 - Int memory_mb = 8000 - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" - } - - Int command_mem = memory_mb - 1000 - Int max_heap = memory_mb - 500 - - String name_from_file = basename(ref_panel_vcf, ".vcf.gz") - String basename = select_first([output_basename, name_from_file]) - - command { - gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" \ - VcfToIntervalList \ - -I ~{ref_panel_vcf} \ - -O ~{basename}.interval_list - } - - output { - File interval_list = "~{basename}.interval_list" - } - - runtime { - docker: gatk_docker - disks: "local-disk ${disk_size_gb} HDD" - memory: "${memory_mb} MiB" - cpu: cpu - } -} - -task CreateRefPanelBedFiles { - input { - File ref_panel_interval_list - - Int disk_size_gb = ceil(2*size(ref_panel_interval_list, "GiB")) + 50 - Int cpu = 1 - Int memory_mb = 8000 - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" - } - - Int command_mem = memory_mb - 1000 - Int max_heap = memory_mb - 500 - - String basename = basename(ref_panel_interval_list, ".interval_list") - - - command { - gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" \ - IntervalListToBed \ - -I ~{ref_panel_interval_list} \ - -O ~{basename}.bed - } - - output { - File bed_file = "~{basename}.bed" - } - - runtime { - docker: gatk_docker - disks: "local-disk ${disk_size_gb} HDD" - memory: "${memory_mb} MiB" - cpu: cpu - } -} diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl index 77d493ca7c..2b6387d677 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl @@ -1,7 +1,6 @@ version 1.0 import "../../../../tasks/broad/ImputationTasks.wdl" as tasks -import "../../../../tasks/broad/Utilities.wdl" as utils workflow ImputationBeagle { @@ -9,7 +8,7 @@ workflow ImputationBeagle { input { Int chunkLength = 25000000 - Int chunkOverlaps = 5000000 # this is the padding that will be added to the beginning and end of each chunk to reduce edge effects + Int chunkOverlaps = 2000000 # this is the padding that will be added to the beginning and end of each chunk to reduce edge effects File multi_sample_vcf @@ -152,7 +151,14 @@ workflow ImputationBeagle { for_dependency = FailQCNChunks.done # these shenanigans can be replaced with `after` in wdl 1.1 } - call tasks.PhaseAndImputeBeagle { + # max amount of cpus you can ask for is 96 so at a max of 10k samples we can only ask for 9 cpu a sample. + # these values are based on trying to optimize for pre-emptibility using a 400k sample referene panel + # and up to a 10k sample input vcf + Int beagle_cpu = if (CountSamples.nSamples <= 1000) then 8 else floor(CountSamples.nSamples / 1000) * 9 + Int beagle_phase_memory_in_gb = if (CountSamples.nSamples <= 1000) then 22 else ceil(beagle_cpu * 1.5) + Int beagle_impute_memory_in_gb = if (CountSamples.nSamples <= 1000) then 30 else ceil(beagle_cpu * 4.3) + + call tasks.PhaseBeagle { input: dataset_vcf = chunkedVcfsWithOverlapsForImputation[i], ref_panel_bref3 = referencePanelContig.bref3, @@ -160,15 +166,37 @@ workflow ImputationBeagle { basename = chunk_basename_imputed, genetic_map_file = referencePanelContig.genetic_map, start = start[i], - end = end[i] + end = end[i], + cpu = beagle_cpu, + memory_mb = beagle_phase_memory_in_gb * 1024 + } + + call tasks.ImputeBeagle { + input: + dataset_vcf = PhaseBeagle.vcf, + ref_panel_bref3 = referencePanelContig.bref3, + chrom = referencePanelContig.contig, + basename = chunk_basename_imputed, + genetic_map_file = referencePanelContig.genetic_map, + start = start[i], + end = end[i], + cpu = beagle_cpu, + memory_mb = beagle_impute_memory_in_gb * 1024 + } + + call tasks.CreateVcfIndex as IndexImputedBeagle { + input: + vcf_input = ImputeBeagle.vcf, + gatk_docker = gatk_docker } call tasks.UpdateHeader { input: - vcf = PhaseAndImputeBeagle.vcf, - vcf_index = PhaseAndImputeBeagle.vcf_index, + vcf = IndexImputedBeagle.vcf, + vcf_index = IndexImputedBeagle.vcf_index, ref_dict = ref_dict, basename = chunk_basename_imputed, + disable_sequence_dictionary_validation = false, gatk_docker = gatk_docker } diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl index 4e9814fd06..a35cf241f6 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl @@ -1,7 +1,6 @@ version 1.0 import "../../../../tasks/broad/ImputationTasks.wdl" as tasks -import "../../../../tasks/broad/Utilities.wdl" as utils workflow ImputationBeagle { diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index 86e57dc08c..f863df3fd1 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -19,7 +19,6 @@ task CalculateChromosomeLength { disks: "local-disk ${disk_size_gb} HDD" memory: "${memory_mb} MiB" cpu: cpu - maxRetries: 2 preemptible: 3 } output { @@ -60,14 +59,14 @@ task CreateRefPanelIntervalLists { File ref_panel_vcf File ref_panel_vcf_index - Int disk_size_gb = ceil(2*size(ref_panel_vcf, "GiB")) + 50 # not sure how big the disk size needs to be since we aren't downloading the entire VCF here + Int disk_size_gb = ceil(2*size(ref_panel_vcf, "GiB")) + 10 Int cpu = 1 - Int memory_mb = 8000 + Int memory_mb = 6000 String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" } - Int command_mem = memory_mb - 1000 - Int max_heap = memory_mb - 500 + Int command_mem = memory_mb - 1500 + Int max_heap = memory_mb - 1000 String basename = basename(ref_panel_vcf, '.vcf.gz') @@ -99,13 +98,13 @@ task GenerateChunk { File vcf File vcf_index - Int disk_size_gb = ceil(2*size(vcf, "GiB")) + 50 # not sure how big the disk size needs to be since we aren't downloading the entire VCF here + Int disk_size_gb = ceil(2*size(vcf, "GiB")) + 10 Int cpu = 1 Int memory_mb = 8000 String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.6.1.0" } - Int command_mem = memory_mb - 1000 - Int max_heap = memory_mb - 500 + Int command_mem = memory_mb - 1500 + Int max_heap = memory_mb - 1000 command { gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" \ @@ -151,11 +150,11 @@ task CountVariantsInChunks { String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.6.1.0" Int cpu = 1 - Int memory_mb = 4000 + Int memory_mb = 6000 Int disk_size_gb = 2 * ceil(size([vcf, vcf_index, panel_vcf, panel_vcf_index], "GiB")) + 20 } - Int command_mem = memory_mb - 1000 - Int max_heap = memory_mb - 500 + Int command_mem = memory_mb - 1500 + Int max_heap = memory_mb - 1000 command <<< set -e -o pipefail @@ -184,7 +183,7 @@ task CheckChunks { Int var_in_original Int var_in_reference - Int disk_size_gb = ceil(2*size([vcf, vcf_index, panel_vcf, panel_vcf_index], "GiB")) + Int disk_size_gb = ceil(2*size([vcf, vcf_index, panel_vcf, panel_vcf_index], "GiB")) + 10 String bcftools_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.7-1.10.2-0.1.16-1669908889" Int cpu = 1 Int memory_mb = 4000 @@ -228,7 +227,7 @@ task PhaseVariantsEagle { String eagle_docker = "us.gcr.io/broad-gotc-prod/imputation-eagle:1.0.0-2.4-1690199702" Int cpu = 8 Int memory_mb = 32000 - Int disk_size_gb = ceil(3 * size([dataset_bcf, reference_panel_bcf, dataset_bcf_index, reference_panel_bcf_index], "GiB")) + 50 + Int disk_size_gb = ceil(3 * size([dataset_bcf, reference_panel_bcf, dataset_bcf_index, reference_panel_bcf_index], "GiB")) + 10 } command <<< /usr/gitc/eagle \ @@ -265,7 +264,7 @@ task Minimac4 { String minimac4_docker = "us.gcr.io/broad-gotc-prod/imputation-minimac4:1.0.6-1.0.2-1663948783" Int cpu = 1 Int memory_mb = 4000 - Int disk_size_gb = ceil(size(ref_panel, "GiB") + 2*size(phased_vcf, "GiB")) + 50 + Int disk_size_gb = ceil(size(ref_panel, "GiB") + 2*size(phased_vcf, "GiB")) + 10 } command <<< set -e -o pipefail @@ -306,11 +305,11 @@ task CountVariantsInChunksBeagle { String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" Int cpu = 1 - Int memory_mb = 8000 - Int disk_size_gb = 2 * ceil(size([vcf, vcf_index, panel_bed_file], "GiB")) + 20 + Int memory_mb = 16000 + Int disk_size_gb = 2 * ceil(size([vcf, vcf_index, panel_bed_file], "GiB")) + 10 } - Int command_mem = memory_mb - 1000 - Int max_heap = memory_mb - 500 + Int command_mem = memory_mb - 1500 + Int max_heap = memory_mb - 1000 command <<< set -e -o pipefail @@ -331,7 +330,6 @@ task CountVariantsInChunksBeagle { disks: "local-disk ${disk_size_gb} HDD" memory: "${memory_mb} MiB" cpu: cpu - maxRetries: 2 preemptible: 3 } } @@ -362,12 +360,11 @@ task CheckChunksBeagle { disks: "local-disk 10 HDD" memory: "${memory_mb} MiB" cpu: cpu - maxRetries: 2 preemptible: 3 } } -task PhaseAndImputeBeagle { +task PhaseBeagle { input { File dataset_vcf File ref_panel_bref3 @@ -380,8 +377,53 @@ task PhaseAndImputeBeagle { String beagle_docker = "us-central1-docker.pkg.dev/morgan-fieldeng-gcp/imputation-beagle-development/imputation-beagle:0.0.1-01Mar24.d36-wip-temp-20240301" Int cpu = 8 # This parameter is used as the nthreads input to Beagle which is part of how we make it determinstic. Changing this value may change the output generated by the tool Int memory_mb = 32000 # value depends on chunk size, the number of samples in ref and target panel, and whether imputation is performed - Int xmx_mb = 29000 # I suggest setting this parameter to be 85-90% of the memory_mb parameter - Int disk_size_gb = ceil(3 * size([dataset_vcf, ref_panel_bref3], "GiB")) + 50 # value may need to be adjusted + Int xmx_mb = memory_mb - 5000 # I suggest setting this parameter to be 85-90% of the memory_mb parameter + Int disk_size_gb = ceil(3 * size([dataset_vcf, ref_panel_bref3], "GiB")) + 10 # value may need to be adjusted + } + + command <<< + set -e -o pipefail + + java -ea -Xmx~{xmx_mb}m \ + -jar /usr/gitc/beagle.01Mar24.d36.jar \ + gt=~{dataset_vcf} \ + ref=~{ref_panel_bref3} \ + map=~{genetic_map_file} \ + out=phased_~{basename} \ + chrom=~{chrom}:~{start}-~{end} \ + impute=false \ + nthreads=~{cpu} \ + seed=-99999 + + >>> + output { + File vcf = "phased_~{basename}.vcf.gz" + File log = "phased_~{basename}.log" + } + runtime { + docker: beagle_docker + disks: "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu + preemptible: 3 + } +} + +task ImputeBeagle { + input { + File dataset_vcf + File ref_panel_bref3 + File genetic_map_file + String basename + String chrom # not needed if ref file has been chunked and you are using the entire chunk + Int start # not needed if ref file has been chunked and you are using the entire chunk + Int end # not needed if ref file has been chunked and you are using the entire chunk + + String beagle_docker = "us-central1-docker.pkg.dev/morgan-fieldeng-gcp/imputation-beagle-development/imputation-beagle:0.0.1-01Mar24.d36-wip-temp-20240301" + Int cpu = 8 # This parameter is used as the nthreads input to Beagle which is part of how we make it determinstic. Changing this value may change the output generated by the tool + Int memory_mb = 32000 # value depends on chunk size, the number of samples in ref and target panel, and whether imputation is performed + Int xmx_mb = memory_mb - 5000 # I suggest setting this parameter to be 85-90% of the memory_mb parameter + Int disk_size_gb = ceil(3 * size([dataset_vcf, ref_panel_bref3], "GiB")) + 10 # value may need to be adjusted } command <<< @@ -398,16 +440,9 @@ task PhaseAndImputeBeagle { nthreads=~{cpu} \ seed=-99999 - # notes: - # rename output file to "phased_{basename}" if phasing without imputing - # `chrom` not needed if ref and targ files have been chunked and you are using the entire chunk - # set impute=false if you wish to phase without imputing ungenotyped markers - - bcftools index -t imputed_~{basename}.vcf.gz >>> output { File vcf = "imputed_~{basename}.vcf.gz" - File vcf_index = "imputed_~{basename}.vcf.gz.tbi" File log = "imputed_~{basename}.log" } runtime { @@ -415,7 +450,6 @@ task PhaseAndImputeBeagle { disks: "local-disk ${disk_size_gb} HDD" memory: "${memory_mb} MiB" cpu: cpu - maxRetries: 2 preemptible: 3 } } @@ -428,10 +462,10 @@ task GatherVcfs { String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.6.1.0" Int cpu = 1 Int memory_mb = 16000 - Int disk_size_gb = ceil(3*size(input_vcfs, "GiB")) + Int disk_size_gb = ceil(3*size(input_vcfs, "GiB")) + 10 } - Int command_mem = memory_mb - 1000 - Int max_heap = memory_mb - 500 + Int command_mem = memory_mb - 1500 + Int max_heap = memory_mb - 1000 command <<< set -e -o pipefail @@ -450,7 +484,6 @@ task GatherVcfs { disks: "local-disk ${disk_size_gb} HDD" memory: "${memory_mb} MiB" cpu: cpu - maxRetries: 2 } output { File output_vcf = "~{output_vcf_basename}.vcf.gz" @@ -463,6 +496,8 @@ task ReplaceHeader { File vcf_to_replace_header File vcf_with_new_header + Int cpu = 1 + Int memory_mb = 6000 String bcftools_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.7-1.10.2-0.1.16-1669908889" } @@ -480,6 +515,9 @@ task ReplaceHeader { runtime { docker: bcftools_docker disks: "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu + preemptible: 3 } output { @@ -493,14 +531,16 @@ task UpdateHeader { File vcf_index File ref_dict String basename + Boolean disable_sequence_dictionary_validation = true Int disk_size_gb = ceil(4*(size(vcf, "GiB") + size(vcf_index, "GiB"))) + 20 String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.6.1.0" Int cpu = 1 - Int memory_mb = 8000 + Int memory_mb = 6000 } - Int command_mem = memory_mb - 1000 - Int max_heap = memory_mb - 500 + Int command_mem = memory_mb - 1500 + Int max_heap = memory_mb - 1000 + String disable_sequence_dict_validation_flag = if disable_sequence_dictionary_validation then "--disable-sequence-dictionary-validation" else "" command <<< @@ -510,14 +550,13 @@ task UpdateHeader { --source-dictionary ~{ref_dict} \ --output ~{basename}.vcf.gz \ --replace -V ~{vcf} \ - --disable-sequence-dictionary-validation + ~{disable_sequence_dict_validation_flag} >>> runtime { docker: gatk_docker disks: "local-disk ${disk_size_gb} HDD" memory: "${memory_mb} MiB" cpu: cpu - maxRetries: 2 preemptible: 3 } output { @@ -537,8 +576,8 @@ task RemoveSymbolicAlleles { Int cpu = 1 Int memory_mb = 4000 } - Int command_mem = memory_mb - 1000 - Int max_heap = memory_mb - 500 + Int command_mem = memory_mb - 1500 + Int max_heap = memory_mb - 1000 command { gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" \ @@ -553,7 +592,6 @@ task RemoveSymbolicAlleles { disks: "local-disk ${disk_size_gb} HDD" memory: "${memory_mb} MiB" cpu: cpu - maxRetries: 2 preemptible: 3 } } @@ -564,7 +602,7 @@ task SeparateMultiallelics { File original_vcf_index String output_basename - Int disk_size_gb = ceil(2*(size(original_vcf, "GiB") + size(original_vcf_index, "GiB"))) + Int disk_size_gb = ceil(2*(size(original_vcf, "GiB") + size(original_vcf_index, "GiB"))) + 10 String bcftools_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.7-1.10.2-0.1.16-1669908889" Int cpu = 1 Int memory_mb = 4000 @@ -584,7 +622,6 @@ task SeparateMultiallelics { disks: "local-disk ${disk_size_gb} HDD" memory: "${memory_mb} MiB" cpu: cpu - maxRetries: 2 preemptible: 3 } } @@ -600,7 +637,7 @@ task OptionalQCSites { String bcftools_vcftools_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.7-1.10.2-0.1.16-1669908889" Int cpu = 1 Int memory_mb = 16000 - Int disk_size_gb = ceil(2*(size(input_vcf, "GiB") + size(input_vcf_index, "GiB"))) + Int disk_size_gb = ceil(2*(size(input_vcf, "GiB") + size(input_vcf_index, "GiB"))) + 10 } Float max_missing = select_first([optional_qc_max_missing, 0.05]) @@ -674,7 +711,7 @@ task CountSamples { String bcftools_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.7-1.10.2-0.1.16-1669908889" Int cpu = 1 Int memory_mb = 3000 - Int disk_size_gb = 100 + ceil(size(vcf, "GiB")) + Int disk_size_gb = ceil(size(vcf, "GiB")) + 10 } command <<< @@ -685,7 +722,7 @@ task CountSamples { disks: "local-disk ${disk_size_gb} HDD" memory: "${memory_mb} MiB" cpu: cpu - maxRetries: 2 + preemptible: 3 } output { Int nSamples = read_int(stdout()) @@ -701,7 +738,7 @@ task AggregateImputationQCMetrics { String rtidyverse_docker = "rocker/tidyverse:4.1.0" Int cpu = 1 Int memory_mb = 2000 - Int disk_size_gb = 100 + ceil(size(infoFile, "GiB")) + Int disk_size_gb = ceil(size(infoFile, "GiB")) + 10 } command <<< Rscript -<< "EOF" @@ -770,7 +807,6 @@ task StoreChunksInfo { memory: "${memory_mb} MiB" cpu: cpu preemptible: 3 - maxRetries: 2 } output { File chunks_info = "~{basename}_chunk_info.tsv" @@ -787,7 +823,7 @@ task MergeImputationQCMetrics { String rtidyverse_docker = "rocker/tidyverse:4.1.0" Int cpu = 1 Int memory_mb = 2000 - Int disk_size_gb = 100 + ceil(size(metrics, "GiB")) + Int disk_size_gb = ceil(size(metrics, "GiB")) + 10 } command <<< Rscript -<< "EOF" @@ -826,13 +862,13 @@ task SubsetVcfToRegion { Int end Boolean exclude_filtered = false - Int disk_size_gb = ceil(2*size(vcf, "GiB")) + 50 # not sure how big the disk size needs to be since we aren't downloading the entire VCF here + Int disk_size_gb = ceil(2*size(vcf, "GiB")) + 10 Int cpu = 1 Int memory_mb = 8000 String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.6.1.0" } - Int command_mem = memory_mb - 1000 - Int max_heap = memory_mb - 500 + Int command_mem = memory_mb - 1500 + Int max_heap = memory_mb - 1000 command { gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" \ @@ -875,7 +911,7 @@ task SetIDs { String bcftools_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.7-1.10.2-0.1.16-1669908889" Int cpu = 1 Int memory_mb = 4000 - Int disk_size_gb = 100 + ceil(2.2 * size(vcf, "GiB")) + Int disk_size_gb = ceil(2.2 * size(vcf, "GiB")) + 10 } command <<< set -e -o pipefail @@ -887,7 +923,6 @@ task SetIDs { disks: "local-disk ${disk_size_gb} HDD" memory: "${memory_mb} MiB" cpu: cpu - maxRetries: 2 preemptible: 3 } output { @@ -901,7 +936,7 @@ task ExtractIDs { File vcf String output_basename - Int disk_size_gb = 2*ceil(size(vcf, "GiB")) + 100 + Int disk_size_gb = 2*ceil(size(vcf, "GiB")) + 10 String bcftools_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.7-1.10.2-0.1.16-1669908889" Int cpu = 1 Int memory_mb = 4000 @@ -918,7 +953,6 @@ task ExtractIDs { disks: "local-disk ${disk_size_gb} HDD" memory: "${memory_mb} MiB" cpu: cpu - maxRetries: 2 preemptible: 3 } } @@ -933,7 +967,7 @@ task SelectVariantsByIds { String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.6.1.0" Int cpu = 1 Int memory_mb = 16000 - Int disk_size_gb = ceil(1.2*size(vcf, "GiB")) + 100 + Int disk_size_gb = ceil(1.2*size(vcf, "GiB")) + 10 } parameter_meta { vcf: { @@ -945,8 +979,8 @@ task SelectVariantsByIds { localization_optional: true } } - Int command_mem = memory_mb - 1000 - Int max_heap = memory_mb - 500 + Int command_mem = memory_mb - 2000 + Int max_heap = memory_mb - 1500 command <<< set -e -o pipefail @@ -960,7 +994,6 @@ task SelectVariantsByIds { disks: "local-disk ${disk_size_gb} SSD" memory: "${memory_mb} MiB" cpu: cpu - maxRetries: 2 preemptible: 3 } output { @@ -977,7 +1010,7 @@ task RemoveAnnotations { String bcftools_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.7-1.10.2-0.1.16-1669908889" Int cpu = 1 Int memory_mb = 3000 - Int disk_size_gb = ceil(2.2*size(vcf, "GiB")) + 100 + Int disk_size_gb = ceil(2.2*size(vcf, "GiB")) + 10 } command <<< set -e -o pipefail @@ -990,7 +1023,6 @@ task RemoveAnnotations { disks: "local-disk ${disk_size_gb} HDD" memory: "${memory_mb} MiB" cpu: cpu - maxRetries: 2 preemptible: 3 } output { @@ -1007,10 +1039,10 @@ task InterleaveVariants { String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.6.1.0" Int cpu = 1 Int memory_mb = 16000 - Int disk_size_gb = ceil(3.2*size(vcfs, "GiB")) + 100 + Int disk_size_gb = ceil(3.2*size(vcfs, "GiB")) + 10 } - Int command_mem = memory_mb - 1000 - Int max_heap = memory_mb - 500 + Int command_mem = memory_mb - 1500 + Int max_heap = memory_mb - 1000 command <<< set -e -o pipefail @@ -1023,7 +1055,6 @@ task InterleaveVariants { disks: "local-disk ${disk_size_gb} SSD" memory: "${memory_mb} MiB" cpu: cpu - maxRetries: 2 preemptible: 3 } output { @@ -1040,7 +1071,7 @@ task FindSitesUniqueToFileTwoOnly { String ubuntu_docker = "ubuntu.azurecr.io/ubuntu:20.04" Int cpu = 1 Int memory_mb = 4000 - Int disk_size_gb = ceil(size(file1, "GiB") + 2*size(file2, "GiB")) + 100 + Int disk_size_gb = ceil(size(file1, "GiB") + 2*size(file2, "GiB")) + 10 } command <<< comm -13 <(sort ~{file1} | uniq) <(sort ~{file2} | uniq) > missing_sites.ids @@ -1050,7 +1081,6 @@ task FindSitesUniqueToFileTwoOnly { disks: "local-disk ${disk_size_gb} HDD" memory: "${memory_mb} MiB" cpu: cpu - maxRetries: 2 preemptible: 3 } output { @@ -1065,10 +1095,10 @@ task SplitMultiSampleVcf { String bcftools_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.7-1.10.2-0.1.16-1669908889" Int cpu = 1 - Int memory_mb = 8000 + Int memory_mb = 6000 # This calculation is explained in https://github.com/broadinstitute/warp/pull/937 - Int disk_size_gb = ceil(21*nSamples*size(multiSampleVcf, "GiB")/(nSamples+20)) + 100 + Int disk_size_gb = ceil(21*nSamples*size(multiSampleVcf, "GiB")/(nSamples+20)) + 10 } command <<< set -e -o pipefail @@ -1095,13 +1125,13 @@ task CreateVcfIndex { input { File vcf_input - Int disk_size_gb = ceil(3*size(vcf_input, "GiB")) + 50 + Int disk_size_gb = ceil(1.2*size(vcf_input, "GiB")) + 10 Int cpu = 1 - Int memory_mb = 8000 + Int memory_mb = 6000 String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" } - Int command_mem = memory_mb - 1000 - Int max_heap = memory_mb - 500 + Int command_mem = memory_mb - 1500 + Int max_heap = memory_mb - 1000 String vcf_basename = basename(vcf_input) @@ -1110,17 +1140,13 @@ task CreateVcfIndex { ln -sf ~{vcf_input} ~{vcf_basename} - gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" \ - IndexFeatureFile -I ~{vcf_basename} - - + bcftools index -t ~{vcf_basename} } runtime { docker: gatk_docker disks: "local-disk ${disk_size_gb} HDD" memory: "${memory_mb} MiB" cpu: cpu - maxRetries: 2 preemptible: 3 } output { @@ -1134,13 +1160,13 @@ task PreSplitVcf { Array[String] contigs File vcf - Int disk_size_gb = ceil(3*size(vcf, "GiB")) + 50 + Int disk_size_gb = ceil(3*size(vcf, "GiB")) + 10 Int cpu = 1 - Int memory_mb = 8000 + Int memory_mb = 6000 String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" } - Int command_mem = memory_mb - 1000 - Int max_heap = memory_mb - 500 + Int command_mem = memory_mb - 1500 + Int max_heap = memory_mb - 1000 command { set -e -o pipefail @@ -1174,7 +1200,6 @@ task PreSplitVcf { disks: "local-disk ${disk_size_gb} HDD" memory: "${memory_mb} MiB" cpu: cpu - maxRetries: 2 preemptible: 3 } output { @@ -1193,13 +1218,13 @@ task PreChunkVcf { File vcf_index Boolean exclude_filtered = false - Int disk_size_gb = ceil(4*size(vcf, "GiB")) + 50 + Int disk_size_gb = ceil(4*size(vcf, "GiB")) + 10 Int cpu = 1 - Int memory_mb = 8000 + Int memory_mb = 6000 String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" } - Int command_mem = memory_mb - 1000 - Int max_heap = memory_mb - 500 + Int command_mem = memory_mb - 1500 + Int max_heap = memory_mb - 1000 command { set -e -o pipefail @@ -1278,7 +1303,6 @@ task PreChunkVcf { disks: "local-disk ${disk_size_gb} HDD" memory: "${memory_mb} MiB" cpu: cpu - maxRetries: 2 preemptible: 3 } output { From 05fe60442a7b3722ac7a1d4e92fc7658f881281a Mon Sep 17 00:00:00 2001 From: jsotobroad Date: Thu, 21 Nov 2024 10:23:42 -0500 Subject: [PATCH 67/92] add set -e -o pipefail to all relevant imputation tasks (#1434) Co-authored-by: Jose Soto --- tasks/broad/ImputationTasks.wdl | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index f863df3fd1..dfe16f1a07 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -12,6 +12,8 @@ task CalculateChromosomeLength { } command { + set -e -o pipefail + grep -P "SN:~{chrom}\t" ~{ref_dict} | sed 's/.*LN://' | sed 's/\t.*//' } runtime { @@ -38,6 +40,8 @@ task GetMissingContigList { } command <<< + set -e -o pipefail + grep "@SQ" ~{ref_dict} | sed 's/.*SN://' | sed 's/\t.*//' > contigs.txt awk 'NR==FNR{arr[$0];next} !($0 in arr)' ~{included_contigs} contigs.txt > missing_contigs.txt >>> @@ -543,7 +547,6 @@ task UpdateHeader { String disable_sequence_dict_validation_flag = if disable_sequence_dictionary_validation then "--disable-sequence-dictionary-validation" else "" command <<< - ## update the header of the merged vcf gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" \ UpdateVCFSequenceDictionary \ @@ -644,6 +647,7 @@ task OptionalQCSites { Float hwe = select_first([optional_qc_hwe, 0.000001]) command <<< set -e -o pipefail + ln -sf ~{input_vcf} input.vcf.gz ln -sf ~{input_vcf_index} input.vcf.gz.tbi @@ -676,6 +680,7 @@ task MergeSingleSampleVcfs { } command <<< set -e -o pipefail + # Move the index file next to the vcf with the corresponding name declare -a VCFS=(~{sep=' ' input_vcfs}) @@ -715,6 +720,8 @@ task CountSamples { } command <<< + set -e -o pipefail + bcftools query -l ~{vcf} | wc -l >>> runtime { @@ -915,6 +922,7 @@ task SetIDs { } command <<< set -e -o pipefail + bcftools annotate ~{vcf} --set-id '%CHROM\:%POS\:%REF\:%FIRST_ALT' -Oz -o ~{output_basename}.vcf.gz bcftools index -t ~{output_basename}.vcf.gz >>> @@ -1074,6 +1082,8 @@ task FindSitesUniqueToFileTwoOnly { Int disk_size_gb = ceil(size(file1, "GiB") + 2*size(file2, "GiB")) + 10 } command <<< + set -e -o pipefail + comm -13 <(sort ~{file1} | uniq) <(sort ~{file2} | uniq) > missing_sites.ids >>> runtime { From ae38109161d8a477a76dc830de32d4cf9401247e Mon Sep 17 00:00:00 2001 From: jsotobroad Date: Thu, 9 Jan 2025 12:20:41 -0500 Subject: [PATCH 68/92] TSPS-341 remove tasks for recovering variants not in the reference panel (#1468) * remove tasks for recovering variants not in the reference panel and separate out beagle tasks from imputation tasks * remove prechunk wdl and references to it remove "Beagle" from task names in BeagleTasks.wdl --------- Co-authored-by: Jose Soto --- .dockstore.yml | 4 - .../imputation_beagle/ImputationBeagle.wdl | 101 +----- .../ImputationBeaglePreChunk.changelog.md | 4 - .../ImputationBeaglePreChunk.wdl | 247 ------------- tasks/broad/ImputationBeagleTasks.wdl | 181 ++++++++++ tasks/broad/ImputationTasks.wdl | 339 ------------------ 6 files changed, 198 insertions(+), 678 deletions(-) delete mode 100644 pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.changelog.md delete mode 100644 pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl create mode 100644 tasks/broad/ImputationBeagleTasks.wdl diff --git a/.dockstore.yml b/.dockstore.yml index a5ebd0c8b0..6cc4f04aa9 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -87,10 +87,6 @@ workflows: subclass: WDL primaryDescriptorPath: /pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl - - name: ImputationBeaglePreChunk - subclass: WDL - primaryDescriptorPath: /pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl - - name: LiftoverVcfs subclass: WDL primaryDescriptorPath: /pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl index 2b6387d677..ecd073524f 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl @@ -1,6 +1,7 @@ version 1.0 import "../../../../tasks/broad/ImputationTasks.wdl" as tasks +import "../../../../tasks/broad/ImputationBeagleTasks.wdl" as beagleTasks workflow ImputationBeagle { @@ -84,7 +85,7 @@ workflow ImputationBeagle { gatk_docker = gatk_docker } - call tasks.CountVariantsInChunksBeagle { + call beagleTasks.CountVariantsInChunks { input: vcf = GenerateChunk.output_vcf, vcf_index = GenerateChunk.output_vcf_index, @@ -92,50 +93,30 @@ workflow ImputationBeagle { gatk_docker = gatk_docker } - call tasks.CheckChunksBeagle { + call beagleTasks.CheckChunks { input: - var_in_original = CountVariantsInChunksBeagle.var_in_original, - var_also_in_reference = CountVariantsInChunksBeagle.var_also_in_reference - } - - # create chunk without overlaps to get sites to impute - call tasks.SubsetVcfToRegion { - input: - vcf = CreateVcfIndex.vcf, - vcf_index = CreateVcfIndex.vcf_index, - output_basename = "input_samples_subset_to_chunk", - contig = referencePanelContig.contig, - start = start, - end = end, - gatk_docker = gatk_docker - } - - call tasks.SetIDs as SetIdsVcfToImpute { - input: - vcf = SubsetVcfToRegion.output_vcf, - output_basename = "input_samples_with_variant_ids" + var_in_original = CountVariantsInChunks.var_in_original, + var_also_in_reference = CountVariantsInChunks.var_also_in_reference } } Array[File] chunkedVcfsWithOverlapsForImputation = GenerateChunk.output_vcf - Array[File] chunkedVcfsWithoutOverlapsForSiteIds = SetIdsVcfToImpute.output_vcf - Array[File] chunkedVcfIndexesWithoutOverlapsForSiteIds = SetIdsVcfToImpute.output_vcf_index call tasks.StoreChunksInfo as StoreContigLevelChunksInfo { input: chroms = chunk_contig, starts = start, ends = end, - vars_in_array = CountVariantsInChunksBeagle.var_in_original, - vars_in_panel = CountVariantsInChunksBeagle.var_also_in_reference, - valids = CheckChunksBeagle.valid, + vars_in_array = CountVariantsInChunks.var_in_original, + vars_in_panel = CountVariantsInChunks.var_also_in_reference, + valids = CheckChunks.valid, basename = output_basename } # if any chunk for any chromosome fail CheckChunks, then we will not impute run any task in the next scatter, # namely phasing and imputing which would be the most costly to throw away Int n_failed_chunks_int = select_first([error_count_override, read_int(StoreContigLevelChunksInfo.n_failed_chunks)]) - call tasks.ErrorWithMessageIfErrorCountNotZero as FailQCNChunks { + call beagleTasks.ErrorWithMessageIfErrorCountNotZero as FailQCNChunks { input: errorCount = n_failed_chunks_int, message = "contig " + referencePanelContig.contig + " had " + n_failed_chunks_int + " failing chunks" @@ -144,13 +125,6 @@ workflow ImputationBeagle { scatter (i in range(num_chunks)) { String chunk_basename_imputed = referencePanelContig.contig + "_chunk_" + i + "_imputed" - call tasks.ExtractIDs as ExtractIdsVcfToImpute { - input: - vcf = chunkedVcfsWithoutOverlapsForSiteIds[i], - output_basename = "imputed_sites", - for_dependency = FailQCNChunks.done # these shenanigans can be replaced with `after` in wdl 1.1 - } - # max amount of cpus you can ask for is 96 so at a max of 10k samples we can only ask for 9 cpu a sample. # these values are based on trying to optimize for pre-emptibility using a 400k sample referene panel # and up to a 10k sample input vcf @@ -158,7 +132,7 @@ workflow ImputationBeagle { Int beagle_phase_memory_in_gb = if (CountSamples.nSamples <= 1000) then 22 else ceil(beagle_cpu * 1.5) Int beagle_impute_memory_in_gb = if (CountSamples.nSamples <= 1000) then 30 else ceil(beagle_cpu * 4.3) - call tasks.PhaseBeagle { + call beagleTasks.Phase { input: dataset_vcf = chunkedVcfsWithOverlapsForImputation[i], ref_panel_bref3 = referencePanelContig.bref3, @@ -171,9 +145,9 @@ workflow ImputationBeagle { memory_mb = beagle_phase_memory_in_gb * 1024 } - call tasks.ImputeBeagle { + call beagleTasks.Impute { input: - dataset_vcf = PhaseBeagle.vcf, + dataset_vcf = Phase.vcf, ref_panel_bref3 = referencePanelContig.bref3, chrom = referencePanelContig.contig, basename = chunk_basename_imputed, @@ -186,7 +160,7 @@ workflow ImputationBeagle { call tasks.CreateVcfIndex as IndexImputedBeagle { input: - vcf_input = ImputeBeagle.vcf, + vcf_input = Impute.vcf, gatk_docker = gatk_docker } @@ -214,50 +188,9 @@ workflow ImputationBeagle { output_basename = chunk_basename_imputed, gatk_docker = gatk_docker } - - call tasks.SetIDs { - input: - vcf = RemoveSymbolicAlleles.output_vcf, - output_basename = chunk_basename_imputed - } - - call tasks.ExtractIDs { - input: - vcf = SetIDs.output_vcf, - output_basename = "imputed_sites" - } - - call tasks.FindSitesUniqueToFileTwoOnly { - input: - file1 = select_first([ExtractIDs.ids, write_lines([])]), - file2 = ExtractIdsVcfToImpute.ids, - ubuntu_docker = ubuntu_docker - } - - call tasks.SelectVariantsByIds { - input: - vcf = chunkedVcfsWithoutOverlapsForSiteIds[i], - vcf_index = chunkedVcfIndexesWithoutOverlapsForSiteIds[i], - ids = FindSitesUniqueToFileTwoOnly.missing_sites, - basename = "imputed_sites_to_recover", - gatk_docker = gatk_docker - } - - call tasks.RemoveAnnotations { - input: - vcf = SelectVariantsByIds.output_vcf, - basename = "imputed_sites_to_recover_annotations_removed" - } - - call tasks.InterleaveVariants { - input: - vcfs = select_all([RemoveAnnotations.output_vcf, SetIDs.output_vcf]), - basename = output_basename, # TODO consider using a contig/chunk labeled basename - gatk_docker = gatk_docker - } } - Array[File] chromosome_vcfs = select_all(InterleaveVariants.output_vcf) + Array[File] chromosome_vcfs = select_all(RemoveSymbolicAlleles.output_vcf) } call tasks.GatherVcfs { @@ -272,9 +205,9 @@ workflow ImputationBeagle { chroms = flatten(chunk_contig), starts = flatten(start), ends = flatten(end), - vars_in_array = flatten(CountVariantsInChunksBeagle.var_in_original), - vars_in_panel = flatten(CountVariantsInChunksBeagle.var_also_in_reference), - valids = flatten(CheckChunksBeagle.valid), + vars_in_array = flatten(CountVariantsInChunks.var_in_original), + vars_in_panel = flatten(CountVariantsInChunks.var_also_in_reference), + valids = flatten(CheckChunks.valid), basename = output_basename } diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.changelog.md b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.changelog.md deleted file mode 100644 index 336273806b..0000000000 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.changelog.md +++ /dev/null @@ -1,4 +0,0 @@ -# 0.0.1 -2024-11-13 (Date of Last Commit) - -* Pipeline still in developmental state diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl deleted file mode 100644 index a35cf241f6..0000000000 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl +++ /dev/null @@ -1,247 +0,0 @@ -version 1.0 - -import "../../../../tasks/broad/ImputationTasks.wdl" as tasks - -workflow ImputationBeagle { - - String pipeline_version = "0.0.1" - - input { - Int chunkLength = 25000000 - Int chunkOverlaps = 5000000 # this is the padding that will be added to the beginning and end of each chunk to reduce edge effects - - File multi_sample_vcf - - File ref_dict # for reheadering / adding contig lengths in the header of the ouptut VCF, and calculating contig lengths - Array[String] contigs - String reference_panel_path_prefix # path + file prefix to the bucket where the reference panel files are stored for all contigs - String genetic_maps_path # path to the bucket where genetic maps are stored for all contigs - String output_basename # the basename for intermediate and output files - - # file extensions used to find reference panel files - String bed_suffix = ".bed" - String bref3_suffix = ".bref3" - - String gatk_docker = "terrapublic.azurecr.io/gatk:4.5-squashed" # "broadinstitute/gatk-nightly:2024-06-06-4.5.0.0-36-g2a420e483-NIGHTLY-SNAPSHOT" - - Int? error_count_override - } - - call tasks.CountSamples { - input: - vcf = multi_sample_vcf, - } - - call tasks.PreSplitVcf { - input: - contigs = contigs, - vcf = multi_sample_vcf, - gatk_docker = gatk_docker - } - - scatter (contig_index in range(length(contigs))) { - # these are specific to hg38 - contig is format 'chr1' - String reference_basename = reference_panel_path_prefix + "." + contigs[contig_index] - String genetic_map_filename = genetic_maps_path + "plink." + contigs[contig_index] + ".GRCh38.withchr.map" - - ReferencePanelContig referencePanelContig = { - "bed": reference_basename + bed_suffix, - "bref3": reference_basename + bref3_suffix, - "contig": contigs[contig_index], - "genetic_map": genetic_map_filename - } - - call tasks.CalculateChromosomeLength { - input: - ref_dict = ref_dict, - chrom = referencePanelContig.contig - } - - call tasks.PreChunkVcf { - input: - chromosome_length=CalculateChromosomeLength.chrom_length, - chunk_length = chunkLength, - chunk_overlap = chunkOverlaps, - chrom = contigs[contig_index], - vcf = PreSplitVcf.chr_split_vcfs[contig_index], - vcf_index = PreSplitVcf.chr_split_vcf_indices[contig_index], - gatk_docker = gatk_docker - } - - scatter (i in range(length(PreChunkVcf.generate_chunk_vcfs))) { - String chunk_contig = referencePanelContig.contig - - Int start = PreChunkVcf.starts[i] - Int end = PreChunkVcf.ends[i] - - call tasks.CountVariantsInChunksBeagle { - input: - vcf = PreChunkVcf.generate_chunk_vcfs[i], - vcf_index = PreChunkVcf.generate_chunk_vcf_indices[i], - panel_bed_file = referencePanelContig.bed, - gatk_docker = gatk_docker - } - - call tasks.CheckChunksBeagle { - input: - var_in_original = CountVariantsInChunksBeagle.var_in_original, - var_also_in_reference = CountVariantsInChunksBeagle.var_also_in_reference - } - - call tasks.SetIDs as SetIdsVcfToImpute { - input: - vcf = PreChunkVcf.subset_vcfs[i], - output_basename = "input_samples_with_variant_ids" - } - } - - call tasks.StoreChunksInfo as StoreContigLevelChunksInfo { - input: - chroms = chunk_contig, - starts = start, - ends = end, - vars_in_array = CountVariantsInChunksBeagle.var_in_original, - vars_in_panel = CountVariantsInChunksBeagle.var_also_in_reference, - valids = CheckChunksBeagle.valid, - basename = output_basename - } - - # if any chunk for any chromosome fail CheckChunks, then we will not impute run any task in the next scatter, - # namely phasing and imputing which would be the most costly to throw away - Int n_failed_chunks_int = select_first([error_count_override, read_int(StoreContigLevelChunksInfo.n_failed_chunks)]) - call tasks.ErrorWithMessageIfErrorCountNotZero as FailQCNChunks { - input: - errorCount = n_failed_chunks_int, - message = "contig " + referencePanelContig.contig + " had " + n_failed_chunks_int + " failing chunks" - } - - scatter (i in range(length(PreChunkVcf.generate_chunk_vcfs))) { - - String chunk_basename = referencePanelContig.contig + "_chunk_" + i - - Int start2 = PreChunkVcf.starts[i] - Int end2 = PreChunkVcf.ends[i] - - call tasks.ExtractIDs as ExtractIdsVcfToImpute { - input: - vcf = SetIdsVcfToImpute.output_vcf[i], - output_basename = "imputed_sites", - for_dependency = FailQCNChunks.done # these shenanigans can be replaced with `after` in wdl 1.1 - } - - call tasks.PhaseAndImputeBeagle { - input: - dataset_vcf = PreChunkVcf.generate_chunk_vcfs[i], - ref_panel_bref3 = referencePanelContig.bref3, - chrom = referencePanelContig.contig, - basename = chunk_basename, - genetic_map_file = referencePanelContig.genetic_map, - start = start2, - end = end2 - } - - call tasks.UpdateHeader { - input: - vcf = PhaseAndImputeBeagle.vcf, - vcf_index = PhaseAndImputeBeagle.vcf_index, - ref_dict = ref_dict, - basename = chunk_basename + "_imputed", - gatk_docker = gatk_docker - } - - call tasks.SeparateMultiallelics { - input: - original_vcf = UpdateHeader.output_vcf, - original_vcf_index = UpdateHeader.output_vcf_index, - output_basename = chunk_basename + "_imputed" - } - - call tasks.RemoveSymbolicAlleles { - input: - original_vcf = SeparateMultiallelics.output_vcf, - original_vcf_index = SeparateMultiallelics.output_vcf_index, - output_basename = chunk_basename + "_imputed", - gatk_docker = gatk_docker - } - - call tasks.SetIDs { - input: - vcf = RemoveSymbolicAlleles.output_vcf, - output_basename = chunk_basename + "_imputed" - } - - call tasks.ExtractIDs { - input: - vcf = SetIDs.output_vcf, - output_basename = "imputed_sites", - for_dependency = true - } - - call tasks.FindSitesUniqueToFileTwoOnly { - input: - file1 = ExtractIDs.ids, - file2 = ExtractIdsVcfToImpute.ids - } - - call tasks.SelectVariantsByIds { - input: - vcf = SetIdsVcfToImpute.output_vcf[i], - vcf_index = SetIdsVcfToImpute.output_vcf_index[i], - ids = FindSitesUniqueToFileTwoOnly.missing_sites, - basename = "imputed_sites_to_recover", - gatk_docker = gatk_docker - } - - call tasks.RemoveAnnotations { - input: - vcf = SelectVariantsByIds.output_vcf, - basename = "imputed_sites_to_recover_annotations_removed" - } - - call tasks.InterleaveVariants { - input: - vcfs = [RemoveAnnotations.output_vcf, SetIDs.output_vcf], - basename = output_basename, - gatk_docker = gatk_docker - } - } - - Array[File] chromosome_vcfs = InterleaveVariants.output_vcf - } - - call tasks.GatherVcfs { - input: - input_vcfs = flatten(chromosome_vcfs), - output_vcf_basename = output_basename + ".imputed", - gatk_docker = gatk_docker - } - - call tasks.StoreChunksInfo { - input: - chroms = flatten(chunk_contig), - starts = flatten(start), - ends = flatten(end), - vars_in_array = flatten(CountVariantsInChunksBeagle.var_in_original), - vars_in_panel = flatten(CountVariantsInChunksBeagle.var_also_in_reference), - valids = flatten(CheckChunksBeagle.valid), - basename = output_basename - } - - output { - File imputed_multi_sample_vcf = GatherVcfs.output_vcf - File imputed_multi_sample_vcf_index = GatherVcfs.output_vcf_index - File chunks_info = StoreChunksInfo.chunks_info - } - - meta { - allowNestedInputs: true - } - -} - -struct ReferencePanelContig { - File bed - File bref3 - String contig - File genetic_map -} diff --git a/tasks/broad/ImputationBeagleTasks.wdl b/tasks/broad/ImputationBeagleTasks.wdl new file mode 100644 index 0000000000..d62f5a4e8c --- /dev/null +++ b/tasks/broad/ImputationBeagleTasks.wdl @@ -0,0 +1,181 @@ +version 1.0 + +task CountVariantsInChunks { + input { + File vcf + File vcf_index + File panel_bed_file + + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" + Int cpu = 1 + Int memory_mb = 16000 + Int disk_size_gb = 2 * ceil(size([vcf, vcf_index, panel_bed_file], "GiB")) + 10 + } + Int command_mem = memory_mb - 1500 + Int max_heap = memory_mb - 1000 + + command <<< + set -e -o pipefail + + ln -sf ~{vcf} input.vcf.gz + ln -sf ~{vcf_index} input.vcf.gz.tbi + + gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V input.vcf.gz | tail -n 1 > var_in_original + bedtools intersect -a ~{vcf} -b ~{panel_bed_file} | wc -l > var_also_in_reference + >>> + + output { + Int var_in_original = read_int("var_in_original") + Int var_also_in_reference = read_int("var_also_in_reference") + } + runtime { + docker: gatk_docker + disks: "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu + preemptible: 3 + } +} + +task CheckChunks { + input { + Int var_in_original + Int var_also_in_reference + + String bcftools_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.7-1.10.2-0.1.16-1669908889" + Int cpu = 1 + Int memory_mb = 4000 + } + command <<< + set -e -o pipefail + + if [ $(( ~{var_also_in_reference} * 2 - ~{var_in_original})) -gt 0 ] && [ ~{var_also_in_reference} -gt 3 ]; then + echo true > valid_file.txt + else + echo false > valid_file.txt + fi + >>> + output { + Boolean valid = read_boolean("valid_file.txt") + } + runtime { + docker: bcftools_docker + disks: "local-disk 10 HDD" + memory: "${memory_mb} MiB" + cpu: cpu + preemptible: 3 + } +} + +task Phase { + input { + File dataset_vcf + File ref_panel_bref3 + File genetic_map_file + String basename + String chrom # not needed if ref file has been chunked and you are using the entire chunk + Int start # not needed if ref file has been chunked and you are using the entire chunk + Int end # not needed if ref file has been chunked and you are using the entire chunk + + String beagle_docker = "us-central1-docker.pkg.dev/morgan-fieldeng-gcp/imputation-beagle-development/imputation-beagle:0.0.1-01Mar24.d36-wip-temp-20240301" + Int cpu = 8 # This parameter is used as the nthreads input to Beagle which is part of how we make it determinstic. Changing this value may change the output generated by the tool + Int memory_mb = 32000 # value depends on chunk size, the number of samples in ref and target panel, and whether imputation is performed + Int xmx_mb = memory_mb - 5000 # I suggest setting this parameter to be 85-90% of the memory_mb parameter + Int disk_size_gb = ceil(3 * size([dataset_vcf, ref_panel_bref3], "GiB")) + 10 # value may need to be adjusted + } + + command <<< + set -e -o pipefail + + java -ea -Xmx~{xmx_mb}m \ + -jar /usr/gitc/beagle.01Mar24.d36.jar \ + gt=~{dataset_vcf} \ + ref=~{ref_panel_bref3} \ + map=~{genetic_map_file} \ + out=phased_~{basename} \ + chrom=~{chrom}:~{start}-~{end} \ + impute=false \ + nthreads=~{cpu} \ + seed=-99999 + + >>> + output { + File vcf = "phased_~{basename}.vcf.gz" + File log = "phased_~{basename}.log" + } + runtime { + docker: beagle_docker + disks: "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu + preemptible: 3 + } +} + +task Impute { + input { + File dataset_vcf + File ref_panel_bref3 + File genetic_map_file + String basename + String chrom # not needed if ref file has been chunked and you are using the entire chunk + Int start # not needed if ref file has been chunked and you are using the entire chunk + Int end # not needed if ref file has been chunked and you are using the entire chunk + + String beagle_docker = "us-central1-docker.pkg.dev/morgan-fieldeng-gcp/imputation-beagle-development/imputation-beagle:0.0.1-01Mar24.d36-wip-temp-20240301" + Int cpu = 8 # This parameter is used as the nthreads input to Beagle which is part of how we make it determinstic. Changing this value may change the output generated by the tool + Int memory_mb = 32000 # value depends on chunk size, the number of samples in ref and target panel, and whether imputation is performed + Int xmx_mb = memory_mb - 5000 # I suggest setting this parameter to be 85-90% of the memory_mb parameter + Int disk_size_gb = ceil(3 * size([dataset_vcf, ref_panel_bref3], "GiB")) + 10 # value may need to be adjusted + } + + command <<< + set -e -o pipefail + + java -ea -Xmx~{xmx_mb}m \ + -jar /usr/gitc/beagle.01Mar24.d36.jar \ + gt=~{dataset_vcf} \ + ref=~{ref_panel_bref3} \ + map=~{genetic_map_file} \ + out=imputed_~{basename} \ + chrom=~{chrom}:~{start}-~{end} \ + impute=true \ + nthreads=~{cpu} \ + seed=-99999 + + >>> + output { + File vcf = "imputed_~{basename}.vcf.gz" + File log = "imputed_~{basename}.log" + } + runtime { + docker: beagle_docker + disks: "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu + preemptible: 3 + } +} + +task ErrorWithMessageIfErrorCountNotZero { + input { + Int errorCount + String message + } + command <<< + if [[ ~{errorCount} -gt 0 ]]; then + >&2 echo "Error: ~{message}" + exit 1 + else + exit 0 + fi + >>> + + runtime { + docker: "ubuntu.azurecr.io/ubuntu:20.04" + preemptible: 3 + } + output { + Boolean done = true + } +} diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index dfe16f1a07..5cab7de9ee 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -301,163 +301,6 @@ task Minimac4 { } } -task CountVariantsInChunksBeagle { - input { - File vcf - File vcf_index - File panel_bed_file - - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" - Int cpu = 1 - Int memory_mb = 16000 - Int disk_size_gb = 2 * ceil(size([vcf, vcf_index, panel_bed_file], "GiB")) + 10 - } - Int command_mem = memory_mb - 1500 - Int max_heap = memory_mb - 1000 - - command <<< - set -e -o pipefail - - ln -sf ~{vcf} input.vcf.gz - ln -sf ~{vcf_index} input.vcf.gz.tbi - - gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" CountVariants -V input.vcf.gz | tail -n 1 > var_in_original - bedtools intersect -a ~{vcf} -b ~{panel_bed_file} | wc -l > var_also_in_reference - >>> - - output { - Int var_in_original = read_int("var_in_original") - Int var_also_in_reference = read_int("var_also_in_reference") - } - runtime { - docker: gatk_docker - disks: "local-disk ${disk_size_gb} HDD" - memory: "${memory_mb} MiB" - cpu: cpu - preemptible: 3 - } -} - -task CheckChunksBeagle { - input { - Int var_in_original - Int var_also_in_reference - - String bcftools_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.7-1.10.2-0.1.16-1669908889" - Int cpu = 1 - Int memory_mb = 4000 - } - command <<< - set -e -o pipefail - - if [ $(( ~{var_also_in_reference} * 2 - ~{var_in_original})) -gt 0 ] && [ ~{var_also_in_reference} -gt 3 ]; then - echo true > valid_file.txt - else - echo false > valid_file.txt - fi - >>> - output { - Boolean valid = read_boolean("valid_file.txt") - } - runtime { - docker: bcftools_docker - disks: "local-disk 10 HDD" - memory: "${memory_mb} MiB" - cpu: cpu - preemptible: 3 - } -} - -task PhaseBeagle { - input { - File dataset_vcf - File ref_panel_bref3 - File genetic_map_file - String basename - String chrom # not needed if ref file has been chunked and you are using the entire chunk - Int start # not needed if ref file has been chunked and you are using the entire chunk - Int end # not needed if ref file has been chunked and you are using the entire chunk - - String beagle_docker = "us-central1-docker.pkg.dev/morgan-fieldeng-gcp/imputation-beagle-development/imputation-beagle:0.0.1-01Mar24.d36-wip-temp-20240301" - Int cpu = 8 # This parameter is used as the nthreads input to Beagle which is part of how we make it determinstic. Changing this value may change the output generated by the tool - Int memory_mb = 32000 # value depends on chunk size, the number of samples in ref and target panel, and whether imputation is performed - Int xmx_mb = memory_mb - 5000 # I suggest setting this parameter to be 85-90% of the memory_mb parameter - Int disk_size_gb = ceil(3 * size([dataset_vcf, ref_panel_bref3], "GiB")) + 10 # value may need to be adjusted - } - - command <<< - set -e -o pipefail - - java -ea -Xmx~{xmx_mb}m \ - -jar /usr/gitc/beagle.01Mar24.d36.jar \ - gt=~{dataset_vcf} \ - ref=~{ref_panel_bref3} \ - map=~{genetic_map_file} \ - out=phased_~{basename} \ - chrom=~{chrom}:~{start}-~{end} \ - impute=false \ - nthreads=~{cpu} \ - seed=-99999 - - >>> - output { - File vcf = "phased_~{basename}.vcf.gz" - File log = "phased_~{basename}.log" - } - runtime { - docker: beagle_docker - disks: "local-disk ${disk_size_gb} HDD" - memory: "${memory_mb} MiB" - cpu: cpu - preemptible: 3 - } -} - -task ImputeBeagle { - input { - File dataset_vcf - File ref_panel_bref3 - File genetic_map_file - String basename - String chrom # not needed if ref file has been chunked and you are using the entire chunk - Int start # not needed if ref file has been chunked and you are using the entire chunk - Int end # not needed if ref file has been chunked and you are using the entire chunk - - String beagle_docker = "us-central1-docker.pkg.dev/morgan-fieldeng-gcp/imputation-beagle-development/imputation-beagle:0.0.1-01Mar24.d36-wip-temp-20240301" - Int cpu = 8 # This parameter is used as the nthreads input to Beagle which is part of how we make it determinstic. Changing this value may change the output generated by the tool - Int memory_mb = 32000 # value depends on chunk size, the number of samples in ref and target panel, and whether imputation is performed - Int xmx_mb = memory_mb - 5000 # I suggest setting this parameter to be 85-90% of the memory_mb parameter - Int disk_size_gb = ceil(3 * size([dataset_vcf, ref_panel_bref3], "GiB")) + 10 # value may need to be adjusted - } - - command <<< - set -e -o pipefail - - java -ea -Xmx~{xmx_mb}m \ - -jar /usr/gitc/beagle.01Mar24.d36.jar \ - gt=~{dataset_vcf} \ - ref=~{ref_panel_bref3} \ - map=~{genetic_map_file} \ - out=imputed_~{basename} \ - chrom=~{chrom}:~{start}-~{end} \ - impute=true \ - nthreads=~{cpu} \ - seed=-99999 - - >>> - output { - File vcf = "imputed_~{basename}.vcf.gz" - File log = "imputed_~{basename}.log" - } - runtime { - docker: beagle_docker - disks: "local-disk ${disk_size_gb} HDD" - memory: "${memory_mb} MiB" - cpu: cpu - preemptible: 3 - } -} - task GatherVcfs { input { Array[File] input_vcfs @@ -1164,185 +1007,3 @@ task CreateVcfIndex { File vcf_index = "~{vcf_basename}.tbi" } } - -task PreSplitVcf { - input { - Array[String] contigs - File vcf - - Int disk_size_gb = ceil(3*size(vcf, "GiB")) + 10 - Int cpu = 1 - Int memory_mb = 6000 - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" - } - Int command_mem = memory_mb - 1500 - Int max_heap = memory_mb - 1000 - - command { - set -e -o pipefail - - gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" \ - IndexFeatureFile -I ~{vcf} - - mkdir split_vcfs - - CONTIG_FILE=~{write_lines(contigs)} - i=0 - - while read -r line; - do - - SPLIT=$(printf "%03d" $i) - echo "SPLIT: $SPLIT" - - gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" \ - SelectVariants \ - -V ~{vcf} \ - -L $line \ - -O split_vcfs/split_chr_$SPLIT.vcf.gz - - i=$(($i + 1)) - - done < $CONTIG_FILE - } - runtime { - docker: gatk_docker - disks: "local-disk ${disk_size_gb} HDD" - memory: "${memory_mb} MiB" - cpu: cpu - preemptible: 3 - } - output { - Array[File] chr_split_vcfs = glob("split_vcfs/*.vcf.gz") - Array[File] chr_split_vcf_indices = glob("split_vcfs/*.vcf.gz.tbi") - } -} - -task PreChunkVcf { - input { - Int chromosome_length - Int chunk_length - Int chunk_overlap - String chrom - File vcf - File vcf_index - Boolean exclude_filtered = false - - Int disk_size_gb = ceil(4*size(vcf, "GiB")) + 10 - Int cpu = 1 - Int memory_mb = 6000 - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" - } - Int command_mem = memory_mb - 1500 - Int max_heap = memory_mb - 1000 - - command { - set -e -o pipefail - - ln -sf ~{vcf} input.vcf.gz - ln -sf ~{vcf_index} input.vcf.gz.tbi - - mkdir generate_chunk - mkdir subset_vcf - - CHROM_LENGTH=~{chromosome_length} - CHUNK_LENGTH=~{chunk_length} - CHUNK_OVERLAPS=~{chunk_overlap} - i=0 - LOOP_DRIVER=$(( $i * $CHUNK_LENGTH + 1 )) - - while [ $LOOP_DRIVER -lt $CHROM_LENGTH ] - do - START=$(( $i * $CHUNK_LENGTH + 1 )) - START_OVERLAP_CHECK=$(($START - $CHUNK_OVERLAPS)) - if [ $START_OVERLAP_CHECK -lt 1 ]; then - START_WITH_OVERLAPS=$START - else - START_WITH_OVERLAPS=$(($START - $CHUNK_OVERLAPS)) - fi - echo "START: $START" - echo "START WITH OVERLAPS: $START_WITH_OVERLAPS" - - END_CHECK=$(( ($i + 1) * $CHUNK_LENGTH )) - if [ $END_CHECK -gt $CHROM_LENGTH ]; then - END=$CHROM_LENGTH - else - END=$(( ($i + 1) * $CHUNK_LENGTH )) - fi - - END_OVERLAP_CHECK=$(( $END + $CHUNK_OVERLAPS )) - if [ $END_OVERLAP_CHECK -gt $CHROM_LENGTH ]; then - END_WITH_OVERLAPS=$CHROM_LENGTH - else - END_WITH_OVERLAPS=$(( $END + $CHUNK_OVERLAPS )) - fi - echo "END: $END" - echo "END WITH OVERLAPS: $END_WITH_OVERLAPS" - - CHUNK=$(printf "%03d" $i) - echo "CHUNK: $CHUNK" - - gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" \ - SelectVariants \ - -V input.vcf.gz \ - --select-type-to-include SNP \ - --max-nocall-fraction 0.1 \ - -xl-select-type SYMBOLIC \ - --select-type-to-exclude MIXED \ - --restrict-alleles-to BIALLELIC \ - -L ~{chrom}:$START_WITH_OVERLAPS-$END_WITH_OVERLAPS \ - -O generate_chunk/~{chrom}_generate_chunk_$CHUNK.vcf.gz \ - --exclude-filtered true - - gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" \ - SelectVariants \ - -V input.vcf.gz \ - -L ~{chrom}:$START-$END \ - -select "POS >= $START" ~{if exclude_filtered then "--exclude-filtered" else ""} \ - -O subset_vcf/~{chrom}_subset_chunk_$CHUNK.vcf.gz - - echo $START >> start.txt - echo $END >> end.txt - - i=$(($i + 1)) - LOOP_DRIVER=$(( $i * $CHUNK_LENGTH + 1 )) - done - } - runtime { - docker: gatk_docker - disks: "local-disk ${disk_size_gb} HDD" - memory: "${memory_mb} MiB" - cpu: cpu - preemptible: 3 - } - output { - Array[File] generate_chunk_vcfs = glob("generate_chunk/*.vcf.gz") - Array[File] generate_chunk_vcf_indices = glob("generate_chunk/*.vcf.gz.tbi") - Array[File] subset_vcfs = glob("subset_vcf/*.vcf.gz") - Array[String] starts = read_lines("start.txt") - Array[String] ends = read_lines("end.txt") - } -} - -task ErrorWithMessageIfErrorCountNotZero { - input { - Int errorCount - String message - } - command <<< - if [[ ~{errorCount} -gt 0 ]]; then - >&2 echo "Error: ~{message}" - exit 1 - else - exit 0 - fi - >>> - - runtime { - docker: "ubuntu.azurecr.io/ubuntu:20.04" - preemptible: 3 - } - output { - Boolean done = true - } -} From eb79037a6d5af5d71fcfc907a2f3f1b5f26bb2d4 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Thu, 13 Feb 2025 21:45:09 +0000 Subject: [PATCH 69/92] Updated pipeline_versions.txt with all pipeline version information --- pipeline_versions.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pipeline_versions.txt b/pipeline_versions.txt index 936aff58e9..47ba4982aa 100644 --- a/pipeline_versions.txt +++ b/pipeline_versions.txt @@ -13,6 +13,9 @@ MultiSampleSmartSeq2 2.2.22 2024-09-11 AnnotationFiltration 1.2.7 2024-11-04 RNAWithUMIsPipeline 1.0.18 2024-11-04 Imputation 1.1.15 2024-11-04 +ArrayImputationQuotaConsumed 0.0.1 2024-11-13 +ImputationBeagle 0.0.1 2024-11-13 +LiftoverVcfs 0.0.1 2024-11-13 Arrays 2.6.30 2024-11-04 MultiSampleArrays 1.6.2 2024-08-02 ValidateChip 1.16.7 2024-11-04 From 39fb410f74f817056a220a8e26a47780ef66243a Mon Sep 17 00:00:00 2001 From: "M. Morgan Aster" Date: Mon, 24 Feb 2025 13:59:46 -0500 Subject: [PATCH 70/92] [PR to feature branch] Add testing to imputation beagle (#1503) * TSPS-239 get wdl running on 400k sample ref panel (#1373) * changes to help beagle imputation wdl run on a 400k sample reference panel --------- Co-authored-by: Jose Soto * remove create imputation ref panel beagle wdl and changelog * PR feedback --------- Co-authored-by: Jose Soto Co-authored-by: M. Morgan Aster * add new files for testing * add test wdl to .dockstore.yml * add test data json files, other updates * version to 1.0.0, update changelog * update beagle docker * update beagle docker again * fix call phase task * re-deleting ImputationBeaglePreChunk.wdl * temporarily try to run test on feature branch pr * remove vault inputs * update output basename for plumbing test * remove feature branch from gha pr branches * pr comments * add quotes in VerifyTasks.CompareVcfs * update dockers, move CreateVcfIndex to BeagleTasks --------- Co-authored-by: jsotobroad Co-authored-by: Jose Soto --- .dockstore.yml | 4 + .github/workflows/test_imputation_beagle.yml | 75 ++++++++++++ .../ImputationBeagle.changelog.md | 8 ++ .../imputation_beagle/ImputationBeagle.wdl | 14 +-- .../Plumbing/test_data_NA12878_x50.json | 8 ++ .../Scientific/test_data_NA12878_x50.json | 8 ++ .../imputation/ImputationBeagleStructs.wdl | 8 ++ tasks/broad/ImputationBeagleTasks.wdl | 44 ++++++- tasks/broad/ImputationTasks.wdl | 40 +------ verification/VerifyImputationBeagle.wdl | 79 +++++++++++++ verification/VerifyTasks.wdl | 2 +- .../test-wdls/TestImputationBeagle.wdl | 111 ++++++++++++++++++ 12 files changed, 348 insertions(+), 53 deletions(-) create mode 100644 .github/workflows/test_imputation_beagle.yml create mode 100644 pipelines/broad/arrays/imputation_beagle/test_inputs/Plumbing/test_data_NA12878_x50.json create mode 100644 pipelines/broad/arrays/imputation_beagle/test_inputs/Scientific/test_data_NA12878_x50.json create mode 100644 structs/imputation/ImputationBeagleStructs.wdl create mode 100644 verification/VerifyImputationBeagle.wdl create mode 100644 verification/test-wdls/TestImputationBeagle.wdl diff --git a/.dockstore.yml b/.dockstore.yml index 6cc4f04aa9..ca04909139 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -167,6 +167,10 @@ workflows: subclass: WDL primaryDescriptorPath: /verification/test-wdls/TestImputation.wdl + - name: TestImputationBeagle + subclass: WDL + primaryDescriptorPath: /verification/test-wdls/TestImputationBeagle.wdl + - name: TestJointGenotyping subclass: WDL primaryDescriptorPath: /verification/test-wdls/TestJointGenotyping.wdl diff --git a/.github/workflows/test_imputation_beagle.yml b/.github/workflows/test_imputation_beagle.yml new file mode 100644 index 0000000000..f9a627f02f --- /dev/null +++ b/.github/workflows/test_imputation_beagle.yml @@ -0,0 +1,75 @@ +name: Test ImputationBeagle + +# Controls when the workflow will run +on: + pull_request: + branches: [ "develop", "staging", "master" ] + # Only run if files in these paths changed: + #################################### + # SET PIPELINE SPECIFIC PATHS HERE # + #################################### + paths: + - 'pipelines/broad/arrays/imputation_beagle/**' + - 'structs/imputation/ImputationBeagleStructs.wdl' + - 'tasks/broad/ImputationTasks.wdl' + - 'tasks/broad/ImputationBeagleTasks.wdl' + - 'verification/VerifyImputationBeagle.wdl' + - 'verification/test-wdls/TestImputationBeagle.wdl' + - 'tasks/broad/Utilities.wdl' + - 'tasks/broad/TerraCopyFilesFromCloudToCloud.wdl' + - '.github/workflows/test_imputation_beagle.yml' + - '.github/workflows/warp_test_workflow.yml' + + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + inputs: + useCallCache: + description: 'Use call cache (default: true)' + required: false + default: "true" + updateTruth: + description: 'Update truth files (default: false)' + required: false + default: "false" + testType: + description: 'Specify the type of test (Plumbing or Scientific)' + required: false + type: choice + options: + - Plumbing + - Scientific + truthBranch: + description: 'Specify the branch for truth files (default: master)' + required: false + default: "master" + +env: + # pipeline configuration + PIPELINE_NAME: TestImputationBeagle + DOCKSTORE_PIPELINE_NAME: ImputationBeagle + PIPELINE_DIR: "pipelines/broad/arrays/imputation_beagle" + + # workspace configuration + TESTING_WORKSPACE: WARP Tests + WORKSPACE_NAMESPACE: warp-pipelines + + # service account configuration + SA_JSON_B64: ${{ secrets.PDT_TESTER_SA_B64 }} + USER: pdt-tester@warp-pipeline-dev.iam.gserviceaccount.com + + +jobs: + TestImputationBeagle: + uses: ./.github/workflows/warp_test_workflow.yml + with: + pipeline_name: TestImputationBeagle + dockstore_pipeline_name: ImputationBeagle + pipeline_dir: pipelines/broad/arrays/imputation_beagle + use_call_cache: ${{ github.event.inputs.useCallCache || 'true' }} + update_truth: ${{ github.event.inputs.updateTruth || 'false' }} + test_type: ${{ github.event.inputs.testType }} + truth_branch: ${{ github.event.inputs.truthBranch || 'master' }} + secrets: + PDT_TESTER_SA_B64: ${{ secrets.PDT_TESTER_SA_B64 }} + DOCKSTORE_TOKEN: ${{ secrets.DOCKSTORE_TOKEN }} diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.changelog.md b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.changelog.md index 336273806b..927e821893 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.changelog.md +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.changelog.md @@ -1,3 +1,11 @@ +# 1.0.0 +2025-02-20 (Date of Last Commit) + +* * Initial public release of the ImputationBeagle pipeline. + + * The ImputationBeagle pipeline imputes missing genotypes from a multi-sample VCF using a large genomic reference panel. It is based on the Michigan Imputation Server pipeline but uses the Beagle imputation tool instead of minimac. Overall, the pipeline filters, phases, and performs imputation on a multi-sample VCF. It outputs the imputed VCF along with key imputation metrics. + + # 0.0.1 2024-11-13 (Date of Last Commit) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl index ecd073524f..6d312f6bda 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl @@ -1,11 +1,12 @@ version 1.0 +import "../../../../structs/imputation/ImputationBeagleStructs.wdl" as structs import "../../../../tasks/broad/ImputationTasks.wdl" as tasks import "../../../../tasks/broad/ImputationBeagleTasks.wdl" as beagleTasks workflow ImputationBeagle { - String pipeline_version = "0.0.1" + String pipeline_version = "1.0.0" input { Int chunkLength = 25000000 @@ -34,7 +35,7 @@ workflow ImputationBeagle { vcf = multi_sample_vcf } - call tasks.CreateVcfIndex { + call beagleTasks.CreateVcfIndex { input: vcf_input = multi_sample_vcf, gatk_docker = gatk_docker @@ -158,7 +159,7 @@ workflow ImputationBeagle { memory_mb = beagle_impute_memory_in_gb * 1024 } - call tasks.CreateVcfIndex as IndexImputedBeagle { + call beagleTasks.CreateVcfIndex as IndexImputedBeagle { input: vcf_input = Impute.vcf, gatk_docker = gatk_docker @@ -222,10 +223,3 @@ workflow ImputationBeagle { } } - -struct ReferencePanelContig { - File bed - File bref3 - String contig - File genetic_map -} diff --git a/pipelines/broad/arrays/imputation_beagle/test_inputs/Plumbing/test_data_NA12878_x50.json b/pipelines/broad/arrays/imputation_beagle/test_inputs/Plumbing/test_data_NA12878_x50.json new file mode 100644 index 0000000000..e23a1cfa8e --- /dev/null +++ b/pipelines/broad/arrays/imputation_beagle/test_inputs/Plumbing/test_data_NA12878_x50.json @@ -0,0 +1,8 @@ +{ + "ImputationBeagle.multi_sample_vcf": "gs://broad-gotc-test-storage/imputation_beagle/scientific/vcfs/NA12878_50_duplicate.clean.vcf.gz", + "ImputationBeagle.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "ImputationBeagle.reference_panel_path_prefix": "gs://broad-gotc-test-storage/imputation_beagle/scientific/1000G_HGDP_no_singletons_reference_panel/hgdp.tgp.gwaspy.AN_added.bcf.ac2", + "ImputationBeagle.contigs": ["chr21","chr22"], + "ImputationBeagle.genetic_maps_path": "gs://broad-gotc-test-storage/imputation_beagle/scientific/plink-genetic-maps/", + "ImputationBeagle.output_basename": "plumbing_test" +} diff --git a/pipelines/broad/arrays/imputation_beagle/test_inputs/Scientific/test_data_NA12878_x50.json b/pipelines/broad/arrays/imputation_beagle/test_inputs/Scientific/test_data_NA12878_x50.json new file mode 100644 index 0000000000..bb23a985b2 --- /dev/null +++ b/pipelines/broad/arrays/imputation_beagle/test_inputs/Scientific/test_data_NA12878_x50.json @@ -0,0 +1,8 @@ +{ + "ImputationBeagle.multi_sample_vcf": "gs://broad-gotc-test-storage/imputation_beagle/scientific/vcfs/NA12878_50_duplicate.clean.vcf.gz", + "ImputationBeagle.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "ImputationBeagle.reference_panel_path_prefix": "gs://broad-gotc-test-storage/imputation_beagle/scientific/1000G_HGDP_no_singletons_reference_panel/hgdp.tgp.gwaspy.AN_added.bcf.ac2", + "ImputationBeagle.contigs": ["chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22"], + "ImputationBeagle.genetic_maps_path": "gs://broad-gotc-test-storage/imputation_beagle/scientific/plink-genetic-maps/", + "ImputationBeagle.output_basename": "scientific_test" +} diff --git a/structs/imputation/ImputationBeagleStructs.wdl b/structs/imputation/ImputationBeagleStructs.wdl new file mode 100644 index 0000000000..6aaaaa061a --- /dev/null +++ b/structs/imputation/ImputationBeagleStructs.wdl @@ -0,0 +1,8 @@ +version 1.0 + +struct ReferencePanelContig { + File bed + File bref3 + String contig + File genetic_map +} diff --git a/tasks/broad/ImputationBeagleTasks.wdl b/tasks/broad/ImputationBeagleTasks.wdl index d62f5a4e8c..123090f768 100644 --- a/tasks/broad/ImputationBeagleTasks.wdl +++ b/tasks/broad/ImputationBeagleTasks.wdl @@ -77,7 +77,7 @@ task Phase { Int start # not needed if ref file has been chunked and you are using the entire chunk Int end # not needed if ref file has been chunked and you are using the entire chunk - String beagle_docker = "us-central1-docker.pkg.dev/morgan-fieldeng-gcp/imputation-beagle-development/imputation-beagle:0.0.1-01Mar24.d36-wip-temp-20240301" + String beagle_docker = "us.gcr.io/broad-gotc-prod/imputation-beagle:1.0.0-17Dec24.224-1740423035" Int cpu = 8 # This parameter is used as the nthreads input to Beagle which is part of how we make it determinstic. Changing this value may change the output generated by the tool Int memory_mb = 32000 # value depends on chunk size, the number of samples in ref and target panel, and whether imputation is performed Int xmx_mb = memory_mb - 5000 # I suggest setting this parameter to be 85-90% of the memory_mb parameter @@ -88,7 +88,7 @@ task Phase { set -e -o pipefail java -ea -Xmx~{xmx_mb}m \ - -jar /usr/gitc/beagle.01Mar24.d36.jar \ + -jar /usr/gitc/beagle.17Dec24.224.jar \ gt=~{dataset_vcf} \ ref=~{ref_panel_bref3} \ map=~{genetic_map_file} \ @@ -122,7 +122,7 @@ task Impute { Int start # not needed if ref file has been chunked and you are using the entire chunk Int end # not needed if ref file has been chunked and you are using the entire chunk - String beagle_docker = "us-central1-docker.pkg.dev/morgan-fieldeng-gcp/imputation-beagle-development/imputation-beagle:0.0.1-01Mar24.d36-wip-temp-20240301" + String beagle_docker = "us.gcr.io/broad-gotc-prod/imputation-beagle:1.0.0-17Dec24.224-1740423035" Int cpu = 8 # This parameter is used as the nthreads input to Beagle which is part of how we make it determinstic. Changing this value may change the output generated by the tool Int memory_mb = 32000 # value depends on chunk size, the number of samples in ref and target panel, and whether imputation is performed Int xmx_mb = memory_mb - 5000 # I suggest setting this parameter to be 85-90% of the memory_mb parameter @@ -133,7 +133,7 @@ task Impute { set -e -o pipefail java -ea -Xmx~{xmx_mb}m \ - -jar /usr/gitc/beagle.01Mar24.d36.jar \ + -jar /usr/gitc/beagle.17Dec24.224.jar \ gt=~{dataset_vcf} \ ref=~{ref_panel_bref3} \ map=~{genetic_map_file} \ @@ -172,10 +172,44 @@ task ErrorWithMessageIfErrorCountNotZero { >>> runtime { - docker: "ubuntu.azurecr.io/ubuntu:20.04" + docker: "ubuntu:20.04" preemptible: 3 } output { Boolean done = true } } + +task CreateVcfIndex { + input { + File vcf_input + + Int disk_size_gb = ceil(1.2*size(vcf_input, "GiB")) + 10 + Int cpu = 1 + Int memory_mb = 6000 + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" + } + Int command_mem = memory_mb - 1500 + Int max_heap = memory_mb - 1000 + + String vcf_basename = basename(vcf_input) + + command { + set -e -o pipefail + + ln -sf ~{vcf_input} ~{vcf_basename} + + bcftools index -t ~{vcf_basename} + } + runtime { + docker: gatk_docker + disks: "local-disk ${disk_size_gb} HDD" + memory: "${memory_mb} MiB" + cpu: cpu + preemptible: 3 + } + output { + File vcf = "~{vcf_basename}" + File vcf_index = "~{vcf_basename}.tbi" + } +} diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index 5cab7de9ee..f64b902906 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -5,7 +5,7 @@ task CalculateChromosomeLength { File ref_dict String chrom - String ubuntu_docker = "ubuntu.azurecr.io/ubuntu:20.04" + String ubuntu_docker = "ubuntu:20.04" Int memory_mb = 2000 Int cpu = 1 Int disk_size_gb = ceil(2*size(ref_dict, "GiB")) + 5 @@ -33,7 +33,7 @@ task GetMissingContigList { File ref_dict File included_contigs - String ubuntu_docker = "ubuntu.azurecr.io/ubuntu:20.04" + String ubuntu_docker = "ubuntu:20.04" Int memory_mb = 2000 Int cpu = 1 Int disk_size_gb = ceil(2*size(ref_dict, "GiB")) + 5 @@ -919,7 +919,7 @@ task FindSitesUniqueToFileTwoOnly { File file1 File file2 - String ubuntu_docker = "ubuntu.azurecr.io/ubuntu:20.04" + String ubuntu_docker = "ubuntu:20.04" Int cpu = 1 Int memory_mb = 4000 Int disk_size_gb = ceil(size(file1, "GiB") + 2*size(file2, "GiB")) + 10 @@ -973,37 +973,3 @@ task SplitMultiSampleVcf { Array[File] single_sample_vcf_indices = glob("out_dir/*.vcf.gz.tbi") } } - -task CreateVcfIndex { - input { - File vcf_input - - Int disk_size_gb = ceil(1.2*size(vcf_input, "GiB")) + 10 - Int cpu = 1 - Int memory_mb = 6000 - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" - } - Int command_mem = memory_mb - 1500 - Int max_heap = memory_mb - 1000 - - String vcf_basename = basename(vcf_input) - - command { - set -e -o pipefail - - ln -sf ~{vcf_input} ~{vcf_basename} - - bcftools index -t ~{vcf_basename} - } - runtime { - docker: gatk_docker - disks: "local-disk ${disk_size_gb} HDD" - memory: "${memory_mb} MiB" - cpu: cpu - preemptible: 3 - } - output { - File vcf = "~{vcf_basename}" - File vcf_index = "~{vcf_basename}.tbi" - } -} diff --git a/verification/VerifyImputationBeagle.wdl b/verification/VerifyImputationBeagle.wdl new file mode 100644 index 0000000000..e99f1767c1 --- /dev/null +++ b/verification/VerifyImputationBeagle.wdl @@ -0,0 +1,79 @@ +version 1.0 + +import "../verification/VerifyTasks.wdl" as Tasks + +## Copyright Broad Institute, 2018 +## +## This WDL script is designed to verify (compare) the outputs of an ArrayWf wdl. +## +## +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +workflow VerifyImputationBeagle { + input { + Array[File] truth_metrics + Array[File] test_metrics + + File truth_vcf + File test_vcf + File test_vcf_index + File truth_vcf_index + + Boolean? done + } + + String bcftools_docker_tag = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.7-1.10.2-0.1.16-1669908889" + + scatter (idx in range(length(truth_metrics))) { + call CompareImputationMetrics { + input: + test_metrics = test_metrics[idx], + truth_metrics = truth_metrics[idx] + } + } + + call Tasks.CompareVcfs as CompareOutputVcfs { + input: + file1 = truth_vcf, + file2 = test_vcf, + patternForLinesToExcludeFromComparison = "##" # ignore headers + } + + output { + } + meta { + allowNestedInputs: true + } +} + +task CompareImputationMetrics { + input { + File test_metrics + File truth_metrics + } + command <<< + set -eo pipefail + diff "~{test_metrics}" "~{truth_metrics}" + + if [ $? -ne 0 ]; + then + echo "Error: ${test_metrics} and ${truth_metrics} differ" + fi + >>> + + runtime { + docker: "ubuntu:20.04" + cpu: 1 + memory: "3.75 GiB" + disks: "local-disk 10 HDD" + } +} diff --git a/verification/VerifyTasks.wdl b/verification/VerifyTasks.wdl index 43bb9b4340..46ed24373c 100644 --- a/verification/VerifyTasks.wdl +++ b/verification/VerifyTasks.wdl @@ -10,7 +10,7 @@ task CompareVcfs { command { set -eo pipefail - if [ -z ~{patternForLinesToExcludeFromComparison} ]; then + if [ -z '~{patternForLinesToExcludeFromComparison}' ]; then diff <(gunzip -c -f ~{file1}) <(gunzip -c -f ~{file2}) else echo "It's defined!" diff --git a/verification/test-wdls/TestImputationBeagle.wdl b/verification/test-wdls/TestImputationBeagle.wdl new file mode 100644 index 0000000000..2d56d858aa --- /dev/null +++ b/verification/test-wdls/TestImputationBeagle.wdl @@ -0,0 +1,111 @@ +version 1.0 + + +import "../../pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl" as ImputationBeagle +import "../../verification/VerifyImputationBeagle.wdl" as VerifyImputationBeagle +import "../../tasks/broad/Utilities.wdl" as Utilities +import "../../tasks/broad/TerraCopyFilesFromCloudToCloud.wdl" as Copy + +workflow TestImputationBeagle { + + input { + Int chunkLength = 25000000 + Int chunkOverlaps = 5000000 # this is the padding that will be added to the beginning and end of each chunk to reduce edge effects + + File multi_sample_vcf + + File ref_dict # for reheadering / adding contig lengths in the header of the ouptut VCF, and calculating contig lengths + Array[String] contigs + String reference_panel_path_prefix # path + file prefix to the bucket where the reference panel files are stored for all contigs + String genetic_maps_path # path to the bucket where genetic maps are stored for all contigs + String output_basename # the basename for intermediate and output files + + # These values will be determined and injected into the inputs by the scala test framework + String truth_path + String results_path + Boolean update_truth + } + + meta { + allowNestedInputs: true + } + + call ImputationBeagle.ImputationBeagle { + input: + chunkLength = chunkLength, + chunkOverlaps = chunkOverlaps, + multi_sample_vcf = multi_sample_vcf, + ref_dict = ref_dict, + contigs = contigs, + reference_panel_path_prefix = reference_panel_path_prefix, + genetic_maps_path = genetic_maps_path, + output_basename = output_basename, + } + + + # Collect all of the pipeline outputs into single Array[String] + Array[String] pipeline_outputs = flatten([ + [ # File outputs + ImputationBeagle.imputed_multi_sample_vcf, + ImputationBeagle.imputed_multi_sample_vcf_index, + ] + ]) + + + # Collect all of the pipeline metrics into single Array[String] + Array[String] pipeline_metrics = flatten([ + [ # File outputs + ImputationBeagle.chunks_info, + ] + ]) + + # Copy results of pipeline to test results bucket + call Copy.TerraCopyFilesFromCloudToCloud as CopyToTestResults { + input: + files_to_copy = flatten([pipeline_outputs, pipeline_metrics]), + destination_cloud_path = results_path + } + + # If updating truth then copy output to truth bucket + if (update_truth){ + call Copy.TerraCopyFilesFromCloudToCloud as CopyToTruth { + input: + files_to_copy = flatten([pipeline_outputs, pipeline_metrics]), + destination_cloud_path = truth_path + } + } + + # This is achieved by passing each desired file/array[files] to GetValidationInputs + if (!update_truth){ + call Utilities.GetValidationInputs as GetMetrics { + input: + input_files = pipeline_metrics, + results_path = results_path, + truth_path = truth_path + } + call Utilities.GetValidationInputs as GetVcf { + input: + input_file = ImputationBeagle.imputed_multi_sample_vcf, + results_path = results_path, + truth_path = truth_path + } + call Utilities.GetValidationInputs as GetVcfIndex { + input: + input_file = ImputationBeagle.imputed_multi_sample_vcf_index, + results_path = results_path, + truth_path = truth_path + } + + + call VerifyImputationBeagle.VerifyImputationBeagle as Verify { + input: + truth_metrics = GetMetrics.truth_files, + test_metrics = GetMetrics.results_files, + truth_vcf = GetVcf.truth_file, + test_vcf = GetVcf.results_file, + truth_vcf_index = GetVcfIndex.truth_file, + test_vcf_index = GetVcfIndex.results_file, + done = CopyToTestResults.done + } + } +} From bd75c0c0aeb8593198290088b32e977b91527cf0 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Mon, 24 Feb 2025 19:00:07 +0000 Subject: [PATCH 71/92] Updated pipeline_versions.txt with all pipeline version information --- pipeline_versions.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline_versions.txt b/pipeline_versions.txt index 47ba4982aa..b41c876333 100644 --- a/pipeline_versions.txt +++ b/pipeline_versions.txt @@ -14,7 +14,7 @@ AnnotationFiltration 1.2.7 2024-11-04 RNAWithUMIsPipeline 1.0.18 2024-11-04 Imputation 1.1.15 2024-11-04 ArrayImputationQuotaConsumed 0.0.1 2024-11-13 -ImputationBeagle 0.0.1 2024-11-13 +ImputationBeagle 1.0.0 2025-02-20 LiftoverVcfs 0.0.1 2024-11-13 Arrays 2.6.30 2024-11-04 MultiSampleArrays 1.6.2 2024-08-02 From 8aa7f54f21d838073a040130f935473315d90550 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Mon, 24 Feb 2025 14:52:39 -0500 Subject: [PATCH 72/92] remove newline at end of Utilities.wdl --- tasks/broad/Utilities.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/broad/Utilities.wdl b/tasks/broad/Utilities.wdl index 52121a74a9..e6a1aeec17 100644 --- a/tasks/broad/Utilities.wdl +++ b/tasks/broad/Utilities.wdl @@ -300,4 +300,4 @@ task GetValidationInputs { Array[String] results_files = read_lines("results_files.txt") } -} +} \ No newline at end of file From b98463ac86fec24a5beb9cc10c000470084fb48f Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Mon, 24 Feb 2025 15:55:56 -0500 Subject: [PATCH 73/92] remove LiftoverVcfs, add README for imputation_beagle --- .dockstore.yml | 4 - .../LiftoverVcfs.changelog.md | 4 - .../arrays/imputation_beagle/LiftoverVcfs.wdl | 101 ------------------ 3 files changed, 109 deletions(-) delete mode 100644 pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.changelog.md delete mode 100644 pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl diff --git a/.dockstore.yml b/.dockstore.yml index ca04909139..08449df04d 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -87,10 +87,6 @@ workflows: subclass: WDL primaryDescriptorPath: /pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl - - name: LiftoverVcfs - subclass: WDL - primaryDescriptorPath: /pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl - - name: ArrayImputationQuotaConsumed subclass: WDL primaryDescriptorPath: /pipelines/broad/arrays/imputation_beagle/ArrayImputationQuotaConsumed.wdl diff --git a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.changelog.md b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.changelog.md deleted file mode 100644 index 336273806b..0000000000 --- a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.changelog.md +++ /dev/null @@ -1,4 +0,0 @@ -# 0.0.1 -2024-11-13 (Date of Last Commit) - -* Pipeline still in developmental state diff --git a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl deleted file mode 100644 index fa57933f9b..0000000000 --- a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl +++ /dev/null @@ -1,101 +0,0 @@ -version 1.0 - -# Liftover VCFs from hg19 to hg38 -workflow LiftoverVcfs { - - String pipeline_version = "0.0.1" - - input { - File vcf_path - File vcf_index_path - - File liftover_chain - - String docker = "us.gcr.io/broad-gatk/gatk:4.2.6.1" - Int min_disk_size = 100 - Int mem_gb = 16 - - File hg38_reference_fasta - File hg38_reference_fasta_index - File hg38_reference_dict - - Int max_retries = 3 - Int preemptible_tries = 3 - } - - String vcf_basename = basename(vcf_path, ".vcf.gz") - - # Lift over the array to hg38. - call LiftOverArrays { - input: - input_vcf = vcf_path, - input_vcf_index = vcf_index_path, - liftover_chain = liftover_chain, - reference_fasta = hg38_reference_fasta, - reference_dict = hg38_reference_dict, - output_basename = vcf_basename, - docker = docker, - max_retries = max_retries, - preemptible_tries = preemptible_tries, - min_disk_size = min_disk_size, - mem_gb = mem_gb - } - - output { - File hg38_vcf = LiftOverArrays.lifted_over_vcf - File hg38_vcf_index = LiftOverArrays.lifted_over_vcf_index - } -} - -task LiftOverArrays { - input { - File input_vcf - File input_vcf_index - File liftover_chain - File reference_fasta - File reference_dict - String output_basename - String docker - Int max_retries - Int preemptible_tries - Int min_disk_size - Int mem_gb - } - - Int disk_size_from_file = (ceil(size(input_vcf, "GiB") + size(liftover_chain, "GiB") + size(reference_fasta, "GiB")) * 2) + 20 - Int disk_size = if ( disk_size_from_file > min_disk_size ) then disk_size_from_file else min_disk_size - Int max_mem_gb = mem_gb - 1 - - command <<< - set -euo pipefail - - gatk --java-options "-Xms4g -Xmx~{max_mem_gb}g" \ - LiftoverVcf \ - --INPUT ~{input_vcf} \ - --OUTPUT ~{output_basename}.liftedover.vcf \ - --CHAIN ~{liftover_chain} \ - --REJECT ~{output_basename}.rejected_variants.vcf \ - --REFERENCE_SEQUENCE ~{reference_fasta} \ - --MAX_RECORDS_IN_RAM 100000 - - # compress vcf - this creates a file with .gz suffix - bgzip ~{output_basename}.liftedover.vcf - - # generate new index - this creates a file with .tbi suffix - tabix ~{output_basename}.liftedover.vcf.gz - >>> - - runtime { - docker: docker - memory: "~{mem_gb} GiB" - cpu: "1" - disks: "local-disk ~{disk_size} HDD" - maxRetries: max_retries - preemptible: preemptible_tries - } - - output { - File lifted_over_vcf = "~{output_basename}.liftedover.vcf.gz" - File lifted_over_vcf_index = "~{output_basename}.liftedover.vcf.gz.tbi" - } -} From 0d21384d57b8a758d41a0c7ddec44c3ca3ef5c2d Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Mon, 24 Feb 2025 15:56:14 -0500 Subject: [PATCH 74/92] oops this commit adds the README for imputation_beagle --- pipelines/broad/arrays/imputation_beagle/README.md | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 pipelines/broad/arrays/imputation_beagle/README.md diff --git a/pipelines/broad/arrays/imputation_beagle/README.md b/pipelines/broad/arrays/imputation_beagle/README.md new file mode 100644 index 0000000000..67f5421d93 --- /dev/null +++ b/pipelines/broad/arrays/imputation_beagle/README.md @@ -0,0 +1,7 @@ +### ImputationBeagle summary + +The ImputationBeagle pipeline imputes missing genotypes from a multi-sample VCF using the [Beagle imputation tool](https://faculty.washington.edu/browning/beagle/beagle.html) and a large genomic reference panel. Overall, the pipeline filters, phases, and performs imputation on a multi-sample VCF. + +### ArrayImputationQuotaConsumed summary + +The ArrayImputationQuotaConsumed pipeline is used by the All of Us/AnVIL Imputation Service and calculates the number of samples in the input multi-sample VCF, which is the metric used by the service for ImputationBeagle pipeline quota. From 38121aca3aca2bb5b18279b8794bcc2c97a4b44a Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Mon, 24 Feb 2025 16:13:45 -0500 Subject: [PATCH 75/92] rename test inputs files to reflect contents --- .../{test_data_NA12878_x50.json => test_data_NA12878_x10.json} | 2 +- .../{test_data_NA12878_x50.json => test_data_NA12878_x10.json} | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename pipelines/broad/arrays/imputation_beagle/test_inputs/Plumbing/{test_data_NA12878_x50.json => test_data_NA12878_x10.json} (87%) rename pipelines/broad/arrays/imputation_beagle/test_inputs/Scientific/{test_data_NA12878_x50.json => test_data_NA12878_x10.json} (89%) diff --git a/pipelines/broad/arrays/imputation_beagle/test_inputs/Plumbing/test_data_NA12878_x50.json b/pipelines/broad/arrays/imputation_beagle/test_inputs/Plumbing/test_data_NA12878_x10.json similarity index 87% rename from pipelines/broad/arrays/imputation_beagle/test_inputs/Plumbing/test_data_NA12878_x50.json rename to pipelines/broad/arrays/imputation_beagle/test_inputs/Plumbing/test_data_NA12878_x10.json index e23a1cfa8e..bdf5a00597 100644 --- a/pipelines/broad/arrays/imputation_beagle/test_inputs/Plumbing/test_data_NA12878_x50.json +++ b/pipelines/broad/arrays/imputation_beagle/test_inputs/Plumbing/test_data_NA12878_x10.json @@ -1,5 +1,5 @@ { - "ImputationBeagle.multi_sample_vcf": "gs://broad-gotc-test-storage/imputation_beagle/scientific/vcfs/NA12878_50_duplicate.clean.vcf.gz", + "ImputationBeagle.multi_sample_vcf": "gs://broad-gotc-test-storage/imputation_beagle/scientific/vcfs/NA12878_10_duplicate.merged.cleaned.vcf.gz", "ImputationBeagle.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", "ImputationBeagle.reference_panel_path_prefix": "gs://broad-gotc-test-storage/imputation_beagle/scientific/1000G_HGDP_no_singletons_reference_panel/hgdp.tgp.gwaspy.AN_added.bcf.ac2", "ImputationBeagle.contigs": ["chr21","chr22"], diff --git a/pipelines/broad/arrays/imputation_beagle/test_inputs/Scientific/test_data_NA12878_x50.json b/pipelines/broad/arrays/imputation_beagle/test_inputs/Scientific/test_data_NA12878_x10.json similarity index 89% rename from pipelines/broad/arrays/imputation_beagle/test_inputs/Scientific/test_data_NA12878_x50.json rename to pipelines/broad/arrays/imputation_beagle/test_inputs/Scientific/test_data_NA12878_x10.json index bb23a985b2..4263609e29 100644 --- a/pipelines/broad/arrays/imputation_beagle/test_inputs/Scientific/test_data_NA12878_x50.json +++ b/pipelines/broad/arrays/imputation_beagle/test_inputs/Scientific/test_data_NA12878_x10.json @@ -1,5 +1,5 @@ { - "ImputationBeagle.multi_sample_vcf": "gs://broad-gotc-test-storage/imputation_beagle/scientific/vcfs/NA12878_50_duplicate.clean.vcf.gz", + "ImputationBeagle.multi_sample_vcf": "gs://broad-gotc-test-storage/imputation_beagle/scientific/vcfs/NA12878_10_duplicate.merged.cleaned.vcf.gz", "ImputationBeagle.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", "ImputationBeagle.reference_panel_path_prefix": "gs://broad-gotc-test-storage/imputation_beagle/scientific/1000G_HGDP_no_singletons_reference_panel/hgdp.tgp.gwaspy.AN_added.bcf.ac2", "ImputationBeagle.contigs": ["chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22"], From 1135a0878c92bee2a04b0456c92d8501e3266f27 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Mon, 24 Feb 2025 16:23:21 -0500 Subject: [PATCH 76/92] PR comments round 1 --- .../arrays/imputation/Imputation.changelog.md | 5 +++ .../broad/arrays/imputation/Imputation.wdl | 2 +- .../ArrayImputationQuotaConsumed.changelog.md | 6 ++-- .../ArrayImputationQuotaConsumed.wdl | 7 +--- .../ImputationBeagle.changelog.md | 10 ++---- .../imputation_beagle/ImputationBeagle.wdl | 2 +- tasks/broad/ImputationTasks.wdl | 35 ------------------- 7 files changed, 13 insertions(+), 54 deletions(-) diff --git a/pipelines/broad/arrays/imputation/Imputation.changelog.md b/pipelines/broad/arrays/imputation/Imputation.changelog.md index 52765e4ec1..5030cf3f05 100644 --- a/pipelines/broad/arrays/imputation/Imputation.changelog.md +++ b/pipelines/broad/arrays/imputation/Imputation.changelog.md @@ -1,3 +1,8 @@ +# 1.1.16 +2025-02-24 (Date of Last Commit) + +* Updated runtime parameters in some ImputationTasks, and added an explicit definition of a vcf_index. + # 1.1.15 2024-11-04 (Date of Last Commit) diff --git a/pipelines/broad/arrays/imputation/Imputation.wdl b/pipelines/broad/arrays/imputation/Imputation.wdl index 058f0066de..3466169b64 100644 --- a/pipelines/broad/arrays/imputation/Imputation.wdl +++ b/pipelines/broad/arrays/imputation/Imputation.wdl @@ -6,7 +6,7 @@ import "../../../../tasks/broad/Utilities.wdl" as utils workflow Imputation { - String pipeline_version = "1.1.15" + String pipeline_version = "1.1.16" input { Int chunkLength = 25000000 diff --git a/pipelines/broad/arrays/imputation_beagle/ArrayImputationQuotaConsumed.changelog.md b/pipelines/broad/arrays/imputation_beagle/ArrayImputationQuotaConsumed.changelog.md index 336273806b..978888b711 100644 --- a/pipelines/broad/arrays/imputation_beagle/ArrayImputationQuotaConsumed.changelog.md +++ b/pipelines/broad/arrays/imputation_beagle/ArrayImputationQuotaConsumed.changelog.md @@ -1,4 +1,4 @@ -# 0.0.1 -2024-11-13 (Date of Last Commit) +# 1.0.0 +2025-02-24 (Date of Last Commit) -* Pipeline still in developmental state +* Initial release of pipeline to calculate the number of samples, i.e. quota used by an imputation service that uses ImputationBeagle.wdl. diff --git a/pipelines/broad/arrays/imputation_beagle/ArrayImputationQuotaConsumed.wdl b/pipelines/broad/arrays/imputation_beagle/ArrayImputationQuotaConsumed.wdl index c14f16158f..a4cd6e8d09 100644 --- a/pipelines/broad/arrays/imputation_beagle/ArrayImputationQuotaConsumed.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ArrayImputationQuotaConsumed.wdl @@ -3,7 +3,7 @@ version 1.0 import "../../../../tasks/broad/ImputationTasks.wdl" as tasks workflow QuotaConsumed { - String pipeline_version = "0.0.1" + String pipeline_version = "1.0.0" input { Int chunkLength = 25000000 @@ -16,11 +16,6 @@ workflow QuotaConsumed { String reference_panel_path_prefix String genetic_maps_path String output_basename - Boolean split_output_to_single_sample = false - - # file extensions used to find reference panel files - String interval_list_suffix = ".interval_list" - String bref3_suffix = ".bref3" } call tasks.CountSamples { diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.changelog.md b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.changelog.md index 927e821893..a383a0cf10 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.changelog.md +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.changelog.md @@ -1,12 +1,6 @@ # 1.0.0 -2025-02-20 (Date of Last Commit) +2025-02-24 (Date of Last Commit) * * Initial public release of the ImputationBeagle pipeline. - * The ImputationBeagle pipeline imputes missing genotypes from a multi-sample VCF using a large genomic reference panel. It is based on the Michigan Imputation Server pipeline but uses the Beagle imputation tool instead of minimac. Overall, the pipeline filters, phases, and performs imputation on a multi-sample VCF. It outputs the imputed VCF along with key imputation metrics. - - -# 0.0.1 -2024-11-13 (Date of Last Commit) - -* Pipeline still in developmental state + * The ImputationBeagle pipeline imputes missing genotypes from a multi-sample VCF using a large genomic reference panel. It is based on the Michigan Imputation Server pipeline but uses the Beagle imputation tool instead of minimac. Overall, the pipeline filters, phases, and performs imputation on a multi-sample VCF. It outputs the imputed VCF. diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl index 6d312f6bda..342f1949f5 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl @@ -127,7 +127,7 @@ workflow ImputationBeagle { String chunk_basename_imputed = referencePanelContig.contig + "_chunk_" + i + "_imputed" # max amount of cpus you can ask for is 96 so at a max of 10k samples we can only ask for 9 cpu a sample. - # these values are based on trying to optimize for pre-emptibility using a 400k sample referene panel + # these values are based on trying to optimize for pre-emptibility using a 400k sample reference panel # and up to a 10k sample input vcf Int beagle_cpu = if (CountSamples.nSamples <= 1000) then 8 else floor(CountSamples.nSamples / 1000) * 9 Int beagle_phase_memory_in_gb = if (CountSamples.nSamples <= 1000) then 22 else ceil(beagle_cpu * 1.5) diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index f64b902906..de729bf33d 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -58,41 +58,6 @@ task GetMissingContigList { } } -task CreateRefPanelIntervalLists { - input { - File ref_panel_vcf - File ref_panel_vcf_index - - Int disk_size_gb = ceil(2*size(ref_panel_vcf, "GiB")) + 10 - Int cpu = 1 - Int memory_mb = 6000 - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" - } - - Int command_mem = memory_mb - 1500 - Int max_heap = memory_mb - 1000 - - String basename = basename(ref_panel_vcf, '.vcf.gz') - - command { - gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" \ - VcfToIntervalList \ - -I ~{ref_panel_vcf} \ - -O ~{basename}.interval_list - } - - output { - File interval_list = "~{basename}.interval_list" - } - - runtime { - docker: gatk_docker - disks: "local-disk ${disk_size_gb} HDD" - memory: "${memory_mb} MiB" - cpu: cpu - } -} - task GenerateChunk { input { Int start From bab5a2a4f2c96d9d8a28c94cfb21f09f9eeb69a8 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Mon, 24 Feb 2025 21:23:47 +0000 Subject: [PATCH 77/92] Updated pipeline_versions.txt with all pipeline version information --- pipeline_versions.txt | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pipeline_versions.txt b/pipeline_versions.txt index b41c876333..ac0abaa7ff 100644 --- a/pipeline_versions.txt +++ b/pipeline_versions.txt @@ -12,10 +12,9 @@ SlideSeq 3.4.9 2025-02-12 MultiSampleSmartSeq2 2.2.22 2024-09-11 AnnotationFiltration 1.2.7 2024-11-04 RNAWithUMIsPipeline 1.0.18 2024-11-04 -Imputation 1.1.15 2024-11-04 -ArrayImputationQuotaConsumed 0.0.1 2024-11-13 -ImputationBeagle 1.0.0 2025-02-20 -LiftoverVcfs 0.0.1 2024-11-13 +Imputation 1.1.16 2025-02-24 +ArrayImputationQuotaConsumed 1.0.0 2025-02-24 +ImputationBeagle 1.0.0 2025-02-24 Arrays 2.6.30 2024-11-04 MultiSampleArrays 1.6.2 2024-08-02 ValidateChip 1.16.7 2024-11-04 From 4d97dc490a0dbcbf10a07be1d972812a38fa2650 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Mon, 24 Feb 2025 16:29:15 -0500 Subject: [PATCH 78/92] update changelog for BroadInternalImputation --- .../arrays/imputation/BroadInternalImputation.changelog.md | 5 +++++ .../internal/arrays/imputation/BroadInternalImputation.wdl | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/pipelines/broad/internal/arrays/imputation/BroadInternalImputation.changelog.md b/pipelines/broad/internal/arrays/imputation/BroadInternalImputation.changelog.md index a0930046d7..e4f328a7fb 100644 --- a/pipelines/broad/internal/arrays/imputation/BroadInternalImputation.changelog.md +++ b/pipelines/broad/internal/arrays/imputation/BroadInternalImputation.changelog.md @@ -1,3 +1,8 @@ +# 1.1.15 +2025-02-24 (Date of Last Commit) + +* Updated runtime parameters in some ImputationTasks, and added an explicit definition of a vcf_index. + # 1.1.14 2024-11-04 (Date of Last Commit) diff --git a/pipelines/broad/internal/arrays/imputation/BroadInternalImputation.wdl b/pipelines/broad/internal/arrays/imputation/BroadInternalImputation.wdl index 525ce85e00..27e16fa28e 100644 --- a/pipelines/broad/internal/arrays/imputation/BroadInternalImputation.wdl +++ b/pipelines/broad/internal/arrays/imputation/BroadInternalImputation.wdl @@ -9,7 +9,7 @@ workflow BroadInternalImputation { description: "Push outputs of Imputation.wdl to TDR dataset table ImputationOutputsTable and split out Imputation arrays into ImputationWideOutputsTable." allowNestedInputs: true } - String pipeline_version = "1.1.14" + String pipeline_version = "1.1.15" input { # inputs to wrapper task From b30a1dbbf9622307d2a47420695a6a1939358164 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Mon, 24 Feb 2025 21:29:47 +0000 Subject: [PATCH 79/92] Updated pipeline_versions.txt with all pipeline version information --- pipeline_versions.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline_versions.txt b/pipeline_versions.txt index ac0abaa7ff..294b90c791 100644 --- a/pipeline_versions.txt +++ b/pipeline_versions.txt @@ -31,7 +31,7 @@ GDCWholeGenomeSomaticSingleSample 1.3.4 2024-11-04 UltimaGenomicsWholeGenomeCramOnly 1.0.23 2024-11-04 CheckFingerprint 1.0.22 2024-10-28 BroadInternalRNAWithUMIs 1.0.36 2024-11-04 -BroadInternalImputation 1.1.14 2024-11-04 +BroadInternalImputation 1.1.15 2025-02-24 BroadInternalArrays 1.1.14 2024-11-04 BroadInternalUltimaGenomics 1.1.3 2024-12-05 IlluminaGenotypingArray 1.12.24 2024-11-04 From 64d66aca56200bb9a0b39f2d82fe4b114634e61f Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Mon, 24 Feb 2025 16:37:19 -0500 Subject: [PATCH 80/92] add back newline to Utilities.wdl with -w flag on changed file check --- scripts/validate_release.sh | 3 ++- tasks/broad/Utilities.wdl | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/validate_release.sh b/scripts/validate_release.sh index b9277c8498..036ea41445 100755 --- a/scripts/validate_release.sh +++ b/scripts/validate_release.sh @@ -13,7 +13,8 @@ source $( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )/common. function check_if_file_changed() { local -r file=${1} commit=${2} # the result is flipped because git diff-index returns 0 if the file is not changed - ! git diff-index --quiet ${commit} ${file} + # ignore whitespace changes (-w) + ! git diff-index -w --quiet ${commit} ${file} } function get_version_from_workflow() { diff --git a/tasks/broad/Utilities.wdl b/tasks/broad/Utilities.wdl index e6a1aeec17..52121a74a9 100644 --- a/tasks/broad/Utilities.wdl +++ b/tasks/broad/Utilities.wdl @@ -300,4 +300,4 @@ task GetValidationInputs { Array[String] results_files = read_lines("results_files.txt") } -} \ No newline at end of file +} From 3e81fed15ec00dfd1286381525084a8b4042172c Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Mon, 24 Feb 2025 16:43:43 -0500 Subject: [PATCH 81/92] remove change to Minimac4 task --- tasks/broad/ImputationTasks.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index de729bf33d..2c19713924 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -233,7 +233,7 @@ task Minimac4 { String minimac4_docker = "us.gcr.io/broad-gotc-prod/imputation-minimac4:1.0.6-1.0.2-1663948783" Int cpu = 1 Int memory_mb = 4000 - Int disk_size_gb = ceil(size(ref_panel, "GiB") + 2*size(phased_vcf, "GiB")) + 10 + Int disk_size_gb = ceil(size(ref_panel, "GiB") + 2*size(phased_vcf, "GiB")) + 50 } command <<< set -e -o pipefail From 295a3ada0a1aae224e969bbade181052e018f4d5 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Mon, 24 Feb 2025 16:46:30 -0500 Subject: [PATCH 82/92] revert change to tool command in OptionalQCSites --- tasks/broad/ImputationTasks.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index 2c19713924..1fed4d6972 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -460,7 +460,7 @@ task OptionalQCSites { ln -sf ~{input_vcf_index} input.vcf.gz.tbi # site missing rate < 5% ; hwe p > 1e-6 - tools --gzvcf input.vcf.gz --max-missing ~{max_missing} --hwe ~{hwe} --recode -c | bgzip -c > ~{output_vcf_basename}.vcf.gz + vcftools --gzvcf input.vcf.gz --max-missing ~{max_missing} --hwe ~{hwe} --recode -c | bgzip -c > ~{output_vcf_basename}.vcf.gz bcftools index -t ~{output_vcf_basename}.vcf.gz # Note: this is necessary because vcftools doesn't have a way to output a zipped vcf, nor a way to index one (hence needing to use bcf). >>> runtime { From 408e9e2cf4c678333e6f4e0ec0fddd0a33a27c82 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Tue, 25 Feb 2025 10:56:55 -0500 Subject: [PATCH 83/92] fix fail task dependency, revert attempt to ignore newline in diff, other pr comments --- pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl | 3 ++- scripts/validate_release.sh | 3 +-- tasks/broad/ImputationBeagleTasks.wdl | 2 ++ tasks/broad/ImputationTasks.wdl | 1 - tasks/broad/Utilities.wdl | 2 +- 5 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl index 342f1949f5..64d058b965 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl @@ -143,7 +143,8 @@ workflow ImputationBeagle { start = start[i], end = end[i], cpu = beagle_cpu, - memory_mb = beagle_phase_memory_in_gb * 1024 + memory_mb = beagle_phase_memory_in_gb * 1024, + for_dependency = FailQCNChunks.done } call beagleTasks.Impute { diff --git a/scripts/validate_release.sh b/scripts/validate_release.sh index 036ea41445..b9277c8498 100755 --- a/scripts/validate_release.sh +++ b/scripts/validate_release.sh @@ -13,8 +13,7 @@ source $( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )/common. function check_if_file_changed() { local -r file=${1} commit=${2} # the result is flipped because git diff-index returns 0 if the file is not changed - # ignore whitespace changes (-w) - ! git diff-index -w --quiet ${commit} ${file} + ! git diff-index --quiet ${commit} ${file} } function get_version_from_workflow() { diff --git a/tasks/broad/ImputationBeagleTasks.wdl b/tasks/broad/ImputationBeagleTasks.wdl index 123090f768..daec3fea80 100644 --- a/tasks/broad/ImputationBeagleTasks.wdl +++ b/tasks/broad/ImputationBeagleTasks.wdl @@ -82,6 +82,8 @@ task Phase { Int memory_mb = 32000 # value depends on chunk size, the number of samples in ref and target panel, and whether imputation is performed Int xmx_mb = memory_mb - 5000 # I suggest setting this parameter to be 85-90% of the memory_mb parameter Int disk_size_gb = ceil(3 * size([dataset_vcf, ref_panel_bref3], "GiB")) + 10 # value may need to be adjusted + + Boolean for_dependency # used for task dependency management } command <<< diff --git a/tasks/broad/ImputationTasks.wdl b/tasks/broad/ImputationTasks.wdl index 1fed4d6972..1a89954535 100644 --- a/tasks/broad/ImputationTasks.wdl +++ b/tasks/broad/ImputationTasks.wdl @@ -756,7 +756,6 @@ task ExtractIDs { String bcftools_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.7-1.10.2-0.1.16-1669908889" Int cpu = 1 Int memory_mb = 4000 - Boolean for_dependency = true } command <<< bcftools query -f "%ID\n" ~{vcf} -o ~{output_basename}.ids.txt diff --git a/tasks/broad/Utilities.wdl b/tasks/broad/Utilities.wdl index 52121a74a9..e6a1aeec17 100644 --- a/tasks/broad/Utilities.wdl +++ b/tasks/broad/Utilities.wdl @@ -300,4 +300,4 @@ task GetValidationInputs { Array[String] results_files = read_lines("results_files.txt") } -} +} \ No newline at end of file From 1546ad77bcceed74b24846b74bda4619a86567af Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Tue, 25 Feb 2025 12:56:57 -0500 Subject: [PATCH 84/92] update README for ImputationBeagle --- pipelines/broad/arrays/imputation_beagle/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/broad/arrays/imputation_beagle/README.md b/pipelines/broad/arrays/imputation_beagle/README.md index 67f5421d93..6fb1f1c4b3 100644 --- a/pipelines/broad/arrays/imputation_beagle/README.md +++ b/pipelines/broad/arrays/imputation_beagle/README.md @@ -1,6 +1,6 @@ ### ImputationBeagle summary -The ImputationBeagle pipeline imputes missing genotypes from a multi-sample VCF using the [Beagle imputation tool](https://faculty.washington.edu/browning/beagle/beagle.html) and a large genomic reference panel. Overall, the pipeline filters, phases, and performs imputation on a multi-sample VCF. +The ImputationBeagle pipeline imputes missing genotypes from a multi-sample VCF using the [Beagle imputation tool](https://faculty.washington.edu/browning/beagle/beagle.html) and a large genomic reference panel. Overall, the pipeline filters, phases, and performs imputation on a multi-sample VCF. This pipeline was created for use by the All of Us/AnVIL Imputation Service ### ArrayImputationQuotaConsumed summary From dbb059f0b676d3e1f700560b4cc52fd7b78b9209 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Wed, 26 Feb 2025 08:07:15 -0500 Subject: [PATCH 85/92] rename test files --- .../{test_data_NA12878_x10.json => NA12878_x10_hg38_arrays.json} | 0 .../{test_data_NA12878_x10.json => NA12878_x10_hg38_arrays.json} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename pipelines/broad/arrays/imputation_beagle/test_inputs/Plumbing/{test_data_NA12878_x10.json => NA12878_x10_hg38_arrays.json} (100%) rename pipelines/broad/arrays/imputation_beagle/test_inputs/Scientific/{test_data_NA12878_x10.json => NA12878_x10_hg38_arrays.json} (100%) diff --git a/pipelines/broad/arrays/imputation_beagle/test_inputs/Plumbing/test_data_NA12878_x10.json b/pipelines/broad/arrays/imputation_beagle/test_inputs/Plumbing/NA12878_x10_hg38_arrays.json similarity index 100% rename from pipelines/broad/arrays/imputation_beagle/test_inputs/Plumbing/test_data_NA12878_x10.json rename to pipelines/broad/arrays/imputation_beagle/test_inputs/Plumbing/NA12878_x10_hg38_arrays.json diff --git a/pipelines/broad/arrays/imputation_beagle/test_inputs/Scientific/test_data_NA12878_x10.json b/pipelines/broad/arrays/imputation_beagle/test_inputs/Scientific/NA12878_x10_hg38_arrays.json similarity index 100% rename from pipelines/broad/arrays/imputation_beagle/test_inputs/Scientific/test_data_NA12878_x10.json rename to pipelines/broad/arrays/imputation_beagle/test_inputs/Scientific/NA12878_x10_hg38_arrays.json From d980d0084047d9f121cf984b94f11ebb4a5535f8 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Wed, 26 Feb 2025 13:07:41 +0000 Subject: [PATCH 86/92] Updated pipeline_versions.txt with all pipeline version information --- pipeline_versions.txt | 68 +++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/pipeline_versions.txt b/pipeline_versions.txt index 294b90c791..6ea703b2de 100644 --- a/pipeline_versions.txt +++ b/pipeline_versions.txt @@ -1,42 +1,42 @@ Pipeline Name Version Date of Last Commit -snm3C 4.0.4 2024-08-06 -BuildIndices 4.0.0 2025-01-17 -scATAC 1.3.2 2023-08-03 -MultiSampleSmartSeq2SingleNucleus 2.0.8 2025-02-12 -atac 2.7.1 2025-02-12 -Optimus 7.9.2 2025-02-12 -SmartSeq2SingleSample 5.1.21 2024-09-11 -Multiome 5.11.0 2025-02-05 -PairedTag 1.10.2 2025-02-06 -SlideSeq 3.4.9 2025-02-12 -MultiSampleSmartSeq2 2.2.22 2024-09-11 -AnnotationFiltration 1.2.7 2024-11-04 -RNAWithUMIsPipeline 1.0.18 2024-11-04 -Imputation 1.1.16 2025-02-24 +Arrays 2.6.30 2024-11-04 +ValidateChip 1.16.7 2024-11-04 ArrayImputationQuotaConsumed 1.0.0 2025-02-24 ImputationBeagle 1.0.0 2025-02-24 -Arrays 2.6.30 2024-11-04 +Imputation 1.1.16 2025-02-24 MultiSampleArrays 1.6.2 2024-08-02 -ValidateChip 1.16.7 2024-11-04 -JointGenotyping 1.7.2 2024-11-04 +WholeGenomeReprocessing 3.3.3 2024-11-04 +ExomeReprocessing 3.3.3 2024-11-04 +CramToUnmappedBams 1.1.3 2024-08-02 +ExternalWholeGenomeReprocessing 2.3.3 2024-11-04 +ExternalExomeReprocessing 3.3.3 2024-11-04 +BroadInternalArrays 1.1.14 2024-11-04 +BroadInternalImputation 1.1.15 2025-02-24 +BroadInternalRNAWithUMIs 1.0.36 2024-11-04 +BroadInternalUltimaGenomics 1.1.3 2024-12-05 +RNAWithUMIsPipeline 1.0.18 2024-11-04 +IlluminaGenotypingArray 1.12.24 2024-11-04 +AnnotationFiltration 1.2.7 2024-11-04 +UltimaGenomicsWholeGenomeCramOnly 1.0.23 2024-11-04 +GDCWholeGenomeSomaticSingleSample 1.3.4 2024-11-04 +UltimaGenomicsWholeGenomeGermline 1.1.3 2024-12-05 +WholeGenomeGermlineSingleSample 3.3.3 2024-11-04 +ExomeGermlineSingleSample 3.2.3 2024-11-04 +VariantCalling 2.2.4 2024-11-04 ReblockGVCF 2.4.0 2024-12-05 UltimaGenomicsJointGenotyping 1.2.2 2024-11-04 -JointGenotypingByChromosomePartTwo 1.5.2 2024-11-04 JointGenotypingByChromosomePartOne 1.5.2 2024-11-04 -ExomeGermlineSingleSample 3.2.3 2024-11-04 -WholeGenomeGermlineSingleSample 3.3.3 2024-11-04 -UltimaGenomicsWholeGenomeGermline 1.1.3 2024-12-05 -VariantCalling 2.2.4 2024-11-04 -GDCWholeGenomeSomaticSingleSample 1.3.4 2024-11-04 -UltimaGenomicsWholeGenomeCramOnly 1.0.23 2024-11-04 +JointGenotypingByChromosomePartTwo 1.5.2 2024-11-04 +JointGenotyping 1.7.2 2024-11-04 CheckFingerprint 1.0.22 2024-10-28 -BroadInternalRNAWithUMIs 1.0.36 2024-11-04 -BroadInternalImputation 1.1.15 2025-02-24 -BroadInternalArrays 1.1.14 2024-11-04 -BroadInternalUltimaGenomics 1.1.3 2024-12-05 -IlluminaGenotypingArray 1.12.24 2024-11-04 -ExternalExomeReprocessing 3.3.3 2024-11-04 -ExternalWholeGenomeReprocessing 2.3.3 2024-11-04 -ExomeReprocessing 3.3.3 2024-11-04 -CramToUnmappedBams 1.1.3 2024-08-02 -WholeGenomeReprocessing 3.3.3 2024-11-04 +scATAC 1.3.2 2023-08-03 +MultiSampleSmartSeq2 2.2.22 2024-09-11 +BuildIndices 4.0.0 2025-01-17 +SlideSeq 3.4.9 2025-02-12 +PairedTag 1.10.2 2025-02-06 +MultiSampleSmartSeq2SingleNucleus 2.0.8 2025-02-12 +atac 2.7.1 2025-02-12 +snm3C 4.0.4 2024-08-06 +SmartSeq2SingleSample 5.1.21 2024-09-11 +Optimus 7.9.2 2025-02-12 +Multiome 5.11.0 2025-02-05 From 06b3d1fae6cb470aa92ebdf0218ddc63c320eedb Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Wed, 26 Feb 2025 08:18:57 -0500 Subject: [PATCH 87/92] another commit for hashes --- pipelines/broad/arrays/imputation_beagle/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/broad/arrays/imputation_beagle/README.md b/pipelines/broad/arrays/imputation_beagle/README.md index 6fb1f1c4b3..754e416b5a 100644 --- a/pipelines/broad/arrays/imputation_beagle/README.md +++ b/pipelines/broad/arrays/imputation_beagle/README.md @@ -1,6 +1,6 @@ ### ImputationBeagle summary -The ImputationBeagle pipeline imputes missing genotypes from a multi-sample VCF using the [Beagle imputation tool](https://faculty.washington.edu/browning/beagle/beagle.html) and a large genomic reference panel. Overall, the pipeline filters, phases, and performs imputation on a multi-sample VCF. This pipeline was created for use by the All of Us/AnVIL Imputation Service +The ImputationBeagle pipeline imputes missing genotypes from a multi-sample VCF using the [Beagle imputation tool](https://faculty.washington.edu/browning/beagle/beagle.html) and a large genomic reference panel. Overall, the pipeline filters, phases, and performs imputation on a multi-sample VCF. This pipeline was created for use by the All of Us/AnVIL Imputation Service. ### ArrayImputationQuotaConsumed summary From fd78fd3b5b6582dbd048a39c52bdea6d621ed7ac Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Wed, 26 Feb 2025 19:53:25 +0000 Subject: [PATCH 88/92] Updated pipeline_versions.txt with all pipeline version information --- pipeline_versions.txt | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/pipeline_versions.txt b/pipeline_versions.txt index f226fcd77e..87117aa799 100644 --- a/pipeline_versions.txt +++ b/pipeline_versions.txt @@ -1,18 +1,4 @@ Pipeline Name Version Date of Last Commit -snm3C 4.0.4 2024-08-06 -BuildIndices 4.0.0 2025-01-17 -scATAC 1.3.2 2023-08-03 -MultiSampleSmartSeq2SingleNucleus 2.0.8 2025-02-25 -atac 2.7.1 2025-02-25 -Optimus 7.9.2 2025-02-25 -SmartSeq2SingleSample 5.1.21 2024-09-11 -Multiome 5.11.0 2025-02-25 -PairedTag 1.10.2 2025-02-25 -SlideSeq 3.4.9 2025-02-25 -MultiSampleSmartSeq2 2.2.22 2024-09-11 -AnnotationFiltration 1.2.7 2024-11-04 -RNAWithUMIsPipeline 1.0.18 2024-11-04 -Imputation 1.1.15 2024-11-04 Arrays 2.6.30 2024-11-04 ValidateChip 1.16.7 2024-11-04 ArrayImputationQuotaConsumed 1.0.0 2025-02-24 @@ -46,11 +32,11 @@ CheckFingerprint 1.0.22 2024-10-28 scATAC 1.3.2 2023-08-03 MultiSampleSmartSeq2 2.2.22 2024-09-11 BuildIndices 4.0.0 2025-01-17 -SlideSeq 3.4.9 2025-02-12 -PairedTag 1.10.2 2025-02-06 -MultiSampleSmartSeq2SingleNucleus 2.0.8 2025-02-12 -atac 2.7.1 2025-02-12 +SlideSeq 3.4.9 2025-02-25 +PairedTag 1.10.2 2025-02-25 +MultiSampleSmartSeq2SingleNucleus 2.0.8 2025-02-25 +atac 2.7.1 2025-02-25 snm3C 4.0.4 2024-08-06 SmartSeq2SingleSample 5.1.21 2024-09-11 -Optimus 7.9.2 2025-02-12 -Multiome 5.11.0 2025-02-05 +Optimus 7.9.2 2025-02-25 +Multiome 5.11.0 2025-02-25 From e795792dfc66e1d660f5a6547694a255b2c7efb2 Mon Sep 17 00:00:00 2001 From: npetrill Date: Wed, 26 Feb 2025 14:54:23 -0500 Subject: [PATCH 89/92] dummy commit --- .github/workflows/warp_test_workflow.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/warp_test_workflow.yml b/.github/workflows/warp_test_workflow.yml index 7bbded37aa..95ca7542fe 100644 --- a/.github/workflows/warp_test_workflow.yml +++ b/.github/workflows/warp_test_workflow.yml @@ -188,6 +188,7 @@ jobs: echo "Starting hash comparison with retry mechanism..." + while [ $TOTAL_WAITED -lt $MAX_WAIT_TIME ]; do echo "Fetching Dockstore Commit Hash..." DOCKSTORE_COMMIT_HASH=$(python scripts/dockstore_api/fetch_dockstore_commit.py \ From 1fd999ff53936765cd02411a286b40ed52829d37 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Wed, 26 Feb 2025 16:05:53 -0500 Subject: [PATCH 90/92] pr comments --- .../imputation_beagle/ImputationBeagle.changelog.md | 5 ++--- tasks/broad/ImputationBeagleTasks.wdl | 12 ++++++------ 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.changelog.md b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.changelog.md index a383a0cf10..ddc7604697 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.changelog.md +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeagle.changelog.md @@ -1,6 +1,5 @@ # 1.0.0 -2025-02-24 (Date of Last Commit) - -* * Initial public release of the ImputationBeagle pipeline. +2025-02-26 (Date of Last Commit) +* Initial public release of the ImputationBeagle pipeline. * The ImputationBeagle pipeline imputes missing genotypes from a multi-sample VCF using a large genomic reference panel. It is based on the Michigan Imputation Server pipeline but uses the Beagle imputation tool instead of minimac. Overall, the pipeline filters, phases, and performs imputation on a multi-sample VCF. It outputs the imputed VCF. diff --git a/tasks/broad/ImputationBeagleTasks.wdl b/tasks/broad/ImputationBeagleTasks.wdl index daec3fea80..af4363e600 100644 --- a/tasks/broad/ImputationBeagleTasks.wdl +++ b/tasks/broad/ImputationBeagleTasks.wdl @@ -73,9 +73,9 @@ task Phase { File ref_panel_bref3 File genetic_map_file String basename - String chrom # not needed if ref file has been chunked and you are using the entire chunk - Int start # not needed if ref file has been chunked and you are using the entire chunk - Int end # not needed if ref file has been chunked and you are using the entire chunk + String chrom + Int start + Int end String beagle_docker = "us.gcr.io/broad-gotc-prod/imputation-beagle:1.0.0-17Dec24.224-1740423035" Int cpu = 8 # This parameter is used as the nthreads input to Beagle which is part of how we make it determinstic. Changing this value may change the output generated by the tool @@ -120,9 +120,9 @@ task Impute { File ref_panel_bref3 File genetic_map_file String basename - String chrom # not needed if ref file has been chunked and you are using the entire chunk - Int start # not needed if ref file has been chunked and you are using the entire chunk - Int end # not needed if ref file has been chunked and you are using the entire chunk + String chrom + Int start + Int end String beagle_docker = "us.gcr.io/broad-gotc-prod/imputation-beagle:1.0.0-17Dec24.224-1740423035" Int cpu = 8 # This parameter is used as the nthreads input to Beagle which is part of how we make it determinstic. Changing this value may change the output generated by the tool From b216840dfeb2489c6fe6d85bb4846a2c671f6858 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Wed, 26 Feb 2025 22:04:13 +0000 Subject: [PATCH 91/92] Updated pipeline_versions.txt with all pipeline version information --- pipeline_versions.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline_versions.txt b/pipeline_versions.txt index 87117aa799..bc8fa78585 100644 --- a/pipeline_versions.txt +++ b/pipeline_versions.txt @@ -2,7 +2,7 @@ Pipeline Name Version Date of Last Commit Arrays 2.6.30 2024-11-04 ValidateChip 1.16.7 2024-11-04 ArrayImputationQuotaConsumed 1.0.0 2025-02-24 -ImputationBeagle 1.0.0 2025-02-24 +ImputationBeagle 1.0.0 2025-02-26 Imputation 1.1.16 2025-02-24 MultiSampleArrays 1.6.2 2024-08-02 WholeGenomeReprocessing 3.3.3 2024-11-04 From 43b48ed9b807546da0a05a8e293797199eb0a723 Mon Sep 17 00:00:00 2001 From: Morgan Taylor Date: Wed, 26 Feb 2025 17:49:15 -0500 Subject: [PATCH 92/92] dummy commit --- .github/workflows/warp_test_workflow.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/warp_test_workflow.yml b/.github/workflows/warp_test_workflow.yml index 95ca7542fe..7bbded37aa 100644 --- a/.github/workflows/warp_test_workflow.yml +++ b/.github/workflows/warp_test_workflow.yml @@ -188,7 +188,6 @@ jobs: echo "Starting hash comparison with retry mechanism..." - while [ $TOTAL_WAITED -lt $MAX_WAIT_TIME ]; do echo "Fetching Dockstore Commit Hash..." DOCKSTORE_COMMIT_HASH=$(python scripts/dockstore_api/fetch_dockstore_commit.py \