diff --git a/.dockstore.yml b/.dockstore.yml index 4ce4927..8925a13 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -129,3 +129,8 @@ workflows: primaryDescriptorPath: /PECGS-QUICviz/QUICviz.wdl testParameterFiles: - /PECGS-QUICviz/QUICviz.inputs.json + - name: BroadInternalRNAWithUMIs + subclass: WDL + primaryDescriptorPath: /TCapRNAPipeline/BroadInternalRNAWithUMIs.wdl + testParameterFiles: + - /TCapRNAPipeline/BroadInternalRNAWithUMIs.inputs.json \ No newline at end of file diff --git a/TCapRNAPipeline/BroadInternalRNAWithUMIs.inputs.json b/TCapRNAPipeline/BroadInternalRNAWithUMIs.inputs.json new file mode 100644 index 0000000..d1fd0fa --- /dev/null +++ b/TCapRNAPipeline/BroadInternalRNAWithUMIs.inputs.json @@ -0,0 +1 @@ +{"BroadInternalRNAWithUMIs.environment":"prod","BroadInternalRNAWithUMIs.library_name":"${this.library_name}","BroadInternalRNAWithUMIs.output_basename":"${this.collaborator_sample_id}","BroadInternalRNAWithUMIs.platform":"${this.platform}","BroadInternalRNAWithUMIs.platform_unit":"${this.platform_unit}","BroadInternalRNAWithUMIs.r1_fastq":"${this.fastq1}","BroadInternalRNAWithUMIs.r2_fastq":"${this.fastq2}","BroadInternalRNAWithUMIs.read1Structure":"${this.read1Structure}","BroadInternalRNAWithUMIs.read2Structure":"${this.read2Structure}","BroadInternalRNAWithUMIs.read_group_name":"${this.read_group_name}","BroadInternalRNAWithUMIs.reference_build":"${this.reference_build}","BroadInternalRNAWithUMIs.sample_lsid":"${this.sample_lsid}","BroadInternalRNAWithUMIs.sequencing_center":"${this.sequencing_center}","BroadInternalRNAWithUMIs.tdr_dataset_uuid":"${}","BroadInternalRNAWithUMIs.tdr_sample_id":"${}","BroadInternalRNAWithUMIs.vault_token_path":"gs://broad-dsp-gotc-arrays-prod-tokens/arrayswdl.token"} \ No newline at end of file diff --git a/TCapRNAPipeline/BroadInternalRNAWithUMIs.wdl b/TCapRNAPipeline/BroadInternalRNAWithUMIs.wdl new file mode 100644 index 0000000..1447d43 --- /dev/null +++ b/TCapRNAPipeline/BroadInternalRNAWithUMIs.wdl @@ -0,0 +1,205 @@ +version 1.0 + +import "./subworkflows/RNAWithUMIsPipeline.wdl" as RNAWithUMIs +import "./subworkflows/CheckFingerprint.wdl" as FP +import "./subworkflows/RNAWithUMIsTasks.wdl" as tasks +import "./subworkflows/Utilities.wdl" as utils + +workflow BroadInternalRNAWithUMIs { + + String pipeline_version = "1.0.33" + + input { + # input needs to be either "hg19" or "hg38" + String reference_build + + String sample_lsid + + # RNAWithUMIs inputs + File r1_fastq + File r2_fastq + String read1Structure + String read2Structure + String output_basename + + String platform + String library_name + String platform_unit + String read_group_name + String sequencing_center = "BI" + + # Terra Data Repo dataset information + String? tdr_dataset_uuid + String? tdr_sample_id + + String environment + File vault_token_path + } + + File ref = if (reference_build == "hg19") then "gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.fasta" else "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/Homo_sapiens_assembly38_noALT_noHLA_noDecoy.fasta" + File refIndex = if (reference_build == "hg19") then "gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.fasta.fai" else "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/Homo_sapiens_assembly38_noALT_noHLA_noDecoy.fasta.fai" + File refDict = if (reference_build == "hg19") then "gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.dict" else "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/Homo_sapiens_assembly38_noALT_noHLA_noDecoy.dict" + File haplotype_database_file = if (reference_build == "hg19") then "gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.haplotype_database.txt" else "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/Homo_sapiens_assembly38_noALT_noHLA_noDecoy.haplotype_database.txt" + File refFlat = if (reference_build == "hg19") then "gs://gcp-public-data--broad-references/hg19/v0/annotation/Homo_sapiens_assembly19.refFlat.txt" else "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/annotation/hg38_GENCODE_v34_refFlat.txt" + File starIndex = if (reference_build == "hg19") then "gs://gcp-public-data--broad-references/hg19/v0/star/STAR2.7.10a_genome_hg19_noALT_noHLA_noDecoy_v19_oh145.tar.gz" else "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/star/STAR2.7.10a_genome_GRCh38_noALT_noHLA_noDecoy_v34_oh145.tar.gz" + File gtf = if (reference_build == "hg19") then "gs://gcp-public-data--broad-references/hg19/v0/annotation/gencode.v19.genes.v7.collapsed_only.patched_contigs.gtf" else "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/annotation/gencode.v34.annotation_collapsed_only.gtf" + File ribosomalIntervals = if (reference_build == "hg19") then "gs://gcp-public-data--broad-references/hg19/v0/annotation/Homo_sapiens_assembly19.rRNA.interval_list" else "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/annotation/gencode_v34_rRNA.interval_list" + File exonBedFile = if (reference_build == "hg19") then "gs://gcp-public-data--broad-references/hg19/v0/annotation/gencode.v19.hg19.insert_size_intervals_geq1000bp.bed" else "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/annotation/gencode.v34.GRCh38.insert_size_intervals_geq1000bp.bed" + File population_vcf = if (reference_build == "hg19") then "gs://gatk-best-practices/somatic-b37/small_exac_common_3.vcf" else "gs://gatk-best-practices/somatic-hg38/small_exac_common_3.hg38.vcf.gz" + File population_vcf_index = if (reference_build == "hg19") then "gs://gatk-best-practices/somatic-b37/small_exac_common_3.vcf.idx" else "gs://gatk-best-practices/somatic-hg38/small_exac_common_3.hg38.vcf.gz.tbi" + + parameter_meta { + reference_build: "String used to define the reference genome build; should be set to 'hg19' or 'hg38'" + sample_lsid: "The sample lsid (an identifier used to retrieve fingerrints from Mercury)" + r1_fastq: "Read 1 FASTQ file" + r2_fastq: "Read 2 FASTQ file" + read1Structure: "String describing how the bases in a sequencing run should be allocated into logical reads for read 1" + read2Structure: "String describing how the bases in a sequencing run should be allocated into logical reads for read 2" + output_basename: "String used as a prefix in workflow output files" + platform: "String used to describe the sequencing platform" + library_name: "String used to describe the library" + platform_unit: "String used to describe the platform unit" + read_group_name: "String used to describe the read group name" + sequencing_center: "String used to describe the sequencing center; default is set to 'BI'" + environment: "The environment (dev or prod) used for determining which service to use to retrieve Mercury fingerprints" + vault_token_path: "The path to the vault token used for accessing the Mercury Fingerprint Store" + tdr_dataset_uuid: "Optional string used to define the Terra Data Repo (TDR) dataset to which outputs will be ingested" + tdr_sample_id: "Optional string used to identify the sample being processed; this must be the primary key in the TDR dataset" + } + + # make sure either hg19 or hg38 is supplied as reference_build input + if ((reference_build != "hg19") && (reference_build != "hg38")) { + call utils.ErrorWithMessage as ErrorMessageIncorrectInput { + input: + message = "reference_build must be supplied with either 'hg19' or 'hg38'." + } + } + + call RNAWithUMIs.RNAWithUMIsPipeline as RNAWithUMIs { + input: + r1_fastq = r1_fastq, + r2_fastq = r2_fastq, + read1Structure = read1Structure, + read2Structure = read2Structure, + starIndex = starIndex, + output_basename = output_basename, + gtf = gtf, + platform = platform, + library_name = library_name, + platform_unit = platform_unit, + read_group_name = read_group_name, + sequencing_center = sequencing_center, + ref = ref, + refIndex = refIndex, + refDict = refDict, + refFlat = refFlat, + ribosomalIntervals = ribosomalIntervals, + exonBedFile = exonBedFile, + population_vcf = population_vcf, + population_vcf_index = population_vcf_index + } + + call FP.CheckFingerprint as CheckFingerprint { + input: + input_bam = RNAWithUMIs.output_bam, + input_bam_index = RNAWithUMIs.output_bam_index, + sample_alias = RNAWithUMIs.sample_name, + sample_lsid = sample_lsid, + output_basename = output_basename, + ref_fasta = ref, + ref_fasta_index = refIndex, + ref_dict = refDict, + read_fingerprint_from_mercury = true, + haplotype_database_file = haplotype_database_file, + environment = environment, + vault_token_path = vault_token_path, + allow_lod_zero = true + } + + call tasks.MergeMetrics { + input: + alignment_summary_metrics = RNAWithUMIs.picard_alignment_summary_metrics, + insert_size_metrics = RNAWithUMIs.picard_insert_size_metrics, + picard_rna_metrics = RNAWithUMIs.picard_rna_metrics, + duplicate_metrics = RNAWithUMIs.duplicate_metrics, + rnaseqc2_metrics = RNAWithUMIs.rnaseqc2_metrics, + fingerprint_summary_metrics = CheckFingerprint.fingerprint_summary_metrics_file, + output_basename = RNAWithUMIs.sample_name + } + + if (defined(tdr_dataset_uuid) && defined(tdr_sample_id)) { + call tasks.formatPipelineOutputs { + input: + sample_id = select_first([tdr_sample_id, ""]), + transcriptome_bam = RNAWithUMIs.transcriptome_bam, + transcriptome_duplicate_metrics = RNAWithUMIs.transcriptome_duplicate_metrics, + output_bam = RNAWithUMIs.output_bam, + output_bam_index = RNAWithUMIs.output_bam_index, + duplicate_metrics = RNAWithUMIs.duplicate_metrics, + rnaseqc2_gene_tpm = RNAWithUMIs.rnaseqc2_gene_tpm, + rnaseqc2_gene_counts = RNAWithUMIs.rnaseqc2_gene_counts, + rnaseqc2_exon_counts = RNAWithUMIs.rnaseqc2_exon_counts, + rnaseqc2_fragment_size_histogram = RNAWithUMIs.rnaseqc2_fragment_size_histogram, + rnaseqc2_metrics = RNAWithUMIs.rnaseqc2_metrics, + picard_rna_metrics = RNAWithUMIs.picard_rna_metrics, + picard_alignment_summary_metrics = RNAWithUMIs.picard_alignment_summary_metrics, + picard_insert_size_metrics = RNAWithUMIs.picard_insert_size_metrics, + picard_insert_size_histogram = RNAWithUMIs.picard_insert_size_histogram, + picard_base_distribution_by_cycle_metrics = RNAWithUMIs.picard_base_distribution_by_cycle_metrics, + picard_base_distribution_by_cycle_pdf = RNAWithUMIs.picard_base_distribution_by_cycle_pdf, + picard_quality_by_cycle_metrics = RNAWithUMIs.picard_quality_by_cycle_metrics, + picard_quality_by_cycle_pdf = RNAWithUMIs.picard_quality_by_cycle_pdf, + picard_quality_distribution_metrics = RNAWithUMIs.picard_quality_distribution_metrics, + picard_quality_distribution_pdf = RNAWithUMIs.picard_quality_distribution_pdf, + picard_fingerprint_summary_metrics = CheckFingerprint.fingerprint_summary_metrics_file, + picard_fingerprint_detail_metrics = CheckFingerprint.fingerprint_detail_metrics_file, + unified_metrics = MergeMetrics.unified_metrics, + contamination = RNAWithUMIs.contamination, + contamination_error = RNAWithUMIs.contamination_error, + fastqc_html_report = RNAWithUMIs.fastqc_html_report, + fastqc_percent_reads_with_adapter = RNAWithUMIs.fastqc_percent_reads_with_adapter + } + + call tasks.updateOutputsInTDR { + input: + tdr_dataset_uuid = select_first([tdr_dataset_uuid, ""]), + outputs_json = formatPipelineOutputs.pipeline_outputs_json + } + } + + output { + File transcriptome_bam = RNAWithUMIs.transcriptome_bam + File output_bam = RNAWithUMIs.output_bam + File output_bam_index = RNAWithUMIs.output_bam_index + + File duplicate_metrics = RNAWithUMIs.duplicate_metrics + File transcriptome_duplicate_metrics = RNAWithUMIs.transcriptome_duplicate_metrics + + File rnaseqc2_gene_tpm = RNAWithUMIs.rnaseqc2_gene_tpm + File rnaseqc2_gene_counts = RNAWithUMIs.rnaseqc2_gene_counts + File rnaseqc2_exon_counts = RNAWithUMIs.rnaseqc2_exon_counts + File rnaseqc2_fragment_size_histogram = RNAWithUMIs.rnaseqc2_fragment_size_histogram + File rnaseqc2_metrics = RNAWithUMIs.rnaseqc2_metrics + File picard_rna_metrics = RNAWithUMIs.picard_rna_metrics + File picard_alignment_summary_metrics = RNAWithUMIs.picard_alignment_summary_metrics + File picard_insert_size_metrics = RNAWithUMIs.picard_insert_size_metrics + File picard_insert_size_histogram = RNAWithUMIs.picard_insert_size_histogram + File picard_base_distribution_by_cycle_metrics = RNAWithUMIs.picard_base_distribution_by_cycle_metrics + File picard_base_distribution_by_cycle_pdf = RNAWithUMIs.picard_base_distribution_by_cycle_pdf + File picard_quality_by_cycle_metrics = RNAWithUMIs.picard_quality_by_cycle_metrics + File picard_quality_by_cycle_pdf = RNAWithUMIs.picard_quality_by_cycle_pdf + File picard_quality_distribution_metrics = RNAWithUMIs.picard_quality_distribution_metrics + File picard_quality_distribution_pdf = RNAWithUMIs.picard_quality_distribution_pdf + File? picard_fingerprint_summary_metrics = CheckFingerprint.fingerprint_summary_metrics_file + File? picard_fingerprint_detail_metrics = CheckFingerprint.fingerprint_detail_metrics_file + File unified_metrics = MergeMetrics.unified_metrics + Float contamination = RNAWithUMIs.contamination + Float contamination_error = RNAWithUMIs.contamination_error + File fastqc_html_report = RNAWithUMIs.fastqc_html_report + Float fastqc_percent_reads_with_adapter = RNAWithUMIs.fastqc_percent_reads_with_adapter + } + + meta { + allowNestedInputs: true + } +} diff --git a/TCapRNAPipeline/subworkflows/CheckFingerprint.wdl b/TCapRNAPipeline/subworkflows/CheckFingerprint.wdl new file mode 100644 index 0000000..53526f2 --- /dev/null +++ b/TCapRNAPipeline/subworkflows/CheckFingerprint.wdl @@ -0,0 +1,130 @@ +version 1.0 + +import "./Utilities.wdl" as utils +import "./InternalTasks.wdl" as InternalTasks +import "./Qc.wdl" as Qc + + +## Copyright Broad Institute, 2022 +## +## This WDL pipeline implements A CheckFingerprint Task +## It runs the Picard tool 'CheckFingerprint' against a supplied input file (VCF, CRAM, BAM or SAM) using a set of 'fingerprint' genotypes. +## These genotypes can either be generated by pulling them from the (Broad-internal) Mercury Fingerprint Store or be supplied as inputs to the pipeline. + ## +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +workflow CheckFingerprint { + + String pipeline_version = "1.0.20" + + input { + File? input_vcf + File? input_vcf_index + File? input_bam + File? input_bam_index + + # The name of the sample in the input_vcf. Not required if there is only one sample in the VCF + String? input_sample_alias + + # If this is true, we will read fingerprints from Mercury + # Otherwise, we will use the optional input fingerprint VCFs below + Boolean read_fingerprint_from_mercury = false + File? fingerprint_genotypes_vcf + File? fingerprint_genotypes_vcf_index + + String? sample_lsid + String sample_alias + + String output_basename + + File ref_fasta + File ref_fasta_index + File ref_dict + + File haplotype_database_file + Boolean allow_lod_zero = false + + String? environment + File? vault_token_path + } + + if (defined(input_vcf) && defined(input_bam)) { + call utils.ErrorWithMessage as ErrorMessageDoubleInput { + input: + message = "input_vcf and input_bam cannot both be defined as input" + } + } + + if (read_fingerprint_from_mercury && (!defined(sample_lsid) || !defined(environment) || !defined(vault_token_path))) { + call utils.ErrorWithMessage as ErrorMessageIncompleteForReadingFromMercury { + input: + message = "sample_lsid, environment, and vault_token_path must defined when reading from Mercury" + } + } + + # sample_alias may contain spaces, so make a filename-safe version for the downloaded fingerprint file + call InternalTasks.MakeSafeFilename { + input: + name = sample_alias + } + + if (read_fingerprint_from_mercury) { + call InternalTasks.DownloadGenotypes { + input: + sample_alias = sample_alias, + sample_lsid = select_first([sample_lsid]), + output_vcf_base_name = MakeSafeFilename.output_safe_name + ".reference.fingerprint", + haplotype_database_file = haplotype_database_file, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + ref_dict = ref_dict, + environment = select_first([environment]), + vault_token_path = select_first([vault_token_path]) + } + } + + Boolean fingerprint_downloaded_from_mercury = select_first([DownloadGenotypes.fingerprint_retrieved, false]) + + File? fingerprint_vcf_to_use = if (fingerprint_downloaded_from_mercury) then DownloadGenotypes.reference_fingerprint_vcf else fingerprint_genotypes_vcf + File? fingerprint_vcf_index_to_use = if (fingerprint_downloaded_from_mercury) then DownloadGenotypes.reference_fingerprint_vcf_index else fingerprint_genotypes_vcf_index + + if ((defined(fingerprint_vcf_to_use)) && (defined(input_vcf) || defined(input_bam))) { + call Qc.CheckFingerprintTask { + input: + input_bam = input_bam, + input_bam_index = input_bam_index, + input_vcf = input_vcf, + input_vcf_index = input_vcf_index, + input_sample_alias = input_sample_alias, + genotypes = select_first([fingerprint_vcf_to_use]), + genotypes_index = fingerprint_vcf_index_to_use, + expected_sample_alias = sample_alias, + output_basename = output_basename, + haplotype_database_file = haplotype_database_file, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + allow_lod_zero = allow_lod_zero + } + } + + output { + Boolean fingerprint_read_from_mercury = fingerprint_downloaded_from_mercury + File? reference_fingerprint_vcf = fingerprint_vcf_to_use + File? reference_fingerprint_vcf_index = fingerprint_vcf_index_to_use + File? fingerprint_summary_metrics_file = CheckFingerprintTask.summary_metrics + File? fingerprint_detail_metrics_file = CheckFingerprintTask.detail_metrics + Float? lod_score = CheckFingerprintTask.lod + } + meta { + allowNestedInputs: true + } +} diff --git a/TCapRNAPipeline/subworkflows/InternalTasks.wdl b/TCapRNAPipeline/subworkflows/InternalTasks.wdl new file mode 100644 index 0000000..b3bef73 --- /dev/null +++ b/TCapRNAPipeline/subworkflows/InternalTasks.wdl @@ -0,0 +1,202 @@ +version 1.0 + +# Tasks that are expected to be used only on Broad Infrastructure + +# This task is to convert a string into another string which is 'safe' as a filename. +# That is, it does not contain any number of odd characters. +# Note that we are not testing for the occurrence of a single quote (') in the string. +task MakeSafeFilename { + input { + String name + } + + String safe_name = name + + command <<< + echo '~{name}' | tr ' "#$%&*/:;<=>?@[]^{}|~\\()' '_' > safe_name.txt + >>> + + runtime { + docker: "gcr.io/gcp-runtimes/ubuntu_16_0_4@sha256:025124e2f1cf4d29149958f17270596bffe13fc6acca6252977c572dd5ba01bf" + disks: "local-disk 10 HDD" + memory: "1 GiB" + preemptible: 3 + } + output { + String output_safe_name = read_string('safe_name.txt') + } +} + +task DownloadGenotypes { + input { + String sample_alias + String sample_lsid + String output_vcf_base_name + Boolean compress = true + + String? ignoreSpecificGenotypesLsid + String? ignoreSpecificGenotypesPlatform + + File haplotype_database_file + + File ref_fasta + File ref_fasta_index + File ref_dict + + String environment + File vault_token_path + + Int? max_retries + Int? preemptible_tries + } + + parameter_meta { + ignoreSpecificGenotypesLsid: "Optional, MUST be specified with ignoreSpecificGenotypesPlatform. If both are defined will not download fingerprints for an LSID run on a specific genotyping platform. The intent is to ignore its own fingerprint" + ignoreSpecificGenotypesPlatform: "Optional, MUST be specified with ignoreSpecificGenotypesLsid. If both are defined will not download fingerprints for an LSID run on a specific genotyping platform. The intent is to ignore its own fingerprint" + } + + meta { + volatile: true + } + + String fp_retrieved_file = "fp_retrieved.txt" + + String output_vcf = output_vcf_base_name + if compress then ".vcf.gz" else".vcf" + String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx" + + command <<< + + export VAULT_ADDR=https://clotho.broadinstitute.org:8200 + export VAULT_TOKEN=$(cat ~{vault_token_path}) + if [ ~{environment} == prod ]; then + export MERCURY_AUTH_KEY=secret/dsde/gotc/prod/wdl/secrets + export MERCURY_FP_STORE_URI=https://portals.broadinstitute.org/portal/mercury-ws/fingerprint + else + export MERCURY_AUTH_KEY=secret/dsde/gotc/dev/wdl/secrets + export MERCURY_FP_STORE_URI=https://portals.broadinstitute.org/portal-test/mercury-ws/fingerprint + fi + + exit_code=0 + + # TODO - there is a bug in DownloadGenotypes - we should NOT have to set EXPECTED_GENOTYPING_PLATFORMS here + # it * should * default to all of them. + + java -Xms2000m -Xmx3000m -Dpicard.useLegacyParser=false -jar /usr/gitc/picard-private.jar \ + DownloadGenotypes \ + --SAMPLE_ALIAS "~{sample_alias}" \ + --SAMPLE_LSID "~{sample_lsid}" \ + --OUTPUT "~{output_vcf}" \ + --CREATE_INDEX true \ + --REFERENCE_SEQUENCE ~{ref_fasta} \ + --HAPLOTYPE_MAP ~{haplotype_database_file} \ + --EXPECTED_GENOTYPING_PLATFORMS FLUIDIGM \ + --EXPECTED_GENOTYPING_PLATFORMS GENERAL_ARRAY \ + ~{if defined(ignoreSpecificGenotypesLsid) then "--IGNORE_SPECIFIC_GENOTYPES_LSID \"" + ignoreSpecificGenotypesLsid + "\"" else ""} \ + ~{if defined(ignoreSpecificGenotypesPlatform) then "--IGNORE_SPECIFIC_GENOTYPES_PLATFORM \"" + ignoreSpecificGenotypesPlatform + "\"" else ""} \ + --MERCURY_FP_STORE_URI $MERCURY_FP_STORE_URI \ + --CREDENTIALS_VAULT_PATH $MERCURY_AUTH_KEY \ + --ERR_NO_GENOTYPES_AVAILABLE 7 + exit_code=$? + + if [ $exit_code -eq 0 ]; then + echo "true" > ~{fp_retrieved_file} + elif [ $exit_code -eq 7 ]; then + # Exit code from DownloadGenotypes if no fingerprints were found. + # Treat this as a normal condition, but set a variable to indicate no fingerprints available. + # Create empty file so that it exists. + exit_code=0 + echo "Found no fingerprints for ~{sample_lsid}" + echo "false" > ~{fp_retrieved_file} + touch ~{output_vcf} + touch ~{output_vcf_index} + else + echo "false" > ~{fp_retrieved_file} + fi + + exit $exit_code + >>> + + runtime { + docker: "us.gcr.io/broad-arrays-prod/arrays-picard-private:4.1.3-1652895718" + memory: "3500 MiB" + maxRetries: select_first([max_retries, 2]) + preemptible: select_first([preemptible_tries, 3]) + } + + output { + Boolean fingerprint_retrieved = read_boolean(fp_retrieved_file) + File reference_fingerprint_vcf = output_vcf + File reference_fingerprint_vcf_index = output_vcf_index + } +} + +task UploadFingerprintToMercury { + input { + File fingerprint_json_file + File gtc_file + + String environment + File vault_token_path + + Int? max_retries + Int? preemptible_tries + } + + command <<< + set -eo pipefail + + export VAULT_ADDR=https://clotho.broadinstitute.org:8200 + export VAULT_TOKEN=$(cat ~{vault_token_path}) + if [ ~{environment} == prod ]; then + export MERCURY_AUTH_KEY=secret/dsde/gotc/prod/wdl/secrets + export MERCURY_FP_STORE_URI=https://portals.broadinstitute.org/portal/mercury-ws/fingerprint + else + export MERCURY_AUTH_KEY=secret/dsde/gotc/dev/wdl/secrets + export MERCURY_FP_STORE_URI=https://portals.broadinstitute.org/portal-test/mercury-ws/fingerprint + fi + + du -k ~{gtc_file} | cut -f 1 > size.txt + + # TODO -Fix UploadFingerprintToMercury so I don't need to pass a file size + + java -Xms2000m -Xmx3000m -Dpicard.useLegacyParser=false -jar /usr/gitc/picard-private.jar \ + UploadFingerprintToMercury \ + --INPUT "~{fingerprint_json_file}" \ + --GTC_FILE_SIZE size.txt \ + --MERCURY_FP_STORE_URI $MERCURY_FP_STORE_URI \ + --CREDENTIALS_VAULT_PATH $MERCURY_AUTH_KEY \ + >>> + + runtime { + docker: "us.gcr.io/broad-arrays-prod/arrays-picard-private:4.1.3-1652895718" + memory: "3500 MiB" + maxRetries: select_first([max_retries, 2]) + preemptible: select_first([preemptible_tries, 3]) + } +} + +task IngestOutputsToTDR { + input { + String tdr_dataset_id + String tdr_target_table_name + String? in_load_tag + File outputs_tsv + } + + command { + + python3 /scripts/emerge/ingest_to_tdr.py -d ~{tdr_dataset_id} \ + -t ~{tdr_target_table_name} \ + -f ~{outputs_tsv} \ + ~{"-l=" + in_load_tag} + } + + runtime { + docker: "gcr.io/emerge-production/emerge_wdls:v.1.0" + } + + output { + File ingest_logs = stdout() + String load_tag = read_string("load_tag.txt") + } +} \ No newline at end of file diff --git a/TCapRNAPipeline/subworkflows/Qc.wdl b/TCapRNAPipeline/subworkflows/Qc.wdl new file mode 100644 index 0000000..c84819c --- /dev/null +++ b/TCapRNAPipeline/subworkflows/Qc.wdl @@ -0,0 +1,708 @@ +version 1.0 + +## Copyright Broad Institute, 2018 +## +## This WDL defines tasks used for QC of human whole-genome or exome sequencing data. +## +## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +# Collect sequencing yield quality metrics +task CollectQualityYieldMetrics { + input { + File input_bam + String metrics_filename + + Int preemptible_tries = 3 + } + + Int disk_size = ceil(size(input_bam, "GiB")) + 20 + + command { + java -Xms2000m -Xmx3000m -jar /usr/picard/picard.jar \ + CollectQualityYieldMetrics \ + INPUT=~{input_bam} \ + OQ=true \ + OUTPUT=~{metrics_filename} + } + runtime { + docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" + disks: "local-disk " + disk_size + " HDD" + memory: "3500 MiB" + preemptible: preemptible_tries + } + output { + File quality_yield_metrics = "~{metrics_filename}" + } +} + +# Collect base quality and insert size metrics +task CollectUnsortedReadgroupBamQualityMetrics { + input { + File input_bam + String output_bam_prefix + Int preemptible_tries + } + + Int disk_size = ceil(size(input_bam, "GiB")) + 20 + + command { + java -Xms5000m -Xmx6500m -jar /usr/picard/picard.jar \ + CollectMultipleMetrics \ + INPUT=~{input_bam} \ + OUTPUT=~{output_bam_prefix} \ + ASSUME_SORTED=true \ + PROGRAM=null \ + PROGRAM=CollectBaseDistributionByCycle \ + PROGRAM=CollectInsertSizeMetrics \ + PROGRAM=MeanQualityByCycle \ + PROGRAM=QualityScoreDistribution \ + METRIC_ACCUMULATION_LEVEL=null \ + METRIC_ACCUMULATION_LEVEL=ALL_READS + + touch ~{output_bam_prefix}.insert_size_metrics + touch ~{output_bam_prefix}.insert_size_histogram.pdf + } + runtime { + docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" + memory: "7000 MiB" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } + output { + File base_distribution_by_cycle_pdf = "~{output_bam_prefix}.base_distribution_by_cycle.pdf" + File base_distribution_by_cycle_metrics = "~{output_bam_prefix}.base_distribution_by_cycle_metrics" + File insert_size_histogram_pdf = "~{output_bam_prefix}.insert_size_histogram.pdf" + File insert_size_metrics = "~{output_bam_prefix}.insert_size_metrics" + File quality_by_cycle_pdf = "~{output_bam_prefix}.quality_by_cycle.pdf" + File quality_by_cycle_metrics = "~{output_bam_prefix}.quality_by_cycle_metrics" + File quality_distribution_pdf = "~{output_bam_prefix}.quality_distribution.pdf" + File quality_distribution_metrics = "~{output_bam_prefix}.quality_distribution_metrics" + } +} + +# Collect alignment summary and GC bias quality metrics +task CollectReadgroupBamQualityMetrics { + input { + File input_bam + File input_bam_index + String output_bam_prefix + File ref_dict + File ref_fasta + File ref_fasta_index + Boolean collect_gc_bias_metrics = true + Int preemptible_tries + } + + Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB") + Int disk_size = ceil(size(input_bam, "GiB") + ref_size) + 20 + + command { + # These are optionally generated, but need to exist for Cromwell's sake + touch ~{output_bam_prefix}.gc_bias.detail_metrics \ + ~{output_bam_prefix}.gc_bias.pdf \ + ~{output_bam_prefix}.gc_bias.summary_metrics + + java -Xms5000m -Xmx6500m -jar /usr/picard/picard.jar \ + CollectMultipleMetrics \ + INPUT=~{input_bam} \ + REFERENCE_SEQUENCE=~{ref_fasta} \ + OUTPUT=~{output_bam_prefix} \ + ASSUME_SORTED=true \ + PROGRAM=null \ + PROGRAM=CollectAlignmentSummaryMetrics \ + ~{true='PROGRAM="CollectGcBiasMetrics"' false="" collect_gc_bias_metrics} \ + METRIC_ACCUMULATION_LEVEL=null \ + METRIC_ACCUMULATION_LEVEL=READ_GROUP + } + runtime { + docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" + memory: "7000 MiB" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } + output { + File alignment_summary_metrics = "~{output_bam_prefix}.alignment_summary_metrics" + File gc_bias_detail_metrics = "~{output_bam_prefix}.gc_bias.detail_metrics" + File gc_bias_pdf = "~{output_bam_prefix}.gc_bias.pdf" + File gc_bias_summary_metrics = "~{output_bam_prefix}.gc_bias.summary_metrics" + } +} + +# Collect quality metrics from the aggregated bam +task CollectAggregationMetrics { + input { + File input_bam + File input_bam_index + String output_bam_prefix + File ref_dict + File ref_fasta + File ref_fasta_index + Boolean collect_gc_bias_metrics = true + Int preemptible_tries + } + + Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB") + Int disk_size = ceil(size(input_bam, "GiB") + ref_size) + 20 + + command { + # These are optionally generated, but need to exist for Cromwell's sake + touch ~{output_bam_prefix}.gc_bias.detail_metrics \ + ~{output_bam_prefix}.gc_bias.pdf \ + ~{output_bam_prefix}.gc_bias.summary_metrics \ + ~{output_bam_prefix}.insert_size_metrics \ + ~{output_bam_prefix}.insert_size_histogram.pdf + + java -Xms5000m -Xmx6500m -jar /usr/picard/picard.jar \ + CollectMultipleMetrics \ + INPUT=~{input_bam} \ + REFERENCE_SEQUENCE=~{ref_fasta} \ + OUTPUT=~{output_bam_prefix} \ + ASSUME_SORTED=true \ + PROGRAM=null \ + PROGRAM=CollectAlignmentSummaryMetrics \ + PROGRAM=CollectInsertSizeMetrics \ + PROGRAM=CollectSequencingArtifactMetrics \ + PROGRAM=QualityScoreDistribution \ + ~{true='PROGRAM="CollectGcBiasMetrics"' false="" collect_gc_bias_metrics} \ + METRIC_ACCUMULATION_LEVEL=null \ + METRIC_ACCUMULATION_LEVEL=SAMPLE \ + METRIC_ACCUMULATION_LEVEL=LIBRARY + } + runtime { + docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" + memory: "7000 MiB" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } + output { + File alignment_summary_metrics = "~{output_bam_prefix}.alignment_summary_metrics" + File bait_bias_detail_metrics = "~{output_bam_prefix}.bait_bias_detail_metrics" + File bait_bias_summary_metrics = "~{output_bam_prefix}.bait_bias_summary_metrics" + File gc_bias_detail_metrics = "~{output_bam_prefix}.gc_bias.detail_metrics" + File gc_bias_pdf = "~{output_bam_prefix}.gc_bias.pdf" + File gc_bias_summary_metrics = "~{output_bam_prefix}.gc_bias.summary_metrics" + File insert_size_histogram_pdf = "~{output_bam_prefix}.insert_size_histogram.pdf" + File insert_size_metrics = "~{output_bam_prefix}.insert_size_metrics" + File pre_adapter_detail_metrics = "~{output_bam_prefix}.pre_adapter_detail_metrics" + File pre_adapter_summary_metrics = "~{output_bam_prefix}.pre_adapter_summary_metrics" + File quality_distribution_pdf = "~{output_bam_prefix}.quality_distribution.pdf" + File quality_distribution_metrics = "~{output_bam_prefix}.quality_distribution_metrics" + File error_summary_metrics = "~{output_bam_prefix}.error_summary_metrics" + } +} + +task ConvertSequencingArtifactToOxoG { + input { + File pre_adapter_detail_metrics + File bait_bias_detail_metrics + String base_name + File ref_dict + File ref_fasta + File ref_fasta_index + Int preemptible_tries + Int memory_multiplier = 1 + } + + Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB") + Int disk_size = ceil(size(pre_adapter_detail_metrics, "GiB") + size(bait_bias_detail_metrics, "GiB") + ref_size) + 20 + + Int memory_size = ceil(4000 * memory_multiplier) + Int java_memory_size = memory_size - 1000 + Int max_heap = memory_size - 500 + + command { + input_base=$(dirname ~{pre_adapter_detail_metrics})/~{base_name} + java -Xms~{java_memory_size}m -Xmx~{max_heap}m \ + -jar /usr/picard/picard.jar \ + ConvertSequencingArtifactToOxoG \ + --INPUT_BASE $input_base \ + --OUTPUT_BASE ~{base_name} \ + --REFERENCE_SEQUENCE ~{ref_fasta} + } + runtime { + docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" + memory: "~{memory_size} MiB" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } + output { + File oxog_metrics = "~{base_name}.oxog_metrics" + } +} + +# Check that the fingerprints of separate readgroups all match +task CrossCheckFingerprints { + input { + Array[File] input_bams + Array[File] input_bam_indexes + File haplotype_database_file + String metrics_filename + Float total_input_size + Int preemptible_tries + Float lod_threshold + String cross_check_by + } + + Int disk_size = ceil(total_input_size) + 20 + + command <<< + java -Dsamjdk.buffer_size=131072 \ + -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xms3000m -Xmx3000m \ + -jar /usr/picard/picard.jar \ + CrosscheckFingerprints \ + OUTPUT=~{metrics_filename} \ + HAPLOTYPE_MAP=~{haplotype_database_file} \ + EXPECT_ALL_GROUPS_TO_MATCH=true \ + INPUT=~{sep=' INPUT=' input_bams} \ + LOD_THRESHOLD=~{lod_threshold} \ + CROSSCHECK_BY=~{cross_check_by} + >>> + runtime { + docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" + preemptible: preemptible_tries + memory: "3500 MiB" + disks: "local-disk " + disk_size + " HDD" + } + output { + File cross_check_fingerprints_metrics = "~{metrics_filename}" + } +} + +task CheckFingerprintTask { + input { + File? input_bam + File? input_bam_index + File? input_vcf + File? input_vcf_index + String? input_sample_alias + + File genotypes + File? genotypes_index + String expected_sample_alias + + String output_basename + Float genotype_lod_threshold = 5.0 + + File haplotype_database_file + File? ref_fasta + File? ref_fasta_index + + Int memory_size = 2500 + Int preemptible_tries = 3 + + Boolean allow_lod_zero = false + } + + Int java_memory_size = memory_size - 1000 + Int max_heap = memory_size - 500 + + Int disk_size = ceil(size(input_bam, "GiB") + size(input_vcf, "GiB")) + 20 + # Picard has different behavior depending on whether or not the OUTPUT parameter ends with a '.', so we are explicitly + # passing in where we want the two metrics files to go to avoid any potential confusion. + String summary_metrics_location = "~{output_basename}.fingerprinting_summary_metrics" + String detail_metrics_location = "~{output_basename}.fingerprinting_detail_metrics" + + File input_file = select_first([input_vcf, input_bam]) + + command <<< + set -e + java -Xms~{java_memory_size}m -Xmx~{max_heap}m -Dpicard.useLegacyParser=false -jar /usr/picard/picard.jar \ + CheckFingerprint \ + --INPUT ~{input_file} \ + ~{if defined(input_vcf) then "--OBSERVED_SAMPLE_ALIAS \"" + input_sample_alias + "\"" else ""} \ + --GENOTYPES ~{genotypes} \ + --EXPECTED_SAMPLE_ALIAS "~{expected_sample_alias}" \ + ~{if defined(input_bam) then "--IGNORE_READ_GROUPS true" else ""} \ + --HAPLOTYPE_MAP ~{haplotype_database_file} \ + --GENOTYPE_LOD_THRESHOLD ~{genotype_lod_threshold} \ + --SUMMARY_OUTPUT ~{summary_metrics_location} \ + --DETAIL_OUTPUT ~{detail_metrics_location} \ + ~{"--REFERENCE_SEQUENCE " + ref_fasta} \ + ~{true='--EXIT_CODE_WHEN_NO_VALID_CHECKS 0' false='' allow_lod_zero} + + CONTENT_LINE=$(cat ~{summary_metrics_location} | + grep -n "## METRICS CLASS\tpicard.analysis.FingerprintingSummaryMetrics" | + cut -f1 -d:) + CONTENT_LINE=$(($CONTENT_LINE+2)) + sed '8q;d' ~{summary_metrics_location} | cut -f5 > lod + >>> + + runtime { + docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" + disks: "local-disk " + disk_size + " HDD" + memory: "~{memory_size} MiB" + preemptible: preemptible_tries + } + + output { + File summary_metrics = summary_metrics_location + File detail_metrics = detail_metrics_location + Float lod = read_float("lod") + } +} + +task CheckPreValidation { + input { + File duplication_metrics + File chimerism_metrics + Float max_duplication_in_reasonable_sample + Float max_chimerism_in_reasonable_sample + + Int preemptible_tries = 3 + } + + command <<< + set -o pipefail + set -e + + grep -A 1 PERCENT_DUPLICATION ~{duplication_metrics} > duplication.csv + grep -A 3 PCT_CHIMERAS ~{chimerism_metrics} | grep -v OF_PAIR > chimerism.csv + + python3 <>> + runtime { + docker: "us.gcr.io/broad-dsp-gcr-public/base/python:3.9-debian" + preemptible: preemptible_tries + memory: "2 GiB" + } + output { + Float duplication_rate = read_float("duplication_value.txt") + Float chimerism_rate = read_float("chimerism_value.txt") + Boolean is_outlier_data = duplication_rate > max_duplication_in_reasonable_sample || chimerism_rate > max_chimerism_in_reasonable_sample + } +} + +task ValidateSamFile { + input { + File input_bam + File? input_bam_index + String report_filename + File ref_dict + File ref_fasta + File ref_fasta_index + Int? max_output + Array[String]? ignore + Boolean? is_outlier_data + Int preemptible_tries = 0 + Int memory_multiplier = 1 + Int additional_disk = 20 + + Int disk_size = ceil(size(input_bam, "GiB") + + size(ref_fasta, "GiB") + + size(ref_fasta_index, "GiB") + + size(ref_dict, "GiB")) + additional_disk + } + + Int memory_size = ceil(16000 * memory_multiplier) + Int java_memory_size = memory_size - 1000 + Int max_heap = memory_size - 500 + + command { + java -Xms~{java_memory_size}m -Xmx~{max_heap}m -jar /usr/picard/picard.jar \ + ValidateSamFile \ + INPUT=~{input_bam} \ + OUTPUT=~{report_filename} \ + REFERENCE_SEQUENCE=~{ref_fasta} \ + ~{"MAX_OUTPUT=" + max_output} \ + IGNORE=~{default="null" sep=" IGNORE=" ignore} \ + MODE=VERBOSE \ + ~{default='SKIP_MATE_VALIDATION=false' true='SKIP_MATE_VALIDATION=true' false='SKIP_MATE_VALIDATION=false' is_outlier_data} \ + IS_BISULFITE_SEQUENCED=false + } + runtime { + docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" + preemptible: preemptible_tries + memory: "~{memory_size} MiB" + disks: "local-disk " + disk_size + " HDD" + } + output { + File report = "~{report_filename}" + } +} + +task CollectWgsMetrics { + input { + File input_bam + File input_bam_index + String metrics_filename + File wgs_coverage_interval_list + File ref_fasta + File ref_fasta_index + Int read_length = 250 + Int preemptible_tries + } + + Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + Int disk_size = ceil(size(input_bam, "GiB") + ref_size) + 20 + + command { + java -Xms2000m -Xmx2500m -jar /usr/picard/picard.jar \ + CollectWgsMetrics \ + INPUT=~{input_bam} \ + VALIDATION_STRINGENCY=SILENT \ + REFERENCE_SEQUENCE=~{ref_fasta} \ + INCLUDE_BQ_HISTOGRAM=true \ + INTERVALS=~{wgs_coverage_interval_list} \ + OUTPUT=~{metrics_filename} \ + USE_FAST_ALGORITHM=true \ + READ_LENGTH=~{read_length} + } + runtime { + docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" + preemptible: preemptible_tries + memory: "3000 MiB" + disks: "local-disk " + disk_size + " HDD" + } + output { + File metrics = "~{metrics_filename}" + } +} + +# Collect raw WGS metrics (commonly used QC thresholds) +task CollectRawWgsMetrics { + input { + File input_bam + File input_bam_index + String metrics_filename + File wgs_coverage_interval_list + File ref_fasta + File ref_fasta_index + Int read_length = 250 + Int preemptible_tries + Int memory_multiplier = 1 + Int additional_disk = 20 + } + Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + Int disk_size = ceil(size(input_bam, "GiB") + ref_size) + additional_disk + + Int memory_size = ceil((if (disk_size < 110) then 5 else 7) * memory_multiplier) + String java_memory_size = (memory_size - 1) * 1000 + + command { + java -Xms~{java_memory_size}m -jar /usr/picard/picard.jar \ + CollectRawWgsMetrics \ + INPUT=~{input_bam} \ + VALIDATION_STRINGENCY=SILENT \ + REFERENCE_SEQUENCE=~{ref_fasta} \ + INCLUDE_BQ_HISTOGRAM=true \ + INTERVALS=~{wgs_coverage_interval_list} \ + OUTPUT=~{metrics_filename} \ + USE_FAST_ALGORITHM=true \ + READ_LENGTH=~{read_length} + } + runtime { + docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" + preemptible: preemptible_tries + memory: "~{memory_size} GiB" + disks: "local-disk " + disk_size + " HDD" + } + output { + File metrics = "~{metrics_filename}" + } +} + +task CollectHsMetrics { + input { + File input_bam + File input_bam_index + File ref_fasta + File ref_fasta_index + String metrics_filename + File target_interval_list + File bait_interval_list + Int preemptible_tries + Int memory_multiplier = 1 + Int additional_disk = 20 + } + + Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + Int disk_size = ceil(size(input_bam, "GiB") + ref_size) + additional_disk + # Try to fit the input bam into memory, within reason. + Int rounded_bam_size = ceil(size(input_bam, "GiB") + 0.5) + Int rounded_memory_size = ceil((if (rounded_bam_size > 10) then 10 else rounded_bam_size) * memory_multiplier) + Int memory_size = if rounded_memory_size < 7 then 7000 else (rounded_memory_size * 1000) + Int java_memory_size = memory_size - 1000 + Int max_heap = memory_size - 500 + + # There are probably more metrics we want to generate with this tool + command { + java -Xms~{java_memory_size}m -Xmx~{max_heap}m -jar /usr/picard/picard.jar \ + CollectHsMetrics \ + INPUT=~{input_bam} \ + REFERENCE_SEQUENCE=~{ref_fasta} \ + VALIDATION_STRINGENCY=SILENT \ + TARGET_INTERVALS=~{target_interval_list} \ + BAIT_INTERVALS=~{bait_interval_list} \ + METRIC_ACCUMULATION_LEVEL=null \ + METRIC_ACCUMULATION_LEVEL=SAMPLE \ + METRIC_ACCUMULATION_LEVEL=LIBRARY \ + OUTPUT=~{metrics_filename} + } + + runtime { + docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" + preemptible: preemptible_tries + memory: "~{memory_size} MiB" + disks: "local-disk " + disk_size + " HDD" + } + + output { + File metrics = metrics_filename + } +} + +# Generate a checksum per readgroup +task CalculateReadGroupChecksum { + input { + File input_bam + File input_bam_index + String read_group_md5_filename + Int preemptible_tries + } + + Int disk_size = ceil(size(input_bam, "GiB")) + 80 + + command { + java -Xms1000m -Xmx3500m -jar /usr/picard/picard.jar \ + CalculateReadGroupChecksum \ + INPUT=~{input_bam} \ + OUTPUT=~{read_group_md5_filename} + } + runtime { + docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" + preemptible: preemptible_tries + memory: "6000 MiB" + disks: "local-disk " + disk_size + " HDD" + } + output { + File md5_file = "~{read_group_md5_filename}" + } +} + +# Validate a (g)VCF with -gvcf specific validation +task ValidateVCF { + input { + File input_vcf + File input_vcf_index + File ref_fasta + File ref_fasta_index + File ref_dict + File? dbsnp_vcf + File? dbsnp_vcf_index + File calling_interval_list + File? calling_interval_list_index # if the interval list is a VCF, than an index file makes VcfToIntervalList run faster + Boolean calling_intervals_defined = true + Int preemptible_tries = 3 + Boolean is_gvcf = true + String? extra_args + #Setting default docker value for workflows that haven't yet been azurized. + String docker_path = "us.gcr.io/broad-gatk/gatk:4.5.0.0" + Int machine_mem_mb = 7000 + } + + String calling_interval_list_basename = basename(calling_interval_list) + String calling_interval_list_index_basename = if calling_intervals_defined then "" else basename(select_first([calling_interval_list_index])) + + Int command_mem_mb = machine_mem_mb - 2000 + Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB") + Int disk_size = ceil(size(input_vcf, "GiB") + size(calling_interval_list, "GiB") + size(dbsnp_vcf, "GiB") + ref_size) + 20 + + command { + set -e + + if [ ~{calling_intervals_defined} == "false" ]; then + # We can't always assume the index was located with the vcf, so make a link so that the paths look the same + ln -s ~{calling_interval_list} ~{calling_interval_list_basename} + ln -s ~{calling_interval_list_index} ~{calling_interval_list_index_basename} + gatk --java-options "-Xms~{command_mem_mb}m -Xmx~{command_mem_mb}m" \ + VcfToIntervalList -I ~{calling_interval_list_basename} -O intervals_from_gvcf.interval_list + INTERVALS="intervals_from_gvcf.interval_list" + else + INTERVALS="~{calling_interval_list}" + fi + + gatk --java-options "-Xms~{command_mem_mb}m -Xmx~{command_mem_mb}m" \ + ValidateVariants \ + -V ~{input_vcf} \ + -R ~{ref_fasta} \ + -L $INTERVALS \ + ~{true="-gvcf" false="" is_gvcf} \ + --validation-type-to-exclude ALLELES \ + ~{"--dbsnp " + dbsnp_vcf} \ + ~{extra_args} + } + runtime { + docker: docker_path + preemptible: preemptible_tries + memory: machine_mem_mb + " MiB" + bootDiskSizeGb: 15 + disks: "local-disk " + disk_size + " HDD" + } +} + +# Collect variant calling metrics from GVCF output +task CollectVariantCallingMetrics { + input { + File input_vcf + File input_vcf_index + String metrics_basename + File dbsnp_vcf + File dbsnp_vcf_index + File ref_dict + File evaluation_interval_list + Boolean is_gvcf = true + Int preemptible_tries + #Setting default docker value for workflows that haven't yet been azurized. + String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" + } + + Int disk_size = ceil(size(input_vcf, "GiB") + size(dbsnp_vcf, "GiB")) + 20 + + command { + java -Xms2000m -Xmx2500m -jar /usr/picard/picard.jar \ + CollectVariantCallingMetrics \ + INPUT=~{input_vcf} \ + OUTPUT=~{metrics_basename} \ + DBSNP=~{dbsnp_vcf} \ + SEQUENCE_DICTIONARY=~{ref_dict} \ + TARGET_INTERVALS=~{evaluation_interval_list} \ + ~{true="GVCF_INPUT=true" false="" is_gvcf} + } + runtime { + docker: docker + preemptible: preemptible_tries + memory: "3000 MiB" + disks: "local-disk " + disk_size + " HDD" + } + output { + File summary_metrics = "~{metrics_basename}.variant_calling_summary_metrics" + File detail_metrics = "~{metrics_basename}.variant_calling_detail_metrics" + } +} diff --git a/TCapRNAPipeline/subworkflows/RNAWithUMIsPipeline.wdl b/TCapRNAPipeline/subworkflows/RNAWithUMIsPipeline.wdl new file mode 100644 index 0000000..2e8a973 --- /dev/null +++ b/TCapRNAPipeline/subworkflows/RNAWithUMIsPipeline.wdl @@ -0,0 +1,259 @@ +version 1.0 + +import "./UMIAwareDuplicateMarking.wdl" as UmiMD +import "./RNAWithUMIsTasks.wdl" as tasks + +## Copyright Broad Institute, 2021 +## +## This WDL pipeline implements data processing for RNA with UMIs +## +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +workflow RNAWithUMIsPipeline { + + String pipeline_version = "1.0.16" + + input { + File? bam + File? r1_fastq + File? r2_fastq + String read1Structure + String read2Structure + String output_basename + + String platform + String library_name + String platform_unit + String read_group_name + String sequencing_center = "BI" + + File starIndex + File gtf + + File ref + File refIndex + File refDict + File refFlat + File ribosomalIntervals + File exonBedFile + + File population_vcf + File population_vcf_index + } + + parameter_meta { + bam: "Read group-specific unmapped BAM file; alternatively, paired-end FASTQ files (the `r1_fastq` and `r2_fastq` inputs) may be used" + r1_fastq: "Read 1 FASTQ file; alternatively, the unmapped bam file (`bam` input) may be used as input" + r2_fastq: "Read 2 FASTQ file; alternatively, the unmapped bam file (`bam` input) may be used as input" + read1Structure: "String describing how the bases in a sequencing run should be allocated into logical reads for read 1" + read2Structure: "String describing how the bases in a sequencing run should be allocated into logical reads for read 2" + starIndex: "TAR file containing genome indices used for the STAR aligner" + output_basename: "String used as a prefix in workflow output files" + gtf: "Gene annotation file (GTF) used for the rnaseqc tool" + platform: "String used to describe the sequencing platform" + library_name: "String used to describe the library" + platform_unit: "String used to describe the platform unit" + read_group_name: "String used to describe the read group name" + sequencing_center: "String used to describe the sequencing center; default is set to 'BI'" + ref: "FASTA file used for metric collection with Picard tools" + refIndex: "FASTA index file used for metric collection with Picard tools" + refDict: "Dictionary file used for metric collection with Picard tools" + refFlat: "refFlat file used for metric collection with Picard tools" + ribosomalIntervals: "Intervals file used for RNA metric collection with Picard tools" + exonBedFile: "Bed file used for fragment size calculations in the rnaseqc tool; contains non-overlapping exons" + population_vcf: "VCF file for contamination estimation; contains common SNP sites from population-wide studies like ExAC or gnomAD" + population_vcf_index: "Population VCF index file used for contamination estimation" + } + + call tasks.VerifyPipelineInputs { + input: + bam = bam, + r1_fastq = r1_fastq, + r2_fastq = r2_fastq + } + + if (VerifyPipelineInputs.fastq_run) { + call tasks.FastqToUbam { + input: + r1_fastq = select_first([r1_fastq]), + r2_fastq = select_first([r2_fastq]), + bam_filename = output_basename, + library_name = library_name, + platform = platform, + platform_unit = platform_unit, + read_group_name = read_group_name, + sequencing_center = sequencing_center + } + } + + File bam_to_use = select_first([bam, FastqToUbam.unmapped_bam]) + + call tasks.ExtractUMIs { + input: + bam = bam_to_use, + read1Structure = read1Structure, + read2Structure = read2Structure + } + + # Convert SAM to fastq for adapter clipping + # This step also removes reads that fail platform/vendor quality checks + call tasks.SamToFastq { + input: + bam = ExtractUMIs.bam_umis_extracted, + output_prefix = output_basename + } + + # Adapter clipping + call tasks.Fastp { + input: + fastq1 = SamToFastq.fastq1, + fastq2 = SamToFastq.fastq2, + output_prefix = output_basename + ".adapter_clipped" + } + + # Back to SAM before alignment + call tasks.FastqToUbam as FastqToUbamAfterClipping { + input: + r1_fastq = Fastp.fastq1_clipped, + r2_fastq = Fastp.fastq2_clipped, + bam_filename = output_basename + ".adapter_clipped", + library_name = library_name, + platform = platform, + platform_unit = platform_unit, + read_group_name = read_group_name, + sequencing_center = sequencing_center + } + + call tasks.FastQC { + input: + unmapped_bam = FastqToUbamAfterClipping.unmapped_bam + } + + call tasks.STAR { + input: + bam = FastqToUbamAfterClipping.unmapped_bam, + starIndex = starIndex + } + + call tasks.CopyReadGroupsToHeader { + input: + bam_with_readgroups = STAR.aligned_bam, + bam_without_readgroups = STAR.transcriptome_bam + } + + # Mark duplicate reads in the genome-aligned bam. The output is coordinate-sorted. + call UmiMD.UMIAwareDuplicateMarking { + input: + aligned_bam = STAR.aligned_bam, + unaligned_bam = ExtractUMIs.bam_umis_extracted, + output_basename = output_basename, + remove_duplicates = false, + coordinate_sort_output = true + } + + # Remove, rather than mark, duplicates reads from the transcriptome-aligned bam + # The output is query-name-sorted. + call UmiMD.UMIAwareDuplicateMarking as UMIAwareDuplicateMarkingTranscriptome { + input: + aligned_bam = CopyReadGroupsToHeader.output_bam, + unaligned_bam = ExtractUMIs.bam_umis_extracted, + output_basename = output_basename + ".transcriptome", + remove_duplicates = true, + coordinate_sort_output = false + } + + call tasks.PostprocessTranscriptomeForRSEM { + input: + prefix = output_basename + ".transcriptome", + input_bam = UMIAwareDuplicateMarkingTranscriptome.duplicate_marked_bam + } + + call tasks.GetSampleName { + input: + bam = bam_to_use + } + + call tasks.rnaseqc2 { + input: + bam_file = UMIAwareDuplicateMarking.duplicate_marked_bam, + genes_gtf = gtf, + sample_id = GetSampleName.sample_name, + exon_bed = exonBedFile + } + + call tasks.CollectRNASeqMetrics { + input: + input_bam = UMIAwareDuplicateMarking.duplicate_marked_bam, + input_bam_index = UMIAwareDuplicateMarking.duplicate_marked_bam_index, + output_bam_prefix = GetSampleName.sample_name, + ref_dict = refDict, + ref_fasta = ref, + ref_fasta_index = refIndex, + ref_flat = refFlat, + ribosomal_intervals = ribosomalIntervals, + } + + call tasks.CollectMultipleMetrics { + input: + input_bam = UMIAwareDuplicateMarking.duplicate_marked_bam, + input_bam_index = UMIAwareDuplicateMarking.duplicate_marked_bam_index, + output_bam_prefix = GetSampleName.sample_name, + ref_dict = refDict, + ref_fasta = ref, + ref_fasta_index = refIndex + } + + call tasks.CalculateContamination { + input: + bam = UMIAwareDuplicateMarking.duplicate_marked_bam, + bam_index = UMIAwareDuplicateMarking.duplicate_marked_bam_index, + base_name = GetSampleName.sample_name, + population_vcf = population_vcf, + population_vcf_index = population_vcf_index, + ref_dict = refDict, + ref_fasta = ref, + ref_fasta_index = refIndex + } + + output { + String sample_name = GetSampleName.sample_name + File transcriptome_bam = PostprocessTranscriptomeForRSEM.output_bam + File transcriptome_duplicate_metrics = UMIAwareDuplicateMarkingTranscriptome.duplicate_metrics + File output_bam = UMIAwareDuplicateMarking.duplicate_marked_bam + File output_bam_index = UMIAwareDuplicateMarking.duplicate_marked_bam_index + File duplicate_metrics = UMIAwareDuplicateMarking.duplicate_metrics + File rnaseqc2_gene_tpm = rnaseqc2.gene_tpm + File rnaseqc2_gene_counts = rnaseqc2.gene_counts + File rnaseqc2_exon_counts = rnaseqc2.exon_counts + File rnaseqc2_fragment_size_histogram = rnaseqc2.fragment_size_histogram + File rnaseqc2_metrics = rnaseqc2.metrics + File picard_rna_metrics = CollectRNASeqMetrics.rna_metrics + File picard_alignment_summary_metrics = CollectMultipleMetrics.alignment_summary_metrics + File picard_insert_size_metrics = CollectMultipleMetrics.insert_size_metrics + File picard_insert_size_histogram = CollectMultipleMetrics.insert_size_histogram + File picard_base_distribution_by_cycle_metrics = CollectMultipleMetrics.base_distribution_by_cycle_metrics + File picard_base_distribution_by_cycle_pdf = CollectMultipleMetrics.base_distribution_by_cycle_pdf + File picard_quality_by_cycle_metrics = CollectMultipleMetrics.quality_by_cycle_metrics + File picard_quality_by_cycle_pdf = CollectMultipleMetrics.quality_by_cycle_pdf + File picard_quality_distribution_metrics = CollectMultipleMetrics.quality_distribution_metrics + File picard_quality_distribution_pdf = CollectMultipleMetrics.quality_distribution_pdf + Float contamination = CalculateContamination.contamination + Float contamination_error = CalculateContamination.contamination_error + File fastqc_html_report = FastQC.fastqc_html + Float fastqc_percent_reads_with_adapter = FastQC.fastqc_percent_reads_with_adapter + } + + meta { + allowNestedInputs: true + } +} + diff --git a/TCapRNAPipeline/subworkflows/RNAWithUMIsTasks.wdl b/TCapRNAPipeline/subworkflows/RNAWithUMIsTasks.wdl new file mode 100644 index 0000000..0c5ee13 --- /dev/null +++ b/TCapRNAPipeline/subworkflows/RNAWithUMIsTasks.wdl @@ -0,0 +1,1039 @@ +version 1.0 + +task VerifyPipelineInputs { + meta { + description: "Verify that the pipeline input is either a ubam or pair of fastqs with additional info" + } + + input { + File? bam + File? r1_fastq + File? r2_fastq + + String docker = "us.gcr.io/broad-dsp-gcr-public/base/python:3.9-debian" + Int cpu = 1 + Int memory_mb = 2000 + Int disk_size_gb = ceil(size(bam, "GiB") + size(r1_fastq,"GiB") + size(r2_fastq, "GiB")) + 10 + } + + command <<< + set -e + python3 <>> + + runtime { + docker: docker + cpu: cpu + memory: "~{memory_mb} MiB" + disks: "local-disk ~{disk_size_gb} HDD" + } + + output { + Boolean fastq_run = read_boolean("output.txt") + } +} + +task ExtractUMIs { + input { + File bam + String read1Structure + String read2Structure + + String docker = "us.gcr.io/broad-gotc-prod/fgbio:1.0.0-1.4.0-1638817487" + Int cpu = 4 + Int memory_mb = 5000 + Int disk_size_gb = ceil(2.2 * size(bam, "GiB")) + 20 + } + + command <<< + java -jar /usr/gitc/fgbio.jar ExtractUmisFromBam \ + --input ~{bam} \ + --read-structure ~{read1Structure} \ + --read-structure ~{read2Structure} \ + --molecular-index-tags RX \ + --output extractUMIs.out.bam + >>> + + runtime { + docker: docker + cpu: cpu + memory: "~{memory_mb} MiB" + disks: "local-disk ~{disk_size_gb} HDD" + preemptible: 0 + } + + output { + File bam_umis_extracted = "extractUMIs.out.bam" + } +} + +# Adapter clipping +# https://github.com/OpenGene/fastp +task Fastp { + input { + File fastq1 + File fastq2 + String output_prefix + File adapter_fasta = "gs://gcp-public-data--broad-references/RNA/resources/Illumina_adapters.fasta" + + String docker = "us.gcr.io/broad-gotc-prod/fastp:1.0.0-0.20.1-1649253500" + Int memory_mb = ceil(1.5*size(fastq1, "MiB")) + 8192 # Experimentally determined formula for memory allocation + Int disk_size_gb = 5*ceil(size(fastq1, "GiB")) + 128 + File monitoring_script = "gs://broad-dsde-methods-monitoring/cromwell_monitoring_script.sh" + Int cpu=4 + } + + command { + bash ~{monitoring_script} > monitoring.log & + + fastp --in1 ~{fastq1} --in2 ~{fastq2} --out1 ~{output_prefix}_read1.fastq.gz --out2 ~{output_prefix}_read2.fastq.gz \ + --disable_quality_filtering \ + --disable_length_filtering \ + --adapter_fasta ~{adapter_fasta} \ + --thread ~{cpu} + } + + + runtime { + docker: docker + memory: "~{memory_mb} MiB" + cpu: cpu + disks: "local-disk ~{disk_size_gb} HDD" + preemptible: 0 + maxRetries: 2 + } + + output { + File monitoring_log = "monitoring.log" + File fastq1_clipped = output_prefix + "_read1.fastq.gz" + File fastq2_clipped = output_prefix + "_read2.fastq.gz" + } + +} + +task STAR { + input { + File bam + File starIndex + + String docker = "us.gcr.io/broad-gotc-prod/samtools-star:1.0.0-1.11-2.7.10a-1642556627" + Int cpu = 8 + Int memory_mb = ceil((size(starIndex, "GiB")) + 10) * 1500 + Int disk_size_gb = ceil(2.2 * size(bam, "GiB") + size(starIndex, "GiB")) + 150 + } + + command <<< + echo $(date +"[%b %d %H:%M:%S] Extracting STAR index") + mkdir star_index + tar -xvf ~{starIndex} -C star_index --strip-components=1 + + STAR \ + --runMode alignReads \ + --runThreadN ~{cpu} \ + --genomeDir star_index \ + --outSAMtype BAM Unsorted \ + --readFilesIn ~{bam} \ + --readFilesType SAM PE \ + --readFilesCommand samtools view -h \ + --runRNGseed 12345 \ + --outSAMunmapped Within \ + --outFilterType BySJout \ + --outFilterMultimapNmax 20 \ + --outFilterScoreMinOverLread 0.33 \ + --outFilterMatchNminOverLread 0.33 \ + --outFilterMismatchNmax 999 \ + --outFilterMismatchNoverLmax 0.1 \ + --alignIntronMin 20 \ + --alignIntronMax 1000000 \ + --alignMatesGapMax 1000000 \ + --alignSJoverhangMin 8 \ + --alignSJDBoverhangMin 1 \ + --alignSoftClipAtReferenceEnds Yes \ + --chimSegmentMin 15 \ + --chimMainSegmentMultNmax 1 \ + --chimOutType WithinBAM SoftClip \ + --chimOutJunctionFormat 0 \ + --twopassMode Basic \ + --quantMode TranscriptomeSAM \ + --quantTranscriptomeBan IndelSoftclipSingleend \ + --alignEndsProtrude 20 ConcordantPair + >>> + + runtime { + docker: docker + cpu: cpu + memory: "~{memory_mb} MiB" + disks: "local-disk ~{disk_size_gb} HDD" + preemptible: 0 + } + + output { + File aligned_bam = "Aligned.out.bam" + File transcriptome_bam = "Aligned.toTranscriptome.out.bam" + } +} + +task FastqToUbam { + input { + File r1_fastq + File r2_fastq + String bam_filename + String library_name + String platform + String platform_unit + String read_group_name + String sequencing_center + + String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:3.0.0" + Int cpu = 1 + Int memory_mb = 4000 + Int disk_size_gb = ceil(size(r1_fastq, "GiB")*2.2 + size(r2_fastq, "GiB")*2.2) + 50 + } + + String unmapped_bam_output_name = bam_filename + ".u.bam" + + Int java_memory_size = memory_mb - 1000 + Int max_heap = memory_mb - 500 + + command <<< + java -Xms~{java_memory_size}m -Xmx~{max_heap}m -jar /usr/picard/picard.jar FastqToSam \ + SORT_ORDER=unsorted \ + F1=~{r1_fastq}\ + F2=~{r2_fastq} \ + SM="~{bam_filename}" \ + LB="~{library_name}" \ + PL="~{platform}" \ + PU="~{platform_unit}" \ + RG="~{read_group_name}" \ + CN="~{sequencing_center}" \ + O="~{unmapped_bam_output_name}" \ + ALLOW_EMPTY_FASTQ=True + >>> + + runtime { + docker: docker + cpu: cpu + memory: "~{memory_mb} MiB" + disks: "local-disk ~{disk_size_gb} HDD" + } + + output { + File unmapped_bam = unmapped_bam_output_name + } +} + +task CopyReadGroupsToHeader { + input { + File bam_with_readgroups + File bam_without_readgroups + + String docker = "us.gcr.io/broad-gotc-prod/samtools:1.0.0-1.11-1624651616" + Int cpu = 1 + Int memory_mb = 2500 + Int disk_size_gb = ceil(2.0 * size([bam_with_readgroups, bam_without_readgroups], "GiB")) + 10 + } + + String basename = basename(bam_without_readgroups) + + command <<< + samtools view -H ~{bam_without_readgroups} > header.sam + samtools view -H ~{bam_with_readgroups} | grep "@RG" >> header.sam + samtools reheader header.sam ~{bam_without_readgroups} > ~{basename} + >>> + + runtime { + docker: docker + cpu: cpu + memory: "~{memory_mb} MiB" + disks: "local-disk ~{disk_size_gb} HDD" + } + + output { + File output_bam = basename + } +} + +task GetSampleName { + input { + File bam + + String docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" + Int cpu = 1 + Int memory_mb = 1000 + Int disk_size_gb = ceil(2.0 * size(bam, "GiB")) + 10 + } + + parameter_meta { + bam: { + localization_optional: true + } + } + + command <<< + gatk GetSampleName -I ~{bam} -O sample_name.txt + >>> + + runtime { + docker: docker + cpu: cpu + memory: "~{memory_mb} MiB" + disks: "local-disk ~{disk_size_gb} HDD" + } + + output { + String sample_name = read_string("sample_name.txt") + } +} + +task rnaseqc2 { + input { + File bam_file + File genes_gtf + String sample_id + File exon_bed + + String docker = "us.gcr.io/broad-dsde-methods/ckachulis/rnaseqc@sha256:a4bec726bb51df5fb8c8f640f7dec22fa28c8f7803ef9994b8ec39619b4514ca" + Int cpu = 1 + Int memory_mb = 8000 + Int disk_size_gb = ceil(size(bam_file, 'GiB') + size(genes_gtf, 'GiB') + size(exon_bed, 'GiB')) + 50 + } + + command <<< + set -euo pipefail + # force fragmentSizes histogram output file to exist (even if empty) + touch ~{sample_id}.fragmentSizes.txt + echo $(date +"[%b %d %H:%M:%S] Running RNA-SeQC 2") + rnaseqc ~{genes_gtf} ~{bam_file} . -s ~{sample_id} -v --bed ~{exon_bed} + echo " * compressing outputs" + gzip *.gct + echo $(date +"[%b %d %H:%M:%S] done") + >>> + + output { + File gene_tpm = "~{sample_id}.gene_tpm.gct.gz" + File gene_counts = "~{sample_id}.gene_reads.gct.gz" + File exon_counts = "~{sample_id}.exon_reads.gct.gz" + File fragment_size_histogram = "~{sample_id}.fragmentSizes.txt" + File metrics = "~{sample_id}.metrics.tsv" + } + + runtime { + docker: docker + cpu: cpu + memory: "~{memory_mb} MiB" + disks: "local-disk ~{disk_size_gb} HDD" + maxRetries: 2 + } +} + +task CollectRNASeqMetrics { + input { + File input_bam + File input_bam_index + String output_bam_prefix + File ref_dict + File ref_fasta + File ref_fasta_index + File ref_flat + File ribosomal_intervals + + + String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.11" + Int cpu = 1 + Int memory_mb = 7500 + Int disk_size_gb = ceil(size(input_bam, "GiB") + size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB")) + 20 + } + + Int java_memory_size = memory_mb - 1000 + Int max_heap = memory_mb - 500 + + command <<< + java -Xms~{java_memory_size}m -Xmx~{max_heap}m -jar /usr/picard/picard.jar CollectRnaSeqMetrics \ + REF_FLAT=~{ref_flat} \ + RIBOSOMAL_INTERVALS= ~{ribosomal_intervals} \ + STRAND_SPECIFICITY=SECOND_READ_TRANSCRIPTION_STRAND \ + INPUT=~{input_bam} \ + OUTPUT=~{output_bam_prefix}.rna_metrics + >>> + + runtime { + docker: docker + cpu: cpu + memory: "~{memory_mb} MiB" + disks: "local-disk ~{disk_size_gb} HDD" + } + + output { + File rna_metrics = output_bam_prefix + ".rna_metrics" + } +} + +task CollectMultipleMetrics { + input { + File input_bam + File input_bam_index + String output_bam_prefix + File ref_dict + File ref_fasta + File ref_fasta_index + + + String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:3.0.0" + Int cpu = 1 + Int memory_mb = 7500 + Int disk_size_gb = ceil(size(input_bam, "GiB") + size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB")) + 20 + } + + Int java_memory_size = memory_mb - 1000 + Int max_heap = memory_mb - 500 + + command <<< + #plots will not be produced if there are no reads + touch ~{output_bam_prefix}.insert_size_histogram.pdf + touch ~{output_bam_prefix}.insert_size_metrics + touch ~{output_bam_prefix}.base_distribution_by_cycle.pdf + touch ~{output_bam_prefix}.quality_by_cycle.pdf + touch ~{output_bam_prefix}.quality_distribution.pdf + + java -Xms~{java_memory_size}m -Xmx~{max_heap}m -jar /usr/picard/picard.jar CollectMultipleMetrics \ + INPUT=~{input_bam} \ + OUTPUT=~{output_bam_prefix} \ + REFERENCE_SEQUENCE=~{ref_fasta} + >>> + + runtime { + docker: docker + cpu: cpu + memory: "~{memory_mb} MiB" + disks: "local-disk ~{disk_size_gb} HDD" + } + + output { + File alignment_summary_metrics = output_bam_prefix + ".alignment_summary_metrics" + File insert_size_metrics = output_bam_prefix + ".insert_size_metrics" + File insert_size_histogram = output_bam_prefix + ".insert_size_histogram.pdf" + File base_distribution_by_cycle_metrics = output_bam_prefix + ".base_distribution_by_cycle_metrics" + File base_distribution_by_cycle_pdf = output_bam_prefix + ".base_distribution_by_cycle.pdf" + File quality_by_cycle_metrics = output_bam_prefix + ".quality_by_cycle_metrics" + File quality_by_cycle_pdf = output_bam_prefix + ".quality_by_cycle.pdf" + File quality_distribution_metrics = output_bam_prefix + ".quality_distribution_metrics" + File quality_distribution_pdf = output_bam_prefix + ".quality_distribution.pdf" + } +} + +task MergeMetrics { + input { + File alignment_summary_metrics + File insert_size_metrics + File picard_rna_metrics + File duplicate_metrics + File rnaseqc2_metrics + File? fingerprint_summary_metrics + String output_basename + + String docker = "python:3.8-slim" + Int cpu = 1 + Int memory_mb = 3000 + Int disk_size_gb = 10 + } + + String out_filename = output_basename + ".unified_metrics.txt" + + command <<< + + # + # Script transpose a two line TSV + # + cat <<-'EOF' > transpose.py + import csv, sys + + rows = list(csv.reader(sys.stdin, delimiter='\t')) + + for col in range(0, len(rows[0])): + key = rows[0][col].lower() + value = rows[1][col] + if value == "?": + value = "NaN" + if key in ["median_insert_size", "median_absolute_deviation", "median_read_length", "mad_read_length", "pf_hq_median_mismatches"]: + value = str(int(float(value))) + print(f"{key}\t{value}") + EOF + + # + # Script clean the keys, replacing space, dash and forward-slash with underscores, + # and removing comma, single quote and periods + # + cat <<-'EOF' > clean.py + import sys + + for line in sys.stdin: + (k,v) = line.strip().lower().split("\t") + transtable = k.maketrans({' ':'_', '-':'_', '/':'_', ',':None, '\'':None, '.' : None}) + print(f"{k.translate(transtable)}\t{v}") + EOF + + # Process each metric file, transposing and cleaning if necessary, and pre-pending a source to the metric name + + echo "Processing Alignment Summary Metrics - Only PAIR line" + cat ~{alignment_summary_metrics} | egrep "(CATEGORY|^PAIR)" | python transpose.py | grep -Eiv "(SAMPLE|LIBRARY|READ_GROUP)" | awk '{print "picard_" $0}' >> ~{out_filename} + + echo "Processing Insert Size Metrics - removing various WIDTH metrics" + cat ~{insert_size_metrics} | grep -A 1 "MEDIAN_INSERT_SIZE" | python transpose.py | grep -Eiv "(SAMPLE|LIBRARY|READ_GROUP|WIDTH)" | awk '{print "picard_" $0}' >> ~{out_filename} + + echo "Processing Picard RNA Metrics" + cat ~{picard_rna_metrics} | grep -A 1 "RIBOSOMAL_BASES" | python transpose.py | grep -Eiv "(SAMPLE|LIBRARY|READ_GROUP)" | awk '{print "picard_rna_metrics_" $0}' >> ~{out_filename} + + echo "Processing Duplicate Metrics" + cat ~{duplicate_metrics} | grep -A 1 "READ_PAIR_DUPLICATES" | python transpose.py | awk '{print "picard_" $0}' >> ~{out_filename} + + echo "Processing RNASeQC2 Metrics" + cat ~{rnaseqc2_metrics} | python clean.py | awk '{print "rnaseqc2_" $0}' >> ~{out_filename} + + if [[ -f "~{fingerprint_summary_metrics}" ]]; + then + echo "Processing Fingerprint Summary Metrics - only extracting LOD_EXPECTED_SAMPLE" + cat ~{fingerprint_summary_metrics} | grep -A 1 "LOD_EXPECTED_SAMPLE" | python transpose.py | grep -i "LOD_EXPECTED_SAMPLE" | awk '{print "fp_"$0}' >> ~{out_filename} + else + echo "No Fingerprint Summary Metrics found." + echo "fp_lod_expected_sample " >> ~{out_filename} + fi >>> + + runtime { + docker: docker + cpu: cpu + memory: "~{memory_mb} MiB" + disks: "local-disk ~{disk_size_gb} HDD" + } + + output { + File unified_metrics = out_filename + } +} + +task SortSamByCoordinate { + input { + File input_bam + String output_bam_basename + + # SortSam spills to disk a lot more because we are only store 300000 records in RAM now because its faster for our data so it needs + # more disk space. Also it spills to disk in an uncompressed format so we need to account for that with a larger multiplier + Float sort_sam_disk_multiplier = 4.0 + + + String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.11" + Int cpu = 1 + Int memory_mb = 7500 + Int disk_size_gb = ceil(sort_sam_disk_multiplier * size(input_bam, "GiB")) + 20 + } + + Int java_memory_size = memory_mb - 1000 + Int max_heap = memory_mb - 500 + + command <<< + java -Xms~{java_memory_size}m -Xmx~{max_heap}m -jar /usr/picard/picard.jar SortSam \ + INPUT=~{input_bam} \ + OUTPUT=~{output_bam_basename}.bam \ + SORT_ORDER="coordinate" \ + CREATE_INDEX=true \ + CREATE_MD5_FILE=true \ + MAX_RECORDS_IN_RAM=300000 + >>> + + runtime { + docker: docker + cpu: cpu + memory: "~{memory_mb} MiB" + disks: "local-disk ~{disk_size_gb} HDD" + } + + output { + File output_bam = "~{output_bam_basename}.bam" + File output_bam_index = "~{output_bam_basename}.bai" + File output_bam_md5 = "~{output_bam_basename}.bam.md5" + } +} + +task SortSamByQueryName { + input { + File input_bam + String output_bam_basename + + # SortSam spills to disk a lot more because we are only store 300000 records in RAM now because its faster for our data so it needs + # more disk space. Also it spills to disk in an uncompressed format so we need to account for that with a larger multiplier + Float sort_sam_disk_multiplier = 6.0 + + + String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.11" + Int cpu = 1 + Int memory_mb = 7500 + Int disk_size_gb = ceil(sort_sam_disk_multiplier * size(input_bam, "GiB")) + 20 + } + + Int java_memory_size = memory_mb - 1000 + Int max_heap = memory_mb - 500 + + command <<< + java -Xms~{java_memory_size}m -Xmx~{max_heap}m -jar /usr/picard/picard.jar SortSam \ + INPUT=~{input_bam} \ + OUTPUT=~{output_bam_basename}.bam \ + SORT_ORDER="queryname" \ + CREATE_MD5_FILE=true \ + MAX_RECORDS_IN_RAM=300000 + >>> + + runtime { + docker: docker + cpu: cpu + memory: "~{memory_mb} MiB" + disks: "local-disk ~{disk_size_gb} HDD" + } + + output { + File output_bam = "~{output_bam_basename}.bam" + File output_bam_md5 = "~{output_bam_basename}.bam.md5" + } +} + +task GroupByUMIs { + input { + File bam + File bam_index + String output_bam_basename + + String docker = "us.gcr.io/broad-gotc-prod/umi_tools:1.0.0-1.1.1-1690198330" + Int cpu = 2 + Int memory_mb = 64000 + Int disk_size_gb = ceil(2.2 * size([bam, bam_index], "GiB")) + 100 + + File monitoring_script = "gs://broad-dsde-methods-monitoring/cromwell_monitoring_script.sh" + } + + command <<< + bash ~{monitoring_script} > monitoring.log & + + # umi_tools has a bug which lead to using the order of elements in a set to determine tie-breakers in + # rare edge-cases. Sets in python are unordered, so this leads to non-determinism. Setting PYTHONHASHSEED + # to 0 means that hashes will be unsalted. While it is not in any way gauranteed by the language that this + # will remove the non-determinism, in practice, due to implementation details of sets in cpython, this makes seemingly + # deterministic behavior much more likely + + export PYTHONHASHSEED=0 + + umi_tools group -I ~{bam} --paired --no-sort-output --output-bam --stdout ~{output_bam_basename}.bam --umi-tag-delimiter "-" \ + --extract-umi-method tag --umi-tag RX --unmapped-reads use + >>> + + output { + File grouped_bam = "~{output_bam_basename}.bam" + File monitoring_log = "monitoring.log" + } + + runtime { + docker: docker + cpu: cpu + memory: "~{memory_mb} MiB" + disks: "local-disk ~{disk_size_gb} HDD" + maxRetries: 1 + } +} + +task MarkDuplicatesUMIAware { + input { + File bam + String output_basename + Boolean remove_duplicates + Boolean use_umi + + String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.11" + Int cpu = 1 + Int memory_mb = 16000 + Int disk_size_gb = ceil(3 * size(bam, "GiB")) + 60 + } + + String output_bam_basename = output_basename + ".duplicate_marked" + + command <<< + java -jar /usr/picard/picard.jar MarkDuplicates \ + INPUT=~{bam} \ + OUTPUT=~{output_bam_basename}.bam \ + METRICS_FILE=~{output_basename}.duplicate.metrics \ + REMOVE_DUPLICATES=~{remove_duplicates} \ + ~{true='READ_ONE_BARCODE_TAG=BX' false='' use_umi} \ + + >>> + + output { + File duplicate_marked_bam = "~{output_bam_basename}.bam" + File duplicate_metrics = "~{output_basename}.duplicate.metrics" + } + + runtime { + docker: docker + cpu: cpu + memory: "~{memory_mb} MiB" + disks: "local-disk ~{disk_size_gb} HDD" + } +} + +task formatPipelineOutputs { + input { + String sample_id + String transcriptome_bam + String transcriptome_duplicate_metrics + String output_bam + String output_bam_index + String duplicate_metrics + String rnaseqc2_gene_tpm + String rnaseqc2_gene_counts + String rnaseqc2_exon_counts + String rnaseqc2_fragment_size_histogram + String rnaseqc2_metrics + String picard_rna_metrics + String picard_alignment_summary_metrics + String picard_insert_size_metrics + String picard_insert_size_histogram + String picard_base_distribution_by_cycle_metrics + String picard_base_distribution_by_cycle_pdf + String picard_quality_by_cycle_metrics + String picard_quality_by_cycle_pdf + String picard_quality_distribution_metrics + String picard_quality_distribution_pdf + String? picard_fingerprint_summary_metrics + String? picard_fingerprint_detail_metrics + File unified_metrics + Float contamination + Float contamination_error + String fastqc_html_report + String fastqc_percent_reads_with_adapter + + Int cpu = 1 + Int memory_mb = 2000 + Int disk_size_gb = 10 + } + + String outputs_json_file_name = "outputs_to_TDR_~{sample_id}.json" + + command <<< + python3 << CODE + import json + + outputs_dict = {} + + # NOTE: we rename some field names to match the TDR schema + outputs_dict["sample_id"]="~{sample_id}" # primary key + outputs_dict["transcriptome_bam"]="~{transcriptome_bam}" + outputs_dict["transcriptome_duplicate_metrics_file"]="~{transcriptome_duplicate_metrics}" + outputs_dict["genome_bam"]="~{output_bam}" + outputs_dict["genome_bam_index"]="~{output_bam_index}" + outputs_dict["picard_duplicate_metrics_file"]="~{duplicate_metrics}" + outputs_dict["rnaseqc2_gene_tpm_file"]="~{rnaseqc2_gene_tpm}" + outputs_dict["rnaseqc2_gene_counts_file"]="~{rnaseqc2_gene_counts}" + outputs_dict["rnaseqc2_exon_counts_file"]="~{rnaseqc2_exon_counts}" + outputs_dict["rnaseqc2_fragment_size_histogram_file"]="~{rnaseqc2_fragment_size_histogram}" + outputs_dict["rnaseqc2_metrics_file"]="~{rnaseqc2_metrics}" + outputs_dict["picard_rna_metrics_file"]="~{picard_rna_metrics}" + outputs_dict["picard_alignment_summary_metrics_file"]="~{picard_alignment_summary_metrics}" + outputs_dict["picard_insert_size_metrics_file"]="~{picard_insert_size_metrics}" + outputs_dict["picard_insert_size_histogram_file"]="~{picard_insert_size_histogram}" + outputs_dict["picard_base_distribution_by_cycle_metrics_file"]="~{picard_base_distribution_by_cycle_metrics}" + outputs_dict["picard_base_distribution_by_cycle_pdf_file"]="~{picard_base_distribution_by_cycle_pdf}" + outputs_dict["picard_quality_by_cycle_metrics_file"]="~{picard_quality_by_cycle_metrics}" + outputs_dict["picard_quality_by_cycle_pdf_file"]="~{picard_quality_by_cycle_pdf}" + outputs_dict["picard_quality_distribution_metrics_file"]="~{picard_quality_distribution_metrics}" + outputs_dict["picard_quality_distribution_pdf_file"]="~{picard_quality_distribution_pdf}" + outputs_dict["fp_summary_metrics_file"]="~{picard_fingerprint_summary_metrics}" + outputs_dict["fp_detail_metrics_file"]="~{picard_fingerprint_detail_metrics}" + outputs_dict["fastqc_html_report"]="~{fastqc_html_report}" + outputs_dict["fastqc_percent_reads_with_adapter"]="~{fastqc_percent_reads_with_adapter}" + outputs_dict["contamination"]="~{contamination}" + outputs_dict["contamination_error"]="~{contamination_error}" + + # explode unified metrics file + with open("~{unified_metrics}", "r") as infile: + for row in infile: + key, value = row.rstrip("\n").split("\t") + outputs_dict[key] = value + + # write full outputs to file + with open("~{outputs_json_file_name}", 'w') as outputs_file: + for key, value in outputs_dict.items(): + if value == "None" or value == "": + outputs_dict[key] = None + outputs_file.write(json.dumps(outputs_dict)) + outputs_file.write("\n") + CODE + >>> + + runtime { + docker: "broadinstitute/horsefish:tdr_import_v1.4" + cpu: cpu + memory: "~{memory_mb} MiB" + disks: "local-disk ~{disk_size_gb} HDD" + } + + output { + File pipeline_outputs_json = outputs_json_file_name + } +} + +task updateOutputsInTDR { + input { + String tdr_dataset_uuid + File outputs_json + + Int cpu = 1 + Int memory_mb = 2000 + Int disk_size_gb = 10 + } + + command <<< + # input args: + # -d dataset uuid + # -t target table in dataset + # -o json of data to ingest + # -f field to populate with timestamp at ingest (can have multiple) + python -u /scripts/export_pipeline_outputs_to_tdr.py \ + -d "~{tdr_dataset_uuid}" \ + -t "sample" \ + -o "~{outputs_json}" \ + -f "version_timestamp" \ + -f "analysis_end_time" + >>> + + runtime { + docker: "broadinstitute/horsefish:tdr_import_v1.4" + cpu: cpu + memory: "~{memory_mb} MiB" + disks: "local-disk ~{disk_size_gb} HDD" + } + + output { + File ingest_logs = stdout() + } +} + +# GATK CalculateContamination, adapted for RNA-seq data. +# Specifically, we disable two read filters from the default set of read filters +# for a LocusWalker: +# 1. WellformedReadFilter: This filter removes (among others) reads with N's in the CIGAR string, which is +# common in RNA data. +# 2. MappingQualityAvailableReadFilter: This filter removes reads with MQ=255, which by SAM spec +# means mapping quality is missing. But STAR uses 255 to mean unique mapping, the equivalent of MQ60 +# for other aligners. +task CalculateContamination { + input { + File bam + File bam_index + String base_name + File ref_fasta + File ref_dict + File ref_fasta_index + File population_vcf + File population_vcf_index + # runtime + String docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" + Int cpu = 1 + Int memory_mb = 8192 + Int disk_size_gb = 256 + } + + parameter_meta { + bam: { localization_optional: true } + bam_index: { localization_optional: true } + ref_fasta: { localization_optional: true } + ref_fasta_index: { localization_optional: true } + ref_dict: { localization_optional: true } + } + + command <<< + set -e + gatk --java-options "-Xmx4096m" GetPileupSummaries \ + -R ~{ref_fasta} \ + -I ~{bam} \ + -V ~{population_vcf} \ + -L ~{population_vcf} \ + -O ~{base_name}_pileups.tsv \ + --disable-read-filter WellformedReadFilter \ + --disable-read-filter MappingQualityAvailableReadFilter + + gatk --java-options "-Xmx4096m" CalculateContamination \ + -I ~{base_name}_pileups.tsv \ + -O ~{base_name}_contamination.tsv + + grep -v ^sample ~{base_name}_contamination.tsv | awk 'BEGIN{FS="\t"}{print($2)}' > contamination.txt + grep -v ^sample ~{base_name}_contamination.tsv | awk 'BEGIN{FS="\t"}{print($3)}' > contamination_error.txt + >>> + + runtime { + docker: docker + cpu: cpu + memory: "~{memory_mb} MiB" + disks: "local-disk ~{disk_size_gb} HDD" + } + + output { + File pileups = "~{base_name}_pileups.tsv" + File contamination_table = "~{base_name}_contamination.tsv" + Float contamination = read_float("contamination.txt") + Float contamination_error = read_float("contamination_error.txt") + } +} + +task SamToFastq { + input { + File bam + String output_prefix + String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.11" + Int memory_mb = 4096 + Int disk_size_gb = 3*ceil(size(bam, "GiB")) + 128 + } + + command { + java -jar /usr/picard/picard.jar SamToFastq \ + I=~{bam} \ + FASTQ=~{output_prefix}_1.fastq.gz \ + SECOND_END_FASTQ=~{output_prefix}_2.fastq.gz + + } + + runtime { + docker: docker + memory: "~{memory_mb} MiB" + disks: "local-disk ~{disk_size_gb} HDD" + } + + output { + File fastq1 = output_prefix + "_1.fastq.gz" + File fastq2 = output_prefix + "_2.fastq.gz" + } +} + +task FastQC { + input { + File unmapped_bam + Float? mem = 4 + String docker = "us.gcr.io/tag-public/tag-tools:1.0.0" # sato: This likely needs to be made public + Int memory_mb = 4096 + Int disk_size_gb = 3*ceil(size(unmapped_bam, "GiB")) + 128 + } + + String bam_basename = basename(unmapped_bam, ".bam") + + command { + perl /usr/tag/scripts/FastQC/fastqc ~{unmapped_bam} --extract -o ./ + mv ~{bam_basename}_fastqc/fastqc_data.txt ~{bam_basename}_fastqc_data.txt + + tail -n 2 ~{bam_basename}_fastqc_data.txt | head -n 1 | cut -f 2 > ~{bam_basename}_adapter_content.txt + } + + runtime { + docker: docker + memory: "~{memory_mb} MiB" + disks: "local-disk ~{disk_size_gb} HDD" + } + + output { + File fastqc_data = "~{bam_basename}_fastqc_data.txt" + File fastqc_html = "~{bam_basename}_fastqc.html" + Float fastqc_percent_reads_with_adapter = if read_string("~{bam_basename}_adapter_content.txt") == "warn" then -1 else read_float("~{bam_basename}_adapter_content.txt") + } +} + +task TransferReadTags { + input { + File aligned_bam + File ubam + String output_basename + String docker = "us.gcr.io/broad-gatk/gatk:4.4.0.0" + Int memory_mb = 16000 + Int disk_size_gb = ceil(2 * size(aligned_bam, "GiB")) + ceil(2 * size(ubam, "GiB")) + 128 + } + + command <<< + gatk TransferReadTags \ + -I ~{aligned_bam} \ + --unmapped-sam ~{ubam} \ + -O ~{output_basename}.bam \ + --read-tags RX + >>> + + output { + File output_bam = "~{output_basename}.bam" + } + + runtime { + docker: docker + memory: "~{memory_mb} MiB" + disks: "local-disk ~{disk_size_gb} HDD" + } +} + +task PostprocessTranscriptomeForRSEM { + input { + String prefix + File input_bam # the input must be queryname sorted + Int disk_size_gb = ceil(3*size(input_bam,"GiB")) + 128 + String docker = "us.gcr.io/broad-gatk/gatk:4.2.6.0" + Int memory_mb = 16000 + } + + command { + gatk PostProcessReadsForRSEM \ + -I ~{input_bam} \ + -O ~{prefix}_RSEM_post_processed.bam \ + --use-jdk-deflater + } + + output { + File output_bam = "~{prefix}_RSEM_post_processed.bam" + } + + runtime { + docker: docker + disks: "local-disk ~{disk_size_gb} HDD" + memory: "${memory_mb} MiB" + } +} + +task CreateEmptyFile { + input { + Int disk_size_gb = 128 + String docker = "us.gcr.io/broad-gatk/gatk:4.2.6.0" + Int memory_mb = 4096 + } + + command { + echo "place holder for a bam index file" > empty.txt + } + + output { + File empty_file = "empty.txt" + } + + runtime { + docker: docker + disks: "local-disk ~{disk_size_gb} HDD" + memory: "${memory_mb} MiB" + } +} \ No newline at end of file diff --git a/TCapRNAPipeline/subworkflows/UMIAwareDuplicateMarking.wdl b/TCapRNAPipeline/subworkflows/UMIAwareDuplicateMarking.wdl new file mode 100644 index 0000000..c4802da --- /dev/null +++ b/TCapRNAPipeline/subworkflows/UMIAwareDuplicateMarking.wdl @@ -0,0 +1,112 @@ +version 1.0 + +import "./RNAWithUMIsTasks.wdl" as tasks + +## Copyright Broad Institute, 2021 +## +## This WDL pipeline implements UMI Aware Duplicate Marking +## +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +workflow UMIAwareDuplicateMarking { + + input { + File aligned_bam + File unaligned_bam + String output_basename + Boolean remove_duplicates + Boolean coordinate_sort_output + } + + parameter_meta { + aligned_bam: "Unsorted aligned bam (the output of STAR in multithread mode is not query-name sorted)" + unaligned_bam: "Query-name sorted unaligned bam; contains UMIs in the RX tag" + output_basename: "Basename for file outputs from this workflow" + remove_duplicates: "If true, remove (rather than mark) duplicate reads from the output" + coordinate_sort: "If true, the output bam will be coordinate sorted. Else it will be query-name sorted." + } + + call tasks.SortSamByQueryName as SortSamByQueryNameAfterAlignment { + input: + input_bam = aligned_bam, + output_bam_basename = output_basename + ".queryname_sorted" + } + + # It appears we cannot assume that the unmapped bam/fastqs will be sorted + call tasks.SortSamByQueryName as SortSamByQueryNameUnmapped { + input: + input_bam = unaligned_bam, + output_bam_basename = output_basename + ".u.queryname_sorted" + } + + call tasks.TransferReadTags { + input: + aligned_bam = SortSamByQueryNameAfterAlignment.output_bam, + ubam = SortSamByQueryNameUnmapped.output_bam, + output_basename = output_basename + ".queryname_sorted_with_RX" + } + + # First sort the aligned bam by coordinate, so we can group duplicate sets using UMIs in the next step. + call tasks.SortSamByCoordinate as SortSamByCoordinateFirstPass { + input: + input_bam = TransferReadTags.output_bam, + output_bam_basename = output_basename + ".STAR_aligned.coordinate_sorted" + } + + # Further divide each duplicate set (a set of reads with the same insert start and end coordinates) + # into subsets that share the same UMIs i.e. differenciate PCR duplicates from biological duplicates. + # (biological duplicates are independent DNA molecules that are sheared such that the inserts are indistinguishable.) + # input: a coordinate sorted bam + # output: a coordinate sorted bam with UMIs (what are the generated tags?) . + + call tasks.GroupByUMIs { + input: + bam = SortSamByCoordinateFirstPass.output_bam, + bam_index = SortSamByCoordinateFirstPass.output_bam_index, + output_bam_basename = output_basename + ".grouped_by_UMI" + } + + call tasks.SortSamByQueryName as SortSamByQueryNameBeforeDuplicateMarking { + input: + input_bam = GroupByUMIs.grouped_bam, + output_bam_basename = output_basename + ".grouped.queryname_sorted" + } + + call tasks.MarkDuplicatesUMIAware as MarkDuplicates { + input: + bam = SortSamByQueryNameBeforeDuplicateMarking.output_bam, + output_basename = output_basename, + remove_duplicates = remove_duplicates, + use_umi = true + } + + if (coordinate_sort_output){ + call tasks.SortSamByCoordinate as SortSamByCoordinateSecondPass { + input: + input_bam = MarkDuplicates.duplicate_marked_bam, + output_bam_basename = output_basename + ".duplicate_marked.coordinate_sorted" + } + } + + # We won't have the index file if the output is query-name sorted. + # Until we remove the transcriptome index from the TDR schema, + # output a placeholder text file as the bam index as a temporary fix + if (!coordinate_sort_output){ + call tasks.CreateEmptyFile {} + } + + output { + File duplicate_marked_bam = select_first([SortSamByCoordinateSecondPass.output_bam, MarkDuplicates.duplicate_marked_bam]) + File duplicate_marked_bam_index = select_first([SortSamByCoordinateSecondPass.output_bam_index, CreateEmptyFile.empty_file]) + File duplicate_metrics = MarkDuplicates.duplicate_metrics + } +} diff --git a/TCapRNAPipeline/subworkflows/Utilities.wdl b/TCapRNAPipeline/subworkflows/Utilities.wdl new file mode 100644 index 0000000..e6a1aee --- /dev/null +++ b/TCapRNAPipeline/subworkflows/Utilities.wdl @@ -0,0 +1,303 @@ +version 1.0 + +## Copyright Broad Institute, 2018 +## +## This WDL defines utility tasks used for processing of sequencing data. +## +## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +# Generate sets of intervals for scatter-gathering over chromosomes +task CreateSequenceGroupingTSV { + input { + File ref_dict + Int preemptible_tries + } + # Use python to create the Sequencing Groupings used for BQSR and PrintReads Scatter. + # It outputs to stdout where it is parsed into a wdl Array[Array[String]] + # e.g. [["1"], ["2"], ["3", "4"], ["5"], ["6", "7", "8"]] + command <<< + python3 <>> + runtime { + preemptible: preemptible_tries + docker: "us.gcr.io/broad-dsp-gcr-public/base/python:3.9-debian" + memory: "2 GiB" + } + output { + Array[Array[String]] sequence_grouping = read_tsv("sequence_grouping.txt") + Array[Array[String]] sequence_grouping_with_unmapped = read_tsv("sequence_grouping_with_unmapped.txt") + } +} + +# This task calls picard's IntervalListTools to scatter the input interval list into scatter_count sub interval lists +# Note that the number of sub interval lists may not be exactly equal to scatter_count. There may be slightly more or less. +# Thus we have the block of python to count the number of generated sub interval lists. +task ScatterIntervalList { + input { + File interval_list + Int scatter_count + Int break_bands_at_multiples_of + #Setting default docker value for workflows that haven't yet been azurized. + String docker = "us.gcr.io/broad-gotc-prod/picard-python:1.0.0-2.26.10-1663951039" + } + + command <<< + set -e + mkdir out + java -Xms1000m -Xmx1500m -jar /usr/gitc/picard.jar \ + IntervalListTools \ + SCATTER_COUNT=~{scatter_count} \ + SUBDIVISION_MODE=BALANCING_WITHOUT_INTERVAL_SUBDIVISION_WITH_OVERFLOW \ + UNIQUE=true \ + SORT=true \ + BREAK_BANDS_AT_MULTIPLES_OF=~{break_bands_at_multiples_of} \ + INPUT=~{interval_list} \ + OUTPUT=out + + python3 <>> + output { + Array[File] out = glob("out/*/*.interval_list") + Int interval_count = read_int(stdout()) + } + runtime { + docker: docker + memory: "2000 MiB" + } +} + +# Convert BAM file to CRAM format +# Note that reading CRAMs directly with Picard is not yet supported +task ConvertToCram { + input { + File input_bam + File ref_fasta + File ref_fasta_index + String output_basename + Int preemptible_tries = 3 + + Int disk_size = ceil((2 * size(input_bam, "GiB")) + size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB")) + 20 + } + + command <<< + set -e + set -o pipefail + + samtools view -C -T ~{ref_fasta} ~{input_bam} | \ + tee ~{output_basename}.cram | \ + md5sum | awk '{print $1}' > ~{output_basename}.cram.md5 + + # Create REF_CACHE. Used when indexing a CRAM + seq_cache_populate.pl -root ./ref/cache ~{ref_fasta} + export REF_PATH=: + export REF_CACHE=./ref/cache/%2s/%2s/%s + + samtools index ~{output_basename}.cram + >>> + runtime { + docker: "us.gcr.io/broad-gotc-prod/samtools:1.0.0-1.11-1624651616" + preemptible: preemptible_tries + memory: "3 GiB" + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + } + output { + File output_cram = "~{output_basename}.cram" + File output_cram_index = "~{output_basename}.cram.crai" + File output_cram_md5 = "~{output_basename}.cram.md5" + } +} + +# Convert CRAM file to BAM format +task ConvertToBam { + input { + File input_cram + File ref_fasta + File ref_fasta_index + String output_basename + } + + command <<< + set -e + set -o pipefail + + samtools view -b -o ~{output_basename}.bam -T ~{ref_fasta} ~{input_cram} + + samtools index ~{output_basename}.bam + >>> + runtime { + docker: "us.gcr.io/broad-gotc-prod/samtools:1.0.0-1.11-1624651616" + preemptible: 3 + memory: "3 GiB" + cpu: "1" + disks: "local-disk 200 HDD" + } + output { + File output_bam = "~{output_basename}.bam" + File output_bam_index = "~{output_basename}.bam.bai" + } +} + +# Calculates sum of a list of floats +task SumFloats { + input { + Array[Float] sizes + Int preemptible_tries + } + + command <<< + python3 -c 'print(~{sep="+" sizes})' + >>> + output { + Float total_size = read_float(stdout()) + } + runtime { + docker: "us.gcr.io/broad-dsp-gcr-public/base/python:3.9-debian" + preemptible: preemptible_tries + } +} + +# Print given message to stderr and return an error +task ErrorWithMessage { + input { + String message + } + command <<< + >&2 echo "Error: ~{message}" + exit 1 + >>> + + runtime { + docker: "ubuntu:20.04" + } +} + +# This task is unused for now, going to keep it in here though if we need it in the future +task GetValidationInputs { + input { + String results_path + String truth_path + Array[String]? input_files + String? input_file + + String docker = "us.gcr.io/broad-dsp-gcr-public/base/python:3.9-debian" + Int cpu = 1 + Int memory_mb = 2000 + Int disk_size_gb = 20 + } + + meta { + description: "Given either a file or list of files, output both the truth and results path" + } + + command <<< + set -e + + touch truth_file.txt + touch truth_files.txt + touch results_file.txt + touch results_files.txt + + python3 <>> + + runtime { + docker: docker + cpu: cpu + memory: "~{memory_mb} MiB" + disks: "local-disk ~{disk_size_gb} HDD" + } + + output { + String truth_file = read_string("truth_file.txt") + String results_file = read_string("results_file.txt") + Array[String] truth_files = read_lines("truth_files.txt") + Array[String] results_files = read_lines("results_files.txt") + } + +} \ No newline at end of file