diff --git a/.dockstore.yml b/.dockstore.yml
index 4ce4927..8925a13 100644
--- a/.dockstore.yml
+++ b/.dockstore.yml
@@ -129,3 +129,8 @@ workflows:
    primaryDescriptorPath: /PECGS-QUICviz/QUICviz.wdl
    testParameterFiles:
     - /PECGS-QUICviz/QUICviz.inputs.json
+ - name: BroadInternalRNAWithUMIs
+   subclass: WDL
+   primaryDescriptorPath: /TCapRNAPipeline/BroadInternalRNAWithUMIs.wdl
+   testParameterFiles:
+    - /TCapRNAPipeline/BroadInternalRNAWithUMIs.inputs.json
\ No newline at end of file
diff --git a/TCapRNAPipeline/BroadInternalRNAWithUMIs.inputs.json b/TCapRNAPipeline/BroadInternalRNAWithUMIs.inputs.json
new file mode 100644
index 0000000..d1fd0fa
--- /dev/null
+++ b/TCapRNAPipeline/BroadInternalRNAWithUMIs.inputs.json
@@ -0,0 +1 @@
+{"BroadInternalRNAWithUMIs.environment":"prod","BroadInternalRNAWithUMIs.library_name":"${this.library_name}","BroadInternalRNAWithUMIs.output_basename":"${this.collaborator_sample_id}","BroadInternalRNAWithUMIs.platform":"${this.platform}","BroadInternalRNAWithUMIs.platform_unit":"${this.platform_unit}","BroadInternalRNAWithUMIs.r1_fastq":"${this.fastq1}","BroadInternalRNAWithUMIs.r2_fastq":"${this.fastq2}","BroadInternalRNAWithUMIs.read1Structure":"${this.read1Structure}","BroadInternalRNAWithUMIs.read2Structure":"${this.read2Structure}","BroadInternalRNAWithUMIs.read_group_name":"${this.read_group_name}","BroadInternalRNAWithUMIs.reference_build":"${this.reference_build}","BroadInternalRNAWithUMIs.sample_lsid":"${this.sample_lsid}","BroadInternalRNAWithUMIs.sequencing_center":"${this.sequencing_center}","BroadInternalRNAWithUMIs.tdr_dataset_uuid":"${}","BroadInternalRNAWithUMIs.tdr_sample_id":"${}","BroadInternalRNAWithUMIs.vault_token_path":"gs://broad-dsp-gotc-arrays-prod-tokens/arrayswdl.token"}
\ No newline at end of file
diff --git a/TCapRNAPipeline/BroadInternalRNAWithUMIs.wdl b/TCapRNAPipeline/BroadInternalRNAWithUMIs.wdl
new file mode 100644
index 0000000..1447d43
--- /dev/null
+++ b/TCapRNAPipeline/BroadInternalRNAWithUMIs.wdl
@@ -0,0 +1,205 @@
+version 1.0
+
+import "./subworkflows/RNAWithUMIsPipeline.wdl" as RNAWithUMIs
+import "./subworkflows/CheckFingerprint.wdl" as FP
+import "./subworkflows/RNAWithUMIsTasks.wdl" as tasks
+import "./subworkflows/Utilities.wdl" as utils
+
+workflow BroadInternalRNAWithUMIs {
+
+  String pipeline_version = "1.0.33"
+
+  input {
+    # input needs to be either "hg19" or "hg38"
+    String reference_build
+
+    String sample_lsid
+
+    # RNAWithUMIs inputs
+    File r1_fastq
+    File r2_fastq
+    String read1Structure
+    String read2Structure
+    String output_basename
+
+    String platform
+    String library_name
+    String platform_unit
+    String read_group_name
+    String sequencing_center = "BI"
+
+    # Terra Data Repo dataset information
+    String? tdr_dataset_uuid
+    String? tdr_sample_id
+
+    String environment
+    File vault_token_path
+  }
+
+  File ref = if (reference_build == "hg19") then "gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.fasta" else "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/Homo_sapiens_assembly38_noALT_noHLA_noDecoy.fasta"
+  File refIndex = if (reference_build == "hg19") then "gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.fasta.fai" else "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/Homo_sapiens_assembly38_noALT_noHLA_noDecoy.fasta.fai"
+  File refDict = if (reference_build == "hg19") then "gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.dict" else "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/Homo_sapiens_assembly38_noALT_noHLA_noDecoy.dict"
+  File haplotype_database_file = if (reference_build == "hg19") then "gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.haplotype_database.txt" else "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/Homo_sapiens_assembly38_noALT_noHLA_noDecoy.haplotype_database.txt"
+  File refFlat = if (reference_build == "hg19") then "gs://gcp-public-data--broad-references/hg19/v0/annotation/Homo_sapiens_assembly19.refFlat.txt" else "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/annotation/hg38_GENCODE_v34_refFlat.txt"
+  File starIndex = if (reference_build == "hg19") then "gs://gcp-public-data--broad-references/hg19/v0/star/STAR2.7.10a_genome_hg19_noALT_noHLA_noDecoy_v19_oh145.tar.gz" else "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/star/STAR2.7.10a_genome_GRCh38_noALT_noHLA_noDecoy_v34_oh145.tar.gz"
+  File gtf = if (reference_build == "hg19") then "gs://gcp-public-data--broad-references/hg19/v0/annotation/gencode.v19.genes.v7.collapsed_only.patched_contigs.gtf" else "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/annotation/gencode.v34.annotation_collapsed_only.gtf"
+  File ribosomalIntervals = if (reference_build == "hg19") then "gs://gcp-public-data--broad-references/hg19/v0/annotation/Homo_sapiens_assembly19.rRNA.interval_list" else "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/annotation/gencode_v34_rRNA.interval_list"
+  File exonBedFile = if (reference_build == "hg19") then "gs://gcp-public-data--broad-references/hg19/v0/annotation/gencode.v19.hg19.insert_size_intervals_geq1000bp.bed" else "gs://gcp-public-data--broad-references/Homo_sapiens_assembly38_noALT_noHLA_noDecoy/v0/annotation/gencode.v34.GRCh38.insert_size_intervals_geq1000bp.bed"
+  File population_vcf = if (reference_build == "hg19") then "gs://gatk-best-practices/somatic-b37/small_exac_common_3.vcf" else "gs://gatk-best-practices/somatic-hg38/small_exac_common_3.hg38.vcf.gz"
+  File population_vcf_index = if (reference_build == "hg19") then "gs://gatk-best-practices/somatic-b37/small_exac_common_3.vcf.idx" else "gs://gatk-best-practices/somatic-hg38/small_exac_common_3.hg38.vcf.gz.tbi"
+
+  parameter_meta {
+    reference_build: "String used to define the reference genome build; should be set to 'hg19' or 'hg38'"
+    sample_lsid: "The sample lsid (an identifier used to retrieve fingerrints from Mercury)"
+    r1_fastq: "Read 1 FASTQ file"
+    r2_fastq: "Read 2 FASTQ file"
+    read1Structure: "String describing how the bases in a sequencing run should be allocated into logical reads for read 1"
+    read2Structure: "String describing how the bases in a sequencing run should be allocated into logical reads for read 2"
+    output_basename: "String used as a prefix in workflow output files"
+    platform: "String used to describe the sequencing platform"
+    library_name: "String used to describe the library"
+    platform_unit: "String used to describe the platform unit"
+    read_group_name: "String used to describe the read group name"
+    sequencing_center: "String used to describe the sequencing center; default is set to 'BI'"
+    environment: "The environment (dev or prod) used for determining which service to use to retrieve Mercury fingerprints"
+    vault_token_path: "The path to the vault token used for accessing the Mercury Fingerprint Store"
+    tdr_dataset_uuid: "Optional string used to define the Terra Data Repo (TDR) dataset to which outputs will be ingested"
+    tdr_sample_id: "Optional string used to identify the sample being processed; this must be the primary key in the TDR dataset"
+  }
+
+  # make sure either hg19 or hg38 is supplied as reference_build input
+  if ((reference_build != "hg19") && (reference_build != "hg38")) {
+    call utils.ErrorWithMessage as ErrorMessageIncorrectInput {
+      input:
+        message = "reference_build must be supplied with either 'hg19' or 'hg38'."
+    }
+  }
+
+  call RNAWithUMIs.RNAWithUMIsPipeline as RNAWithUMIs {
+    input:
+      r1_fastq = r1_fastq,
+      r2_fastq = r2_fastq,
+      read1Structure = read1Structure,
+      read2Structure = read2Structure,
+      starIndex = starIndex,
+      output_basename = output_basename,
+      gtf = gtf,
+      platform = platform,
+      library_name = library_name,
+      platform_unit = platform_unit,
+      read_group_name = read_group_name,
+      sequencing_center = sequencing_center,
+      ref = ref,
+      refIndex = refIndex,
+      refDict = refDict,
+      refFlat = refFlat,
+      ribosomalIntervals = ribosomalIntervals,
+      exonBedFile = exonBedFile,
+      population_vcf = population_vcf,
+      population_vcf_index = population_vcf_index
+  }
+
+  call FP.CheckFingerprint as CheckFingerprint {
+    input:
+      input_bam = RNAWithUMIs.output_bam,
+      input_bam_index = RNAWithUMIs.output_bam_index,
+      sample_alias = RNAWithUMIs.sample_name,
+      sample_lsid = sample_lsid,
+      output_basename = output_basename,
+      ref_fasta = ref,
+      ref_fasta_index = refIndex,
+      ref_dict = refDict,
+      read_fingerprint_from_mercury = true,
+      haplotype_database_file = haplotype_database_file,
+      environment = environment,
+      vault_token_path = vault_token_path,
+      allow_lod_zero = true
+  }
+
+  call tasks.MergeMetrics {
+    input:
+      alignment_summary_metrics = RNAWithUMIs.picard_alignment_summary_metrics,
+      insert_size_metrics = RNAWithUMIs.picard_insert_size_metrics,
+      picard_rna_metrics = RNAWithUMIs.picard_rna_metrics,
+      duplicate_metrics = RNAWithUMIs.duplicate_metrics,
+      rnaseqc2_metrics = RNAWithUMIs.rnaseqc2_metrics,
+      fingerprint_summary_metrics = CheckFingerprint.fingerprint_summary_metrics_file,
+      output_basename = RNAWithUMIs.sample_name
+  }
+
+  if (defined(tdr_dataset_uuid) && defined(tdr_sample_id)) {
+    call tasks.formatPipelineOutputs {
+      input:
+        sample_id = select_first([tdr_sample_id, ""]),
+        transcriptome_bam = RNAWithUMIs.transcriptome_bam,
+        transcriptome_duplicate_metrics = RNAWithUMIs.transcriptome_duplicate_metrics,
+        output_bam = RNAWithUMIs.output_bam,
+        output_bam_index = RNAWithUMIs.output_bam_index,
+        duplicate_metrics = RNAWithUMIs.duplicate_metrics,
+        rnaseqc2_gene_tpm = RNAWithUMIs.rnaseqc2_gene_tpm,
+        rnaseqc2_gene_counts = RNAWithUMIs.rnaseqc2_gene_counts,
+        rnaseqc2_exon_counts = RNAWithUMIs.rnaseqc2_exon_counts,
+        rnaseqc2_fragment_size_histogram = RNAWithUMIs.rnaseqc2_fragment_size_histogram,
+        rnaseqc2_metrics = RNAWithUMIs.rnaseqc2_metrics,
+        picard_rna_metrics = RNAWithUMIs.picard_rna_metrics,
+        picard_alignment_summary_metrics = RNAWithUMIs.picard_alignment_summary_metrics,
+        picard_insert_size_metrics = RNAWithUMIs.picard_insert_size_metrics,
+        picard_insert_size_histogram = RNAWithUMIs.picard_insert_size_histogram,
+        picard_base_distribution_by_cycle_metrics = RNAWithUMIs.picard_base_distribution_by_cycle_metrics,
+        picard_base_distribution_by_cycle_pdf = RNAWithUMIs.picard_base_distribution_by_cycle_pdf,
+        picard_quality_by_cycle_metrics = RNAWithUMIs.picard_quality_by_cycle_metrics,
+        picard_quality_by_cycle_pdf = RNAWithUMIs.picard_quality_by_cycle_pdf,
+        picard_quality_distribution_metrics = RNAWithUMIs.picard_quality_distribution_metrics,
+        picard_quality_distribution_pdf = RNAWithUMIs.picard_quality_distribution_pdf,
+        picard_fingerprint_summary_metrics = CheckFingerprint.fingerprint_summary_metrics_file,
+        picard_fingerprint_detail_metrics = CheckFingerprint.fingerprint_detail_metrics_file,
+        unified_metrics = MergeMetrics.unified_metrics,
+        contamination = RNAWithUMIs.contamination,
+        contamination_error = RNAWithUMIs.contamination_error,
+        fastqc_html_report = RNAWithUMIs.fastqc_html_report,
+        fastqc_percent_reads_with_adapter = RNAWithUMIs.fastqc_percent_reads_with_adapter
+    }
+
+    call tasks.updateOutputsInTDR {
+      input:
+        tdr_dataset_uuid = select_first([tdr_dataset_uuid, ""]),
+        outputs_json = formatPipelineOutputs.pipeline_outputs_json
+    }
+  }
+
+  output {
+    File transcriptome_bam = RNAWithUMIs.transcriptome_bam
+    File output_bam = RNAWithUMIs.output_bam
+    File output_bam_index = RNAWithUMIs.output_bam_index
+
+    File duplicate_metrics = RNAWithUMIs.duplicate_metrics
+    File transcriptome_duplicate_metrics = RNAWithUMIs.transcriptome_duplicate_metrics
+
+    File rnaseqc2_gene_tpm = RNAWithUMIs.rnaseqc2_gene_tpm
+    File rnaseqc2_gene_counts = RNAWithUMIs.rnaseqc2_gene_counts
+    File rnaseqc2_exon_counts = RNAWithUMIs.rnaseqc2_exon_counts
+    File rnaseqc2_fragment_size_histogram = RNAWithUMIs.rnaseqc2_fragment_size_histogram
+    File rnaseqc2_metrics = RNAWithUMIs.rnaseqc2_metrics
+    File picard_rna_metrics = RNAWithUMIs.picard_rna_metrics
+    File picard_alignment_summary_metrics = RNAWithUMIs.picard_alignment_summary_metrics
+    File picard_insert_size_metrics = RNAWithUMIs.picard_insert_size_metrics
+    File picard_insert_size_histogram = RNAWithUMIs.picard_insert_size_histogram
+    File picard_base_distribution_by_cycle_metrics = RNAWithUMIs.picard_base_distribution_by_cycle_metrics
+    File picard_base_distribution_by_cycle_pdf = RNAWithUMIs.picard_base_distribution_by_cycle_pdf
+    File picard_quality_by_cycle_metrics = RNAWithUMIs.picard_quality_by_cycle_metrics
+    File picard_quality_by_cycle_pdf = RNAWithUMIs.picard_quality_by_cycle_pdf
+    File picard_quality_distribution_metrics = RNAWithUMIs.picard_quality_distribution_metrics
+    File picard_quality_distribution_pdf = RNAWithUMIs.picard_quality_distribution_pdf
+    File? picard_fingerprint_summary_metrics = CheckFingerprint.fingerprint_summary_metrics_file
+    File? picard_fingerprint_detail_metrics = CheckFingerprint.fingerprint_detail_metrics_file
+    File unified_metrics = MergeMetrics.unified_metrics
+    Float contamination = RNAWithUMIs.contamination
+    Float contamination_error = RNAWithUMIs.contamination_error
+    File fastqc_html_report = RNAWithUMIs.fastqc_html_report
+    Float fastqc_percent_reads_with_adapter = RNAWithUMIs.fastqc_percent_reads_with_adapter
+  }
+
+  meta {
+    allowNestedInputs: true
+  }
+}
diff --git a/TCapRNAPipeline/subworkflows/CheckFingerprint.wdl b/TCapRNAPipeline/subworkflows/CheckFingerprint.wdl
new file mode 100644
index 0000000..53526f2
--- /dev/null
+++ b/TCapRNAPipeline/subworkflows/CheckFingerprint.wdl
@@ -0,0 +1,130 @@
+version 1.0
+
+import "./Utilities.wdl" as utils
+import "./InternalTasks.wdl" as InternalTasks
+import "./Qc.wdl" as Qc
+
+
+## Copyright Broad Institute, 2022
+##
+## This WDL pipeline implements A CheckFingerprint Task
+## It runs the Picard tool 'CheckFingerprint' against a supplied input file (VCF, CRAM, BAM or SAM) using a set of 'fingerprint' genotypes.
+## These genotypes can either be generated by pulling them from the (Broad-internal) Mercury Fingerprint Store or be supplied as inputs to the pipeline.
+                                                                                                                                                                                                                                                                                                  ##
+## Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
+## For program versions, see docker containers.
+##
+## LICENSING :
+## This script is released under the WDL source code license (BSD-3) (see LICENSE in
+## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
+## be subject to different licenses. Users are responsible for checking that they are
+## authorized to run all programs before running this script. Please see the docker
+## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
+## licensing information pertaining to the included programs.
+
+workflow CheckFingerprint {
+
+  String pipeline_version = "1.0.20"
+
+  input {
+    File? input_vcf
+    File? input_vcf_index
+    File? input_bam
+    File? input_bam_index
+
+    # The name of the sample in the input_vcf.  Not required if there is only one sample in the VCF
+    String? input_sample_alias
+
+    # If this is true, we will read fingerprints from Mercury
+    # Otherwise, we will use the optional input fingerprint VCFs below
+    Boolean read_fingerprint_from_mercury = false
+    File? fingerprint_genotypes_vcf
+    File? fingerprint_genotypes_vcf_index
+
+    String? sample_lsid
+    String sample_alias
+
+    String output_basename
+
+    File ref_fasta
+    File ref_fasta_index
+    File ref_dict
+
+    File haplotype_database_file
+    Boolean allow_lod_zero = false
+
+    String? environment
+    File? vault_token_path
+  }
+
+  if (defined(input_vcf) && defined(input_bam)) {
+    call utils.ErrorWithMessage as ErrorMessageDoubleInput {
+      input:
+        message = "input_vcf and input_bam cannot both be defined as input"
+    }
+  }
+
+  if (read_fingerprint_from_mercury && (!defined(sample_lsid) || !defined(environment) || !defined(vault_token_path))) {
+    call utils.ErrorWithMessage as ErrorMessageIncompleteForReadingFromMercury {
+      input:
+        message = "sample_lsid, environment, and vault_token_path must defined when reading from Mercury"
+    }
+  }
+
+  # sample_alias may contain spaces, so make a filename-safe version for the downloaded fingerprint file
+  call InternalTasks.MakeSafeFilename {
+    input:
+      name = sample_alias
+  }
+
+  if (read_fingerprint_from_mercury) {
+    call InternalTasks.DownloadGenotypes {
+      input:
+        sample_alias = sample_alias,
+        sample_lsid = select_first([sample_lsid]),
+        output_vcf_base_name = MakeSafeFilename.output_safe_name + ".reference.fingerprint",
+        haplotype_database_file = haplotype_database_file,
+        ref_fasta = ref_fasta,
+        ref_fasta_index = ref_fasta_index,
+        ref_dict = ref_dict,
+        environment = select_first([environment]),
+        vault_token_path = select_first([vault_token_path])
+    }
+  }
+
+  Boolean fingerprint_downloaded_from_mercury = select_first([DownloadGenotypes.fingerprint_retrieved, false])
+
+  File? fingerprint_vcf_to_use = if (fingerprint_downloaded_from_mercury) then DownloadGenotypes.reference_fingerprint_vcf else fingerprint_genotypes_vcf
+  File? fingerprint_vcf_index_to_use = if (fingerprint_downloaded_from_mercury) then DownloadGenotypes.reference_fingerprint_vcf_index else fingerprint_genotypes_vcf_index
+
+  if ((defined(fingerprint_vcf_to_use)) && (defined(input_vcf) || defined(input_bam))) {
+    call Qc.CheckFingerprintTask {
+      input:
+        input_bam = input_bam,
+        input_bam_index = input_bam_index,
+        input_vcf = input_vcf,
+        input_vcf_index = input_vcf_index,
+        input_sample_alias = input_sample_alias,
+        genotypes = select_first([fingerprint_vcf_to_use]),
+        genotypes_index = fingerprint_vcf_index_to_use,
+        expected_sample_alias = sample_alias,
+        output_basename = output_basename,
+        haplotype_database_file = haplotype_database_file,
+        ref_fasta = ref_fasta,
+        ref_fasta_index = ref_fasta_index,
+        allow_lod_zero = allow_lod_zero
+    }
+  }
+
+  output {
+    Boolean fingerprint_read_from_mercury = fingerprint_downloaded_from_mercury
+    File? reference_fingerprint_vcf = fingerprint_vcf_to_use
+    File? reference_fingerprint_vcf_index = fingerprint_vcf_index_to_use
+    File? fingerprint_summary_metrics_file = CheckFingerprintTask.summary_metrics
+    File? fingerprint_detail_metrics_file = CheckFingerprintTask.detail_metrics
+    Float? lod_score = CheckFingerprintTask.lod
+  }
+  meta {
+    allowNestedInputs: true
+  }
+}
diff --git a/TCapRNAPipeline/subworkflows/InternalTasks.wdl b/TCapRNAPipeline/subworkflows/InternalTasks.wdl
new file mode 100644
index 0000000..b3bef73
--- /dev/null
+++ b/TCapRNAPipeline/subworkflows/InternalTasks.wdl
@@ -0,0 +1,202 @@
+version 1.0
+
+# Tasks that are expected to be used only on Broad Infrastructure
+
+# This task is to convert a string into another string which is 'safe' as a filename.
+# That is, it does not contain any number of odd characters.
+# Note that we are not testing for the occurrence of a single quote (') in the string.
+task MakeSafeFilename {
+  input {
+    String name
+  }
+
+  String safe_name = name
+
+  command <<<
+    echo '~{name}' | tr ' "#$%&*/:;<=>?@[]^{}|~\\()' '_' > safe_name.txt
+  >>>
+
+  runtime {
+    docker: "gcr.io/gcp-runtimes/ubuntu_16_0_4@sha256:025124e2f1cf4d29149958f17270596bffe13fc6acca6252977c572dd5ba01bf"
+    disks: "local-disk 10 HDD"
+    memory: "1 GiB"
+    preemptible: 3
+  }
+  output {
+    String output_safe_name = read_string('safe_name.txt')
+  }
+}
+
+task DownloadGenotypes {
+  input {
+    String sample_alias
+    String sample_lsid
+    String output_vcf_base_name
+    Boolean compress = true
+
+    String? ignoreSpecificGenotypesLsid
+    String? ignoreSpecificGenotypesPlatform
+
+    File haplotype_database_file
+
+    File ref_fasta
+    File ref_fasta_index
+    File ref_dict
+
+    String environment
+    File vault_token_path
+
+    Int? max_retries
+    Int? preemptible_tries
+  }
+
+  parameter_meta {
+    ignoreSpecificGenotypesLsid: "Optional, MUST be specified with ignoreSpecificGenotypesPlatform. If both are defined will not download fingerprints for an LSID run on a specific genotyping platform. The intent is to ignore its own fingerprint"
+    ignoreSpecificGenotypesPlatform: "Optional, MUST be specified with ignoreSpecificGenotypesLsid. If both are defined will not download fingerprints for an LSID run on a specific genotyping platform. The intent is to ignore its own fingerprint"
+  }
+
+  meta {
+    volatile: true
+  }
+
+  String fp_retrieved_file = "fp_retrieved.txt"
+
+  String output_vcf = output_vcf_base_name + if compress then ".vcf.gz" else".vcf"
+  String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx"
+
+  command <<<
+
+    export VAULT_ADDR=https://clotho.broadinstitute.org:8200
+    export VAULT_TOKEN=$(cat ~{vault_token_path})
+    if [ ~{environment} == prod ]; then
+      export MERCURY_AUTH_KEY=secret/dsde/gotc/prod/wdl/secrets
+      export MERCURY_FP_STORE_URI=https://portals.broadinstitute.org/portal/mercury-ws/fingerprint
+    else
+      export MERCURY_AUTH_KEY=secret/dsde/gotc/dev/wdl/secrets
+      export MERCURY_FP_STORE_URI=https://portals.broadinstitute.org/portal-test/mercury-ws/fingerprint
+    fi
+
+    exit_code=0
+
+    # TODO - there is a bug in DownloadGenotypes - we should NOT have to set EXPECTED_GENOTYPING_PLATFORMS here
+    # it * should * default to all of them.
+
+    java -Xms2000m -Xmx3000m -Dpicard.useLegacyParser=false -jar /usr/gitc/picard-private.jar \
+    DownloadGenotypes \
+      --SAMPLE_ALIAS "~{sample_alias}" \
+      --SAMPLE_LSID "~{sample_lsid}" \
+      --OUTPUT "~{output_vcf}" \
+      --CREATE_INDEX true \
+      --REFERENCE_SEQUENCE ~{ref_fasta} \
+      --HAPLOTYPE_MAP ~{haplotype_database_file} \
+      --EXPECTED_GENOTYPING_PLATFORMS FLUIDIGM \
+      --EXPECTED_GENOTYPING_PLATFORMS GENERAL_ARRAY \
+      ~{if defined(ignoreSpecificGenotypesLsid) then "--IGNORE_SPECIFIC_GENOTYPES_LSID \"" + ignoreSpecificGenotypesLsid + "\"" else ""} \
+      ~{if defined(ignoreSpecificGenotypesPlatform) then "--IGNORE_SPECIFIC_GENOTYPES_PLATFORM \"" + ignoreSpecificGenotypesPlatform + "\"" else ""} \
+      --MERCURY_FP_STORE_URI $MERCURY_FP_STORE_URI \
+      --CREDENTIALS_VAULT_PATH $MERCURY_AUTH_KEY \
+      --ERR_NO_GENOTYPES_AVAILABLE 7
+    exit_code=$?
+
+    if [ $exit_code -eq 0 ]; then
+      echo "true" > ~{fp_retrieved_file}
+    elif [ $exit_code -eq 7 ]; then
+      # Exit code from DownloadGenotypes if no fingerprints were found.
+      # Treat this as a normal condition, but set a variable to indicate no fingerprints available.
+      # Create empty file so that it exists.
+      exit_code=0
+      echo "Found no fingerprints for ~{sample_lsid}"
+      echo "false" > ~{fp_retrieved_file}
+      touch ~{output_vcf}
+      touch ~{output_vcf_index}
+    else
+      echo "false" > ~{fp_retrieved_file}
+    fi
+
+    exit $exit_code
+  >>>
+
+  runtime {
+    docker: "us.gcr.io/broad-arrays-prod/arrays-picard-private:4.1.3-1652895718"
+    memory: "3500 MiB"
+    maxRetries: select_first([max_retries, 2])
+    preemptible: select_first([preemptible_tries, 3])
+  }
+
+  output {
+    Boolean fingerprint_retrieved = read_boolean(fp_retrieved_file)
+    File reference_fingerprint_vcf = output_vcf
+    File reference_fingerprint_vcf_index = output_vcf_index
+  }
+}
+
+task UploadFingerprintToMercury {
+  input {
+    File fingerprint_json_file
+    File gtc_file
+
+    String environment
+    File vault_token_path
+
+    Int? max_retries
+    Int? preemptible_tries
+  }
+
+  command <<<
+    set -eo pipefail
+
+    export VAULT_ADDR=https://clotho.broadinstitute.org:8200
+    export VAULT_TOKEN=$(cat ~{vault_token_path})
+    if [ ~{environment} == prod ]; then
+      export MERCURY_AUTH_KEY=secret/dsde/gotc/prod/wdl/secrets
+      export MERCURY_FP_STORE_URI=https://portals.broadinstitute.org/portal/mercury-ws/fingerprint
+    else
+      export MERCURY_AUTH_KEY=secret/dsde/gotc/dev/wdl/secrets
+      export MERCURY_FP_STORE_URI=https://portals.broadinstitute.org/portal-test/mercury-ws/fingerprint
+    fi
+
+    du -k ~{gtc_file} | cut -f 1 > size.txt
+
+    # TODO -Fix UploadFingerprintToMercury so I don't need to pass a file size
+
+    java -Xms2000m -Xmx3000m -Dpicard.useLegacyParser=false -jar /usr/gitc/picard-private.jar \
+      UploadFingerprintToMercury \
+      --INPUT "~{fingerprint_json_file}" \
+      --GTC_FILE_SIZE size.txt \
+      --MERCURY_FP_STORE_URI $MERCURY_FP_STORE_URI \
+      --CREDENTIALS_VAULT_PATH $MERCURY_AUTH_KEY \
+  >>>
+
+  runtime {
+    docker: "us.gcr.io/broad-arrays-prod/arrays-picard-private:4.1.3-1652895718"
+    memory: "3500 MiB"
+    maxRetries: select_first([max_retries, 2])
+    preemptible: select_first([preemptible_tries, 3])
+  }
+}
+
+task IngestOutputsToTDR {
+    input {
+        String  tdr_dataset_id
+        String  tdr_target_table_name
+        String? in_load_tag
+        File   outputs_tsv
+    }
+
+    command {
+
+        python3 /scripts/emerge/ingest_to_tdr.py -d ~{tdr_dataset_id} \
+                                                 -t ~{tdr_target_table_name} \
+                                                 -f ~{outputs_tsv} \
+                                                 ~{"-l=" + in_load_tag}
+    }
+
+    runtime {
+        docker: "gcr.io/emerge-production/emerge_wdls:v.1.0"
+    }
+
+    output {
+        File    ingest_logs = stdout()
+        String  load_tag = read_string("load_tag.txt")
+    }
+}
\ No newline at end of file
diff --git a/TCapRNAPipeline/subworkflows/Qc.wdl b/TCapRNAPipeline/subworkflows/Qc.wdl
new file mode 100644
index 0000000..c84819c
--- /dev/null
+++ b/TCapRNAPipeline/subworkflows/Qc.wdl
@@ -0,0 +1,708 @@
+version 1.0
+
+## Copyright Broad Institute, 2018
+##
+## This WDL defines tasks used for QC of human whole-genome or exome sequencing data.
+##
+## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation.
+## For program versions, see docker containers.
+##
+## LICENSING :
+## This script is released under the WDL source code license (BSD-3) (see LICENSE in
+## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
+## be subject to different licenses. Users are responsible for checking that they are
+## authorized to run all programs before running this script. Please see the docker
+## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
+## licensing information pertaining to the included programs.
+
+# Collect sequencing yield quality metrics
+task CollectQualityYieldMetrics {
+  input {
+    File input_bam
+    String metrics_filename
+
+    Int preemptible_tries = 3
+  }
+
+  Int disk_size = ceil(size(input_bam, "GiB")) + 20
+
+  command {
+    java -Xms2000m -Xmx3000m -jar /usr/picard/picard.jar \
+      CollectQualityYieldMetrics \
+      INPUT=~{input_bam} \
+      OQ=true \
+      OUTPUT=~{metrics_filename}
+  }
+  runtime {
+    docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10"
+    disks: "local-disk " + disk_size + " HDD"
+    memory: "3500 MiB"
+    preemptible: preemptible_tries
+  }
+  output {
+    File quality_yield_metrics = "~{metrics_filename}"
+  }
+}
+
+# Collect base quality and insert size metrics
+task CollectUnsortedReadgroupBamQualityMetrics {
+  input {
+    File input_bam
+    String output_bam_prefix
+    Int preemptible_tries
+  }
+
+  Int disk_size = ceil(size(input_bam, "GiB")) + 20
+
+  command {
+    java -Xms5000m -Xmx6500m -jar /usr/picard/picard.jar \
+      CollectMultipleMetrics \
+      INPUT=~{input_bam} \
+      OUTPUT=~{output_bam_prefix} \
+      ASSUME_SORTED=true \
+      PROGRAM=null \
+      PROGRAM=CollectBaseDistributionByCycle \
+      PROGRAM=CollectInsertSizeMetrics \
+      PROGRAM=MeanQualityByCycle \
+      PROGRAM=QualityScoreDistribution \
+      METRIC_ACCUMULATION_LEVEL=null \
+      METRIC_ACCUMULATION_LEVEL=ALL_READS
+
+    touch ~{output_bam_prefix}.insert_size_metrics
+    touch ~{output_bam_prefix}.insert_size_histogram.pdf
+  }
+  runtime {
+    docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10"
+    memory: "7000 MiB"
+    disks: "local-disk " + disk_size + " HDD"
+    preemptible: preemptible_tries
+  }
+  output {
+    File base_distribution_by_cycle_pdf = "~{output_bam_prefix}.base_distribution_by_cycle.pdf"
+    File base_distribution_by_cycle_metrics = "~{output_bam_prefix}.base_distribution_by_cycle_metrics"
+    File insert_size_histogram_pdf = "~{output_bam_prefix}.insert_size_histogram.pdf"
+    File insert_size_metrics = "~{output_bam_prefix}.insert_size_metrics"
+    File quality_by_cycle_pdf = "~{output_bam_prefix}.quality_by_cycle.pdf"
+    File quality_by_cycle_metrics = "~{output_bam_prefix}.quality_by_cycle_metrics"
+    File quality_distribution_pdf = "~{output_bam_prefix}.quality_distribution.pdf"
+    File quality_distribution_metrics = "~{output_bam_prefix}.quality_distribution_metrics"
+  }
+}
+
+# Collect alignment summary and GC bias quality metrics
+task CollectReadgroupBamQualityMetrics {
+  input {
+    File input_bam
+    File input_bam_index
+    String output_bam_prefix
+    File ref_dict
+    File ref_fasta
+    File ref_fasta_index
+    Boolean collect_gc_bias_metrics = true
+    Int preemptible_tries
+  }
+
+  Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB")
+  Int disk_size = ceil(size(input_bam, "GiB") + ref_size) + 20
+
+  command {
+    # These are optionally generated, but need to exist for Cromwell's sake
+    touch ~{output_bam_prefix}.gc_bias.detail_metrics \
+      ~{output_bam_prefix}.gc_bias.pdf \
+      ~{output_bam_prefix}.gc_bias.summary_metrics
+
+    java -Xms5000m -Xmx6500m -jar /usr/picard/picard.jar \
+      CollectMultipleMetrics \
+      INPUT=~{input_bam} \
+      REFERENCE_SEQUENCE=~{ref_fasta} \
+      OUTPUT=~{output_bam_prefix} \
+      ASSUME_SORTED=true \
+      PROGRAM=null \
+      PROGRAM=CollectAlignmentSummaryMetrics \
+      ~{true='PROGRAM="CollectGcBiasMetrics"' false="" collect_gc_bias_metrics} \
+      METRIC_ACCUMULATION_LEVEL=null \
+      METRIC_ACCUMULATION_LEVEL=READ_GROUP
+  }
+  runtime {
+    docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10"
+    memory: "7000 MiB"
+    disks: "local-disk " + disk_size + " HDD"
+    preemptible: preemptible_tries
+  }
+  output {
+    File alignment_summary_metrics = "~{output_bam_prefix}.alignment_summary_metrics"
+    File gc_bias_detail_metrics = "~{output_bam_prefix}.gc_bias.detail_metrics"
+    File gc_bias_pdf = "~{output_bam_prefix}.gc_bias.pdf"
+    File gc_bias_summary_metrics = "~{output_bam_prefix}.gc_bias.summary_metrics"
+  }
+}
+
+# Collect quality metrics from the aggregated bam
+task CollectAggregationMetrics {
+  input {
+    File input_bam
+    File input_bam_index
+    String output_bam_prefix
+    File ref_dict
+    File ref_fasta
+    File ref_fasta_index
+    Boolean collect_gc_bias_metrics = true
+    Int preemptible_tries
+  }
+
+  Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB")
+  Int disk_size = ceil(size(input_bam, "GiB") + ref_size) + 20
+
+  command {
+    # These are optionally generated, but need to exist for Cromwell's sake
+    touch ~{output_bam_prefix}.gc_bias.detail_metrics \
+      ~{output_bam_prefix}.gc_bias.pdf \
+      ~{output_bam_prefix}.gc_bias.summary_metrics \
+      ~{output_bam_prefix}.insert_size_metrics \
+      ~{output_bam_prefix}.insert_size_histogram.pdf
+
+    java -Xms5000m -Xmx6500m -jar /usr/picard/picard.jar \
+      CollectMultipleMetrics \
+      INPUT=~{input_bam} \
+      REFERENCE_SEQUENCE=~{ref_fasta} \
+      OUTPUT=~{output_bam_prefix} \
+      ASSUME_SORTED=true \
+      PROGRAM=null \
+      PROGRAM=CollectAlignmentSummaryMetrics \
+      PROGRAM=CollectInsertSizeMetrics \
+      PROGRAM=CollectSequencingArtifactMetrics \
+      PROGRAM=QualityScoreDistribution \
+      ~{true='PROGRAM="CollectGcBiasMetrics"' false="" collect_gc_bias_metrics} \
+      METRIC_ACCUMULATION_LEVEL=null \
+      METRIC_ACCUMULATION_LEVEL=SAMPLE \
+      METRIC_ACCUMULATION_LEVEL=LIBRARY
+  }
+  runtime {
+    docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10"
+    memory: "7000 MiB"
+    disks: "local-disk " + disk_size + " HDD"
+    preemptible: preemptible_tries
+  }
+  output {
+    File alignment_summary_metrics = "~{output_bam_prefix}.alignment_summary_metrics"
+    File bait_bias_detail_metrics = "~{output_bam_prefix}.bait_bias_detail_metrics"
+    File bait_bias_summary_metrics = "~{output_bam_prefix}.bait_bias_summary_metrics"
+    File gc_bias_detail_metrics = "~{output_bam_prefix}.gc_bias.detail_metrics"
+    File gc_bias_pdf = "~{output_bam_prefix}.gc_bias.pdf"
+    File gc_bias_summary_metrics = "~{output_bam_prefix}.gc_bias.summary_metrics"
+    File insert_size_histogram_pdf = "~{output_bam_prefix}.insert_size_histogram.pdf"
+    File insert_size_metrics = "~{output_bam_prefix}.insert_size_metrics"
+    File pre_adapter_detail_metrics = "~{output_bam_prefix}.pre_adapter_detail_metrics"
+    File pre_adapter_summary_metrics = "~{output_bam_prefix}.pre_adapter_summary_metrics"
+    File quality_distribution_pdf = "~{output_bam_prefix}.quality_distribution.pdf"
+    File quality_distribution_metrics = "~{output_bam_prefix}.quality_distribution_metrics"
+    File error_summary_metrics = "~{output_bam_prefix}.error_summary_metrics"
+  }
+}
+
+task ConvertSequencingArtifactToOxoG {
+  input {
+    File pre_adapter_detail_metrics
+    File bait_bias_detail_metrics
+    String base_name
+    File ref_dict
+    File ref_fasta
+    File ref_fasta_index
+    Int preemptible_tries
+    Int memory_multiplier = 1
+  }
+
+  Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB")
+  Int disk_size = ceil(size(pre_adapter_detail_metrics, "GiB") + size(bait_bias_detail_metrics, "GiB") + ref_size) + 20
+
+  Int memory_size = ceil(4000 * memory_multiplier)
+  Int java_memory_size = memory_size - 1000
+  Int max_heap = memory_size - 500
+
+  command {
+    input_base=$(dirname ~{pre_adapter_detail_metrics})/~{base_name}
+    java -Xms~{java_memory_size}m -Xmx~{max_heap}m \
+      -jar /usr/picard/picard.jar \
+      ConvertSequencingArtifactToOxoG \
+      --INPUT_BASE $input_base \
+      --OUTPUT_BASE ~{base_name} \
+      --REFERENCE_SEQUENCE ~{ref_fasta}
+  }
+  runtime {
+    docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10"
+    memory: "~{memory_size} MiB"
+    disks: "local-disk " + disk_size + " HDD"
+    preemptible: preemptible_tries
+  }
+  output {
+    File oxog_metrics = "~{base_name}.oxog_metrics"
+  }
+}
+
+# Check that the fingerprints of separate readgroups all match
+task CrossCheckFingerprints {
+  input {
+    Array[File] input_bams
+    Array[File] input_bam_indexes
+    File haplotype_database_file
+    String metrics_filename
+    Float total_input_size
+    Int preemptible_tries
+    Float lod_threshold
+    String cross_check_by
+  }
+
+  Int disk_size = ceil(total_input_size) + 20
+
+  command <<<
+    java -Dsamjdk.buffer_size=131072 \
+      -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xms3000m -Xmx3000m \
+      -jar /usr/picard/picard.jar \
+      CrosscheckFingerprints \
+      OUTPUT=~{metrics_filename} \
+      HAPLOTYPE_MAP=~{haplotype_database_file} \
+      EXPECT_ALL_GROUPS_TO_MATCH=true \
+      INPUT=~{sep=' INPUT=' input_bams} \
+      LOD_THRESHOLD=~{lod_threshold} \
+      CROSSCHECK_BY=~{cross_check_by}
+  >>>
+  runtime {
+    docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10"
+    preemptible: preemptible_tries
+    memory: "3500 MiB"
+    disks: "local-disk " + disk_size + " HDD"
+  }
+  output {
+    File cross_check_fingerprints_metrics = "~{metrics_filename}"
+  }
+}
+
+task CheckFingerprintTask {
+  input {
+    File? input_bam
+    File? input_bam_index
+    File? input_vcf
+    File? input_vcf_index
+    String? input_sample_alias
+
+    File genotypes
+    File? genotypes_index
+    String expected_sample_alias
+
+    String output_basename
+    Float genotype_lod_threshold = 5.0
+
+    File haplotype_database_file
+    File? ref_fasta
+    File? ref_fasta_index
+
+    Int memory_size = 2500
+    Int preemptible_tries = 3
+
+    Boolean allow_lod_zero = false
+  }
+
+  Int java_memory_size = memory_size - 1000
+  Int max_heap = memory_size - 500
+
+  Int disk_size = ceil(size(input_bam, "GiB") + size(input_vcf, "GiB")) + 20
+  # Picard has different behavior depending on whether or not the OUTPUT parameter ends with a '.', so we are explicitly
+  #   passing in where we want the two metrics files to go to avoid any potential confusion.
+  String summary_metrics_location = "~{output_basename}.fingerprinting_summary_metrics"
+  String detail_metrics_location = "~{output_basename}.fingerprinting_detail_metrics"
+
+  File input_file = select_first([input_vcf, input_bam])
+
+  command <<<
+    set -e
+    java -Xms~{java_memory_size}m -Xmx~{max_heap}m -Dpicard.useLegacyParser=false -jar /usr/picard/picard.jar \
+    CheckFingerprint \
+      --INPUT ~{input_file} \
+      ~{if defined(input_vcf) then "--OBSERVED_SAMPLE_ALIAS \"" + input_sample_alias + "\"" else ""} \
+      --GENOTYPES ~{genotypes} \
+      --EXPECTED_SAMPLE_ALIAS "~{expected_sample_alias}" \
+      ~{if defined(input_bam) then "--IGNORE_READ_GROUPS true" else ""} \
+      --HAPLOTYPE_MAP ~{haplotype_database_file} \
+      --GENOTYPE_LOD_THRESHOLD ~{genotype_lod_threshold} \
+      --SUMMARY_OUTPUT ~{summary_metrics_location} \
+      --DETAIL_OUTPUT ~{detail_metrics_location} \
+      ~{"--REFERENCE_SEQUENCE " + ref_fasta} \
+      ~{true='--EXIT_CODE_WHEN_NO_VALID_CHECKS 0' false='' allow_lod_zero}
+
+    CONTENT_LINE=$(cat ~{summary_metrics_location} |
+    grep -n "## METRICS CLASS\tpicard.analysis.FingerprintingSummaryMetrics" |
+    cut -f1 -d:)
+    CONTENT_LINE=$(($CONTENT_LINE+2))
+    sed '8q;d' ~{summary_metrics_location} | cut -f5 > lod
+  >>>
+
+  runtime {
+    docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10"
+    disks: "local-disk " + disk_size + " HDD"
+    memory: "~{memory_size} MiB"
+    preemptible: preemptible_tries
+  }
+
+  output {
+    File summary_metrics = summary_metrics_location
+    File detail_metrics = detail_metrics_location
+    Float lod = read_float("lod")
+  }
+}
+
+task CheckPreValidation {
+  input {
+    File duplication_metrics
+    File chimerism_metrics
+    Float max_duplication_in_reasonable_sample
+    Float max_chimerism_in_reasonable_sample
+    
+    Int preemptible_tries = 3
+  }
+
+  command <<<
+    set -o pipefail
+    set -e
+
+    grep -A 1 PERCENT_DUPLICATION ~{duplication_metrics} > duplication.csv
+    grep -A 3 PCT_CHIMERAS ~{chimerism_metrics} | grep -v OF_PAIR > chimerism.csv
+
+    python3 <<CODE
+
+    import csv
+    with open('duplication.csv') as dupfile:
+      reader = csv.DictReader(dupfile, delimiter='\t')
+      for row in reader:
+        with open("duplication_value.txt","w") as file:
+          file.write(row['PERCENT_DUPLICATION'])
+          file.close()
+
+    with open('chimerism.csv') as chimfile:
+      reader = csv.DictReader(chimfile, delimiter='\t')
+      for row in reader:
+        with open("chimerism_value.txt","w") as file:
+          file.write(row['PCT_CHIMERAS'])
+          file.close()
+
+    CODE
+
+>>>
+  runtime {
+    docker: "us.gcr.io/broad-dsp-gcr-public/base/python:3.9-debian"
+    preemptible: preemptible_tries
+    memory: "2 GiB"
+  }
+  output {
+    Float duplication_rate = read_float("duplication_value.txt")
+    Float chimerism_rate = read_float("chimerism_value.txt")
+    Boolean is_outlier_data = duplication_rate > max_duplication_in_reasonable_sample || chimerism_rate > max_chimerism_in_reasonable_sample
+  }
+}
+
+task ValidateSamFile {
+  input {
+    File input_bam
+    File? input_bam_index
+    String report_filename
+    File ref_dict
+    File ref_fasta
+    File ref_fasta_index
+    Int? max_output
+    Array[String]? ignore
+    Boolean? is_outlier_data
+    Int preemptible_tries = 0
+    Int memory_multiplier = 1
+    Int additional_disk = 20
+
+    Int disk_size = ceil(size(input_bam, "GiB") 
+                    + size(ref_fasta, "GiB") 
+                    + size(ref_fasta_index, "GiB")
+                    + size(ref_dict, "GiB")) + additional_disk
+  }
+
+  Int memory_size = ceil(16000 * memory_multiplier)
+  Int java_memory_size = memory_size - 1000
+  Int max_heap = memory_size - 500
+
+  command {
+    java -Xms~{java_memory_size}m -Xmx~{max_heap}m -jar /usr/picard/picard.jar \
+      ValidateSamFile \
+      INPUT=~{input_bam} \
+      OUTPUT=~{report_filename} \
+      REFERENCE_SEQUENCE=~{ref_fasta} \
+      ~{"MAX_OUTPUT=" + max_output} \
+      IGNORE=~{default="null" sep=" IGNORE=" ignore} \
+      MODE=VERBOSE \
+      ~{default='SKIP_MATE_VALIDATION=false' true='SKIP_MATE_VALIDATION=true' false='SKIP_MATE_VALIDATION=false' is_outlier_data} \
+      IS_BISULFITE_SEQUENCED=false
+  }
+  runtime {
+    docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10"
+    preemptible: preemptible_tries
+    memory: "~{memory_size} MiB"
+    disks: "local-disk " + disk_size + " HDD"
+  }
+  output {
+    File report = "~{report_filename}"
+  }
+}
+
+task CollectWgsMetrics {
+  input {
+    File input_bam
+    File input_bam_index
+    String metrics_filename
+    File wgs_coverage_interval_list
+    File ref_fasta
+    File ref_fasta_index
+    Int read_length = 250
+    Int preemptible_tries
+  }
+
+  Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB")
+  Int disk_size = ceil(size(input_bam, "GiB") + ref_size) + 20
+
+  command {
+    java -Xms2000m -Xmx2500m -jar /usr/picard/picard.jar \
+      CollectWgsMetrics \
+      INPUT=~{input_bam} \
+      VALIDATION_STRINGENCY=SILENT \
+      REFERENCE_SEQUENCE=~{ref_fasta} \
+      INCLUDE_BQ_HISTOGRAM=true \
+      INTERVALS=~{wgs_coverage_interval_list} \
+      OUTPUT=~{metrics_filename} \
+      USE_FAST_ALGORITHM=true \
+      READ_LENGTH=~{read_length}
+  }
+  runtime {
+    docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10"
+    preemptible: preemptible_tries
+    memory: "3000 MiB"
+    disks: "local-disk " + disk_size + " HDD"
+  }
+  output {
+    File metrics = "~{metrics_filename}"
+  }
+}
+
+# Collect raw WGS metrics (commonly used QC thresholds)
+task CollectRawWgsMetrics {
+  input {
+    File input_bam
+    File input_bam_index
+    String metrics_filename
+    File wgs_coverage_interval_list
+    File ref_fasta
+    File ref_fasta_index
+    Int read_length = 250
+    Int preemptible_tries
+    Int memory_multiplier = 1
+    Int additional_disk = 20
+  }
+  Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB")
+  Int disk_size = ceil(size(input_bam, "GiB") + ref_size) + additional_disk
+
+  Int memory_size = ceil((if (disk_size < 110) then 5 else 7) * memory_multiplier)
+  String java_memory_size = (memory_size - 1) * 1000
+
+  command {
+    java -Xms~{java_memory_size}m -jar /usr/picard/picard.jar \
+      CollectRawWgsMetrics \
+      INPUT=~{input_bam} \
+      VALIDATION_STRINGENCY=SILENT \
+      REFERENCE_SEQUENCE=~{ref_fasta} \
+      INCLUDE_BQ_HISTOGRAM=true \
+      INTERVALS=~{wgs_coverage_interval_list} \
+      OUTPUT=~{metrics_filename} \
+      USE_FAST_ALGORITHM=true \
+      READ_LENGTH=~{read_length}
+  }
+  runtime {
+    docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10"
+    preemptible: preemptible_tries
+    memory: "~{memory_size} GiB"
+    disks: "local-disk " + disk_size + " HDD"
+  }
+  output {
+    File metrics = "~{metrics_filename}"
+  }
+}
+
+task CollectHsMetrics {
+  input {
+    File input_bam
+    File input_bam_index
+    File ref_fasta
+    File ref_fasta_index
+    String metrics_filename
+    File target_interval_list
+    File bait_interval_list
+    Int preemptible_tries
+    Int memory_multiplier = 1
+    Int additional_disk = 20
+  }
+
+  Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB")
+  Int disk_size = ceil(size(input_bam, "GiB") + ref_size) + additional_disk
+  # Try to fit the input bam into memory, within reason.
+  Int rounded_bam_size = ceil(size(input_bam, "GiB") + 0.5)
+  Int rounded_memory_size = ceil((if (rounded_bam_size > 10) then 10 else rounded_bam_size) * memory_multiplier)
+  Int memory_size = if rounded_memory_size < 7 then 7000 else (rounded_memory_size * 1000)
+  Int java_memory_size = memory_size - 1000
+  Int max_heap = memory_size - 500
+
+  # There are probably more metrics we want to generate with this tool
+  command {
+    java -Xms~{java_memory_size}m -Xmx~{max_heap}m -jar /usr/picard/picard.jar \
+      CollectHsMetrics \
+      INPUT=~{input_bam} \
+      REFERENCE_SEQUENCE=~{ref_fasta} \
+      VALIDATION_STRINGENCY=SILENT \
+      TARGET_INTERVALS=~{target_interval_list} \
+      BAIT_INTERVALS=~{bait_interval_list} \
+      METRIC_ACCUMULATION_LEVEL=null \
+      METRIC_ACCUMULATION_LEVEL=SAMPLE \
+      METRIC_ACCUMULATION_LEVEL=LIBRARY \
+      OUTPUT=~{metrics_filename}
+  }
+
+  runtime {
+    docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10"
+    preemptible: preemptible_tries
+    memory: "~{memory_size} MiB"
+    disks: "local-disk " + disk_size + " HDD"
+  }
+
+  output {
+    File metrics = metrics_filename
+  }
+}
+
+# Generate a checksum per readgroup
+task CalculateReadGroupChecksum {
+  input {
+    File input_bam
+    File input_bam_index
+    String read_group_md5_filename
+    Int preemptible_tries
+  }
+
+  Int disk_size = ceil(size(input_bam, "GiB")) + 80
+
+  command {
+    java -Xms1000m -Xmx3500m -jar /usr/picard/picard.jar \
+      CalculateReadGroupChecksum \
+      INPUT=~{input_bam} \
+      OUTPUT=~{read_group_md5_filename}
+  }
+  runtime {
+    docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10"
+    preemptible: preemptible_tries
+    memory: "6000 MiB"
+    disks: "local-disk " + disk_size + " HDD"
+  }
+  output {
+    File md5_file = "~{read_group_md5_filename}"
+  }
+}
+
+# Validate a (g)VCF with -gvcf specific validation
+task ValidateVCF {
+  input {
+    File input_vcf
+    File input_vcf_index
+    File ref_fasta
+    File ref_fasta_index
+    File ref_dict
+    File? dbsnp_vcf
+    File? dbsnp_vcf_index
+    File calling_interval_list
+    File? calling_interval_list_index  # if the interval list is a VCF, than an index file makes VcfToIntervalList run faster
+    Boolean calling_intervals_defined = true
+    Int preemptible_tries = 3
+    Boolean is_gvcf = true
+    String? extra_args
+    #Setting default docker value for workflows that haven't yet been azurized. 
+    String docker_path = "us.gcr.io/broad-gatk/gatk:4.5.0.0"
+    Int machine_mem_mb = 7000
+  }
+
+  String calling_interval_list_basename = basename(calling_interval_list)
+  String calling_interval_list_index_basename = if calling_intervals_defined then "" else basename(select_first([calling_interval_list_index]))
+
+  Int command_mem_mb = machine_mem_mb - 2000
+  Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB")
+  Int disk_size = ceil(size(input_vcf, "GiB") + size(calling_interval_list, "GiB") + size(dbsnp_vcf, "GiB") + ref_size) + 20
+
+  command {
+    set -e
+    
+    if [ ~{calling_intervals_defined} == "false" ]; then
+      # We can't always assume the index was located with the vcf, so make a link so that the paths look the same
+      ln -s ~{calling_interval_list} ~{calling_interval_list_basename}
+      ln -s ~{calling_interval_list_index} ~{calling_interval_list_index_basename}
+      gatk --java-options "-Xms~{command_mem_mb}m -Xmx~{command_mem_mb}m" \
+        VcfToIntervalList -I ~{calling_interval_list_basename} -O intervals_from_gvcf.interval_list
+      INTERVALS="intervals_from_gvcf.interval_list"
+    else
+      INTERVALS="~{calling_interval_list}"
+    fi
+    
+    gatk --java-options "-Xms~{command_mem_mb}m -Xmx~{command_mem_mb}m" \
+      ValidateVariants \
+      -V ~{input_vcf} \
+      -R ~{ref_fasta} \
+      -L $INTERVALS \
+      ~{true="-gvcf" false="" is_gvcf} \
+      --validation-type-to-exclude ALLELES \
+      ~{"--dbsnp " + dbsnp_vcf} \
+      ~{extra_args}
+  }
+  runtime {
+    docker: docker_path
+    preemptible: preemptible_tries
+    memory: machine_mem_mb + " MiB"
+    bootDiskSizeGb: 15
+    disks: "local-disk " + disk_size + " HDD"
+  }
+}
+
+# Collect variant calling metrics from GVCF output
+task CollectVariantCallingMetrics {
+  input {
+    File input_vcf
+    File input_vcf_index
+    String metrics_basename
+    File dbsnp_vcf
+    File dbsnp_vcf_index
+    File ref_dict
+    File evaluation_interval_list
+    Boolean is_gvcf = true
+    Int preemptible_tries
+    #Setting default docker value for workflows that haven't yet been azurized. 
+    String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10"
+  }
+
+  Int disk_size = ceil(size(input_vcf, "GiB") + size(dbsnp_vcf, "GiB")) + 20
+
+  command {
+    java -Xms2000m -Xmx2500m -jar /usr/picard/picard.jar \
+      CollectVariantCallingMetrics \
+      INPUT=~{input_vcf} \
+      OUTPUT=~{metrics_basename} \
+      DBSNP=~{dbsnp_vcf} \
+      SEQUENCE_DICTIONARY=~{ref_dict} \
+      TARGET_INTERVALS=~{evaluation_interval_list} \
+      ~{true="GVCF_INPUT=true" false="" is_gvcf}
+  }
+  runtime {
+    docker: docker
+    preemptible: preemptible_tries
+    memory: "3000 MiB"
+    disks: "local-disk " + disk_size + " HDD"
+  }
+  output {
+    File summary_metrics = "~{metrics_basename}.variant_calling_summary_metrics"
+    File detail_metrics = "~{metrics_basename}.variant_calling_detail_metrics"
+  }
+}
diff --git a/TCapRNAPipeline/subworkflows/RNAWithUMIsPipeline.wdl b/TCapRNAPipeline/subworkflows/RNAWithUMIsPipeline.wdl
new file mode 100644
index 0000000..2e8a973
--- /dev/null
+++ b/TCapRNAPipeline/subworkflows/RNAWithUMIsPipeline.wdl
@@ -0,0 +1,259 @@
+version 1.0
+
+import "./UMIAwareDuplicateMarking.wdl" as UmiMD
+import "./RNAWithUMIsTasks.wdl" as tasks
+
+## Copyright Broad Institute, 2021
+##
+## This WDL pipeline implements data processing for RNA with UMIs
+##
+## Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
+## For program versions, see docker containers.
+##
+## LICENSING :
+## This script is released under the WDL source code license (BSD-3) (see LICENSE in
+## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
+## be subject to different licenses. Users are responsible for checking that they are
+## authorized to run all programs before running this script. Please see the docker
+## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
+## licensing information pertaining to the included programs.
+
+workflow RNAWithUMIsPipeline {
+
+  String pipeline_version = "1.0.16"
+
+  input {
+    File? bam
+    File? r1_fastq
+    File? r2_fastq
+    String read1Structure
+    String read2Structure
+    String output_basename
+
+    String platform
+    String library_name
+    String platform_unit
+    String read_group_name
+    String sequencing_center = "BI"
+
+    File starIndex
+    File gtf
+
+    File ref
+    File refIndex
+    File refDict
+    File refFlat
+    File ribosomalIntervals
+    File exonBedFile
+
+    File population_vcf
+    File population_vcf_index
+  }
+
+  parameter_meta {
+    bam: "Read group-specific unmapped BAM file;  alternatively, paired-end FASTQ files (the `r1_fastq` and `r2_fastq` inputs) may be used"
+    r1_fastq: "Read 1 FASTQ file; alternatively, the unmapped bam file (`bam` input) may be used as input"
+    r2_fastq: "Read 2 FASTQ file; alternatively, the unmapped bam file (`bam` input) may be used as input"
+    read1Structure: "String describing how the bases in a sequencing run should be allocated into logical reads for read 1"
+    read2Structure: "String describing how the bases in a sequencing run should be allocated into logical reads for read 2"
+    starIndex: "TAR file containing genome indices used for the STAR aligner"
+    output_basename: "String used as a prefix in workflow output files"
+    gtf: "Gene annotation file (GTF) used for the rnaseqc tool"
+    platform: "String used to describe the sequencing platform"
+    library_name: "String used to describe the library"
+    platform_unit: "String used to describe the platform unit"
+    read_group_name: "String used to describe the read group name"
+    sequencing_center: "String used to describe the sequencing center; default is set to 'BI'"
+    ref: "FASTA file used for metric collection with Picard tools"
+    refIndex: "FASTA index file used for metric collection with Picard tools"
+    refDict: "Dictionary file used for metric collection with Picard tools"
+    refFlat: "refFlat file used for metric collection with Picard tools"
+    ribosomalIntervals: "Intervals file used for RNA metric collection with Picard tools"
+    exonBedFile: "Bed file used for fragment size calculations in the rnaseqc tool; contains non-overlapping exons"
+    population_vcf: "VCF file for contamination estimation; contains common SNP sites from population-wide studies like ExAC or gnomAD"
+    population_vcf_index: "Population VCF index file used for contamination estimation"
+  }
+
+  call tasks.VerifyPipelineInputs {
+    input:
+      bam = bam,
+      r1_fastq = r1_fastq,
+      r2_fastq = r2_fastq
+  }
+
+  if (VerifyPipelineInputs.fastq_run) {
+    call tasks.FastqToUbam {
+      input:
+        r1_fastq = select_first([r1_fastq]),
+        r2_fastq = select_first([r2_fastq]),
+        bam_filename = output_basename,
+        library_name = library_name,
+        platform = platform,
+        platform_unit = platform_unit,
+        read_group_name = read_group_name,
+        sequencing_center = sequencing_center
+    }
+  }
+
+  File bam_to_use = select_first([bam, FastqToUbam.unmapped_bam])
+
+  call tasks.ExtractUMIs {
+    input:
+      bam = bam_to_use,
+      read1Structure = read1Structure,
+      read2Structure = read2Structure
+  }
+
+  # Convert SAM to fastq for adapter clipping
+  # This step also removes reads that fail platform/vendor quality checks
+  call tasks.SamToFastq {
+    input:
+      bam = ExtractUMIs.bam_umis_extracted,
+      output_prefix = output_basename
+  }
+
+  # Adapter clipping
+  call tasks.Fastp {
+    input:
+      fastq1 = SamToFastq.fastq1,
+      fastq2 = SamToFastq.fastq2,
+      output_prefix = output_basename + ".adapter_clipped"
+  }
+
+  # Back to SAM before alignment
+  call tasks.FastqToUbam as FastqToUbamAfterClipping {
+    input:
+        r1_fastq = Fastp.fastq1_clipped,
+        r2_fastq = Fastp.fastq2_clipped,
+        bam_filename = output_basename + ".adapter_clipped",
+        library_name = library_name,
+        platform = platform,
+        platform_unit = platform_unit,
+        read_group_name = read_group_name,
+        sequencing_center = sequencing_center
+  }
+
+  call tasks.FastQC {
+    input:
+      unmapped_bam = FastqToUbamAfterClipping.unmapped_bam
+  }
+
+  call tasks.STAR {
+    input:
+      bam = FastqToUbamAfterClipping.unmapped_bam,
+      starIndex = starIndex
+  }
+
+  call tasks.CopyReadGroupsToHeader {
+    input:
+      bam_with_readgroups = STAR.aligned_bam,
+      bam_without_readgroups = STAR.transcriptome_bam
+  }
+
+  # Mark duplicate reads in the genome-aligned bam. The output is coordinate-sorted.
+  call UmiMD.UMIAwareDuplicateMarking {
+    input:
+      aligned_bam = STAR.aligned_bam,
+      unaligned_bam = ExtractUMIs.bam_umis_extracted,
+      output_basename = output_basename,
+      remove_duplicates = false,
+      coordinate_sort_output = true
+  }
+
+  # Remove, rather than mark, duplicates reads from the transcriptome-aligned bam
+  # The output is query-name-sorted.
+  call UmiMD.UMIAwareDuplicateMarking as UMIAwareDuplicateMarkingTranscriptome {
+    input:
+      aligned_bam = CopyReadGroupsToHeader.output_bam,
+      unaligned_bam = ExtractUMIs.bam_umis_extracted,
+      output_basename = output_basename + ".transcriptome",
+      remove_duplicates = true,
+      coordinate_sort_output = false
+  }
+
+  call tasks.PostprocessTranscriptomeForRSEM {
+    input:
+      prefix = output_basename + ".transcriptome",
+      input_bam = UMIAwareDuplicateMarkingTranscriptome.duplicate_marked_bam
+  }
+
+  call tasks.GetSampleName {
+    input:
+      bam = bam_to_use
+  }
+
+  call tasks.rnaseqc2 {
+    input:
+      bam_file = UMIAwareDuplicateMarking.duplicate_marked_bam,
+      genes_gtf = gtf,
+      sample_id = GetSampleName.sample_name,
+      exon_bed = exonBedFile
+  }
+
+  call tasks.CollectRNASeqMetrics {
+    input:
+      input_bam = UMIAwareDuplicateMarking.duplicate_marked_bam,
+      input_bam_index = UMIAwareDuplicateMarking.duplicate_marked_bam_index,
+      output_bam_prefix = GetSampleName.sample_name,
+      ref_dict = refDict,
+      ref_fasta = ref,
+      ref_fasta_index = refIndex,
+      ref_flat = refFlat,
+      ribosomal_intervals = ribosomalIntervals,
+  }
+
+  call tasks.CollectMultipleMetrics {
+    input:
+      input_bam = UMIAwareDuplicateMarking.duplicate_marked_bam,
+      input_bam_index = UMIAwareDuplicateMarking.duplicate_marked_bam_index,
+      output_bam_prefix = GetSampleName.sample_name,
+      ref_dict = refDict,
+      ref_fasta = ref,
+      ref_fasta_index = refIndex
+  }
+
+  call tasks.CalculateContamination {
+    input:
+      bam = UMIAwareDuplicateMarking.duplicate_marked_bam,
+      bam_index = UMIAwareDuplicateMarking.duplicate_marked_bam_index,
+      base_name = GetSampleName.sample_name,
+      population_vcf = population_vcf,
+      population_vcf_index = population_vcf_index,
+      ref_dict = refDict,
+      ref_fasta = ref,
+      ref_fasta_index = refIndex
+  }
+
+  output {
+    String sample_name = GetSampleName.sample_name
+    File transcriptome_bam = PostprocessTranscriptomeForRSEM.output_bam
+    File transcriptome_duplicate_metrics = UMIAwareDuplicateMarkingTranscriptome.duplicate_metrics
+    File output_bam = UMIAwareDuplicateMarking.duplicate_marked_bam
+    File output_bam_index = UMIAwareDuplicateMarking.duplicate_marked_bam_index
+    File duplicate_metrics = UMIAwareDuplicateMarking.duplicate_metrics
+    File rnaseqc2_gene_tpm = rnaseqc2.gene_tpm
+    File rnaseqc2_gene_counts = rnaseqc2.gene_counts
+    File rnaseqc2_exon_counts = rnaseqc2.exon_counts
+    File rnaseqc2_fragment_size_histogram = rnaseqc2.fragment_size_histogram
+    File rnaseqc2_metrics = rnaseqc2.metrics
+    File picard_rna_metrics = CollectRNASeqMetrics.rna_metrics
+    File picard_alignment_summary_metrics = CollectMultipleMetrics.alignment_summary_metrics
+    File picard_insert_size_metrics = CollectMultipleMetrics.insert_size_metrics
+    File picard_insert_size_histogram = CollectMultipleMetrics.insert_size_histogram
+    File picard_base_distribution_by_cycle_metrics = CollectMultipleMetrics.base_distribution_by_cycle_metrics
+    File picard_base_distribution_by_cycle_pdf = CollectMultipleMetrics.base_distribution_by_cycle_pdf
+    File picard_quality_by_cycle_metrics = CollectMultipleMetrics.quality_by_cycle_metrics
+    File picard_quality_by_cycle_pdf = CollectMultipleMetrics.quality_by_cycle_pdf
+    File picard_quality_distribution_metrics = CollectMultipleMetrics.quality_distribution_metrics
+    File picard_quality_distribution_pdf = CollectMultipleMetrics.quality_distribution_pdf
+    Float contamination = CalculateContamination.contamination
+    Float contamination_error = CalculateContamination.contamination_error
+    File fastqc_html_report = FastQC.fastqc_html
+    Float fastqc_percent_reads_with_adapter = FastQC.fastqc_percent_reads_with_adapter
+  }
+
+  meta {
+    allowNestedInputs: true
+  }
+}
+
diff --git a/TCapRNAPipeline/subworkflows/RNAWithUMIsTasks.wdl b/TCapRNAPipeline/subworkflows/RNAWithUMIsTasks.wdl
new file mode 100644
index 0000000..0c5ee13
--- /dev/null
+++ b/TCapRNAPipeline/subworkflows/RNAWithUMIsTasks.wdl
@@ -0,0 +1,1039 @@
+version 1.0
+
+task VerifyPipelineInputs {
+  meta {
+    description: "Verify that the pipeline input is either a ubam or pair of fastqs with additional info"
+  }
+
+  input {
+    File? bam
+    File? r1_fastq
+    File? r2_fastq
+
+    String docker = "us.gcr.io/broad-dsp-gcr-public/base/python:3.9-debian"
+    Int cpu = 1
+    Int memory_mb = 2000
+    Int disk_size_gb = ceil(size(bam, "GiB") + size(r1_fastq,"GiB") + size(r2_fastq, "GiB")) + 10
+  }
+
+  command <<<
+    set -e
+    python3 <<CODE
+
+    fastq_flag = 0
+    bam = "~{bam}"
+    r1_fastq = "~{r1_fastq}"
+    r2_fastq = "~{r2_fastq}"
+
+    if bam and not r1_fastq and not r2_fastq:
+      pass
+    elif r1_fastq and r2_fastq and not bam:
+      fastq_flag += 1
+    else:
+      raise ValueError("Invalid Input. Input must be either ubam or a pair of fastqs")
+
+    with open("output.txt", "w") as f:
+      if fastq_flag == 1:
+        f.write("true")
+      # Remaining case is that only bam is defined
+      else:
+        f.write("false")
+
+    CODE
+  >>>
+
+  runtime {
+    docker: docker
+    cpu: cpu
+    memory: "~{memory_mb} MiB"
+    disks: "local-disk ~{disk_size_gb} HDD"
+  }
+
+  output {
+    Boolean fastq_run = read_boolean("output.txt")
+  }
+}
+
+task ExtractUMIs {
+  input {
+    File bam
+    String read1Structure
+    String read2Structure
+
+    String docker = "us.gcr.io/broad-gotc-prod/fgbio:1.0.0-1.4.0-1638817487"
+    Int cpu = 4
+    Int memory_mb = 5000
+    Int disk_size_gb = ceil(2.2 * size(bam, "GiB")) + 20
+  }
+
+  command <<<
+    java -jar /usr/gitc/fgbio.jar ExtractUmisFromBam \
+      --input ~{bam} \
+      --read-structure ~{read1Structure} \
+      --read-structure ~{read2Structure} \
+      --molecular-index-tags RX \
+      --output extractUMIs.out.bam
+  >>>
+
+  runtime {
+    docker: docker
+    cpu: cpu
+    memory: "~{memory_mb} MiB"
+    disks: "local-disk ~{disk_size_gb} HDD"
+    preemptible: 0
+  }
+
+  output {
+    File bam_umis_extracted = "extractUMIs.out.bam"
+  }
+}
+
+# Adapter clipping
+# https://github.com/OpenGene/fastp
+task Fastp {
+  input {
+    File fastq1
+    File fastq2
+    String output_prefix
+    File adapter_fasta = "gs://gcp-public-data--broad-references/RNA/resources/Illumina_adapters.fasta"
+
+    String docker = "us.gcr.io/broad-gotc-prod/fastp:1.0.0-0.20.1-1649253500"
+    Int memory_mb =  ceil(1.5*size(fastq1, "MiB")) + 8192 # Experimentally determined formula for memory allocation
+    Int disk_size_gb = 5*ceil(size(fastq1, "GiB")) + 128
+    File monitoring_script = "gs://broad-dsde-methods-monitoring/cromwell_monitoring_script.sh"
+    Int cpu=4
+  }
+
+  command {
+    bash ~{monitoring_script} > monitoring.log &
+
+    fastp --in1 ~{fastq1} --in2 ~{fastq2} --out1 ~{output_prefix}_read1.fastq.gz --out2 ~{output_prefix}_read2.fastq.gz \
+    --disable_quality_filtering \
+    --disable_length_filtering \
+    --adapter_fasta ~{adapter_fasta} \
+    --thread ~{cpu}
+  }
+  
+
+  runtime {
+    docker: docker
+    memory: "~{memory_mb} MiB"
+    cpu: cpu
+    disks: "local-disk ~{disk_size_gb} HDD"
+    preemptible: 0
+    maxRetries: 2
+  }
+
+  output {
+    File monitoring_log = "monitoring.log"
+    File fastq1_clipped = output_prefix + "_read1.fastq.gz"
+    File fastq2_clipped = output_prefix + "_read2.fastq.gz"
+  }
+
+}
+
+task STAR {
+  input {
+    File bam
+    File starIndex
+
+    String docker = "us.gcr.io/broad-gotc-prod/samtools-star:1.0.0-1.11-2.7.10a-1642556627"
+    Int cpu = 8
+    Int memory_mb = ceil((size(starIndex, "GiB")) + 10) * 1500
+    Int disk_size_gb = ceil(2.2 * size(bam, "GiB") + size(starIndex, "GiB")) + 150
+  }
+
+  command <<<
+    echo $(date +"[%b %d %H:%M:%S] Extracting STAR index")
+    mkdir star_index
+    tar -xvf ~{starIndex} -C star_index --strip-components=1
+
+    STAR \
+      --runMode alignReads \
+      --runThreadN ~{cpu} \
+      --genomeDir star_index \
+      --outSAMtype BAM Unsorted \
+      --readFilesIn ~{bam} \
+      --readFilesType SAM PE \
+      --readFilesCommand samtools view -h \
+      --runRNGseed 12345 \
+      --outSAMunmapped Within \
+      --outFilterType BySJout \
+      --outFilterMultimapNmax 20 \
+      --outFilterScoreMinOverLread 0.33 \
+      --outFilterMatchNminOverLread 0.33 \
+      --outFilterMismatchNmax 999 \
+      --outFilterMismatchNoverLmax 0.1 \
+      --alignIntronMin 20 \
+      --alignIntronMax 1000000 \
+      --alignMatesGapMax 1000000 \
+      --alignSJoverhangMin 8 \
+      --alignSJDBoverhangMin 1 \
+      --alignSoftClipAtReferenceEnds Yes \
+      --chimSegmentMin 15 \
+      --chimMainSegmentMultNmax 1 \
+      --chimOutType WithinBAM SoftClip \
+      --chimOutJunctionFormat 0 \
+      --twopassMode Basic \
+      --quantMode TranscriptomeSAM \
+      --quantTranscriptomeBan IndelSoftclipSingleend \
+      --alignEndsProtrude 20 ConcordantPair
+  >>>
+
+  runtime {
+    docker: docker
+    cpu: cpu
+    memory: "~{memory_mb} MiB"
+    disks: "local-disk ~{disk_size_gb} HDD"
+    preemptible: 0
+  }
+
+  output {
+    File aligned_bam = "Aligned.out.bam"
+    File transcriptome_bam = "Aligned.toTranscriptome.out.bam"
+  }
+}
+
+task FastqToUbam {
+  input {
+    File r1_fastq
+    File r2_fastq
+    String bam_filename
+    String library_name
+    String platform
+    String platform_unit
+    String read_group_name
+    String sequencing_center
+
+    String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:3.0.0"
+    Int cpu = 1
+    Int memory_mb = 4000
+    Int disk_size_gb = ceil(size(r1_fastq, "GiB")*2.2 + size(r2_fastq, "GiB")*2.2) + 50
+  }
+
+  String unmapped_bam_output_name = bam_filename + ".u.bam"
+
+  Int java_memory_size = memory_mb - 1000
+  Int max_heap = memory_mb - 500
+
+  command <<<
+    java -Xms~{java_memory_size}m -Xmx~{max_heap}m -jar /usr/picard/picard.jar FastqToSam \
+      SORT_ORDER=unsorted \
+      F1=~{r1_fastq}\
+      F2=~{r2_fastq} \
+      SM="~{bam_filename}" \
+      LB="~{library_name}" \
+      PL="~{platform}" \
+      PU="~{platform_unit}" \
+      RG="~{read_group_name}" \
+      CN="~{sequencing_center}" \
+      O="~{unmapped_bam_output_name}" \
+      ALLOW_EMPTY_FASTQ=True
+  >>>
+
+  runtime {
+    docker: docker
+    cpu: cpu
+    memory: "~{memory_mb} MiB"
+    disks: "local-disk ~{disk_size_gb} HDD"
+  }
+
+  output {
+    File unmapped_bam = unmapped_bam_output_name
+  }
+}
+
+task CopyReadGroupsToHeader {
+  input {
+    File bam_with_readgroups
+    File bam_without_readgroups
+
+    String docker = "us.gcr.io/broad-gotc-prod/samtools:1.0.0-1.11-1624651616"
+    Int cpu = 1
+    Int memory_mb = 2500
+    Int disk_size_gb = ceil(2.0 * size([bam_with_readgroups, bam_without_readgroups], "GiB")) + 10
+  }
+
+  String basename = basename(bam_without_readgroups)
+
+  command <<<
+    samtools view -H ~{bam_without_readgroups} > header.sam
+    samtools view -H ~{bam_with_readgroups} | grep "@RG" >> header.sam
+    samtools reheader header.sam ~{bam_without_readgroups} > ~{basename}
+  >>>
+
+  runtime {
+    docker: docker
+    cpu: cpu
+    memory: "~{memory_mb} MiB"
+    disks: "local-disk ~{disk_size_gb} HDD"
+  }
+
+  output {
+    File output_bam = basename
+  }
+}
+
+task GetSampleName {
+  input {
+    File bam
+
+    String docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0"
+    Int cpu = 1
+    Int memory_mb = 1000
+    Int disk_size_gb = ceil(2.0 * size(bam, "GiB")) + 10
+  }
+
+  parameter_meta {
+    bam: {
+      localization_optional: true
+    }
+  }
+
+  command <<<
+    gatk GetSampleName -I ~{bam} -O sample_name.txt
+  >>>
+
+  runtime {
+    docker: docker
+    cpu: cpu
+    memory: "~{memory_mb} MiB"
+    disks: "local-disk ~{disk_size_gb} HDD"
+  }
+
+  output {
+    String sample_name = read_string("sample_name.txt")
+  }
+}
+
+task rnaseqc2 {
+  input {
+    File bam_file
+    File genes_gtf
+    String sample_id
+    File exon_bed
+
+    String docker = "us.gcr.io/broad-dsde-methods/ckachulis/rnaseqc@sha256:a4bec726bb51df5fb8c8f640f7dec22fa28c8f7803ef9994b8ec39619b4514ca"
+    Int cpu = 1
+    Int memory_mb = 8000
+    Int disk_size_gb = ceil(size(bam_file, 'GiB') + size(genes_gtf, 'GiB') + size(exon_bed, 'GiB')) + 50
+  }
+
+  command <<<
+    set -euo pipefail
+    # force fragmentSizes histogram output file to exist (even if empty)
+    touch ~{sample_id}.fragmentSizes.txt
+    echo $(date +"[%b %d %H:%M:%S] Running RNA-SeQC 2")
+    rnaseqc ~{genes_gtf} ~{bam_file} . -s ~{sample_id} -v --bed ~{exon_bed}
+    echo "  * compressing outputs"
+    gzip *.gct
+    echo $(date +"[%b %d %H:%M:%S] done")
+  >>>
+
+  output {
+    File gene_tpm = "~{sample_id}.gene_tpm.gct.gz"
+    File gene_counts = "~{sample_id}.gene_reads.gct.gz"
+    File exon_counts = "~{sample_id}.exon_reads.gct.gz"
+    File fragment_size_histogram = "~{sample_id}.fragmentSizes.txt"
+    File metrics = "~{sample_id}.metrics.tsv"
+  }
+
+  runtime {
+    docker: docker
+    cpu: cpu
+    memory: "~{memory_mb} MiB"
+    disks: "local-disk ~{disk_size_gb} HDD"
+    maxRetries: 2
+  }
+}
+
+task CollectRNASeqMetrics {
+  input {
+    File input_bam
+    File input_bam_index
+    String output_bam_prefix
+    File ref_dict
+    File ref_fasta
+    File ref_fasta_index
+    File ref_flat
+    File ribosomal_intervals
+
+
+    String docker =  "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.11"
+    Int cpu = 1
+    Int memory_mb = 7500
+    Int disk_size_gb = ceil(size(input_bam, "GiB") + size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB")) + 20
+  }
+
+  Int java_memory_size = memory_mb - 1000
+  Int max_heap = memory_mb - 500
+
+  command <<<
+    java -Xms~{java_memory_size}m -Xmx~{max_heap}m -jar /usr/picard/picard.jar CollectRnaSeqMetrics \
+      REF_FLAT=~{ref_flat} \
+      RIBOSOMAL_INTERVALS= ~{ribosomal_intervals} \
+      STRAND_SPECIFICITY=SECOND_READ_TRANSCRIPTION_STRAND \
+      INPUT=~{input_bam} \
+      OUTPUT=~{output_bam_prefix}.rna_metrics
+  >>>
+
+  runtime {
+    docker: docker
+    cpu: cpu
+    memory: "~{memory_mb} MiB"
+    disks: "local-disk ~{disk_size_gb} HDD"
+  }
+
+  output {
+    File rna_metrics = output_bam_prefix + ".rna_metrics"
+  }
+}
+
+task CollectMultipleMetrics {
+  input {
+    File input_bam
+    File input_bam_index
+    String output_bam_prefix
+    File ref_dict
+    File ref_fasta
+    File ref_fasta_index
+
+
+    String docker =  "us.gcr.io/broad-gotc-prod/picard-cloud:3.0.0"
+    Int cpu = 1
+    Int memory_mb = 7500
+    Int disk_size_gb = ceil(size(input_bam, "GiB") + size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB")) + 20
+  }
+
+  Int java_memory_size = memory_mb - 1000
+  Int max_heap = memory_mb - 500
+
+  command <<<
+    #plots will not be produced if there are no reads
+    touch ~{output_bam_prefix}.insert_size_histogram.pdf
+    touch ~{output_bam_prefix}.insert_size_metrics
+    touch ~{output_bam_prefix}.base_distribution_by_cycle.pdf
+    touch ~{output_bam_prefix}.quality_by_cycle.pdf
+    touch ~{output_bam_prefix}.quality_distribution.pdf
+
+    java -Xms~{java_memory_size}m -Xmx~{max_heap}m -jar /usr/picard/picard.jar CollectMultipleMetrics \
+      INPUT=~{input_bam} \
+      OUTPUT=~{output_bam_prefix} \
+      REFERENCE_SEQUENCE=~{ref_fasta}
+  >>>
+
+  runtime {
+    docker: docker
+    cpu: cpu
+    memory: "~{memory_mb} MiB"
+    disks: "local-disk ~{disk_size_gb} HDD"
+  }
+
+  output {
+    File alignment_summary_metrics = output_bam_prefix + ".alignment_summary_metrics"
+    File insert_size_metrics = output_bam_prefix + ".insert_size_metrics"
+    File insert_size_histogram = output_bam_prefix + ".insert_size_histogram.pdf"
+    File base_distribution_by_cycle_metrics = output_bam_prefix + ".base_distribution_by_cycle_metrics"
+    File base_distribution_by_cycle_pdf = output_bam_prefix + ".base_distribution_by_cycle.pdf"
+    File quality_by_cycle_metrics = output_bam_prefix + ".quality_by_cycle_metrics"
+    File quality_by_cycle_pdf = output_bam_prefix + ".quality_by_cycle.pdf"
+    File quality_distribution_metrics = output_bam_prefix + ".quality_distribution_metrics"
+    File quality_distribution_pdf = output_bam_prefix + ".quality_distribution.pdf"
+  }
+}
+
+task MergeMetrics {
+  input {
+    File alignment_summary_metrics
+    File insert_size_metrics
+    File picard_rna_metrics
+    File duplicate_metrics
+    File rnaseqc2_metrics
+    File? fingerprint_summary_metrics
+    String output_basename
+
+    String docker =  "python:3.8-slim"
+    Int cpu = 1
+    Int memory_mb = 3000
+    Int disk_size_gb = 10
+  }
+
+  String out_filename = output_basename + ".unified_metrics.txt"
+
+  command <<<
+
+    #
+    # Script transpose a two line TSV
+    #
+    cat <<-'EOF' > transpose.py
+    import csv, sys
+
+    rows = list(csv.reader(sys.stdin, delimiter='\t'))
+
+    for col in range(0, len(rows[0])):
+      key = rows[0][col].lower()
+      value = rows[1][col]
+      if value == "?":
+        value = "NaN"
+      if key in ["median_insert_size", "median_absolute_deviation", "median_read_length", "mad_read_length", "pf_hq_median_mismatches"]:
+        value = str(int(float(value)))
+      print(f"{key}\t{value}")
+    EOF
+
+    #
+    # Script clean the keys, replacing space, dash and forward-slash with underscores,
+    # and removing comma, single quote and periods
+    #
+    cat <<-'EOF' > clean.py
+    import sys
+
+    for line in sys.stdin:
+      (k,v) = line.strip().lower().split("\t")
+      transtable = k.maketrans({' ':'_', '-':'_', '/':'_', ',':None, '\'':None, '.' : None})
+      print(f"{k.translate(transtable)}\t{v}")
+    EOF
+
+    # Process each metric file, transposing and cleaning if necessary, and pre-pending a source to the metric name
+
+    echo "Processing Alignment Summary Metrics - Only PAIR line"
+    cat ~{alignment_summary_metrics} | egrep "(CATEGORY|^PAIR)" | python transpose.py | grep -Eiv "(SAMPLE|LIBRARY|READ_GROUP)" | awk '{print "picard_" $0}' >> ~{out_filename}
+
+    echo "Processing Insert Size Metrics - removing various WIDTH metrics"
+    cat ~{insert_size_metrics} | grep -A 1 "MEDIAN_INSERT_SIZE" | python transpose.py | grep -Eiv "(SAMPLE|LIBRARY|READ_GROUP|WIDTH)" | awk '{print "picard_" $0}' >> ~{out_filename}
+
+    echo "Processing Picard RNA Metrics"
+    cat ~{picard_rna_metrics} | grep -A 1 "RIBOSOMAL_BASES" | python transpose.py | grep -Eiv "(SAMPLE|LIBRARY|READ_GROUP)" | awk '{print "picard_rna_metrics_" $0}' >> ~{out_filename}
+
+    echo "Processing Duplicate Metrics"
+    cat ~{duplicate_metrics} | grep -A 1 "READ_PAIR_DUPLICATES" | python transpose.py | awk '{print "picard_" $0}' >> ~{out_filename}
+
+    echo "Processing RNASeQC2 Metrics"
+    cat ~{rnaseqc2_metrics} | python clean.py | awk '{print "rnaseqc2_" $0}' >> ~{out_filename}
+
+    if [[ -f "~{fingerprint_summary_metrics}" ]];
+    then
+      echo "Processing Fingerprint Summary Metrics - only extracting LOD_EXPECTED_SAMPLE"
+      cat ~{fingerprint_summary_metrics} | grep -A 1 "LOD_EXPECTED_SAMPLE" | python transpose.py | grep -i "LOD_EXPECTED_SAMPLE" | awk '{print "fp_"$0}' >> ~{out_filename}
+    else
+      echo "No Fingerprint Summary Metrics found."
+      echo "fp_lod_expected_sample	" >> ~{out_filename}
+    fi    >>>
+
+  runtime {
+    docker: docker
+    cpu: cpu
+    memory: "~{memory_mb} MiB"
+    disks: "local-disk ~{disk_size_gb} HDD"
+  }
+
+  output {
+    File unified_metrics = out_filename
+  }
+}
+
+task SortSamByCoordinate {
+  input {
+    File input_bam
+    String output_bam_basename
+
+    # SortSam spills to disk a lot more because we are only store 300000 records in RAM now because its faster for our data so it needs
+    # more disk space.  Also it spills to disk in an uncompressed format so we need to account for that with a larger multiplier
+    Float sort_sam_disk_multiplier = 4.0
+
+
+    String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.11"
+    Int cpu = 1
+    Int memory_mb = 7500
+    Int disk_size_gb = ceil(sort_sam_disk_multiplier * size(input_bam, "GiB")) + 20
+  }
+
+  Int java_memory_size = memory_mb - 1000
+  Int max_heap = memory_mb - 500
+
+  command <<<
+    java -Xms~{java_memory_size}m -Xmx~{max_heap}m -jar /usr/picard/picard.jar SortSam \
+      INPUT=~{input_bam} \
+      OUTPUT=~{output_bam_basename}.bam \
+      SORT_ORDER="coordinate" \
+      CREATE_INDEX=true \
+      CREATE_MD5_FILE=true \
+      MAX_RECORDS_IN_RAM=300000
+  >>>
+
+  runtime {
+    docker: docker
+    cpu: cpu
+    memory: "~{memory_mb} MiB"
+    disks: "local-disk ~{disk_size_gb} HDD"
+  }
+
+  output {
+    File output_bam = "~{output_bam_basename}.bam"
+    File output_bam_index = "~{output_bam_basename}.bai"
+    File output_bam_md5 = "~{output_bam_basename}.bam.md5"
+  }
+}
+
+task SortSamByQueryName {
+  input {
+    File input_bam
+    String output_bam_basename
+
+    # SortSam spills to disk a lot more because we are only store 300000 records in RAM now because its faster for our data so it needs
+    # more disk space.  Also it spills to disk in an uncompressed format so we need to account for that with a larger multiplier
+    Float sort_sam_disk_multiplier = 6.0
+
+
+    String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.11"
+    Int cpu = 1
+    Int memory_mb = 7500
+    Int disk_size_gb = ceil(sort_sam_disk_multiplier * size(input_bam, "GiB")) + 20
+  }
+
+  Int java_memory_size = memory_mb - 1000
+  Int max_heap = memory_mb - 500
+
+  command <<<
+    java -Xms~{java_memory_size}m -Xmx~{max_heap}m -jar /usr/picard/picard.jar SortSam \
+      INPUT=~{input_bam} \
+      OUTPUT=~{output_bam_basename}.bam \
+      SORT_ORDER="queryname" \
+      CREATE_MD5_FILE=true \
+      MAX_RECORDS_IN_RAM=300000
+  >>>
+
+  runtime {
+    docker: docker
+    cpu: cpu
+    memory: "~{memory_mb} MiB"
+    disks: "local-disk ~{disk_size_gb} HDD"
+  }
+
+  output {
+    File output_bam = "~{output_bam_basename}.bam"
+    File output_bam_md5 = "~{output_bam_basename}.bam.md5"
+  }
+}
+
+task GroupByUMIs {
+  input {
+    File bam
+    File bam_index
+    String output_bam_basename
+
+    String docker = "us.gcr.io/broad-gotc-prod/umi_tools:1.0.0-1.1.1-1690198330"
+    Int cpu = 2
+    Int memory_mb = 64000
+    Int disk_size_gb = ceil(2.2 * size([bam, bam_index], "GiB")) + 100
+
+    File monitoring_script = "gs://broad-dsde-methods-monitoring/cromwell_monitoring_script.sh"
+  }
+
+  command <<<
+    bash ~{monitoring_script} > monitoring.log &
+
+    # umi_tools has a bug which lead to using the order of elements in a set to determine tie-breakers in
+    # rare edge-cases.  Sets in python are unordered, so this leads to non-determinism.  Setting PYTHONHASHSEED
+    # to 0 means that hashes will be unsalted.  While it is not in any way gauranteed by the language that this
+    # will remove the non-determinism, in practice, due to implementation details of sets in cpython, this makes seemingly
+    # deterministic behavior much more likely
+
+    export PYTHONHASHSEED=0
+
+    umi_tools group -I ~{bam} --paired --no-sort-output --output-bam --stdout ~{output_bam_basename}.bam --umi-tag-delimiter "-" \
+    --extract-umi-method tag --umi-tag RX --unmapped-reads use
+  >>>
+
+  output {
+    File grouped_bam = "~{output_bam_basename}.bam"
+    File monitoring_log = "monitoring.log"
+  }
+
+  runtime {
+    docker: docker
+    cpu: cpu
+    memory: "~{memory_mb} MiB"
+    disks: "local-disk ~{disk_size_gb} HDD"
+    maxRetries: 1
+  }
+}
+
+task MarkDuplicatesUMIAware {
+  input {
+    File bam
+    String output_basename
+    Boolean remove_duplicates
+    Boolean use_umi
+
+    String docker =  "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.11"
+    Int cpu = 1
+    Int memory_mb = 16000
+    Int disk_size_gb = ceil(3 * size(bam, "GiB")) + 60
+  }
+
+  String output_bam_basename = output_basename + ".duplicate_marked"
+
+  command <<<
+    java -jar /usr/picard/picard.jar MarkDuplicates \
+    INPUT=~{bam} \
+    OUTPUT=~{output_bam_basename}.bam \
+    METRICS_FILE=~{output_basename}.duplicate.metrics \
+    REMOVE_DUPLICATES=~{remove_duplicates} \
+    ~{true='READ_ONE_BARCODE_TAG=BX' false='' use_umi} \
+
+  >>>
+
+  output {
+    File duplicate_marked_bam = "~{output_bam_basename}.bam"
+    File duplicate_metrics = "~{output_basename}.duplicate.metrics"
+  }
+
+  runtime {
+    docker: docker
+    cpu: cpu
+    memory: "~{memory_mb} MiB"
+    disks: "local-disk ~{disk_size_gb} HDD"
+  }
+}
+
+task formatPipelineOutputs {
+  input {
+    String sample_id
+    String transcriptome_bam
+    String transcriptome_duplicate_metrics
+    String output_bam
+    String output_bam_index
+    String duplicate_metrics
+    String rnaseqc2_gene_tpm
+    String rnaseqc2_gene_counts
+    String rnaseqc2_exon_counts
+    String rnaseqc2_fragment_size_histogram
+    String rnaseqc2_metrics
+    String picard_rna_metrics
+    String picard_alignment_summary_metrics
+    String picard_insert_size_metrics
+    String picard_insert_size_histogram
+    String picard_base_distribution_by_cycle_metrics
+    String picard_base_distribution_by_cycle_pdf
+    String picard_quality_by_cycle_metrics
+    String picard_quality_by_cycle_pdf
+    String picard_quality_distribution_metrics
+    String picard_quality_distribution_pdf
+    String? picard_fingerprint_summary_metrics
+    String? picard_fingerprint_detail_metrics
+    File unified_metrics
+    Float contamination
+    Float contamination_error
+    String fastqc_html_report
+    String fastqc_percent_reads_with_adapter
+
+    Int cpu = 1
+    Int memory_mb = 2000
+    Int disk_size_gb = 10
+  }
+
+  String outputs_json_file_name = "outputs_to_TDR_~{sample_id}.json"
+
+  command <<<
+    python3 << CODE
+    import json
+
+    outputs_dict = {}
+
+    # NOTE: we rename some field names to match the TDR schema
+    outputs_dict["sample_id"]="~{sample_id}" # primary key
+    outputs_dict["transcriptome_bam"]="~{transcriptome_bam}"
+    outputs_dict["transcriptome_duplicate_metrics_file"]="~{transcriptome_duplicate_metrics}"
+    outputs_dict["genome_bam"]="~{output_bam}"
+    outputs_dict["genome_bam_index"]="~{output_bam_index}"
+    outputs_dict["picard_duplicate_metrics_file"]="~{duplicate_metrics}"
+    outputs_dict["rnaseqc2_gene_tpm_file"]="~{rnaseqc2_gene_tpm}"
+    outputs_dict["rnaseqc2_gene_counts_file"]="~{rnaseqc2_gene_counts}"
+    outputs_dict["rnaseqc2_exon_counts_file"]="~{rnaseqc2_exon_counts}"
+    outputs_dict["rnaseqc2_fragment_size_histogram_file"]="~{rnaseqc2_fragment_size_histogram}"
+    outputs_dict["rnaseqc2_metrics_file"]="~{rnaseqc2_metrics}"
+    outputs_dict["picard_rna_metrics_file"]="~{picard_rna_metrics}"
+    outputs_dict["picard_alignment_summary_metrics_file"]="~{picard_alignment_summary_metrics}"
+    outputs_dict["picard_insert_size_metrics_file"]="~{picard_insert_size_metrics}"
+    outputs_dict["picard_insert_size_histogram_file"]="~{picard_insert_size_histogram}"
+    outputs_dict["picard_base_distribution_by_cycle_metrics_file"]="~{picard_base_distribution_by_cycle_metrics}"
+    outputs_dict["picard_base_distribution_by_cycle_pdf_file"]="~{picard_base_distribution_by_cycle_pdf}"
+    outputs_dict["picard_quality_by_cycle_metrics_file"]="~{picard_quality_by_cycle_metrics}"
+    outputs_dict["picard_quality_by_cycle_pdf_file"]="~{picard_quality_by_cycle_pdf}"
+    outputs_dict["picard_quality_distribution_metrics_file"]="~{picard_quality_distribution_metrics}"
+    outputs_dict["picard_quality_distribution_pdf_file"]="~{picard_quality_distribution_pdf}"
+    outputs_dict["fp_summary_metrics_file"]="~{picard_fingerprint_summary_metrics}"
+    outputs_dict["fp_detail_metrics_file"]="~{picard_fingerprint_detail_metrics}"
+    outputs_dict["fastqc_html_report"]="~{fastqc_html_report}"
+    outputs_dict["fastqc_percent_reads_with_adapter"]="~{fastqc_percent_reads_with_adapter}"
+    outputs_dict["contamination"]="~{contamination}"
+    outputs_dict["contamination_error"]="~{contamination_error}"
+
+    # explode unified metrics file
+    with open("~{unified_metrics}", "r") as infile:
+      for row in infile:
+        key, value = row.rstrip("\n").split("\t")
+        outputs_dict[key] = value
+
+    # write full outputs to file
+    with open("~{outputs_json_file_name}", 'w') as outputs_file:
+      for key, value in outputs_dict.items():
+        if value == "None" or value == "":
+          outputs_dict[key] = None
+      outputs_file.write(json.dumps(outputs_dict))
+      outputs_file.write("\n")
+    CODE
+  >>>
+
+  runtime {
+    docker: "broadinstitute/horsefish:tdr_import_v1.4"
+    cpu: cpu
+    memory: "~{memory_mb} MiB"
+    disks: "local-disk ~{disk_size_gb} HDD"
+  }
+
+  output {
+    File pipeline_outputs_json = outputs_json_file_name
+  }
+}
+
+task updateOutputsInTDR {
+  input {
+    String tdr_dataset_uuid
+    File outputs_json
+
+    Int cpu = 1
+    Int memory_mb = 2000
+    Int disk_size_gb = 10
+  }
+
+  command <<<
+    # input args:
+    # -d dataset uuid
+    # -t target table in dataset
+    # -o json of data to ingest
+    # -f field to populate with timestamp at ingest (can have multiple)
+    python -u /scripts/export_pipeline_outputs_to_tdr.py \
+      -d "~{tdr_dataset_uuid}" \
+      -t "sample" \
+      -o "~{outputs_json}" \
+      -f "version_timestamp" \
+      -f "analysis_end_time"
+  >>>
+
+  runtime {
+    docker: "broadinstitute/horsefish:tdr_import_v1.4"
+    cpu: cpu
+    memory: "~{memory_mb} MiB"
+    disks: "local-disk ~{disk_size_gb} HDD"
+  }
+
+  output {
+    File ingest_logs = stdout()
+  }
+}
+
+# GATK CalculateContamination, adapted for RNA-seq data.
+# Specifically, we disable two read filters from the default set of read filters
+# for a LocusWalker:
+# 1. WellformedReadFilter: This filter removes (among others) reads with N's in the CIGAR string, which is 
+# common in RNA data.
+# 2. MappingQualityAvailableReadFilter: This filter removes reads with MQ=255, which by SAM spec
+# means mapping quality is missing. But STAR uses 255 to mean unique mapping, the equivalent of MQ60
+# for other aligners.
+task CalculateContamination {
+  input {
+    File bam
+    File bam_index
+    String base_name
+    File ref_fasta
+    File ref_dict
+    File ref_fasta_index
+    File population_vcf
+    File population_vcf_index
+    # runtime
+    String docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0"
+    Int cpu = 1
+    Int memory_mb = 8192
+    Int disk_size_gb = 256
+  }
+
+  parameter_meta {
+    bam: { localization_optional: true }
+    bam_index: { localization_optional: true }
+    ref_fasta: { localization_optional: true }
+    ref_fasta_index: { localization_optional: true }
+    ref_dict: { localization_optional: true }
+  }
+
+  command <<<
+    set -e
+    gatk --java-options "-Xmx4096m" GetPileupSummaries \
+    -R ~{ref_fasta} \
+    -I ~{bam} \
+    -V ~{population_vcf} \
+    -L ~{population_vcf} \
+    -O ~{base_name}_pileups.tsv \
+    --disable-read-filter WellformedReadFilter \
+    --disable-read-filter MappingQualityAvailableReadFilter
+
+    gatk --java-options "-Xmx4096m" CalculateContamination \
+    -I ~{base_name}_pileups.tsv \
+    -O ~{base_name}_contamination.tsv
+  
+    grep -v ^sample ~{base_name}_contamination.tsv | awk 'BEGIN{FS="\t"}{print($2)}' > contamination.txt 
+    grep -v ^sample ~{base_name}_contamination.tsv | awk 'BEGIN{FS="\t"}{print($3)}' > contamination_error.txt
+  >>>
+
+  runtime {
+    docker: docker
+    cpu: cpu
+    memory: "~{memory_mb} MiB"
+    disks: "local-disk ~{disk_size_gb} HDD"
+  }
+
+  output {
+      File pileups = "~{base_name}_pileups.tsv"
+      File contamination_table = "~{base_name}_contamination.tsv"
+      Float contamination = read_float("contamination.txt")
+      Float contamination_error = read_float("contamination_error.txt")
+  }
+}
+
+task SamToFastq {
+  input {
+    File bam
+    String output_prefix
+    String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.11"
+    Int memory_mb = 4096
+    Int disk_size_gb = 3*ceil(size(bam, "GiB")) + 128
+  }
+
+  command {
+    java -jar /usr/picard/picard.jar SamToFastq \
+    I=~{bam} \
+    FASTQ=~{output_prefix}_1.fastq.gz \
+    SECOND_END_FASTQ=~{output_prefix}_2.fastq.gz
+
+  }
+
+  runtime {
+    docker: docker
+    memory: "~{memory_mb} MiB"
+    disks: "local-disk ~{disk_size_gb} HDD"
+  }
+
+  output {
+    File fastq1 = output_prefix + "_1.fastq.gz"
+    File fastq2 = output_prefix + "_2.fastq.gz"
+  }
+}
+
+task FastQC {
+  input {
+    File unmapped_bam
+    Float? mem = 4
+    String docker = "us.gcr.io/tag-public/tag-tools:1.0.0" # sato: This likely needs to be made public
+    Int memory_mb = 4096
+    Int disk_size_gb = 3*ceil(size(unmapped_bam, "GiB")) + 128
+  }
+
+  String bam_basename = basename(unmapped_bam, ".bam")
+
+  command {
+    perl /usr/tag/scripts/FastQC/fastqc ~{unmapped_bam} --extract -o ./
+    mv ~{bam_basename}_fastqc/fastqc_data.txt ~{bam_basename}_fastqc_data.txt
+
+    tail -n 2 ~{bam_basename}_fastqc_data.txt | head -n 1 | cut -f 2 > ~{bam_basename}_adapter_content.txt
+  }
+  
+  runtime {
+    docker: docker
+    memory: "~{memory_mb} MiB"
+    disks: "local-disk ~{disk_size_gb} HDD"
+  }
+
+  output {
+    File fastqc_data = "~{bam_basename}_fastqc_data.txt"
+    File fastqc_html = "~{bam_basename}_fastqc.html"
+    Float fastqc_percent_reads_with_adapter = if read_string("~{bam_basename}_adapter_content.txt") == "warn" then -1 else read_float("~{bam_basename}_adapter_content.txt")
+  }
+}
+
+task TransferReadTags {
+  input {
+    File aligned_bam
+    File ubam
+    String output_basename
+    String docker = "us.gcr.io/broad-gatk/gatk:4.4.0.0"
+    Int memory_mb = 16000
+    Int disk_size_gb = ceil(2 * size(aligned_bam, "GiB")) + ceil(2 * size(ubam, "GiB")) + 128
+  }
+
+  command <<<
+    gatk TransferReadTags \
+    -I ~{aligned_bam} \
+    --unmapped-sam ~{ubam} \
+    -O ~{output_basename}.bam \
+    --read-tags RX
+  >>>
+
+  output {
+    File output_bam = "~{output_basename}.bam"
+  }
+
+  runtime {
+    docker: docker
+    memory: "~{memory_mb} MiB"
+    disks: "local-disk ~{disk_size_gb} HDD"
+  }
+}
+
+task PostprocessTranscriptomeForRSEM {
+  input {
+    String prefix
+    File input_bam # the input must be queryname sorted
+    Int disk_size_gb = ceil(3*size(input_bam,"GiB")) + 128
+    String docker = "us.gcr.io/broad-gatk/gatk:4.2.6.0"
+    Int memory_mb = 16000
+  }
+
+  command {
+    gatk PostProcessReadsForRSEM \
+    -I ~{input_bam} \
+    -O ~{prefix}_RSEM_post_processed.bam \
+    --use-jdk-deflater
+  }
+
+  output {
+    File output_bam = "~{prefix}_RSEM_post_processed.bam"
+  }
+
+  runtime {
+    docker: docker 
+    disks: "local-disk ~{disk_size_gb} HDD"
+    memory: "${memory_mb} MiB"
+  }
+}
+
+task CreateEmptyFile {
+  input {
+    Int disk_size_gb = 128
+    String docker = "us.gcr.io/broad-gatk/gatk:4.2.6.0"
+    Int memory_mb = 4096
+  }
+
+  command {
+    echo "place holder for a bam index file" > empty.txt
+  }
+
+  output {
+    File empty_file = "empty.txt"
+  }
+
+  runtime {
+    docker: docker 
+    disks: "local-disk ~{disk_size_gb} HDD"
+    memory: "${memory_mb} MiB"
+  }
+}
\ No newline at end of file
diff --git a/TCapRNAPipeline/subworkflows/UMIAwareDuplicateMarking.wdl b/TCapRNAPipeline/subworkflows/UMIAwareDuplicateMarking.wdl
new file mode 100644
index 0000000..c4802da
--- /dev/null
+++ b/TCapRNAPipeline/subworkflows/UMIAwareDuplicateMarking.wdl
@@ -0,0 +1,112 @@
+version 1.0
+
+import "./RNAWithUMIsTasks.wdl" as tasks
+
+## Copyright Broad Institute, 2021
+##
+## This WDL pipeline implements UMI Aware Duplicate Marking
+##
+## Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
+## For program versions, see docker containers.
+##
+## LICENSING :
+## This script is released under the WDL source code license (BSD-3) (see LICENSE in
+## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
+## be subject to different licenses. Users are responsible for checking that they are
+## authorized to run all programs before running this script. Please see the docker
+## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
+## licensing information pertaining to the included programs.
+
+workflow UMIAwareDuplicateMarking {
+
+  input {
+    File aligned_bam 
+    File unaligned_bam
+    String output_basename
+    Boolean remove_duplicates
+    Boolean coordinate_sort_output
+  }
+
+  parameter_meta {
+    aligned_bam: "Unsorted aligned bam (the output of STAR in multithread mode is not query-name sorted)"
+    unaligned_bam: "Query-name sorted unaligned bam; contains UMIs in the RX tag"
+    output_basename: "Basename for file outputs from this workflow"
+    remove_duplicates: "If true, remove (rather than mark) duplicate reads from the output"
+    coordinate_sort: "If true, the output bam will be coordinate sorted. Else it will be query-name sorted."
+  }
+
+  call tasks.SortSamByQueryName as SortSamByQueryNameAfterAlignment {
+    input:
+      input_bam = aligned_bam,
+      output_bam_basename = output_basename + ".queryname_sorted"
+  }
+
+  # It appears we cannot assume that the unmapped bam/fastqs will be sorted
+  call tasks.SortSamByQueryName as SortSamByQueryNameUnmapped {
+    input:
+      input_bam = unaligned_bam,
+      output_bam_basename = output_basename + ".u.queryname_sorted"
+  }
+
+  call tasks.TransferReadTags {
+    input:
+      aligned_bam = SortSamByQueryNameAfterAlignment.output_bam,
+      ubam = SortSamByQueryNameUnmapped.output_bam,
+      output_basename = output_basename + ".queryname_sorted_with_RX"
+  }
+
+  # First sort the aligned bam by coordinate, so we can group duplicate sets using UMIs in the next step.
+  call tasks.SortSamByCoordinate as SortSamByCoordinateFirstPass {
+    input:
+      input_bam = TransferReadTags.output_bam,
+      output_bam_basename = output_basename + ".STAR_aligned.coordinate_sorted"
+  }
+
+  # Further divide each duplicate set (a set of reads with the same insert start and end coordinates)
+  # into subsets that share the same UMIs i.e. differenciate PCR duplicates from biological duplicates.
+  # (biological duplicates are independent DNA molecules that are sheared such that the inserts are indistinguishable.)
+  # input: a coordinate sorted bam
+  # output: a coordinate sorted bam with UMIs (what are the generated tags?) .
+  
+  call tasks.GroupByUMIs {
+    input:
+      bam = SortSamByCoordinateFirstPass.output_bam,
+      bam_index = SortSamByCoordinateFirstPass.output_bam_index,
+      output_bam_basename = output_basename + ".grouped_by_UMI"
+  }
+
+  call tasks.SortSamByQueryName as SortSamByQueryNameBeforeDuplicateMarking {
+    input:
+      input_bam = GroupByUMIs.grouped_bam,
+      output_bam_basename = output_basename + ".grouped.queryname_sorted"
+  }
+
+  call tasks.MarkDuplicatesUMIAware as MarkDuplicates {
+    input:
+      bam = SortSamByQueryNameBeforeDuplicateMarking.output_bam,
+      output_basename = output_basename,
+      remove_duplicates = remove_duplicates,
+      use_umi = true
+  }
+
+  if (coordinate_sort_output){
+    call tasks.SortSamByCoordinate as SortSamByCoordinateSecondPass {
+      input:
+        input_bam = MarkDuplicates.duplicate_marked_bam,
+        output_bam_basename = output_basename + ".duplicate_marked.coordinate_sorted"
+    }
+  }
+
+  # We won't have the index file if the output is query-name sorted.
+  # Until we remove the transcriptome index from the TDR schema,
+  # output a placeholder text file as the bam index as a temporary fix
+  if (!coordinate_sort_output){
+    call tasks.CreateEmptyFile {}
+  }
+
+  output {
+    File duplicate_marked_bam = select_first([SortSamByCoordinateSecondPass.output_bam, MarkDuplicates.duplicate_marked_bam])
+    File duplicate_marked_bam_index = select_first([SortSamByCoordinateSecondPass.output_bam_index, CreateEmptyFile.empty_file])
+    File duplicate_metrics = MarkDuplicates.duplicate_metrics
+  }
+}
diff --git a/TCapRNAPipeline/subworkflows/Utilities.wdl b/TCapRNAPipeline/subworkflows/Utilities.wdl
new file mode 100644
index 0000000..e6a1aee
--- /dev/null
+++ b/TCapRNAPipeline/subworkflows/Utilities.wdl
@@ -0,0 +1,303 @@
+version 1.0
+
+## Copyright Broad Institute, 2018
+##
+## This WDL defines utility tasks used for processing of sequencing data.
+##
+## Runtime parameters are often optimized for Broad's Google Cloud Platform implementation.
+## For program versions, see docker containers.
+##
+## LICENSING :
+## This script is released under the WDL source code license (BSD-3) (see LICENSE in
+## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
+## be subject to different licenses. Users are responsible for checking that they are
+## authorized to run all programs before running this script. Please see the docker
+## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
+## licensing information pertaining to the included programs.
+
+# Generate sets of intervals for scatter-gathering over chromosomes
+task CreateSequenceGroupingTSV {
+  input {
+    File ref_dict
+    Int preemptible_tries
+  }
+  # Use python to create the Sequencing Groupings used for BQSR and PrintReads Scatter.
+  # It outputs to stdout where it is parsed into a wdl Array[Array[String]]
+  # e.g. [["1"], ["2"], ["3", "4"], ["5"], ["6", "7", "8"]]
+  command <<<
+    python3 <<CODE
+    with open("~{ref_dict}", "r") as ref_dict_file:
+        sequence_tuple_list = []
+        longest_sequence = 0
+        for line in ref_dict_file:
+            if line.startswith("@SQ"):
+                line_split = line.split("\t")
+                # (Sequence_Name, Sequence_Length)
+                sequence_tuple_list.append((line_split[1].split("SN:")[1], int(line_split[2].split("LN:")[1])))
+        longest_sequence = sorted(sequence_tuple_list, key=lambda x: x[1], reverse=True)[0][1]
+    # We are adding this to the intervals because hg38 has contigs named with embedded colons and a bug in GATK strips off
+    # the last element after a :, so we add this as a sacrificial element.
+    hg38_protection_tag = ":1+"
+    # initialize the tsv string with the first sequence
+    tsv_string = sequence_tuple_list[0][0] + hg38_protection_tag
+    temp_size = sequence_tuple_list[0][1]
+    for sequence_tuple in sequence_tuple_list[1:]:
+        if temp_size + sequence_tuple[1] <= longest_sequence:
+            temp_size += sequence_tuple[1]
+            tsv_string += "\t" + sequence_tuple[0] + hg38_protection_tag
+        else:
+            tsv_string += "\n" + sequence_tuple[0] + hg38_protection_tag
+            temp_size = sequence_tuple[1]
+    # add the unmapped sequences as a separate line to ensure that they are recalibrated as well
+    with open("sequence_grouping.txt","w") as tsv_file:
+      tsv_file.write(tsv_string)
+      tsv_file.close()
+
+    tsv_string += '\n' + "unmapped"
+
+    with open("sequence_grouping_with_unmapped.txt","w") as tsv_file_with_unmapped:
+      tsv_file_with_unmapped.write(tsv_string)
+      tsv_file_with_unmapped.close()
+    CODE
+  >>>
+  runtime {
+    preemptible: preemptible_tries
+    docker: "us.gcr.io/broad-dsp-gcr-public/base/python:3.9-debian"
+    memory: "2 GiB"
+  }
+  output {
+    Array[Array[String]] sequence_grouping = read_tsv("sequence_grouping.txt")
+    Array[Array[String]] sequence_grouping_with_unmapped = read_tsv("sequence_grouping_with_unmapped.txt")
+  }
+}
+
+# This task calls picard's IntervalListTools to scatter the input interval list into scatter_count sub interval lists
+# Note that the number of sub interval lists may not be exactly equal to scatter_count.  There may be slightly more or less.
+# Thus we have the block of python to count the number of generated sub interval lists.
+task ScatterIntervalList {
+  input {
+    File interval_list
+    Int scatter_count
+    Int break_bands_at_multiples_of
+    #Setting default docker value for workflows that haven't yet been azurized. 
+    String docker = "us.gcr.io/broad-gotc-prod/picard-python:1.0.0-2.26.10-1663951039"
+  }
+
+  command <<<
+    set -e
+    mkdir out
+    java -Xms1000m -Xmx1500m -jar /usr/gitc/picard.jar \
+      IntervalListTools \
+      SCATTER_COUNT=~{scatter_count} \
+      SUBDIVISION_MODE=BALANCING_WITHOUT_INTERVAL_SUBDIVISION_WITH_OVERFLOW \
+      UNIQUE=true \
+      SORT=true \
+      BREAK_BANDS_AT_MULTIPLES_OF=~{break_bands_at_multiples_of} \
+      INPUT=~{interval_list} \
+      OUTPUT=out
+
+    python3 <<CODE
+    import glob, os
+    # Works around a JES limitation where multiples files with the same name overwrite each other when globbed
+    intervals = sorted(glob.glob("out/*/*.interval_list"))
+    for i, interval in enumerate(intervals):
+      (directory, filename) = os.path.split(interval)
+      newName = os.path.join(directory, str(i + 1) + filename)
+      os.rename(interval, newName)
+    print(len(intervals))
+    CODE
+  >>>
+  output {
+    Array[File] out = glob("out/*/*.interval_list")
+    Int interval_count = read_int(stdout())
+  }
+  runtime {
+    docker: docker
+    memory: "2000 MiB"
+  }
+}
+
+# Convert BAM file to CRAM format
+# Note that reading CRAMs directly with Picard is not yet supported
+task ConvertToCram {
+  input {
+    File input_bam
+    File ref_fasta
+    File ref_fasta_index
+    String output_basename
+    Int preemptible_tries = 3
+
+    Int disk_size = ceil((2 * size(input_bam, "GiB")) + size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB")) + 20
+  }
+
+  command <<<
+    set -e
+    set -o pipefail
+
+    samtools view -C -T ~{ref_fasta} ~{input_bam} | \
+    tee ~{output_basename}.cram | \
+    md5sum | awk '{print $1}' > ~{output_basename}.cram.md5
+
+    # Create REF_CACHE. Used when indexing a CRAM
+    seq_cache_populate.pl -root ./ref/cache ~{ref_fasta}
+    export REF_PATH=:
+    export REF_CACHE=./ref/cache/%2s/%2s/%s
+
+    samtools index ~{output_basename}.cram
+  >>>
+  runtime {
+    docker: "us.gcr.io/broad-gotc-prod/samtools:1.0.0-1.11-1624651616"
+    preemptible: preemptible_tries
+    memory: "3 GiB"
+    cpu: "1"
+    disks: "local-disk " + disk_size + " HDD"
+  }
+  output {
+    File output_cram = "~{output_basename}.cram"
+    File output_cram_index = "~{output_basename}.cram.crai"
+    File output_cram_md5 = "~{output_basename}.cram.md5"
+  }
+}
+
+# Convert CRAM file to BAM format
+task ConvertToBam {
+  input {
+    File input_cram
+    File ref_fasta
+    File ref_fasta_index
+    String output_basename
+  }
+
+  command <<<
+    set -e
+    set -o pipefail
+
+    samtools view -b -o ~{output_basename}.bam -T ~{ref_fasta} ~{input_cram}
+
+    samtools index ~{output_basename}.bam
+  >>>
+  runtime {
+    docker: "us.gcr.io/broad-gotc-prod/samtools:1.0.0-1.11-1624651616"
+    preemptible: 3
+    memory: "3 GiB"
+    cpu: "1"
+    disks: "local-disk 200 HDD"
+  }
+  output {
+    File output_bam = "~{output_basename}.bam"
+    File output_bam_index = "~{output_basename}.bam.bai"
+  }
+}
+
+# Calculates sum of a list of floats
+task SumFloats {
+  input {
+    Array[Float] sizes
+    Int preemptible_tries
+  }
+
+  command <<<
+    python3 -c 'print(~{sep="+" sizes})'
+  >>>
+  output {
+    Float total_size = read_float(stdout())
+  }
+  runtime {
+    docker: "us.gcr.io/broad-dsp-gcr-public/base/python:3.9-debian"
+    preemptible: preemptible_tries
+  }
+}
+
+# Print given message to stderr and return an error
+task ErrorWithMessage {
+  input {
+    String message
+  }
+  command <<<
+    >&2 echo "Error: ~{message}"
+    exit 1
+  >>>
+
+  runtime {
+    docker: "ubuntu:20.04"
+  }
+}
+
+# This task is unused for now, going to keep it in here though if we need it in the future
+task GetValidationInputs {
+  input {
+    String results_path
+    String truth_path
+    Array[String]? input_files
+    String? input_file
+
+    String docker = "us.gcr.io/broad-dsp-gcr-public/base/python:3.9-debian"
+    Int cpu = 1
+    Int memory_mb = 2000
+    Int disk_size_gb = 20
+  }
+
+  meta {
+    description: "Given either a file or list of files, output both the truth and results path"
+  }
+
+  command <<<
+    set -e
+
+    touch truth_file.txt
+    touch truth_files.txt
+    touch results_file.txt
+    touch results_files.txt
+
+    python3 <<CODE
+    import os.path
+
+
+
+    results_path = "~{results_path}"
+    truth_path = "~{truth_path}"
+    input_file = "~{input_file}"
+    input_files = [ x for x in [ "~{sep='", "' input_files}" ]  if x != "" ]
+
+    if input_file:
+      file = os.path.basename(input_file)
+      truth_file = os.path.join(truth_path, file)
+      results_file = os.path.join(results_path, file)
+
+      with open("truth_file.txt", "w") as f:
+        f.write(truth_file)
+      with open("results_file.txt", "w") as f:
+        f.write(results_file)
+
+    elif input_files:
+      truth_files, results_files = [], []
+
+      for input_file in input_files:
+        file = os.path.basename(input_file)
+        truth_files.append(os.path.join(truth_path, file))
+        results_files.append(os.path.join(results_path, file))
+
+      with open("truth_files.txt", "w") as f:
+        f.write("\n".join(truth_files))
+      with open("results_files.txt", "w") as f:
+        f.write("\n".join(results_files))
+
+
+    CODE
+  >>>
+
+  runtime {
+    docker: docker
+    cpu: cpu
+    memory: "~{memory_mb} MiB"
+    disks: "local-disk ~{disk_size_gb} HDD"
+  }
+
+  output {
+    String truth_file = read_string("truth_file.txt")
+    String results_file = read_string("results_file.txt")
+    Array[String] truth_files = read_lines("truth_files.txt")
+    Array[String] results_files = read_lines("results_files.txt")
+  }
+
+}
\ No newline at end of file