From 95d8959815cb3e37a148358473e40d0c19a811e0 Mon Sep 17 00:00:00 2001 From: Mark Fleharty Date: Mon, 22 Apr 2024 17:46:52 -0400 Subject: [PATCH] Adding PacBio Pipeline --- HiFi-human-WGS-WDL/.dockstore.yml | 69 + HiFi-human-WGS-WDL/LICENSE | 34 + HiFi-human-WGS-WDL/README.md | 279 ++++ HiFi-human-WGS-WDL/backends/aws/.gitignore | 1 + HiFi-human-WGS-WDL/backends/aws/README.md | 123 ++ .../backends/aws/agc-project.template.yaml | 167 +++ .../backends/aws/inputs.aws.json | 76 ++ HiFi-human-WGS-WDL/backends/azure/README.md | 29 + .../backends/azure/inputs.azure.json | 73 ++ HiFi-human-WGS-WDL/backends/gcp/README.md | 34 + .../backends/gcp/inputs.gcp.json | 74 ++ HiFi-human-WGS-WDL/backends/hpc/README.md | 48 + .../backends/hpc/inputs.hpc.json | 73 ++ HiFi-human-WGS-WDL/backends/hpc/miniwdl.cfg | 50 + .../images/logo_wdl_workflows.svg | 83 ++ HiFi-human-WGS-WDL/images/main.graphviz.svg | 136 ++ HiFi-human-WGS-WDL/wdl-ci.config.json | 1148 +++++++++++++++++ .../cohort_analysis/cohort_analysis.wdl | 112 ++ .../workflows/cohort_analysis/inputs.json | 50 + .../workflows/humanwgs_structs.wdl | 51 + .../workflows/input_template.json | 54 + HiFi-human-WGS-WDL/workflows/main.wdl | 172 +++ .../workflows/sample_analysis/inputs.json | 41 + .../sample_analysis/sample_analysis.wdl | 737 +++++++++++ .../workflows/tertiary_analysis/inputs.json | 69 + .../tertiary_analysis/tertiary_analysis.wdl | 471 +++++++ 26 files changed, 4254 insertions(+) create mode 100644 HiFi-human-WGS-WDL/.dockstore.yml create mode 100644 HiFi-human-WGS-WDL/LICENSE create mode 100644 HiFi-human-WGS-WDL/README.md create mode 100644 HiFi-human-WGS-WDL/backends/aws/.gitignore create mode 100644 HiFi-human-WGS-WDL/backends/aws/README.md create mode 100644 HiFi-human-WGS-WDL/backends/aws/agc-project.template.yaml create mode 100644 HiFi-human-WGS-WDL/backends/aws/inputs.aws.json create mode 100644 HiFi-human-WGS-WDL/backends/azure/README.md create mode 100644 HiFi-human-WGS-WDL/backends/azure/inputs.azure.json create mode 100644 HiFi-human-WGS-WDL/backends/gcp/README.md create mode 100644 HiFi-human-WGS-WDL/backends/gcp/inputs.gcp.json create mode 100644 HiFi-human-WGS-WDL/backends/hpc/README.md create mode 100644 HiFi-human-WGS-WDL/backends/hpc/inputs.hpc.json create mode 100644 HiFi-human-WGS-WDL/backends/hpc/miniwdl.cfg create mode 100644 HiFi-human-WGS-WDL/images/logo_wdl_workflows.svg create mode 100644 HiFi-human-WGS-WDL/images/main.graphviz.svg create mode 100644 HiFi-human-WGS-WDL/wdl-ci.config.json create mode 100644 HiFi-human-WGS-WDL/workflows/cohort_analysis/cohort_analysis.wdl create mode 100644 HiFi-human-WGS-WDL/workflows/cohort_analysis/inputs.json create mode 100644 HiFi-human-WGS-WDL/workflows/humanwgs_structs.wdl create mode 100644 HiFi-human-WGS-WDL/workflows/input_template.json create mode 100644 HiFi-human-WGS-WDL/workflows/main.wdl create mode 100644 HiFi-human-WGS-WDL/workflows/sample_analysis/inputs.json create mode 100644 HiFi-human-WGS-WDL/workflows/sample_analysis/sample_analysis.wdl create mode 100644 HiFi-human-WGS-WDL/workflows/tertiary_analysis/inputs.json create mode 100644 HiFi-human-WGS-WDL/workflows/tertiary_analysis/tertiary_analysis.wdl diff --git a/HiFi-human-WGS-WDL/.dockstore.yml b/HiFi-human-WGS-WDL/.dockstore.yml new file mode 100644 index 0000000..88fe91b --- /dev/null +++ b/HiFi-human-WGS-WDL/.dockstore.yml @@ -0,0 +1,69 @@ +# The first line refers to the version 1.2 of the .dockstore.yml schema +version: 1.2 + +# An array of workflows. Each element corresponds to a workflow on Dockstore. +workflows: + + # The optional workflow name for a workflow, which may only consist of alphanumerics + # and internal underscores and hyphens, but no spaces or other characters. Names may not exceed 256 characters. + # If using a .dockstore.yml with multiple workflows, this field is required + # to uniquely identify workflows in the repository. + # + # It should be noted that having the name come first is an arbitrary decision. + # You could use subclass instead, for instance. Provided arrays are not broken + # up, the order of fields within a .dockstore.yml is not important. + - name: HiFi-human-WGS-WDL + + # The descriptor language used for the workflow. CWL, WDL, NFL (Nextflow), or GALAXY. + # This cannot be changed once the workflow is registered. + subclass: WDL + + # Workflow-wide setting that will affect ALL branches/tags; only set this as needed in a main branch. + # Set to true to publish an unpublished workflow, or false to unpublish a published workflow. + # Omitting the publish setting leaves the publish-state unchanged (recommended for all non-primary branches). + # publish: + + # The absolute path to the primary descriptor file in the Git repository. + # - For CWL, the primary descriptor is a .cwl file. + # - For WDL, the primary descriptor is a .wdl file. + # - For Galaxy, the primary descriptor is a .ga file. + # - Nextflow differs from these as the primary descriptor is a nextflow.config file. + primaryDescriptorPath: /workflows/main.wdl + + # An optional array of absolute paths to test parameter files in the Git repository. + # For example... + # testParameterFiles: + # - /null-model/null-model.json + # - /null-model/null-model-binary.json + # testParameterFiles: + + # An optional path to a workflow-specific readme in the Git repository. If not provided, Dockstore will show + # the readme.md present at the root of the Git repository if it is present. + # If you have multiple workflows in a single Git repository, it is recommend to give each one a readme. + readMePath: /README.md + + # An optional array of authorship information. + # Note that if orcid is present, then all other fields will be ignored, as information will be taken from orcid. + # If orcid is not present, make sure to at a minimum include the name field for each author. + authors: + - orcid: 0000-0001-5921-2022 # Juniper Lake + - orcid: 0000-0001-7628-5645 # Gregory Concepcion + - orcid: 0000-0003-1183-0432 # Aaron Wenger + - orcid: 0000-0002-7422-1194 # William Rowell + - orcid: 0000-0002-5507-0896 # Heather Ward + - orcid: 0009-0001-0205-4614 # Karen Fang + + # A boolean that will change the default version to be displayed on Dockstore. Default: False. + # A value of true will automatically display the latest tag updated as default. + # A value of false will retain the default version that has been specified via the Dockstore UI. + latestTagAsDefault: False + + # The optional filters section allow specifying sets of Git branches and tags to include for the workflow. + # If no filters are given, all branches and tags are included. + # Branches and tags are arrays of pattern-strings. + # Pattern-strings use Unix-style Glob syntax by default (Ex: `develop`, `myworkflow/**`) + # https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/nio/file/FileSystem.html#getPathMatcher(java.lang.String) + # or RegEx when the string is surrounded by / (Ex: `/develop/`, `/myworkflow\/.*/`). + filters: + branches: [ /.*dockstore/ ] + tags: [ /v.*/ ] diff --git a/HiFi-human-WGS-WDL/LICENSE b/HiFi-human-WGS-WDL/LICENSE new file mode 100644 index 0000000..aaea0c1 --- /dev/null +++ b/HiFi-human-WGS-WDL/LICENSE @@ -0,0 +1,34 @@ +Copyright (c) 2023, Pacific Biosciences of California, Inc. + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted (subject to the limitations in the +disclaimer below) provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of Pacific Biosciences nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC +BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +SUCH DAMAGE. diff --git a/HiFi-human-WGS-WDL/README.md b/HiFi-human-WGS-WDL/README.md new file mode 100644 index 0000000..bf55eaa --- /dev/null +++ b/HiFi-human-WGS-WDL/README.md @@ -0,0 +1,279 @@ +

+ +

PacBio WGS Variant Pipeline

+ +Workflow for analyzing human PacBio whole genome sequencing (WGS) data using [Workflow Description Language (WDL)](https://openwdl.org/). + +- Docker images used by this workflow are defined in [the wdl-dockerfiles repo](https://github.com/PacificBiosciences/wdl-dockerfiles). Images are hosted in PacBio's [quay.io](https://quay.io/organization/pacbio). +- Common tasks that may be reused within or between workflows are defined in [the wdl-common repo](https://github.com/PacificBiosciences/wdl-common). + +# Workflow + +**Workflow entrypoint**: [workflows/main.wdl](workflows/main.wdl) + +PacBio WGS Variant Pipeline performs read alignment, variant calling, and phasing. Joint-calling of small variants and structural variants for cohorts and optional variant filtering and annotation is also available for HiFi human WGS. The workflow can run using Azure, AWS, GCP, and HPC backends. + +![PacBio WGS Variant Pipeline diagram](https://github.com/PacificBiosciences/HiFi-human-WGS-WDL/raw/main/images/main.graphviz.svg "PacBio WGS Variant Pipeline diagram") + +## Setup + +We recommend cloning the repo rather than downloading the release package. Some tasks and workflows are pulled in from other repositories. Ensure you have initialized submodules following cloning by running `git submodule update --init --recursive`. + +## Resource requirements + +The workflow requires at minimum 64 cores and 256 GB of RAM. Ensure that the backend environment you're using has enough quota to run the workflow. + +## Reference datasets and associated workflow files + +Reference datasets are hosted publicly for use in the pipeline. For data locations, see the [backend-specific documentation](backends/) and template inputs files for each backend with paths to publicly hosted reference files filled out. + +# Running the workflow + +1. [Select a backend environment](#selecting-a-backend) +2. [Configure a workflow execution engine in the chosen environment](#configuring-a-workflow-engine) +3. [Fill out the inputs JSON file for your cohort](#filling-out-the-inputs-json) +4. [Run the workflow](#running-the-workflow-1) + +## Selecting a backend + +The workflow can be run on Azure, AWS, GCP, or HPC. Your choice of backend will largely be determined by the location of your data. + +For backend-specific configuration, see the relevant documentation: + +- [Azure](backends/azure) +- [AWS](backends/aws) +- [GCP](backends/gcp) +- [HPC](backends/hpc) + +## Configuring a workflow engine and container runtime + +An execution engine is required to run workflows. Two popular engines for running WDL-based workflows are [`miniwdl`](https://miniwdl.readthedocs.io/en/latest/getting_started.html) and [`Cromwell`](https://cromwell.readthedocs.io/en/stable/tutorials/FiveMinuteIntro/). + +Because workflow dependencies are containerized, a container runtime is required. This workflow has been tested with [Docker](https://docs.docker.com/get-docker/) and [Singularity](https://docs.sylabs.io/guides/3.10/user-guide/) container runtimes. + +See the [backend-specific documentation](backends) for details on setting up an engine. + +| Engine | Azure | AWS | GCP | HPC | +| :- | :- | :- | :- | :- | +| [**miniwdl**](https://github.com/chanzuckerberg/miniwdl#scaling-up) | _Unsupported_ | Supported via the [Amazon Genomics CLI](https://aws.amazon.com/genomics-cli/) | _Unsupported_ | (SLURM only) Supported via the [`miniwdl-slurm`](https://github.com/miniwdl-ext/miniwdl-slurm) plugin | +| [**Cromwell**](https://cromwell.readthedocs.io/en/stable/backends/Backends/) | Supported via [Cromwell on Azure](https://github.com/microsoft/CromwellOnAzure) | Supported via the [Amazon Genomics CLI](https://aws.amazon.com/genomics-cli/) | Supported via Google's [Pipelines API](https://cromwell.readthedocs.io/en/stable/backends/Google/) | Supported - [Configuration varies depending on HPC infrastructure](https://cromwell.readthedocs.io/en/stable/tutorials/HPCIntro/) | + +## Filling out the inputs JSON + +The input to a workflow run is defined in JSON format. Template input files with reference dataset information filled out are available for each backend: + +- [Azure](backends/azure/inputs.azure.json) +- [AWS](backends/aws/inputs.aws.json) +- [GCP](backends/gcp/inputs.gcp.json) +- [HPC](backends/hpc/inputs.hpc.json) + +Using the appropriate inputs template file, fill in the cohort and sample information (see [Workflow Inputs](#workflow-inputs) for more information on the input structure). + +If using an HPC backend, you will need to download the reference bundle and replace the `` in the input template file with the local path to the reference datasets on your HPC. + +## Running the workflow + +Run the workflow using the engine and backend that you have configured ([miniwdl](#run-directly-using-miniwdl), [Cromwell](#run-directly-using-cromwell)). + +Note that the calls to `miniwdl` and `Cromwell` assume you are accessing the engine directly on the machine on which it has been deployed. Depending on the backend you have configured, you may be able to submit workflows using different methods (e.g. using trigger files in Azure, or using the Amazon Genomics CLI in AWS). + +### Run directly using miniwdl + +`miniwdl run workflows/main.wdl -i ` + +### Run directly using Cromwell + +`java -jar run workflows/main.wdl -i ` + +If Cromwell is running in server mode, the workflow can be submitted using cURL. Fill in the values of CROMWELL_URL and INPUTS_JSON below, then from the root of the repository, run: + +```bash +# The base URL (and port, if applicable) of your Cromwell server +CROMWELL_URL= +# The path to your inputs JSON file +INPUTS_JSON= + +(cd workflows && zip -r dependencies.zip humanwgs_structs.wdl cohort_analysis/ sample_analysis/ tertiary_analysis/ wdl-common/) +curl -X "POST" \ + "${CROMWELL_URL}/api/workflows/v1" \ + -H "accept: application/json" \ + -H "Content-Type: multipart/form-data" \ + -F "workflowSource=@workflows/main.wdl" \ + -F "workflowInputs=@${INPUTS_JSON};type=application/json" \ + -F "workflowDependencies=@workflows/dependencies.zip;type=application/zip" +``` + +To specify [workflow options](https://cromwell.readthedocs.io/en/latest/wf_options/Overview/), add the following to the request (assuming your options file is a file called `options.json` located in the `pwd`): `-F "workflowOptions=@options.json;type=application/json"`. + +# Workflow inputs + +This section describes the inputs required for a run of the workflow. Typically, only the `humanwgs.cohort` and potentially [run/backend-specific sections](#other-inputs) will be filled out by the user for each run of the workflow. Input templates with reference file locations filled out are provided [for each backend](backends). + +## [Cohort](workflows/humanwgs_structs.wdl) + +A cohort can include one or more samples. Samples need not be related, but if you plan to run tertiary analysis, it is best to think of a cohort as a family of related samples. We have tested cohorts of up to 5 samples with 30x coverage. Larger cohorts may encounter memory issues during joint calling. + +| Type | Name | Description | Notes | +| :- | :- | :- | :- | +| String | cohort_id | A unique name for the cohort; used to name outputs | | +| Array[[Sample](#sample)] | samples | The set of samples for the cohort. At least one sample must be defined. | | +| Array[String] | phenotypes | [Human Phenotype Ontology (HPO) phenotypes](https://hpo.jax.org/app/) associated with the cohort. If no particular phenotypes are desired, the root HPO term, `"HP:0000001"`, can be used. | | + +### [Sample](workflows/humanwgs_structs.wdl) + +Sample information for each sample in the workflow run. + +| Type | Name | Description | Notes | +| :- | :- | :- | :- | +| String | sample_id | A unique name for the sample; used to name outputs | | +| Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)] | movie_bams | The set of unaligned movie BAMs associated with this sample | | +| String? | sex | Sample sex | ["MALE", "FEMALE", `null`]. If the sex field is missing or `null`, sex will be set to unknown. Used to set the expected sex chromosome karyotype for TRGT and HiFiCNV. | +| Boolean | affected | Is this sample affected by the phenotype? | \[`true`, `false`\] | +| String? | father_id | Paternal `sample_id` | | +| String? | mother_id | Maternal `sample_id` | | + +## [ReferenceData](workflows/humanwgs_structs.wdl) + +Files associated with the reference genome. + +These files are hosted publicly in each of the cloud backends; see `backends/${backend}/inputs.${backend}.json`. + +| Type | Name | Description | Notes | +| :- | :- | :- | :- | +| String | name | Reference name; used to name outputs (e.g., "GRCh38") | Note: The workflow currently only supports GRCh38 and provides GCA_000001405.15_GRCh38_no_alt_analysis_set. | +| [IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl) | fasta | Reference genome and index | | +| File | tandem_repeat_bed | Tandem repeat locations used by [pbsv](https://github.com/PacificBiosciences/pbsv) to normalize SV representation | | +| File | trgt_tandem_repeat_bed | Tandem repeat sites to be genotyped by [TRGT](https://github.com/PacificBiosciences/trgt) | | +| [IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl) | hificnv_exclude_bed | Compressed BED and index of regions to exclude from calling by [HiFiCNV](https://github.com/PacificBiosciences/HiFiCNV). We recommend [cnv.excluded_regions.common_50.hg38.bed.gz](https://github.com/PacificBiosciences/HiFiCNV/blob/main/docs/aux_data.md). | | +| File | hificnv_expected_bed_male | BED of expected copy number for male karyotype for HiFiCNV | | +| File | hificnv_expected_bed_female | BED of expected copy number for female karyotype for HiFiCNV | | +| File? | gnomad_af | [gnomAD](https://gnomad.broadinstitute.org/) v3.1 allele frequences in [`slivar gnotate`](https://github.com/brentp/slivar/wiki/gnotate) format | required if `run_tertiary_analysis` is set to `true` | +| File? | hprc_af | Allele frequences in ~100 [Human Pangenome Reference Consortium (HPRC)](https://humanpangenome.org/) samples in `slivar gnotate` format | required if `run_tertiary_analysis` is set to `true` | +| File? | gff | [Ensembl](https://useast.ensembl.org/index.html) GFF3 reference annotation | required if `run_tertiary_analysis` is set to `true` | +| Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)?] | population_vcfs | An array of structural variant population VCFs | required if `run_tertiary_analysis` is set to `true` | + +## [SlivarData](workflows/humanwgs_structs.wdl) + +Files associated with `slivar` annotation. These are required if `run_tertiary_analysis` is set to `true`. + +These files are hosted publicly in each of the cloud backends; see `backends/${backend}/inputs.${backend}.json`. + +| Type | Name | Description | Notes | +| :- | :- | :- | :- | +| File | slivar_js | Additional javascript functions for slivar | | +| File | hpo_terms | [HPO](https://hpo.jax.org/app/) annotation lookups | | +| File | hpo_dag | HPO annotation lookups | | +| File | hpo_annotations | HPO annotation lookups | | +| File | ensembl_to_hgnc | Ensembl to HGNC gene mapping | | +| File | lof_lookup | Loss-of-function scores per gene | | +| File | clinvar_lookup | ClinVar annotations per gene | | + +## Other inputs + +| Type | Name | Description | Notes | +| :- | :- | :- | :- | +| String? | deepvariant_version | Version of deepvariant to use \["1.5.0"\] | | +| [DeepVariantModel](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)? | deepvariant_model | Optional alternate DeepVariant model file to use | | +| Int? | pbsv_call_mem_gb | Optionally set RAM (GB) for pbsv_call during cohort analysis | | +| Int? | glnexus_mem_gb | Optionally set RAM (GB) for GLnexus during cohort analysis | | +| Boolean? | run_tertiary_analysis | Run the optional tertiary analysis steps \[`false`\] | \[`true`, `false`\] | +| String | backend | Backend where the workflow will be executed | \["Azure", "AWS", "GCP", "HPC"\] | +| String? | zones | Zones where compute will take place; required if backend is set to 'AWS' or 'GCP'. |
  • [Determining available zones in AWS](backends/aws/README.md#determining-available-zones)
  • [Determining available zones in GCP](backends/gcp/README.md#determining-available-zones)
| +| String? | aws_spot_queue_arn | Queue ARN for the spot batch queue; required if backend is set to 'AWS' and `preemptible` is set to `true` | [Determining the AWS queue ARN](backends/aws/README.md#determining-the-aws-batch-queue-arn) | +| String? | aws_on_demand_queue_arn | Queue ARN for the on demand batch queue; required if backend is set to 'AWS' and `preemptible` is set to `false` | [Determining the AWS queue ARN](backends/aws/README.md#determining-the-aws-batch-queue-arn) | +| String? | container_registry | Container registry where workflow images are hosted. If left blank, [PacBio's public Quay.io registry](https://quay.io/organization/pacbio) will be used. | | +| Boolean | preemptible | If set to `true`, run tasks preemptibly where possible. On-demand VMs will be used only for tasks that run for >24 hours if the backend is set to GCP. If set to `false`, on-demand VMs will be used for every task. Ignored if backend is set to HPC. | \[`true`, `false`\] | + +# Workflow outputs + +## Sample analysis + +These files will be output for each sample defined in the cohort. + +| Type | Name | Description | Notes | +| :- | :- | :- | :- | +| Array[Array[File]] | bam_stats | TSV of length and quality for each read (per input BAM) | | +| Array[Array[File]] | read_length_summary | For each input BAM, read length distribution (per input BAM) | | +| Array[Array[File]] | read_quality_summary | For each input BAM, read quality distribution (per input BAM) | | +| Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)] | small_variant_gvcfs | Small variants (SNPs and INDELs < 50bp) gVCFs called by [DeepVariant](https://github.com/google/deepvariant) (with index) | | +| Array[File] | small_variant_vcf_stats | [`bcftools stats`](https://samtools.github.io/bcftools/bcftools.html#stats) summary statistics for small variants | | +| Array[File] | small_variant_roh_out | Output of [`bcftools roh`](https://samtools.github.io/bcftools/howtos/roh-calling.html) using `--AF-dflt 0.4` | | +| Array[File] | small_variant_roh_bed | Regions of homozygosity determiend by [`bcftools roh`](https://samtools.github.io/bcftools/howtos/roh-calling.html) using `--AF-dflt 0.4` | | +| Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)] | sample_phased_small_variant_vcfs | Small variants called by DeepVariant and phased by [HiPhase](https://github.com/PacificBiosciences/HiPhase) (with index) | | +| Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)] | sample_phased_sv_vcfs | Structural variants called by [pbsv](https://github.com/PacificBiosciences/pbsv) and phased by HiPhase (with index) | | +| Array[File] | sample_hiphase_stats | Phase block summary statistics written by [HiPhase](https://github.com/PacificBiosciences/HiPhase/blob/main/docs/user_guide.md#chromosome-summary-file---summary-file) | | +| Array[File] | sample_hiphase_blocks | Phase block list written by [HiPhase](https://github.com/PacificBiosciences/HiPhase/blob/main/docs/user_guide.md#phase-block-file---blocks-file) | | +| Array[File] | sample_hiphase_haplotags | Per-read haplotag information, written by [HiPhase](https://github.com/PacificBiosciences/HiPhase/blob/main/docs/user_guide.md#haplotag-file---haplotag-file) | | +| Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)] | merged_haplotagged_bam | Aligned (by [pbmm2](https://github.com/PacificBiosciences/pbmm2)), haplotagged (by [HiPhase](https://github.com/PacificBiosciences/HiPhase/blob/main/docs/user_guide.md#haplotagged-bam-files)) reads (with index) | | +| Array[File] | haplotagged_bam_mosdepth_summary | [mosdepth](https://github.com/brentp/mosdepth) summary of median depths per chromosome | | +| Array[File] | haplotagged_bam_mosdepth_region_bed | mosdepthhttps://github.com/brentp/mosdepth BED of median coverage depth per 500 bp window | | +| Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)] | trgt_repeat_vcf | Tandem repeat genotypes from [TRGT](https://github.com/PacificBiosciences/trgt/blob/main/docs/vcf_files.md) (with index) | | +| Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)] | trgt_spanning_reads | Fragments of HiFi reads spanning loci genotyped by TRGT (with index) | | +| Array[File] | trgt_dropouts | Regions with insufficient coverage to genotype by TRGT | | +| Array[Array[File]] | cpg_pileup_beds | 5mCpG site methylation probability pileups generated by [pb-CpG-tools](https://github.com/PacificBiosciences/pb-CpG-tools#output-files) | | +| Array[Array[File]] | cpg_pileup_bigwigs | 5mCpG site methylation probability pileups generated by pb-CpG-tools | | +| Array[File] | paraphase_output | Output generated by [Paraphase](https://github.com/PacificBiosciences/paraphase) | | +| Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)] | paraphase_realigned_bam | Realigned BAM for selected medically relevant genes in segmental duplications (with index), generated by Paraphase | | +| Array[Array[File]] | paraphase_vcfs | Phased Variant calls for selected medically relevant genes in segmental duplications, generated by Paraphase | | +| Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)] | hificnv_vcfs | VCF output containing copy number variant calls for the sample from [HiFiCNV](https://github.com/PacificBiosciences/HiFiCNV) | | +| Array[File] | hificnv_copynum_bedgraphs | Copy number values calculated for each region | | +| Array[File] | hificnv_depth_bws | Bigwig file containing the depth measurements from HiFiCNV | | +| Array[File] | hificnv_maf_bws | Bigwig file containing the minor allele frequency measurements from DeepVariant, generated by HiFiCNV | | + +## Cohort analysis + +These files will be output if the cohort includes more than one sample. + +| Type | Name | Description | Notes | +| :- | :- | :- | :- | +| [IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)? | cohort_small_variant_vcf | Small variants called by [DeepVariant](https://github.com/google/deepvariant), joint-called by [GLnexus](https://github.com/dnanexus-rnd/GLnexus), and phased by [HiPhase](https://github.com/PacificBiosciences/HiPhase) (with index) | | +| [IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)? | cohort_sv_vcf | Structural variants joint-called by [pbsv](https://github.com/PacificBiosciences/pbsv) and phased by HiPhase (with index) | | +| File? | cohort_hiphase_stats | Phase block summary statistics written by [HiPhase](https://github.com/PacificBiosciences/HiPhase/blob/main/docs/user_guide.md#chromosome-summary-file---summary-file) | | +| File? | cohort_hiphase_blocks | Phase block list written by [HiPhase](https://github.com/PacificBiosciences/HiPhase/blob/main/docs/user_guide.md#phase-block-file---blocks-file) | | + +## Tertiary analysis + +These files will be output for each run of the workflow if `run_tertiary_analysis` is set to `true`. The files that are being annotated will depend on whether the number of samples is equal to or greater than one: +- If the number of samples is equal to one, the files being annotated in this step are the sample small variant VCF and SV VCF. +- If the number of samples is greater than one, the files being annotated in this step are the phased, joint-called small variant VCF and the cohort SV VCF. + +| Type | Name | Description | Notes | +| :- | :- | :- | :- | +| [IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)? | filtered_small_variant_vcf | Small variant calls that are filtered based on population frequency and annotated with cohort information, population frequency, gene, functional impact, etc., using [slivar](https://github.com/brentp/slivar) and [`bcftools csq`](https://samtools.github.io/bcftools/howtos/csq-calling.html) | | +| [IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)? | compound_het_small_variant_vcf | Compound heterozygotes annotated with cohort information, population frequency, gene, functional impact, etc., using slivar and `bcftools csq` | | +| File? | filtered_small_variant_tsv | Filtered VCFs are reformatted as a human-readable TSV by [`slivar tsv`](https://github.com/brentp/slivar/wiki/tsv:-creating-a-spreadsheet-from-a-filtered-VCF) | | +| File? | compound_het_small_variant_tsv | Filtered VCFs are reformatted as a human-readable TSV by `slivar tsv` | | +| [IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)? | filtered_svpack_vcf | Structural variant calls that are filtered based on population frequency and annotated with cohort information, population frequency, gene, functional impact, etc., using [svpack](https://github.com/PacificBiosciences/svpack) | | +| File? | filtered_svpack_tsv | Filtered VCFs are reformatted as a human-readable TSV by `slivar tsv` | | + +# Tool versions and Docker images + +Docker images definitions used by this workflow can be found in [the wdl-dockerfiles repository](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/987efde4d614a292fbfe9f3cf146b63005ad6a8a). Images are hosted in PacBio's [quay.io](https://quay.io/organization/pacbio). Docker images used in the workflow are pegged to specific versions by referring to their digests rather than tags. + +The Docker image used by a particular step of the workflow can be identified by looking at the `docker` key in the `runtime` block for the given task. Images can be referenced in the following table by looking for the name after the final `/` character and before the `@sha256:...`. For example, the image referred to here is "align_hifiasm": +> ~{runtime_attributes.container_registry}/**align_hifiasm**@sha256:3968cb<...>b01f80fe + +| Image | Major tool versions | Links | +| :- | :- | :- | +| bcftools |
  • [bcftools 1.14](https://github.com/samtools/bcftools/releases/tag/1.14)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/bcftools) | +| deepvariant | User-defined; default is version [1.5.0](https://github.com/google/deepvariant/releases/tag/v1.5.0) | [DeepVariant GitHub](https://github.com/google/deepvariant) | +| glnexus |
  • [glnexus v1.4.3](https://github.com/dnanexus-rnd/GLnexus/releases/tag/v1.4.3)
| [GLnexus GitHub](https://github.com/dnanexus-rnd/GLnexus) | +| hificnv |
  • [HiFiCNV v0.1.7](https://github.com/PacificBiosciences/HiFiCNV/releases/tag/v0.1.7)
  • [bcftools 1.16](https://github.com/samtools/bcftools/releases/tag/1.16)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/0b0fbe939648087e9fdea4497ae08dc76538ebf0/docker/hificnv) | +| hiphase |
  • [HiPhase 1.0.0](https://github.com/PacificBiosciences/HiPhase/releases/tag/v1.0.0)
  • [samtools 1.18](https://github.com/samtools/samtools/releases/tag/1.18)
  • [bcftools 1.18](https://github.com/samtools/bcftools/releases/tag/1.18)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/d26db6204409dfeff56e169cdba0cc14bc272f15/docker/hiphase) | +| htslib |
  • [htslib 1.14](https://github.com/samtools/htslib/releases/tag/1.14)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/htslib) | +| mosdepth |
  • [mosdepth 0.2.9](https://github.com/brentp/mosdepth/releases/tag/v0.2.9)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/mosdepth) | +| paraphase |
  • [minimap2 2.17](https://github.com/lh3/minimap2/releases/tag/v2.17)
  • [samtools 1.14](https://github.com/samtools/samtools/releases/tag/1.14)
  • [paraphase 2.2.3](https://github.com/PacificBiosciences/paraphase/releases/tag/v2.2.3)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/paraphase) | +| pb-cpg-tools |
  • [pb-CpG-tools v2.3.2](https://github.com/PacificBiosciences/pb-CpG-tools/releases/tag/v2.3.2)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/7481837d3b0f539adf4f64209a65cf28eebf3dba/docker/pb-cpg-tools) | +| pbmm2 |
  • [pbmm2 1.10.0](https://github.com/PacificBiosciences/pbmm2/releases/tag/v1.10.0)
  • [datamash 1.1.0](https://ftp.gnu.org/gnu/datamash/)
  • [pysam 0.16.0.1](https://github.com/pysam-developers/pysam/releases/tag/v0.16.0.1)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/pbmm2) | +| pbsv |
  • [pbsv 2.9.0](https://github.com/PacificBiosciences/pbsv/releases/tag/v2.9.0)
  • [htslib 1.14](https://github.com/samtools/htslib/releases/tag/1.14)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/f9e33a757e6d8cb15696ac930a2efd0fd7a885d8/docker/pbsv) | +| pyyaml |
  • [pyyaml 5.3.1](https://github.com/yaml/pyyaml/releases/tag/5.3.1)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/f72e862bca2f209b9909e6043ef0197975762f27/docker/pyyaml) | +| samtools |
  • [samtools 1.14](https://github.com/samtools/samtools/releases/tag/1.14)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/samtools) | +| slivar |
  • [slivar 0.2.2](https://github.com/brentp/slivar/releases/tag/v0.2.2)
  • [bcftools 1.14](https://github.com/samtools/bcftools/releases/tag/1.14)
  • [vcfpy 0.13.3](https://github.com/bihealth/vcfpy/releases/tag/v0.13.3)
  • [pysam 0.19.1](https://github.com/pysam-developers/pysam/releases/tag/v0.19.1)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/slivar) | +| svpack |
  • [svpack 36180ae6](https://github.com/PacificBiosciences/svpack/tree/a82598ebc4013bf32e70295b83b380ada6302c4a)
  • [htslib 1.18](https://github.com/samtools/htslib/releases/tag/1.18)
  • [pysam 0.21.0](https://github.com/pysam-developers/pysam/releases/tag/v0.21.0)
  • | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/8edbc516abc0ff43ac279b48018003923721b054/docker/svpack) | +| trgt |
    • [trgt 0.5.0](https://github.com/PacificBiosciences/trgt/releases/tag/v0.5.0)
    • [samtools 1.18](https://github.com/samtools/samtools/releases/tag/1.18)
    • [bcftools 1.18](https://github.com/samtools/bcftools/releases/tag/1.18)
    • [pysam 0.21.0](https://github.com/pysam-developers/pysam/releases/tag/v0.21.0)
    | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/d2a45e0213ac3fa631a51a48757c442d3ed550b6/docker/trgt) | + +--- + +## DISCLAIMER + +TO THE GREATEST EXTENT PERMITTED BY APPLICABLE LAW, THIS WEBSITE AND ITS CONTENT, INCLUDING ALL SOFTWARE, SOFTWARE CODE, SITE-RELATED SERVICES, AND DATA, ARE PROVIDED "AS IS," WITH ALL FAULTS, WITH NO REPRESENTATIONS OR WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, ANY WARRANTIES OF MERCHANTABILITY, SATISFACTORY QUALITY, NON-INFRINGEMENT OR FITNESS FOR A PARTICULAR PURPOSE. ALL WARRANTIES ARE REJECTED AND DISCLAIMED. YOU ASSUME TOTAL RESPONSIBILITY AND RISK FOR YOUR USE OF THE FOREGOING. PACBIO IS NOT OBLIGATED TO PROVIDE ANY SUPPORT FOR ANY OF THE FOREGOING, AND ANY SUPPORT PACBIO DOES PROVIDE IS SIMILARLY PROVIDED WITHOUT REPRESENTATION OR WARRANTY OF ANY KIND. NO ORAL OR WRITTEN INFORMATION OR ADVICE SHALL CREATE A REPRESENTATION OR WARRANTY OF ANY KIND. ANY REFERENCES TO SPECIFIC PRODUCTS OR SERVICES ON THE WEBSITES DO NOT CONSTITUTE OR IMPLY A RECOMMENDATION OR ENDORSEMENT BY PACBIO. diff --git a/HiFi-human-WGS-WDL/backends/aws/.gitignore b/HiFi-human-WGS-WDL/backends/aws/.gitignore new file mode 100644 index 0000000..10ef663 --- /dev/null +++ b/HiFi-human-WGS-WDL/backends/aws/.gitignore @@ -0,0 +1 @@ +agc-project.yaml diff --git a/HiFi-human-WGS-WDL/backends/aws/README.md b/HiFi-human-WGS-WDL/backends/aws/README.md new file mode 100644 index 0000000..9dc1110 --- /dev/null +++ b/HiFi-human-WGS-WDL/backends/aws/README.md @@ -0,0 +1,123 @@ +# Configuring the Amazon Genomics CLI + +The Amazon Genomics CLI (`agc`) allows users to orchestrate workflow execution using AWS Batch. See the [Workbench documentation](https://docs.dnastack.com/docs/cromwell-on-aws-amazon-genomics-cli) for information on installing and using the `agc` to configure and run workflows. The following section provides additional information on deploying a project using the `agc`. + +## Deploying a context with `agc` + +Once you have installed and authenticated with the `agc`, you can deploy a context using an agc project YAML file. This file must be named `agc-project.yaml`. + +An [example agc-project.yaml file](agc-project.template.yaml) that has the workflow, reference data source, and both on-demand and spot contexts configured using Cromwell as the engine is provided here. This will create an agc project named `humanwgsAGC`, with either (or both) a `spotContext` or an `onDemandContext`. The `spotContext` will allow you to run worklfows using [AWS spot instances](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-spot-instances.html), which can result in substantial cost savings relative to using [on-demand instances](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-on-demand-instances.html). + +Note that deploying a context **will incur costs** even if you are not actively running workflows; ensure that [contexts that are not in use are destroyed](https://aws.github.io/amazon-genomics-cli/docs/reference/agc_context_destroy/) to avoid incurring ongoing costs. + +To deploy the agc project using the template file, first copy the template file to a file named `agc-project.yaml` (`cp agc-project.template.yaml agc-project.yaml`). + +In the `data` section of the `agc-project.yaml` file, add any additional s3 buckets that the workflow will require access to, for example the bucket containing sample input data. Make sure that you do not remove the section granting access to the s3://dnastack-resources bucket; this is where [reference datasets are hosted](#reference-data-hosted-in-aws). + +``` +data: + - location: s3://dnastack-resources + readOnly: true + - location: s3:// + readOnly: true +``` + +Then from the directory containing the `agc-project.yaml` file, run: + +```bash +agc context deploy --context ${context} +``` + +Where `${context}` is either `spotContext` or `onDemandContext`. + +If you want both spot and on-demand contexts, all contexts can be deployed at once by running: + +``` +agc context deploy --all +``` + +Note that the `miniwdl` engine run via AWS is currently not supported for this workflow. + +# Checking and requesting quota in AWS + +See [resources requirements](../../README.md#resource-requirements) for information on the minimum requirements for running the workflow. Typically in a new AWS environment, additional vCPU quota will be required. + +## Checking current quota + +1. Navigate to [the AWS console](https://console.aws.amazon.com/). +2. In the top right corner, select the region where your `agc` deployment is located. +3. Navigate to EC2. +4. In the menu on the left, select 'Limits'. +5. Filter the limits by searching for "Standard". The current limit field indicates the number of vCPUs that you currently have access to. +- Spot instance limit: `All Standard (A, C, D, H, I, M, R, T, Z) Spot Instance Requests` +- On-demand instance limit: `Running On-Demand All Standard (A, C, D, H, I, M, R, T, Z) instances` + +If the number of vCPUs in the context you plan to run the workflow in is less than the limites specified in [the resources requirements](../../README.md#resource-requirements) section, you will need to request additional quota before you can run the workflow. + +## Requesting additional quota + +5. Continuing from the steps outlined in [checking the current quota](#checking-current-quota), select the service you want to request an increase for. +6. In the top right corner, select 'Request limit increase'. +7. Fill out the appropriate fields in the request form, ensuring that the region you select is the region where you have deployed your `agc` and where your data is located. 256 vCPUs are recommended for running trio data. + +Low quota increase requests are typically fulfilled within a 1-2 hours. + +# Configuring and running the workflow + +## Filling out workflow inputs + +Fill out any information missing in [the inputs file](inputs.aws.json). Ensure that all data files used by the workflow are at locations that have been configured in the agc-project.yaml file; see the [granting access to other data files](#granting-access-to-other-data-files) for more information. + +See [the inputs section of the main README](../../README.md#workflow-inputs) for more information on the structure of the inputs.json file. + +Note that you only need to fill out the queueArn corresponding to the context you are submitting the workflow to (spot or on-demand). + +### Determining available zones + +To determine available zones in AWS, look for the `ZoneName` attribute output by the following command: + +```bash +aws ec2 describe-availability-zones --region +``` + +For example, the zones in region us-east-2 are `"us-east-2a us-east-2b us-east-2c"`. + +### Determining the AWS batch queue ARN + +**Note that if you are using a `miniwdl` engine, you can skip these steps; workflows run via miniwdl will run exclusively in the job queue to which they are submitted.** + +1. Visit [the AWS console](https://console.aws.amazon.com/). +2. Navigate to the Batch service. +3. In the lefthand sidebar, select "Compute environments". Note the name of the compute environment with the provisioning model SPOT (if you have deployed a context using spot instances) and the name of the compute environment with provisioning model "EC2" (if you have deployed a context that does not use spot instances). +4. In the lefthand sidebar, select "Job queues". +5. Clicking into an individual queue will show information about the compute environment ("Compute environment order"). Identify the job queue with the Compute environment name that matches the name you identified for the SPOT compute environment; copy the Amazon Resource Name (ARN) for this job queue. This is the value that should be used for the `aws_spot_queue_arn`. Repeat this process to find the ARN for the `aws_on_demand_queue_arn`. + +- If `preemptible = true`, only the `aws_spot_queue_arn` is required. +- If `preemptible = false`, only the `aws_on_demand_queue_arn` is required. + +## Running the workflow + +### Running via `agc` + +From the directory where your `agc-project.yaml` is located, run: + +`agc workflow run humanwgs --context --inputsFile ` + +The running workflow can be monitored via [`agc workflow` commands](https://aws.github.io/amazon-genomics-cli/docs/reference/agc_workflow/), or via the AWS console. + +# Reference data hosted in AWS + +AWS reference data is hosted in the `us-west-2` region in the bucket `s3://dnastack-resources`. + +To use AWS reference data, add the following line to the data section of your [`agc-project.yaml`](https://aws.github.io/amazon-genomics-cli/docs/concepts/projects/): + +```yaml +data: + - location: s3://dnastack-resources + readOnly: true +``` +The [AWS input file template](inputs.aws.json) has paths to the reference files in s3 prefilled. The template [agc-project.template.yaml file](agc-project.template.yaml) has this section filled out already. + +### Granting access to other data files + +S3 buckets outside of the reference files can be accessed by adding additional data blocks to the agc-project.yaml file. See the [agc documentation](https://aws.github.io/amazon-genomics-cli/docs/concepts/data/) for more details on adding additional data sources. All inputs referenced in the inputs.json file will need to be at locations that have been configured in the agc-project.yaml. diff --git a/HiFi-human-WGS-WDL/backends/aws/agc-project.template.yaml b/HiFi-human-WGS-WDL/backends/aws/agc-project.template.yaml new file mode 100644 index 0000000..46ebf5f --- /dev/null +++ b/HiFi-human-WGS-WDL/backends/aws/agc-project.template.yaml @@ -0,0 +1,167 @@ +name: humanwgsAgc +schemaVersion: 1 +data: + - location: s3://dnastack-resources + readOnly: true +workflows: + humanwgs: + type: + language: wdl + version: 1.0 + sourceURL: ../../workflows +contexts: + onDemandContext: + instanceTypes: [ + "c5.large", + "c5.xlarge", + "c5.2xlarge", + "c5.4xlarge", + "c5.9xlarge", + "c5.12xlarge", + "c5.18xlarge", + "c5.24xlarge", + "c5.metal", + "c5a.large", + "c5a.xlarge", + "c5a.2xlarge", + "c5a.4xlarge", + "c5a.8xlarge", + "c5a.12xlarge", + "c5a.16xlarge", + "c5a.24xlarge", + "c5n.large", + "c5n.xlarge", + "c5n.2xlarge", + "c5n.4xlarge", + "c5n.9xlarge", + "c5n.18xlarge", + "m5.large", + "m5.xlarge", + "m5.2xlarge", + "m5.4xlarge", + "m5.8xlarge", + "m5.12xlarge", + "m5.16xlarge", + "m5.24xlarge", + "m5a.large", + "m5a.xlarge", + "m5a.2xlarge", + "m5a.4xlarge", + "m5a.8xlarge", + "m5a.12xlarge", + "m5a.16xlarge", + "m5a.24xlarge", + "m5n.large", + "m5n.xlarge", + "m5n.2xlarge", + "m5n.4xlarge", + "m5n.8xlarge", + "m5n.12xlarge", + "m5n.16xlarge", + "m5n.24xlarge", + "r5.large", + "r5.xlarge", + "r5.2xlarge", + "r5.4xlarge", + "r5.8xlarge", + "r5.12xlarge", + "r5.16xlarge", + "r5.24xlarge", + "r5a.large", + "r5a.xlarge", + "r5a.2xlarge", + "r5a.4xlarge", + "r5a.8xlarge", + "r5a.12xlarge", + "r5a.16xlarge", + "r5a.24xlarge", + "r5n.large", + "r5n.xlarge", + "r5n.2xlarge", + "r5n.4xlarge", + "r5n.8xlarge", + "r5n.12xlarge", + "r5n.16xlarge", + "r5n.24xlarge", + ] + engines: + - type: wdl + engine: cromwell + spotContext: + requestSpotInstances: true + instanceTypes: [ + "c5.large", + "c5.xlarge", + "c5.2xlarge", + "c5.4xlarge", + "c5.9xlarge", + "c5.12xlarge", + "c5.18xlarge", + "c5.24xlarge", + "c5.metal", + "c5a.large", + "c5a.xlarge", + "c5a.2xlarge", + "c5a.4xlarge", + "c5a.8xlarge", + "c5a.12xlarge", + "c5a.16xlarge", + "c5a.24xlarge", + "c5n.large", + "c5n.xlarge", + "c5n.2xlarge", + "c5n.4xlarge", + "c5n.9xlarge", + "c5n.18xlarge", + "m5.large", + "m5.xlarge", + "m5.2xlarge", + "m5.4xlarge", + "m5.8xlarge", + "m5.12xlarge", + "m5.16xlarge", + "m5.24xlarge", + "m5a.large", + "m5a.xlarge", + "m5a.2xlarge", + "m5a.4xlarge", + "m5a.8xlarge", + "m5a.12xlarge", + "m5a.16xlarge", + "m5a.24xlarge", + "m5n.large", + "m5n.xlarge", + "m5n.2xlarge", + "m5n.4xlarge", + "m5n.8xlarge", + "m5n.12xlarge", + "m5n.16xlarge", + "m5n.24xlarge", + "r5.large", + "r5.xlarge", + "r5.2xlarge", + "r5.4xlarge", + "r5.8xlarge", + "r5.12xlarge", + "r5.16xlarge", + "r5.24xlarge", + "r5a.large", + "r5a.xlarge", + "r5a.2xlarge", + "r5a.4xlarge", + "r5a.8xlarge", + "r5a.12xlarge", + "r5a.16xlarge", + "r5a.24xlarge", + "r5n.large", + "r5n.xlarge", + "r5n.2xlarge", + "r5n.4xlarge", + "r5n.8xlarge", + "r5n.12xlarge", + "r5n.16xlarge", + "r5n.24xlarge", + ] + engines: + - type: wdl + engine: cromwell diff --git a/HiFi-human-WGS-WDL/backends/aws/inputs.aws.json b/HiFi-human-WGS-WDL/backends/aws/inputs.aws.json new file mode 100644 index 0000000..87b4f16 --- /dev/null +++ b/HiFi-human-WGS-WDL/backends/aws/inputs.aws.json @@ -0,0 +1,76 @@ +{ + "humanwgs.cohort": { + "cohort_id": "String", + "samples": [ + { + "sample_id": "String", + "movie_bams": [ + "File" + ], + "sex": "String?", + "affected": "Boolean", + "father_id": "String?", + "mother_id": "String?" + } + ], + "phenotypes": [ + "String" + ] + }, + "humanwgs.reference": { + "name": "GRCh38", + "fasta": { + "data": "s3://dnastack-resources/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", + "data_index": "s3://dnastack-resources/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai" + }, + "pbsv_splits": "s3://dnastack-resources/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.pbsv_splits.json", + "tandem_repeat_bed": "s3://dnastack-resources/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.trf.bed", + "trgt_tandem_repeat_bed": "s3://dnastack-resources/dataset/GRCh38/trgt/human_GRCh38_no_alt_analysis_set.trgt.v0.3.4.bed", + "hificnv_exclude_bed": { + "data": "s3://dnastack-resources/dataset/GRCh38/hificnv/cnv.excluded_regions.common_50.hg38.bed.gz", + "data_index": "s3://dnastack-resources/dataset/GRCh38/hificnv/cnv.excluded_regions.common_50.hg38.bed.gz.tbi" + }, + "hificnv_expected_bed_male": "s3://dnastack-resources/dataset/GRCh38/hificnv/expected_cn.hg38.XY.bed", + "hificnv_expected_bed_female": "s3://dnastack-resources/dataset/GRCh38/hificnv/expected_cn.hg38.XX.bed", + "gnomad_af": "s3://dnastack-resources/dataset/GRCh38/slivar_gnotate/gnomad.hg38.v3.custom.v1.zip", + "hprc_af": "s3://dnastack-resources/dataset/GRCh38/slivar_gnotate/hprc.deepvariant.glnexus.hg38.v1.zip", + "gff": "s3://dnastack-resources/dataset/GRCh38/ensembl.GRCh38.101.reformatted.gff3.gz", + "population_vcfs": [ + { + "data": "s3://dnastack-resources/dataset/GRCh38/sv_pop_vcfs/EEE_SV-Pop_1.ALL.sites.20181204.vcf.gz", + "data_index": "s3://dnastack-resources/dataset/GRCh38/sv_pop_vcfs/EEE_SV-Pop_1.ALL.sites.20181204.vcf.gz.tbi" + }, + { + "data": "s3://dnastack-resources/dataset/GRCh38/sv_pop_vcfs/nstd166.GRCh38.variant_call.vcf.gz", + "data_index": "s3://dnastack-resources/dataset/GRCh38/sv_pop_vcfs/nstd166.GRCh38.variant_call.vcf.gz.tbi" + }, + { + "data": "s3://dnastack-resources/dataset/GRCh38/sv_pop_vcfs/hprc.GRCh38.pbsv.vcf.gz", + "data_index": "s3://dnastack-resources/dataset/GRCh38/sv_pop_vcfs/hprc.GRCh38.pbsv.vcf.gz.tbi" + }, + { + "data": "s3://dnastack-resources/dataset/GRCh38/sv_pop_vcfs/ont_sv_high_confidence_SVs.sorted.vcf.gz", + "data_index": "s3://dnastack-resources/dataset/GRCh38/sv_pop_vcfs/ont_sv_high_confidence_SVs.sorted.vcf.gz.tbi" + } + ] + }, + "humanwgs.slivar_data": { + "slivar_js": "s3://dnastack-resources/dataset/slivar/slivar-functions.v0.2.8.js", + "hpo_terms": "s3://dnastack-resources/dataset/hpo/hpoTerms.txt", + "hpo_dag": "s3://dnastack-resources/dataset/hpo/hpoDag.txt", + "hpo_annotations": "s3://dnastack-resources/dataset/hpo/ensembl.hpoPhenotype.tsv", + "ensembl_to_hgnc": "s3://dnastack-resources/dataset/genes/ensembl.hgncSymbol.tsv", + "lof_lookup": "s3://dnastack-resources/dataset/slivar/lof_lookup.v2.1.1.txt", + "clinvar_lookup": "s3://dnastack-resources/dataset/slivar/clinvar_gene_desc.20221214T183140.txt" + }, + "humanwgs.deepvariant_version": "String (optional)", + "humanwgs.deepvariant_model": "WomCompositeType {\n model -> WomCompositeType {\n data -> File\ndata_index -> File \n}\nmetadata -> File \n}? (optional)", + "humanwgs.pbsv_call_mem_gb": "Int (optional, default = if N<=3: 64 else 96)", + "humanwgs.glnexus_mem_gb": "Int (optional, default = 30)", + "humanwgs.run_tertiary_analysis": "Boolean (optional, default = false)", + "humanwgs.backend": "AWS", + "humanwgs.zones": "us-east-2a us-east-2b us-east-2c", + "humanwgs.aws_spot_queue_arn": "", + "humanwgs.aws_on_demand_queue_arn": "", + "humanwgs.preemptible": "Boolean" +} diff --git a/HiFi-human-WGS-WDL/backends/azure/README.md b/HiFi-human-WGS-WDL/backends/azure/README.md new file mode 100644 index 0000000..357a43f --- /dev/null +++ b/HiFi-human-WGS-WDL/backends/azure/README.md @@ -0,0 +1,29 @@ +# Configuring Cromwell on Azure + +Workflows can be run in Azure by setting up [Cromwell on Azure (CoA)](https://github.com/microsoft/CromwellOnAzure). Documentation on deploying and configuring an instance of CoA can be found [here](https://github.com/microsoft/CromwellOnAzure/wiki/Deploy-your-instance-of-Cromwell-on-Azure). + +## Requirements + +- [Cromwell on Azure](https://github.com/microsoft/CromwellOnAzure) version 3.2+; version 4.0+ is recommended + +# Configuring and running the workflow + +## Filling out workflow inputs + +Fill out any information missing in [the inputs file](inputs.azure.json). + +See [the inputs section of the main README](../../README.md#workflow-inputs) for more information on the structure of the inputs.json file. + +## Running the workflow + +### Running via Cromwell on Azure + +Instructions for running a workflow from Cromwell on Azure are described in [the Cromwell on Azure documentation](https://github.com/microsoft/CromwellOnAzure/wiki/Running-Workflows). + +# Reference data hosted in Azure + +To use Azure reference data, add the following line to your `containers-to-mount` file in your Cromwell on Azure installation ([more info here](https://github.com/microsoft/CromwellOnAzure/blob/develop/docs/troubleshooting-guide.md#use-input-data-files-from-an-existing-azure-storage-account-that-my-lab-or-team-is-currently-using)): + +`https://datasetpbrarediseases.blob.core.windows.net/dataset?si=public&spr=https&sv=2021-06-08&sr=c&sig=o6OkcqWWlGcGOOr8I8gCA%2BJwlpA%2FYsRz0DMB8CCtCJk%3D` + +The [Azure input file template](inputs.azure.json) has paths to the reference files in this blob storage prefilled. diff --git a/HiFi-human-WGS-WDL/backends/azure/inputs.azure.json b/HiFi-human-WGS-WDL/backends/azure/inputs.azure.json new file mode 100644 index 0000000..8d603d5 --- /dev/null +++ b/HiFi-human-WGS-WDL/backends/azure/inputs.azure.json @@ -0,0 +1,73 @@ +{ + "humanwgs.cohort": { + "cohort_id": "String", + "samples": [ + { + "sample_id": "String", + "movie_bams": [ + "File" + ], + "sex": "String?", + "affected": "Boolean", + "father_id": "String?", + "mother_id": "String?" + } + ], + "phenotypes": [ + "String" + ] + }, + "humanwgs.reference": { + "name": "GRCh38", + "fasta": { + "data": "/datasetpbrarediseases/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", + "data_index": "/datasetpbrarediseases/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai" + }, + "pbsv_splits": "/datasetpbrarediseases/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.pbsv_splits.json", + "tandem_repeat_bed": "/datasetpbrarediseases/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.trf.bed", + "trgt_tandem_repeat_bed": "/datasetpbrarediseases/dataset/GRCh38/trgt/human_GRCh38_no_alt_analysis_set.trgt.v0.3.4.bed", + "hificnv_exclude_bed": { + "data": "/datasetpbrarediseases/dataset/GRCh38/hificnv/cnv.excluded_regions.common_50.hg38.bed.gz", + "data_index": "/datasetpbrarediseases/dataset/GRCh38/hificnv/cnv.excluded_regions.common_50.hg38.bed.gz.tbi" + }, + "hificnv_expected_bed_male": "/datasetpbrarediseases/dataset/GRCh38/hificnv/expected_cn.hg38.XY.bed", + "hificnv_expected_bed_female": "/datasetpbrarediseases/dataset/GRCh38/hificnv/expected_cn.hg38.XX.bed", + "gnomad_af": "/datasetpbrarediseases/dataset/GRCh38/slivar_gnotate/gnomad.hg38.v3.custom.v1.zip", + "hprc_af": "/datasetpbrarediseases/dataset/GRCh38/slivar_gnotate/hprc.deepvariant.glnexus.hg38.v1.zip", + "gff": "/datasetpbrarediseases/dataset/GRCh38/ensembl.GRCh38.101.reformatted.gff3.gz", + "population_vcfs": [ + { + "data": "/datasetpbrarediseases/dataset/GRCh38/sv_pop_vcfs/EEE_SV-Pop_1.ALL.sites.20181204.vcf.gz", + "data_index": "/datasetpbrarediseases/dataset/GRCh38/sv_pop_vcfs/EEE_SV-Pop_1.ALL.sites.20181204.vcf.gz.tbi" + }, + { + "data": "/datasetpbrarediseases/dataset/GRCh38/sv_pop_vcfs/nstd166.GRCh38.variant_call.vcf.gz", + "data_index": "/datasetpbrarediseases/dataset/GRCh38/sv_pop_vcfs/nstd166.GRCh38.variant_call.vcf.gz.tbi" + }, + { + "data": "/datasetpbrarediseases/dataset/GRCh38/sv_pop_vcfs/hprc.GRCh38.pbsv.vcf.gz", + "data_index": "/datasetpbrarediseases/dataset/GRCh38/sv_pop_vcfs/hprc.GRCh38.pbsv.vcf.gz.tbi" + }, + { + "data": "/datasetpbrarediseases/dataset/GRCh38/sv_pop_vcfs/ont_sv_high_confidence_SVs.sorted.vcf.gz", + "data_index": "/datasetpbrarediseases/dataset/GRCh38/sv_pop_vcfs/ont_sv_high_confidence_SVs.sorted.vcf.gz.tbi" + } + ] + }, + "humanwgs.slivar_data": { + "slivar_js": "/datasetpbrarediseases/dataset/slivar/slivar-functions.v0.2.8.js", + "hpo_terms": "/datasetpbrarediseases/dataset/hpo/hpoTerms.txt", + "hpo_dag": "/datasetpbrarediseases/dataset/hpo/hpoDag.txt", + "hpo_annotations": "/datasetpbrarediseases/dataset/hpo/ensembl.hpoPhenotype.tsv", + "ensembl_to_hgnc": "/datasetpbrarediseases/dataset/genes/ensembl.hgncSymbol.tsv", + "lof_lookup": "/datasetpbrarediseases/dataset/slivar/lof_lookup.v2.1.1.txt", + "clinvar_lookup": "/datasetpbrarediseases/dataset/slivar/clinvar_gene_desc.20221214T183140.txt" + }, + "humanwgs.deepvariant_version": "String (optional)", + "humanwgs.deepvariant_model": "WomCompositeType {\n model -> WomCompositeType {\n data -> File\ndata_index -> File \n}\nmetadata -> File \n}? (optional)", + "humanwgs.pbsv_call_mem_gb": "Int (optional, default = if N<=3: 64 else 96)", + "humanwgs.glnexus_mem_gb": "Int (optional, default = 30)", + "humanwgs.run_tertiary_analysis": "Boolean (optional, default = false)", + "humanwgs.backend": "Azure", + "humanwgs.preemptible": "Boolean" +} diff --git a/HiFi-human-WGS-WDL/backends/gcp/README.md b/HiFi-human-WGS-WDL/backends/gcp/README.md new file mode 100644 index 0000000..e1ea7c4 --- /dev/null +++ b/HiFi-human-WGS-WDL/backends/gcp/README.md @@ -0,0 +1,34 @@ +# Configuring Cromwell on GCP + +[Cromwell's documentation](https://cromwell.readthedocs.io/en/stable/tutorials/PipelinesApi101/) on getting started with Google's genomics Pipelines API can be used to set up the resources needed to run the workflow. + +# Configuring and running the workflow + +## Filling out workflow inputs + +Fill out any information missing in [the inputs file](inputs.gcp.json). + +See [the inputs section of the main README](../../README.md#workflow-inputs) for more information on the structure of the inputs.json file. + +### Determining available zones + +To determine available zones in GCP, run the following; available zones within a region can be found in the first column of the output: + +```bash +gcloud compute zones list | grep +``` + +For example, the zones in region us-central1 are `"us-central1-a us-central1-b us-central1c us-central1f"`. + +## Running the workflow + +### Running via Google's genomics Pipelines API + +[Cromwell's documentation](https://cromwell.readthedocs.io/en/stable/tutorials/PipelinesApi101/) on getting started with Google's genomics Pipelines API can be used as an example for how to run the workflow. + + +# Reference data hosted in GCP + +GCP reference data is hosted in the `us-west1` region in the bucket `gs://pacbio-wdl`. This bucket is requester-pays, meaning that users will need to [provide a billing project in their Cromwell configuration](https://cromwell.readthedocs.io/en/stable/filesystems/GoogleCloudStorage/) in order to use files located in this bucket. + +To avoid egress charges, Cromwell should be set up to spin up compute resources in the same region in which the data is located. If possible, add cohort data to the same region as the reference dataset, or consider mirroring this dataset in the region where your data is located. See [Google's information about data storage and egress charges for more information](https://cloud.google.com/storage/pricing). diff --git a/HiFi-human-WGS-WDL/backends/gcp/inputs.gcp.json b/HiFi-human-WGS-WDL/backends/gcp/inputs.gcp.json new file mode 100644 index 0000000..483998e --- /dev/null +++ b/HiFi-human-WGS-WDL/backends/gcp/inputs.gcp.json @@ -0,0 +1,74 @@ +{ + "humanwgs.cohort": { + "cohort_id": "String", + "samples": [ + { + "sample_id": "String", + "movie_bams": [ + "File" + ], + "sex": "String?", + "affected": "Boolean", + "father_id": "String?", + "mother_id": "String?" + } + ], + "phenotypes": [ + "String" + ] + }, + "humanwgs.reference": { + "name": "GRCh38", + "fasta": { + "data": "gs://pacbio-wdl/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", + "data_index": "gs://pacbio-wdl/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai" + }, + "pbsv_splits": "gs://pacbio-wdl/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.pbsv_splits.json", + "tandem_repeat_bed": "gs://pacbio-wdl/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.trf.bed", + "trgt_tandem_repeat_bed": "gs://pacbio-wdl/dataset/GRCh38/trgt/human_GRCh38_no_alt_analysis_set.trgt.v0.3.4.bed", + "hificnv_exclude_bed": { + "data": "gs://pacbio-wdl/dataset/GRCh38/hificnv/cnv.excluded_regions.common_50.hg38.bed.gz", + "data_index": "gs://pacbio-wdl/dataset/GRCh38/hificnv/cnv.excluded_regions.common_50.hg38.bed.gz.tbi" + }, + "hificnv_expected_bed_male": "gs://pacbio-wdl/dataset/GRCh38/hificnv/expected_cn.hg38.XY.bed", + "hificnv_expected_bed_female": "gs://pacbio-wdl/dataset/GRCh38/hificnv/expected_cn.hg38.XX.bed", + "gnomad_af": "gs://pacbio-wdl/dataset/GRCh38/slivar_gnotate/gnomad.hg38.v3.custom.v1.zip", + "hprc_af": "gs://pacbio-wdl/dataset/GRCh38/slivar_gnotate/hprc.deepvariant.glnexus.hg38.v1.zip", + "gff": "gs://pacbio-wdl/dataset/GRCh38/ensembl.GRCh38.101.reformatted.gff3.gz", + "population_vcfs": [ + { + "data": "gs://pacbio-wdl/dataset/GRCh38/sv_pop_vcfs/EEE_SV-Pop_1.ALL.sites.20181204.vcf.gz", + "data_index": "gs://pacbio-wdl/dataset/GRCh38/sv_pop_vcfs/EEE_SV-Pop_1.ALL.sites.20181204.vcf.gz.tbi" + }, + { + "data": "gs://pacbio-wdl/dataset/GRCh38/sv_pop_vcfs/nstd166.GRCh38.variant_call.vcf.gz", + "data_index": "gs://pacbio-wdl/dataset/GRCh38/sv_pop_vcfs/nstd166.GRCh38.variant_call.vcf.gz.tbi" + }, + { + "data": "gs://pacbio-wdl/dataset/GRCh38/sv_pop_vcfs/hprc.GRCh38.pbsv.vcf.gz", + "data_index": "gs://pacbio-wdl/dataset/GRCh38/sv_pop_vcfs/hprc.GRCh38.pbsv.vcf.gz.tbi" + }, + { + "data": "gs://pacbio-wdl/dataset/GRCh38/sv_pop_vcfs/ont_sv_high_confidence_SVs.sorted.vcf.gz", + "data_index": "gs://pacbio-wdl/dataset/GRCh38/sv_pop_vcfs/ont_sv_high_confidence_SVs.sorted.vcf.gz.tbi" + } + ] + }, + "humanwgs.slivar_data": { + "slivar_js": "gs://pacbio-wdl/dataset/slivar/slivar-functions.v0.2.8.js", + "hpo_terms": "gs://pacbio-wdl/dataset/hpo/hpoTerms.txt", + "hpo_dag": "gs://pacbio-wdl/dataset/hpo/hpoDag.txt", + "hpo_annotations": "gs://pacbio-wdl/dataset/hpo/ensembl.hpoPhenotype.tsv", + "ensembl_to_hgnc": "gs://pacbio-wdl/dataset/genes/ensembl.hgncSymbol.tsv", + "lof_lookup": "gs://pacbio-wdl/dataset/slivar/lof_lookup.v2.1.1.txt", + "clinvar_lookup": "gs://pacbio-wdl/dataset/slivar/clinvar_gene_desc.20221214T183140.txt" + }, + "humanwgs.deepvariant_version": "String (optional)", + "humanwgs.deepvariant_model": "WomCompositeType {\n model -> WomCompositeType {\n data -> File\ndata_index -> File \n}\nmetadata -> File \n}? (optional)", + "humanwgs.pbsv_call_mem_gb": "Int (optional, default = if N<=3: 64 else 96)", + "humanwgs.glnexus_mem_gb": "Int (optional, default = 30)", + "humanwgs.run_tertiary_analysis": "Boolean (optional, default = false)", + "humanwgs.backend": "GCP", + "humanwgs.zones": "String", + "humanwgs.preemptible": "Boolean" +} diff --git a/HiFi-human-WGS-WDL/backends/hpc/README.md b/HiFi-human-WGS-WDL/backends/hpc/README.md new file mode 100644 index 0000000..48f018a --- /dev/null +++ b/HiFi-human-WGS-WDL/backends/hpc/README.md @@ -0,0 +1,48 @@ +Either `miniwdl` or `Cromwell` can be used to run workflows on the HPC. + +# Installing and configuring `miniwdl` + +## Requirements + +- [`miniwdl`](https://github.com/chanzuckerberg/miniwdl) >= 1.9.0 +- [`miniwdl-slurm`](https://github.com/miniwdl-ext/miniwdl-slurm) + +## Configuring + +An [example miniwdl.cfg file](miniwdl.cfg) is provided here. This should be placed at `~/.config/miniwdl.cfg` and edited to match your slurm configuration. This allows running workflows using a basic SLURM setup. + +# Installing and configuring `Cromwell` + +Cromwell supports a number of different HPC backends; see [Cromwell's documentation](https://cromwell.readthedocs.io/en/stable/backends/HPC/) for more information on configuring each of the backends. + +# Configuring and running the workflow + +## Filling out workflow inputs + +Fill out any information missing in [the inputs file](inputs.hpc.json). Once you have downloaded the reference data bundle, ensure that you have replaced the `` in the input template file with the local path to the reference datasets on your HPC. + +See [the inputs section of the main README](../../README.md#workflow-inputs) for more information on the structure of the inputs.json file. + +## Running the workflow + +### Running via miniwdl + +`miniwdl run workflows/main.wdl -i ` + +### Running via Cromwell + +`cromwell run workflows/main.wdl -i ` + +# Reference data bundle + +![https://doi.org/10.5281/zenodo.8415406](https://zenodo.org/badge/DOI/10.5281/zenodo.8415406.svg) + +Reference data is hosted on Zenodo at [10.5281/zenodo.8415406](https://zenodo.org/record/8415406). Download the reference data bundle and extract it to a location on your HPC, then update the input template file with the path to the reference data. + +```bash +# download the reference data bundle +wget https://zenodo.org/record/8415406/files/wdl-humanwgs.v1.0.2.resources.tgz + +# extract the reference data bundle and rename as dataset +tar -xzf wdl-humanwgs.v1.0.2.resources.tgz && mv static_resources dataset +``` diff --git a/HiFi-human-WGS-WDL/backends/hpc/inputs.hpc.json b/HiFi-human-WGS-WDL/backends/hpc/inputs.hpc.json new file mode 100644 index 0000000..e5a6621 --- /dev/null +++ b/HiFi-human-WGS-WDL/backends/hpc/inputs.hpc.json @@ -0,0 +1,73 @@ +{ + "humanwgs.cohort": { + "cohort_id": "String", + "samples": [ + { + "sample_id": "String", + "movie_bams": [ + "File" + ], + "sex": "String?", + "affected": "Boolean", + "father_id": "String?", + "mother_id": "String?" + } + ], + "phenotypes": [ + "String" + ] + }, + "humanwgs.reference": { + "name": "GRCh38", + "fasta": { + "data": "/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", + "data_index": "/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai" + }, + "pbsv_splits": "/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.pbsv_splits.json", + "tandem_repeat_bed": "/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.trf.bed", + "trgt_tandem_repeat_bed": "/dataset/GRCh38/trgt/human_GRCh38_no_alt_analysis_set.trgt.v0.3.4.bed", + "hificnv_exclude_bed": { + "data": "/dataset/GRCh38/hificnv/cnv.excluded_regions.common_50.hg38.bed.gz", + "data_index": "/dataset/GRCh38/hificnv/cnv.excluded_regions.common_50.hg38.bed.gz.tbi" + }, + "hificnv_expected_bed_male": "/dataset/GRCh38/hificnv/expected_cn.hg38.XY.bed", + "hificnv_expected_bed_female": "/dataset/GRCh38/hificnv/expected_cn.hg38.XX.bed", + "gnomad_af": "/dataset/GRCh38/slivar_gnotate/gnomad.hg38.v3.custom.v1.zip", + "hprc_af": "/dataset/GRCh38/slivar_gnotate/hprc.deepvariant.glnexus.hg38.v1.zip", + "gff": "/dataset/GRCh38/ensembl.GRCh38.101.reformatted.gff3.gz", + "population_vcfs": [ + { + "data": "/dataset/GRCh38/sv_pop_vcfs/EEE_SV-Pop_1.ALL.sites.20181204.vcf.gz", + "data_index": "/dataset/GRCh38/sv_pop_vcfs/EEE_SV-Pop_1.ALL.sites.20181204.vcf.gz.tbi" + }, + { + "data": "/dataset/GRCh38/sv_pop_vcfs/nstd166.GRCh38.variant_call.vcf.gz", + "data_index": "/dataset/GRCh38/sv_pop_vcfs/nstd166.GRCh38.variant_call.vcf.gz.tbi" + }, + { + "data": "/dataset/GRCh38/sv_pop_vcfs/hprc.GRCh38.pbsv.vcf.gz", + "data_index": "/dataset/GRCh38/sv_pop_vcfs/hprc.GRCh38.pbsv.vcf.gz.tbi" + }, + { + "data": "/dataset/GRCh38/sv_pop_vcfs/ont_sv_high_confidence_SVs.sorted.vcf.gz", + "data_index": "/dataset/GRCh38/sv_pop_vcfs/ont_sv_high_confidence_SVs.sorted.vcf.gz.tbi" + } + ] + }, + "humanwgs.slivar_data": { + "slivar_js": "/dataset/slivar/slivar-functions.v0.2.8.js", + "hpo_terms": "/dataset/hpo/hpoTerms.txt", + "hpo_dag": "/dataset/hpo/hpoDag.txt", + "hpo_annotations": "/dataset/hpo/ensembl.hpoPhenotype.tsv", + "ensembl_to_hgnc": "/dataset/genes/ensembl.hgncSymbol.tsv", + "lof_lookup": "/dataset/slivar/lof_lookup.v2.1.1.txt", + "clinvar_lookup": "/dataset/slivar/clinvar_gene_desc.20221214T183140.txt" + }, + "humanwgs.deepvariant_version": "String (optional)", + "humanwgs.deepvariant_model": "WomCompositeType {\n model -> WomCompositeType {\n data -> File\ndata_index -> File \n}\nmetadata -> File \n}? (optional)", + "humanwgs.pbsv_call_mem_gb": "Int (optional, default = if N<=3: 64 else 96)", + "humanwgs.glnexus_mem_gb": "Int (optional, default = 30)", + "humanwgs.run_tertiary_analysis": "Boolean (optional, default = false)", + "humanwgs.backend": "HPC", + "humanwgs.preemptible": true +} diff --git a/HiFi-human-WGS-WDL/backends/hpc/miniwdl.cfg b/HiFi-human-WGS-WDL/backends/hpc/miniwdl.cfg new file mode 100644 index 0000000..3bdd33d --- /dev/null +++ b/HiFi-human-WGS-WDL/backends/hpc/miniwdl.cfg @@ -0,0 +1,50 @@ +[scheduler] +container_backend = slurm_singularity +# task_concurrency defaults to the number of processors on the system. +# since we submit the jobs to SLURM this is not necessary. +# higher numbers means miniwdl has to monitor more processes simultaneously +# which might impact performance. +task_concurrency=200 + +# This setting allows running tasks to continue, even if one other tasks fails. +# Useful in combination with call caching. Prevents wasting resources by +# cancelling jobs half-way that would probably succeed. +fail_fast = false + +[call_cache] +# The following settings create a call cache under the current directory. +# This prevents wasting unnecessary resources on the cluster by rerunning +# jobs that have already succeeded. +put = true +get = true +dir = "$PWD/miniwdl_call_cache" + +[task_runtime] +# Setting a 'maxRetries' default allows jobs that fail due to intermittent +# errors on the cluster to be retried. +## Requires miniwdl >= 1.9.0 +command_shell = /bin/bash +defaults = { + "maxRetries": 2, + "docker": "ubuntu:20.04" + } + +[singularity] +# This plugin wraps the singularity backend. Make sure the settings are +# appropriate for your cluster. +exe = ["/usr/bin/singularity"] + +# the miniwdl default options contain options to run as a fake root, which +# is not available on most clusters. +run_options = [ + "--containall" + ] + +# Location of the singularity images (optional). The miniwdl-slurm plugin +# will set it to a directory inside $PWD. This location must be reachable +# for the submit nodes. +image_cache = "$PWD/miniwdl_singularity_cache" + +[slurm] +# extra arguments passed to the srun command (optional). +extra_args="--partition compute --comment 'run with miniwdl'" diff --git a/HiFi-human-WGS-WDL/images/logo_wdl_workflows.svg b/HiFi-human-WGS-WDL/images/logo_wdl_workflows.svg new file mode 100644 index 0000000..4f065f0 --- /dev/null +++ b/HiFi-human-WGS-WDL/images/logo_wdl_workflows.svg @@ -0,0 +1,83 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/HiFi-human-WGS-WDL/images/main.graphviz.svg b/HiFi-human-WGS-WDL/images/main.graphviz.svg new file mode 100644 index 0000000..7df5907 --- /dev/null +++ b/HiFi-human-WGS-WDL/images/main.graphviz.svg @@ -0,0 +1,136 @@ + + + + + + +%3 + + +cluster-scatter-L40C2-sample + +scatter(cohort.samples) + + +cluster-if-L51C2 + +if(length(cohort.samples) > 1) + + +cluster-if-L63C2 + +if(run_tertiary_analysis) + + + +call-backend_configuration + +backend_configuration + + + +decl-default_runtime_attributes +default_runtime_attributes + + + +call-backend_configuration->decl-default_runtime_attributes + + + + + +call-sample_analysis + +sample_analysis + + + +decl-default_runtime_attributes->call-sample_analysis + + + + + +call-cohort_analysis + +cohort_analysis + + + +decl-default_runtime_attributes->call-cohort_analysis + + + + + +call-tertiary_analysis + +tertiary_analysis + + + +decl-default_runtime_attributes->call-tertiary_analysis + + + + + +call-sample_analysis->call-cohort_analysis + + + + + +decl-slivar_small_variant_input_vcf +slivar_small_variant_input_vcf + + + +call-sample_analysis->decl-slivar_small_variant_input_vcf + + + + + +decl-slivar_sv_input_vcf +slivar_sv_input_vcf + + + +call-sample_analysis->decl-slivar_sv_input_vcf + + + + + + +call-cohort_analysis->decl-slivar_small_variant_input_vcf + + + + + +call-cohort_analysis->decl-slivar_sv_input_vcf + + + + + + +decl-slivar_small_variant_input_vcf->call-tertiary_analysis + + + + + +decl-slivar_sv_input_vcf->call-tertiary_analysis + + + + + + diff --git a/HiFi-human-WGS-WDL/wdl-ci.config.json b/HiFi-human-WGS-WDL/wdl-ci.config.json new file mode 100644 index 0000000..414f024 --- /dev/null +++ b/HiFi-human-WGS-WDL/wdl-ci.config.json @@ -0,0 +1,1148 @@ +{ + "workflows": { + "workflows/humanwgs_structs.wdl": { + "key": "workflows/humanwgs_structs.wdl", + "name": "", + "description": "", + "tasks": {} + }, + "workflows/main.wdl": { + "key": "workflows/main.wdl", + "name": "", + "description": "", + "tasks": {} + }, + "workflows/cohort_analysis/cohort_analysis.wdl": { + "key": "workflows/cohort_analysis/cohort_analysis.wdl", + "name": "", + "description": "", + "tasks": {} + }, + "workflows/sample_analysis/sample_analysis.wdl": { + "key": "workflows/sample_analysis/sample_analysis.wdl", + "name": "", + "description": "", + "tasks": { + "pbmm2_align": { + "key": "pbmm2_align", + "digest": "3r4icze5zkps7m6xoruzvnfzk2fp4gqd", + "tests": [ + { + "inputs": { + "sample_id": "HG005", + "bam": "${input_file_path}/small_HG005/m64017_200723_190224.hifi_reads.bam", + "reference": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", + "reference_index": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai", + "reference_name": "GRCh38", + "runtime_attributes": "${default_runtime_attributes}" + }, + "output_tests": { + "aligned_bam": { + "value": "${resources_file_path}/HG005.m64017_200723_190224.hifi_reads.GRCh38.aligned.bam", + "test_tasks": [ + "compare_file_basename", + "samtools_quickcheck" + ] + }, + "bam_stats": { + "value": "${resources_file_path}/HG005.m64017_200723_190224.hifi_reads.read_length_and_quality.tsv", + "test_tasks": [ + "calculate_md5sum", + "compare_file_basename", + "check_tab_delimited", + "count_columns" + ] + }, + "read_length_summary": { + "value": "${resources_file_path}/HG005.m64017_200723_190224.hifi_reads.read_length_summary.tsv", + "test_tasks": [ + "calculate_md5sum", + "compare_file_basename", + "check_tab_delimited", + "count_columns", + "check_numeric" + ] + }, + "read_quality_summary": { + "value": "${resources_file_path}/HG005.m64017_200723_190224.hifi_reads.read_quality_summary.tsv", + "test_tasks": [ + "calculate_md5sum", + "compare_file_basename", + "check_tab_delimited", + "count_columns", + "check_numeric" + ] + } + } + } + ] + }, + "bcftools": { + "key": "bcftools", + "digest": "cbfxlhk575vhxbh6spw7ceyhn2ljf7vu", + "tests": [ + { + "inputs": { + "vcf": "${resources_file_path}/HG005.GRCh38.deepvariant.vcf.gz", + "stats_params": "--apply-filters PASS --samples HG005", + "reference": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", + "runtime_attributes": "${default_runtime_attributes}" + }, + "output_tests": { + "roh_out": { + "value": "${resources_file_path}/HG005.GRCh38.deepvariant.bcftools_roh.out", + "test_tasks": [ + "compare_file_basename" + ] + }, + "roh_bed": { + "value": "${resources_file_path}/HG005.GRCh38.deepvariant.roh.bed", + "test_tasks": [ + "calculate_md5sum", + "compare_file_basename", + "count_bed_columns", + "check_tab_delimited" + ] + }, + "stats": { + "value": "${resources_file_path}/HG005.GRCh38.deepvariant.vcf.stats.txt", + "test_tasks": [ + "compare_file_basename", + "check_empty_lines" + ] + } + } + } + ] + }, + "merge_bams": { + "key": "merge_bams", + "digest": "ihskwtepnuvcllzbe3k2m73kuju37l34", + "tests": [ + { + "inputs": { + "bams": [ + "${resources_file_path}/HG005.m64017_200723_190224.hifi_reads.GRCh38.aligned.haplotagged.bam", + "${resources_file_path}/HG005.m64017_200730_190124.hifi_reads.GRCh38.aligned.haplotagged.bam" + ], + "output_bam_name": "${sample_id}.${reference_name}.haplotagged.bam", + "runtime_attributes": "${default_runtime_attributes}" + }, + "output_tests": { + "merged_bam": { + "value": "${resources_file_path}/HG005.GRCh38.haplotagged.bam", + "test_tasks": [ + "compare_file_basename", + "samtools_quickcheck" + ] + } + } + } + ] + }, + "trgt": { + "key": "trgt", + "digest": "ylzep5nroxhzjff43gkc6fs25ydor7dd", + "tests": [ + { + "inputs": { + "sample_id": "${sample_id}", + "sex": "MALE", + "bam": "${resources_file_path}/HG005.GRCh38.haplotagged.bam", + "bam_index": "${resources_file_path}/HG005.GRCh38.haplotagged.bam.bai", + "reference": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", + "reference_index": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai", + "tandem_repeat_bed": "${datasets_file_path}/GRCh38/trgt/human_GRCh38_no_alt_analysis_set.trgt.v0.3.4.bed", + "runtime_attributes": "${default_runtime_attributes}" + }, + "output_tests": { + "spanning_reads": { + "value": "${resources_file_path}/HG005.GRCh38.haplotagged.trgt.spanning.sorted.bam", + "test_tasks": [ + "compare_file_basename", + "samtools_quickcheck", + "check_coordinate_sorted_alignment" + ] + }, + "repeat_vcf": { + "value": "${resources_file_path}/HG005.GRCh38.haplotagged.trgt.sorted.vcf.gz", + "test_tasks": [ + "compare_file_basename", + "vcftools_validator", + "check_sorted_vcf_bcf" + ] + } + } + } + ] + }, + "coverage_dropouts": { + "key": "coverage_dropouts", + "digest": "3el45hg36hlyx5cswr3dkvqfg644cvbn", + "tests": [ + { + "inputs": { + "bam": "${resources_file_path}/HG005.GRCh38.haplotagged.bam", + "bam_index": "${resources_file_path}/HG005.GRCh38.haplotagged.bam.bai", + "tandem_repeat_bed": "${datasets_file_path}/GRCh38/trgt/human_GRCh38_no_alt_analysis_set.trgt.v0.3.4.bed", + "output_prefix": "${sample_id}.${reference_name}", + "runtime_attributes": "${default_runtime_attributes}" + }, + "output_tests": { + "trgt_dropouts": { + "value": "${resources_file_path}/HG005.GRCh38.trgt.dropouts.txt", + "test_tasks": [ + "calculate_md5sum", + "compare_file_basename", + "check_tab_delimited", + "count_columns" + ] + } + } + } + ] + }, + "cpg_pileup": { + "key": "cpg_pileup", + "digest": "yw7vg33vgv3pomq6ozzy3dq65wklrc62", + "tests": [ + { + "inputs": { + "bam": "${resources_file_path}/HG005.GRCh38.haplotagged.bam", + "bam_index": "${resources_file_path}/HG005.GRCh38.haplotagged.bam.bai", + "output_prefix": "${sample_id}.${reference_name}", + "reference": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", + "reference_index": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai", + "runtime_attributes": "${default_runtime_attributes}" + }, + "output_tests": { + "pileup_beds": { + "value": [ + "${resources_file_path}/HG005.GRCh38.combined.bed", + "${resources_file_path}/HG005.GRCh38.hap1.bed", + "${resources_file_path}/HG005.GRCh38.hap2.bed" + ], + "test_tasks": [ + "compare_file_basename", + "check_tab_delimited", + "count_bed_columns" + ] + }, + "pileup_bigwigs": { + "value": [ + "${resources_file_path}/HG005.GRCh38.combined.bw", + "${resources_file_path}/HG005.GRCh38.hap1.bw", + "${resources_file_path}/HG005.GRCh38.hap2.bw" + ], + "test_tasks": [ + "compare_file_basename", + "bigwig_validator" + ] + } + } + } + ] + }, + "paraphase": { + "key": "paraphase", + "digest": "gzktyxvdrw73el5khnudlpu23x34lbxv", + "tests": [ + { + "inputs": { + "sample_id": "${sample_id}", + "bam": "/coac74908838b5dd7/inputs/small_dataset/paraphase/HG005.GRCh38.paraphase.test.bam", + "bam_index": "/coac74908838b5dd7/inputs/small_dataset/paraphase/HG005.GRCh38.paraphase.test.bam.bai", + "out_directory": "${sample_id}.paraphase", + "reference": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", + "reference_index": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai", + "runtime_attributes": "${default_runtime_attributes}" + }, + "output_tests": { + "output_json": { + "value": "${resources_file_path}/paraphase/${sample_id}.json", + "test_tasks": [ + "compare_file_basename", + "check_json" + ] + }, + "realigned_bam": { + "value": "${resources_file_path}/paraphase/${sample_id}_realigned_tagged.bam", + "test_tasks": [ + "compare_file_basename", + "samtools_quickcheck" + ] + } + } + } + ] + }, + "hificnv": { + "key": "hificnv", + "digest": "v5u3yha66r3tfyhzvhaye47h6u6q3glv", + "tests": [ + { + "inputs": { + "sample_id": "${sample_id}", + "sex": "MALE", + "bam": "${resources_file_path}/HG005.GRCh38.haplotagged.bam", + "bam_index": "${resources_file_path}/HG005.GRCh38.haplotagged.bam.bai", + "phased_vcf": "${resources_file_path}/HG005.GRCh38.deepvariant.phased.vcf.gz", + "phased_vcf_index": "${resources_file_path}/HG005.GRCh38.deepvariant.phased.vcf.gz", + "reference": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", + "reference_index": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai", + "exclude_bed": "${datasets_file_path}/GRCh38/hificnv/cnv.excluded_regions.common_50.hg38.bed.gz", + "exclude_bed_index": "${datasets_file_path}/GRCh38/hificnv/cnv.excluded_regions.common_50.hg38.bed.gz.tbi", + "expected_bed_male": "${datasets_file_path}/GRCh38/hificnv/expected_cn.hg38.XY.bed", + "expected_bed_female": "${datasets_file_path}/GRCh38/hificnv/expected_cn.hg38.XX.bed", + "output_prefix": "hificnv", + "runtime_attributes": "${default_runtime_attributes}" + }, + "output_tests": { + "cnv_vcf": { + "value": "${resources_file_path}/hificnv/hificnv.HG005.vcf.gz", + "test_tasks": [ + "compare_file_basename", + "vcftools_validator" + ] + }, + "copynum_bedgraph": { + "value": "${resources_file_path}/hificnv/hificnv.HG005.copynum.bedgraph", + "test_tasks": [ + "compare_file_basename", + "calculate_md5sum", + "check_tab_delimited", + "count_columns", + "check_chr_lines" + ] + }, + "depth_bw": { + "value": "${resources_file_path}/hificnv/hificnv.HG005.depth.bw", + "test_tasks": [ + "compare_file_basename", + "bigwig_validator" + ] + }, + "maf_bw": { + "value": "${resources_file_path}/hificnv/hificnv.HG005.maf.bw", + "test_tasks": [ + "compare_file_basename", + "bigwig_validator" + ] + } + } + } + ] + } + } + }, + "workflows/wdl-common/wdl/structs.wdl": { + "key": "workflows/wdl-common/wdl/structs.wdl", + "name": "", + "description": "", + "tasks": {} + }, + "workflows/wdl-common/wdl/tasks/pbsv_discover.wdl": { + "key": "workflows/wdl-common/wdl/tasks/pbsv_discover.wdl", + "name": "", + "description": "", + "tasks": { + "pbsv_discover": { + "key": "pbsv_discover", + "digest": "lbv7nwockw3wcbkfvapzoc2wv7fcodnw", + "tests": [ + { + "inputs": { + "aligned_bam": "${resources_file_path}/HG005.m64017_200723_190224.hifi_reads.GRCh38.aligned.bam", + "aligned_bam_index": "${resources_file_path}/HG005.m64017_200723_190224.hifi_reads.GRCh38.aligned.bam.bai", + "reference_tandem_repeat_bed": "${datasets_file_path}/GRCh38/trgt/human_GRCh38_no_alt_analysis_set.trgt.v0.3.4.bed", + "runtime_attributes": "${default_runtime_attributes}" + }, + "output_tests": { + "svsig": { + "value": "${resources_file_path}/HG005.m64017_200723_190224.hifi_reads.GRCh38.aligned.svsig.gz", + "test_tasks": [ + "compare_file_basename", + "check_gzip", + "check_empty_lines" + ] + } + } + } + ] + } + } + }, + "workflows/wdl-common/wdl/tasks/glnexus.wdl": { + "key": "workflows/wdl-common/wdl/tasks/glnexus.wdl", + "name": "", + "description": "", + "tasks": { + "glnexus": { + "key": "glnexus", + "digest": "4jz5jgrdccgii5ldycmjryzajkrdscji", + "tests": [ + { + "inputs": { + "cohort_id": "hg005-small-cohort", + "gvcfs": [ + "${resources_file_path}/HG005.GRCh38.deepvariant.g.vcf.gz", + "${resources_file_path}/HG006.GRCh38.deepvariant.g.vcf.gz", + "${resources_file_path}/HG007.GRCh38.deepvariant.g.vcf.gz" + ], + "gvcf_indices": [ + "${resources_file_path}/HG005.GRCh38.deepvariant.g.vcf.gz.tbi", + "${resources_file_path}/HG006.GRCh38.deepvariant.g.vcf.gz.tbi", + "${resources_file_path}/HG007.GRCh38.deepvariant.g.vcf.gz.tbi" + ], + "reference_name": "GRCh38", + "runtime_attributes": "${default_runtime_attributes}" + }, + "output_tests": { + "vcf": { + "value": "${resources_file_path}/hg005-small-cohort.GRCh38.deepvariant.glnexus.vcf.gz", + "test_tasks": [ + "compare_file_basename", + "vcftools_validator", + "check_gzip" + ] + } + } + } + ] + } + } + }, + "workflows/wdl-common/wdl/tasks/pbsv_call.wdl": { + "key": "workflows/wdl-common/wdl/tasks/pbsv_call.wdl", + "name": "", + "description": "", + "tasks": { + "pbsv_call": { + "key": "pbsv_call", + "digest": "o5xv2etbm2j4s32d5xs626xj6sp2ykmj", + "tests": [ + { + "inputs": { + "sample_id": "HG005", + "svsigs": [ + "${resources_file_path}/HG005.m64017_200723_190224.hifi_reads.GRCh38.aligned.svsig.gz" + ], + "reference": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", + "reference_index": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai", + "reference_name": "GRCh38", + "regions": [ + "chr6" + ], + "runtime_attributes": "${default_runtime_attributes}" + }, + "output_tests": { + "pbsv_vcf": { + "value": "${resources_file_path}/HG005.GRCh38.pbsv.vcf.gz", + "test_tasks": [ + "compare_file_basename", + "vcftools_validator", + "check_gzip" + ] + } + } + } + ] + } + } + }, + "workflows/wdl-common/wdl/tasks/concat_vcf.wdl": { + "key": "workflows/wdl-common/wdl/tasks/concat_vcf.wdl", + "name": "", + "description": "", + "tasks": { + "concat_vcf": { + "key": "concat_vcf", + "digest": "xkyvutmrg3gz6zgabdmwcjvcbwrbwwp7", + "tests": [ + { + "inputs": { + "vcfs": [ + "${resources_file_path}/HG005.GRCh38.chr5.pbsv.vcf.gz", + "${resources_file_path}/HG005.GRCh38.chr6.pbsv.vcf.gz" + ], + "vcf_indices": [ + "${resources_file_path}/HG005.GRCh38.chr5.pbsv.vcf.gz.tbi", + "${resources_file_path}/HG005.GRCh38.chr6.pbsv.vcf.gz.tbi" + ], + "output_vcf_name": "HG005.GRCh38.chr5chr6.pbsv.vcf.gz", + "runtime_attributes": "${default_runtime_attributes}" + }, + "output_tests": { + "concatenated_vcf": { + "value": "${resources_file_path}/HG005.GRCh38.chr5chr6.pbsv.vcf.gz", + "test_tasks": [ + "compare_file_basename", + "vcftools_validator", + "check_gzip" + ] + } + } + } + ] + } + } + }, + "workflows/wdl-common/wdl/tasks/samtools_fasta.wdl": { + "key": "workflows/wdl-common/wdl/tasks/samtools_fasta.wdl", + "name": "", + "description": "", + "tasks": { + "samtools_fasta": { + "key": "samtools_fasta", + "digest": "x336uu76d5c6nzls2vgntvoqrnhex5q4", + "tests": [ + { + "inputs": { + "bam": "${input_file_path}/small_HG005/m64017_200723_190224.hifi_reads.bam", + "runtime_attributes": "${default_runtime_attributes}" + }, + "output_tests": { + "reads_fasta": { + "value": "${resources_file_path}/m64017_200723_190224.hifi_reads.fasta", + "test_tasks": [ + "calculate_md5sum", + "compare_file_basename", + "check_empty_lines", + "fasta_validator" + ] + } + } + } + ] + } + } + }, + "workflows/wdl-common/wdl/tasks/mosdepth.wdl": { + "key": "workflows/wdl-common/wdl/tasks/mosdepth.wdl", + "name": "", + "description": "", + "tasks": { + "mosdepth": { + "key": "mosdepth", + "digest": "4uqrkpwl5zu5f5s53ef2ic6trlefamvi", + "tests": [ + { + "inputs": { + "aligned_bam": "${resources_file_path}/HG005.m64017_200723_190224.hifi_reads.GRCh38.aligned.bam", + "aligned_bam_index": "${resources_file_path}/HG005.m64017_200723_190224.hifi_reads.GRCh38.aligned.bam.bai", + "runtime_attributes": "${default_runtime_attributes}" + }, + "output_tests": { + "region_bed": { + "value": "${resources_file_path}/HG005.m64017_200723_190224.hifi_reads.GRCh38.aligned.regions.bed.gz", + "test_tasks": [ + "calculate_md5sum", + "compare_file_basename", + "check_tab_delimited", + "count_bed_columns" + ] + }, + "summary": { + "value": "${resources_file_path}/HG005.m64017_200723_190224.hifi_reads.GRCh38.aligned.mosdepth.summary.txt", + "test_tasks": [ + "calculate_md5sum", + "compare_file_basename", + "check_tab_delimited", + "count_columns" + ] + } + } + } + ] + } + } + }, + "workflows/wdl-common/wdl/workflows/backend_configuration/backend_configuration.wdl": { + "key": "workflows/wdl-common/wdl/workflows/backend_configuration/backend_configuration.wdl", + "name": "", + "description": "", + "tasks": {} + }, + "workflows/wdl-common/wdl/workflows/deepvariant/deepvariant.wdl": { + "key": "workflows/wdl-common/wdl/workflows/deepvariant/deepvariant.wdl", + "name": "", + "description": "", + "tasks": { + "deepvariant_make_examples": { + "key": "deepvariant_make_examples", + "digest": "35kzpf37semcoxs7frzvhjrc4zvwoyan", + "tests": [ + { + "inputs": { + "sample_id": "${sample_id}", + "aligned_bams": [ + "${resources_file_path}/HG005.m64017_200723_190224.hifi_reads.GRCh38.aligned.bam", + "${resources_file_path}/HG005.m64017_200730_190124.hifi_reads.GRCh38.aligned.bam", + "${resources_file_path}/HG005.m64017_200801_011415.hifi_reads.GRCh38.aligned.bam", + "${resources_file_path}/HG005.m64017_200802_073944.hifi_reads.GRCh38.aligned.bam", + "${resources_file_path}/HG005.m64109_200304_195708.hifi_reads.GRCh38.aligned.bam", + "${resources_file_path}/HG005.m64109_200309_192110.hifi_reads.GRCh38.aligned.bam", + "${resources_file_path}/HG005.m64109_200311_013444.hifi_reads.GRCh38.aligned.bam" + ], + "aligned_bam_indices": [ + "${resources_file_path}/HG005.m64017_200723_190224.hifi_reads.GRCh38.aligned.bam.bai", + "${resources_file_path}/HG005.m64017_200730_190124.hifi_reads.GRCh38.aligned.bam.bai", + "${resources_file_path}/HG005.m64017_200801_011415.hifi_reads.GRCh38.aligned.bam.bai", + "${resources_file_path}/HG005.m64017_200802_073944.hifi_reads.GRCh38.aligned.bam.bai", + "${resources_file_path}/HG005.m64109_200304_195708.hifi_reads.GRCh38.aligned.bam.bai", + "${resources_file_path}/HG005.m64109_200309_192110.hifi_reads.GRCh38.aligned.bam.bai", + "${resources_file_path}/HG005.m64109_200311_013444.hifi_reads.GRCh38.aligned.bam.bai" + ], + "reference": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", + "reference_index": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai", + "task_start_index": 0, + "tasks_per_shard": 8, + "total_deepvariant_tasks": 64, + "deepvariant_version": "1.5.0", + "runtime_attributes": "${default_runtime_attributes}" + }, + "output_tests": { + "example_tfrecord_tar": { + "value": "${resources_file_path}/deepvariant/${sample_id}.0.example_tfrecords.tar.gz", + "test_tasks": [ + "compare_file_basename" + ] + }, + "nonvariant_site_tfrecord_tar": { + "value": "${resources_file_path}/deepvariant/${sample_id}.0.nonvariant_site_tfrecords.tar.gz", + "test_tasks": [ + "compare_file_basename" + ] + } + } + } + ] + }, + "deepvariant_call_variants": { + "key": "deepvariant_call_variants", + "digest": "a6ksi3haiz5pye7p64c67zeeauit7gqf", + "tests": [ + { + "inputs": { + "sample_id": "${sample_id}", + "reference_name": "GRCh38", + "example_tfrecord_tars": [ + "${resources_file_path}/deepvariant/${sample_id}.0.example_tfrecords.tar.gz", + "${resources_file_path}/deepvariant/${sample_id}.8.example_tfrecords.tar.gz", + "${resources_file_path}/deepvariant/${sample_id}.16.example_tfrecords.tar.gz", + "${resources_file_path}/deepvariant/${sample_id}.24.example_tfrecords.tar.gz", + "${resources_file_path}/deepvariant/${sample_id}.32.example_tfrecords.tar.gz", + "${resources_file_path}/deepvariant/${sample_id}.40.example_tfrecords.tar.gz", + "${resources_file_path}/deepvariant/${sample_id}.48.example_tfrecords.tar.gz", + "${resources_file_path}/deepvariant/${sample_id}.56.example_tfrecords.tar.gz" + ], + "total_deepvariant_tasks": 64, + "deepvariant_version": "1.5.0", + "runtime_attributes": "${default_runtime_attributes}" + }, + "output_tests": { + "tfrecord": { + "value": "${resources_file_path}/deepvariant/${sample_id}.GRCh38.call_variants_output.tfrecord.gz", + "test_tasks": [ + "compare_file_basename", + "check_gzip" + ] + } + } + } + ] + }, + "deepvariant_postprocess_variants": { + "key": "deepvariant_postprocess_variants", + "digest": "afmkoy2hy4lcyolms7n6lgzuzudr7uxx", + "tests": [ + { + "inputs": { + "sample_id": "${sample_id}", + "tfrecord": "${resources_file_path}/deepvariant/${sample_id}.GRCh38.call_variants_output.tfrecord.gz", + "nonvariant_site_tfrecord_tars": [ + "${resources_file_path}/deepvariant/${sample_id}.0.nonvariant_site_tfrecords.tar.gz", + "${resources_file_path}/deepvariant/${sample_id}.8.nonvariant_site_tfrecords.tar.gz", + "${resources_file_path}/deepvariant/${sample_id}.16.nonvariant_site_tfrecords.tar.gz", + "${resources_file_path}/deepvariant/${sample_id}.24.nonvariant_site_tfrecords.tar.gz", + "${resources_file_path}/deepvariant/${sample_id}.32.nonvariant_site_tfrecords.tar.gz", + "${resources_file_path}/deepvariant/${sample_id}.40.nonvariant_site_tfrecords.tar.gz", + "${resources_file_path}/deepvariant/${sample_id}.48.nonvariant_site_tfrecords.tar.gz", + "${resources_file_path}/deepvariant/${sample_id}.56.nonvariant_site_tfrecords.tar.gz" + ], + "reference": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", + "reference_index": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai", + "reference_name": "GRCh38", + "total_deepvariant_tasks": 64, + "deepvariant_version": "1.5.0", + "runtime_attributes": "${default_runtime_attributes}" + }, + "output_tests": { + "vcf": { + "value": "${resources_file_path}/${sample_id}.GRCh38.deepvariant.vcf.gz", + "test_tasks": [ + "calculate_md5sum", + "compare_file_basename", + "check_gzip", + "vcftools_validator" + ] + }, + "gvcf": { + "value": "${resources_file_path}/${sample_id}.GRCh38.deepvariant.g.vcf.gz", + "test_tasks": [ + "calculate_md5sum", + "compare_file_basename", + "check_gzip", + "vcftools_validator" + ] + } + } + } + ] + } + } + }, + "workflows/tertiary_analysis/tertiary_analysis.wdl": { + "key": "workflows/tertiary_analysis/tertiary_analysis.wdl", + "name": "", + "description": "", + "tasks": { + "write_yaml_ped_phrank": { + "key": "write_yaml_ped_phrank", + "digest": "e4yxyjj6vw35pxz434pgfalxpa4xh72n", + "tests": [ + { + "inputs": { + "cohort_id": "hg005-small-cohort", + "cohort_json": "${resources_file_path}/cohort.json", + "hpo_terms": "${datasets_file_path}/hpo/hpoTerms.txt", + "hpo_dag": "${datasets_file_path}/hpo/hpoDag.txt", + "hpo_annotations": "${datasets_file_path}/hpo/ensembl.hpoPhenotype.tsv", + "ensembl_to_hgnc": "${datasets_file_path}/hpo/ensembl.hgncSymbol.tsv", + "runtime_attributes": "${default_runtime_attributes}" + }, + "output_tests": { + "cohort_yaml": { + "value": "${resources_file_path}/hg005-small-cohort.yml", + "test_tasks": [ + "calculate_md5sum", + "compare_file_basename", + "check_yaml" + ] + }, + "pedigree": { + "value": "${resources_file_path}/hg005-small-cohort.ped", + "test_tasks": [ + "calculate_md5sum", + "compare_file_basename", + "check_tab_delimited", + "count_columns" + ] + }, + "phrank_lookup": { + "value": "${resources_file_path}/hg005-small-cohort_phrank.tsv", + "test_tasks": [ + "compare_file_basename", + "check_tab_delimited", + "count_columns" + ] + } + } + } + ] + }, + "slivar_small_variant": { + "key": "slivar_small_variant", + "digest": "rrak4b2uphyuonanbjtyjnub2vu5mkkl", + "tests": [ + { + "inputs": { + "vcf": "${resources_file_path}/hg005-small-cohort.GRCh38.deepvariant.glnexus.phased.vcf.gz", + "vcf_index": "${resources_file_path}/hg005-small-cohort.GRCh38.deepvariant.glnexus.phased.vcf.gz.tbi", + "pedigree": "${resources_file_path}/hg005-small-cohort.ped", + "reference": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", + "reference_index": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai", + "slivar_js": "${datasets_file_path}/slivar/slivar-functions.v0.2.8.js", + "gnomad_af": "${datasets_file_path}/GRCh38/slivar_gnotate/gnomad.hg38.v3.custom.v1.zip", + "hprc_af": "${datasets_file_path}/GRCh38/slivar_gnotate/hprc.deepvariant.glnexus.hg38.v1.zip", + "gff": "${datasets_file_path}/GRCh38/ensembl.GRCh38.101.reformatted.gff3.gz", + "lof_lookup": "${datasets_file_path}/slivar/lof_lookup.v2.1.1.txt", + "clinvar_lookup": "${datasets_file_path}/slivar/clinvar_gene_desc.20221214T183140.txt", + "phrank_lookup": "${resources_file_path}/hg005-small-cohort_phrank.tsv", + "runtime_attributes": "${default_runtime_attributes}" + }, + "output_tests": { + "filtered_vcf": { + "value": "${resources_file_path}/hg005-small-cohort.GRCh38.deepvariant.glnexus.phased.norm.slivar.vcf.gz", + "test_tasks": [ + "compare_file_basename", + "vcftools_validator", + "check_gzip" + ] + }, + "compound_het_vcf": { + "value": "${resources_file_path}/hg005-small-cohort.GRCh38.deepvariant.glnexus.phased.norm.slivar.compound_hets.vcf.gz", + "test_tasks": [ + "compare_file_basename", + "vcftools_validator", + "check_gzip" + ] + }, + "filtered_tsv": { + "value": "${resources_file_path}/hg005-small-cohort.GRCh38.deepvariant.glnexus.phased.norm.slivar.tsv", + "test_tasks": [ + "calculate_md5sum", + "compare_file_basename", + "check_tab_delimited", + "count_columns" + ] + }, + "compound_het_tsv": { + "value": "${resources_file_path}/hg005-small-cohort.GRCh38.deepvariant.glnexus.phased.norm.slivar.compound_hets.tsv", + "test_tasks": [ + "calculate_md5sum", + "compare_file_basename", + "check_tab_delimited", + "count_columns" + ] + } + } + } + ] + }, + "svpack_filter_annotated": { + "key": "svpack_filter_annotated", + "digest": "iyov6j7rcjp3llujj37q3clgpcbfduzh", + "tests": [ + { + "inputs": { + "sv_vcf": "${resources_file_path}/hg005-small-cohort.GRCh38.pbsv.vcf.gz", + "population_vcfs": [ + "/datasetpbrarediseases/dataset/GRCh38/sv_pop_vcfs/EEE_SV-Pop_1.ALL.sites.20181204.vcf.gz", + "/datasetpbrarediseases/dataset/GRCh38/sv_pop_vcfs/nstd166.GRCh38.variant_call.vcf.gz", + "/datasetpbrarediseases/dataset/GRCh38/sv_pop_vcfs/hprc.GRCh38.pbsv.vcf.gz", + "/datasetpbrarediseases/dataset/GRCh38/sv_pop_vcfs/ont_sv_high_confidence_SVs.sorted.vcf.gz" + ], + "population_vcf_indices": [ + "/datasetpbrarediseases/dataset/GRCh38/sv_pop_vcfs/EEE_SV-Pop_1.ALL.sites.20181204.vcf.gz.tbi", + "/datasetpbrarediseases/dataset/GRCh38/sv_pop_vcfs/nstd166.GRCh38.variant_call.vcf.gz.tbi", + "/datasetpbrarediseases/dataset/GRCh38/sv_pop_vcfs/hprc.GRCh38.pbsv.vcf.gz.tbi", + "/datasetpbrarediseases/dataset/GRCh38/sv_pop_vcfs/ont_sv_high_confidence_SVs.sorted.vcf.gz.tbi" + ], + "gff": "${datasets_file_path}/GRCh38/ensembl.GRCh38.101.reformatted.gff3.gz", + "runtime_attributes": "${default_runtime_attributes}" + }, + "output_tests": { + "svpack_vcf": { + "value": "${resources_file_path}/hg005-small-cohort.GRCh38.pbsv.svpack.vcf.gz", + "test_tasks": [ + "calculate_md5sum", + "compare_file_basename", + "vcftools_validator" + ] + } + } + } + ] + }, + "slivar_svpack_tsv": { + "key": "slivar_svpack_tsv", + "digest": "nizbjqia5xoqanjo67nmcwwa3m5y7eko", + "tests": [ + { + "inputs": { + "filtered_vcf": "${resources_file_path}/hg005-small-cohort.GRCh38.pbsv.svpack.vcf.gz", + "pedigree": "${resources_file_path}/hg005-small-cohort.ped", + "lof_lookup": "${datasets_file_path}/slivar/lof_lookup.v2.1.1.txt", + "clinvar_lookup": "${datasets_file_path}/slivar/clinvar_gene_desc.20221214T183140.txt", + "phrank_lookup": "${resources_file_path}/hg005-small-cohort_phrank.tsv", + "runtime_attributes": "${default_runtime_attributes}" + }, + "output_tests": { + "svpack_tsv": { + "value": "${resources_file_path}/hg005-small-cohort.GRCh38.pbsv.svpack.tsv", + "test_tasks": [ + "calculate_md5sum", + "compare_file_basename", + "check_tab_delimited", + "count_columns" + ] + } + } + } + ] + } + } + }, + "workflows/wdl-common/wdl/workflows/hiphase/hiphase.wdl": { + "key": "workflows/wdl-common/wdl/workflows/hiphase/hiphase.wdl", + "name": "", + "description": "", + "tasks": { + "run_hiphase": { + "key": "run_hiphase", + "digest": "6k2rtel3k6747xhcfnjufqfzgcnb7g5v", + "tests": [ + { + "inputs": { + "id": "HG005", + "refname": "GRCh38", + "sample_ids": [ + "HG005" + ], + "vcfs": [ + "${resources_file_path}/HG005.GRCh38.deepvariant.vcf.gz", + "${resources_file_path}/HG005.GRCh38.pbsv.vcf.gz" + ], + "vcf_indices": [ + "${resources_file_path}/HG005.GRCh38.deepvariant.vcf.gz.tbi", + "${resources_file_path}/HG005.GRCh38.pbsv.vcf.gz.tbi" + ], + "phased_vcf_names": [ + "HG005.GRCh38.deepvariant.phased.vcf.gz", + "HG005.GRCh38.pbsv.phased.vcf.gz" + ], + "phased_vcf_index_names": [ + "HG005.GRCh38.deepvariant.phased.vcf.gz.tbi", + "HG005.GRCh38.pbsv.phased.vcf.gz.tbi" + ], + "bams": [ + "${resources_file_path}/HG005.m64017_200723_190224.hifi_reads.GRCh38.aligned.bam", + "${resources_file_path}/HG005.m64017_200730_190124.hifi_reads.GRCh38.aligned.bam", + "${resources_file_path}/HG005.m64017_200801_011415.hifi_reads.GRCh38.aligned.bam", + "${resources_file_path}/HG005.m64017_200802_073944.hifi_reads.GRCh38.aligned.bam", + "${resources_file_path}/HG005.m64109_200304_195708.hifi_reads.GRCh38.aligned.bam", + "${resources_file_path}/HG005.m64109_200309_192110.hifi_reads.GRCh38.aligned.bam", + "${resources_file_path}/HG005.m64109_200311_013444.hifi_reads.GRCh38.aligned.bam" + ], + "bam_indices": [ + "${resources_file_path}/HG005.m64017_200723_190224.hifi_reads.GRCh38.aligned.bam.bai", + "${resources_file_path}/HG005.m64017_200730_190124.hifi_reads.GRCh38.aligned.bam.bai", + "${resources_file_path}/HG005.m64017_200801_011415.hifi_reads.GRCh38.aligned.bam.bai", + "${resources_file_path}/HG005.m64017_200802_073944.hifi_reads.GRCh38.aligned.bam.bai", + "${resources_file_path}/HG005.m64109_200304_195708.hifi_reads.GRCh38.aligned.bam.bai", + "${resources_file_path}/HG005.m64109_200309_192110.hifi_reads.GRCh38.aligned.bam.bai", + "${resources_file_path}/HG005.m64109_200311_013444.hifi_reads.GRCh38.aligned.bam.bai" + ], + "haplotagged_bam_names": [ + "HG005.m64017_200723_190224.hifi_reads.GRCh38.aligned.haplotagged.bam", + "HG005.m64017_200730_190124.hifi_reads.GRCh38.aligned.haplotagged.bam", + "HG005.m64017_200801_011415.hifi_reads.GRCh38.aligned.haplotagged.bam", + "HG005.m64017_200802_073944.hifi_reads.GRCh38.aligned.haplotagged.bam", + "HG005.m64109_200304_195708.hifi_reads.GRCh38.aligned.haplotagged.bam", + "HG005.m64109_200309_192110.hifi_reads.GRCh38.aligned.haplotagged.bam", + "HG005.m64109_200311_013444.hifi_reads.GRCh38.aligned.haplotagged.bam" + ], + "haplotagged_bam_index_names": [ + "HG005.m64017_200723_190224.hifi_reads.GRCh38.aligned.haplotagged.bam.bai", + "HG005.m64017_200730_190124.hifi_reads.GRCh38.aligned.haplotagged.bam.bai", + "HG005.m64017_200801_011415.hifi_reads.GRCh38.aligned.haplotagged.bam.bai", + "HG005.m64017_200802_073944.hifi_reads.GRCh38.aligned.haplotagged.bam.bai", + "HG005.m64109_200304_195708.hifi_reads.GRCh38.aligned.haplotagged.bam.bai", + "HG005.m64109_200309_192110.hifi_reads.GRCh38.aligned.haplotagged.bam.bai", + "HG005.m64109_200311_013444.hifi_reads.GRCh38.aligned.haplotagged.bam.bai" + ], + "reference": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", + "reference_index": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai", + "runtime_attributes": "${default_runtime_attributes}" + }, + "output_tests": { + "phased_vcfs": { + "value": [ + "${resources_file_path}/hiphase/HG005.GRCh38.deepvariant.phased.vcf.gz", + "${resources_file_path}/hiphase/HG005.GRCh38.pbsv.phased.vcf.gz" + ], + "test_tasks": [ + "compare_file_basename", + "vcftools_validator", + "check_gzip" + ] + }, + "haplotagged_bams": { + "value": [ + "${resources_file_path}/hiphase/HG005.m64017_200723_190224.hifi_reads.GRCh38.aligned.haplotagged.bam", + "${resources_file_path}/hiphase/HG005.m64017_200730_190124.hifi_reads.GRCh38.aligned.haplotagged.bam", + "${resources_file_path}/hiphase/HG005.m64017_200801_011415.hifi_reads.GRCh38.aligned.haplotagged.bam", + "${resources_file_path}/hiphase/HG005.m64017_200802_073944.hifi_reads.GRCh38.aligned.haplotagged.bam", + "${resources_file_path}/hiphase/HG005.m64109_200304_195708.hifi_reads.GRCh38.aligned.haplotagged.bam", + "${resources_file_path}/hiphase/HG005.m64109_200309_192110.hifi_reads.GRCh38.aligned.haplotagged.bam", + "${resources_file_path}/hiphase/HG005.m64109_200311_013444.hifi_reads.GRCh38.aligned.haplotagged.bam" + ], + "test_tasks": [ + "compare_file_basename", + "samtools_quickcheck" + ] + }, + "hiphase_stats": { + "value": "${resources_file_path}/hiphase/HG005.GRCh38.hiphase.stats.tsv", + "test_tasks": [ + "calculate_md5sum", + "compare_file_basename", + "check_tab_delimited", + "count_columns" + ] + }, + "hiphase_blocks": { + "value": "${resources_file_path}/hiphase/HG005.GRCh38.hiphase.blocks.tsv", + "test_tasks": [ + "calculate_md5sum", + "compare_file_basename", + "check_tab_delimited", + "count_columns" + ] + }, + "hiphase_haplotags": { + "value": "${resources_file_path}/hiphase/HG005.GRCh38.hiphase.haplotags.tsv", + "test_tasks": [ + "compare_file_basename", + "check_tab_delimited", + "count_columns" + ] + } + } + }, + { + "inputs": { + "id": "hg005-small-cohort", + "refname": "GRCh38", + "sample_ids": [ + "HG005", + "HG006", + "HG007" + ], + "vcfs": [ + "${resources_file_path}/hiphase/hg005-small-cohort.GRCh38.deepvariant.glnexus.vcf.gz", + "${resources_file_path}/hiphase/hg005-small-cohort.GRCh38.pbsv.vcf.gz" + ], + "vcf_indices": [ + "${resources_file_path}/hiphase/hg005-small-cohort.GRCh38.deepvariant.glnexus.vcf.gz.tbi", + "${resources_file_path}/hiphase/hg005-small-cohort.GRCh38.pbsv.vcf.gz.tbi" + ], + "phased_vcf_names": [ + "hg005-small-cohort.GRCh38.deepvariant.glnexus.phased.vcf.gz", + "hg005-small-cohort.GRCh38.pbsv.phased.vcf.gz" + ], + "phased_vcf_index_names": [ + "hg005-small-cohort.GRCh38.deepvariant.glnexus.phased.vcf.gz.tbi", + "hg005-small-cohort.GRCh38.pbsv.phased.vcf.gz.tbi" + ], + "bams": [ + "${resources_file_path}/HG005.m64017_200723_190224.hifi_reads.GRCh38.aligned.bam", + "${resources_file_path}/HG005.m64017_200730_190124.hifi_reads.GRCh38.aligned.bam", + "${resources_file_path}/HG005.m64017_200801_011415.hifi_reads.GRCh38.aligned.bam", + "${resources_file_path}/HG005.m64017_200802_073944.hifi_reads.GRCh38.aligned.bam", + "${resources_file_path}/HG005.m64109_200304_195708.hifi_reads.GRCh38.aligned.bam", + "${resources_file_path}/HG005.m64109_200309_192110.hifi_reads.GRCh38.aligned.bam", + "${resources_file_path}/HG005.m64109_200311_013444.hifi_reads.GRCh38.aligned.bam", + "${resources_file_path}/HG006.m64017_191209_211903.hifi_reads.GRCh38.aligned.bam", + "${resources_file_path}/HG006.m64017_191211_182504.hifi_reads.GRCh38.aligned.bam", + "${resources_file_path}/HG006.m64017_191213_003759.hifi_reads.GRCh38.aligned.bam", + "${resources_file_path}/HG006.m64017_191214_070352.hifi_reads.GRCh38.aligned.bam", + "${resources_file_path}/HG006.m64017_200107_170917.hifi_reads.GRCh38.aligned.bam", + "${resources_file_path}/HG006.m64109_200210_210230.hifi_reads.GRCh38.aligned.bam", + "${resources_file_path}/HG007.m64017_191216_194629.hifi_reads.GRCh38.aligned.bam", + "${resources_file_path}/HG007.m64017_191218_164535.hifi_reads.GRCh38.aligned.bam", + "${resources_file_path}/HG007.m64017_191219_225837.hifi_reads.GRCh38.aligned.bam", + "${resources_file_path}/HG007.m64017_191221_052416.hifi_reads.GRCh38.aligned.bam", + "${resources_file_path}/HG007.m64017_200108_232219.hifi_reads.GRCh38.aligned.bam", + "${resources_file_path}/HG007.m64017_200112_090459.hifi_reads.GRCh38.aligned.bam" + ], + "bam_indices": [ + "${resources_file_path}/HG005.m64017_200723_190224.hifi_reads.GRCh38.aligned.bam.bai", + "${resources_file_path}/HG005.m64017_200730_190124.hifi_reads.GRCh38.aligned.bam.bai", + "${resources_file_path}/HG005.m64017_200801_011415.hifi_reads.GRCh38.aligned.bam.bai", + "${resources_file_path}/HG005.m64017_200802_073944.hifi_reads.GRCh38.aligned.bam.bai", + "${resources_file_path}/HG005.m64109_200304_195708.hifi_reads.GRCh38.aligned.bam.bai", + "${resources_file_path}/HG005.m64109_200309_192110.hifi_reads.GRCh38.aligned.bam.bai", + "${resources_file_path}/HG005.m64109_200311_013444.hifi_reads.GRCh38.aligned.bam.bai", + "${resources_file_path}/HG006.m64017_191209_211903.hifi_reads.GRCh38.aligned.bam.bai", + "${resources_file_path}/HG006.m64017_191211_182504.hifi_reads.GRCh38.aligned.bam.bai", + "${resources_file_path}/HG006.m64017_191213_003759.hifi_reads.GRCh38.aligned.bam.bai", + "${resources_file_path}/HG006.m64017_191214_070352.hifi_reads.GRCh38.aligned.bam.bai", + "${resources_file_path}/HG006.m64017_200107_170917.hifi_reads.GRCh38.aligned.bam.bai", + "${resources_file_path}/HG006.m64109_200210_210230.hifi_reads.GRCh38.aligned.bam.bai", + "${resources_file_path}/HG007.m64017_191216_194629.hifi_reads.GRCh38.aligned.bam.bai", + "${resources_file_path}/HG007.m64017_191218_164535.hifi_reads.GRCh38.aligned.bam.bai", + "${resources_file_path}/HG007.m64017_191219_225837.hifi_reads.GRCh38.aligned.bam.bai", + "${resources_file_path}/HG007.m64017_191221_052416.hifi_reads.GRCh38.aligned.bam.bai", + "${resources_file_path}/HG007.m64017_200108_232219.hifi_reads.GRCh38.aligned.bam.bai", + "${resources_file_path}/HG007.m64017_200112_090459.hifi_reads.GRCh38.aligned.bam.bai" + ], + "haplotagged_bam_names": [], + "haplotagged_bam_index_names": [], + "reference": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", + "reference_index": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai", + "runtime_attributes": "${default_runtime_attributes}" + }, + "output_tests": { + "phased_vcfs": { + "value": [ + "${resources_file_path}/hiphase/hg005-small-cohort.GRCh38.deepvariant.glnexus.phased.vcf.gz", + "${resources_file_path}/hiphase/hg005-small-cohort.GRCh38.pbsv.phased.vcf.gz" + ], + "test_tasks": [ + "compare_file_basename", + "vcftools_validator", + "check_gzip" + ] + }, + "hiphase_stats": { + "value": "${resources_file_path}/hiphase/hg005-small-cohort.GRCh38.hiphase.stats.tsv", + "test_tasks": [ + "calculate_md5sum", + "compare_file_basename", + "check_tab_delimited", + "count_columns" + ] + }, + "hiphase_blocks": { + "value": "${resources_file_path}/hiphase/hg005-small-cohort.GRCh38.hiphase.blocks.tsv", + "test_tasks": [ + "calculate_md5sum", + "compare_file_basename", + "check_tab_delimited", + "count_columns" + ] + } + } + } + ] + } + } + } + }, + "engines": { + "c3213beb-97bc-4adc-9bd8-cdb3ac83b398": { + "key": "c3213beb-97bc-4adc-9bd8-cdb3ac83b398", + "enabled": true, + "name": "PacBio CoA" + } + }, + "test_params": { + "global_params": { + "sample_id": "HG005", + "reference_name": "GRCh38", + "default_runtime_attributes": { + "preemptible_tries": 3, + "max_retries": 3, + "zones": "", + "queue_arn": "", + "container_registry": "quay.io/pacbio" + }, + "on_demand_runtime_attributes": { + "preemptible_tries": 0, + "max_retries": 0, + "zones": "", + "queue_arn": "", + "container_registry": "quay.io/pacbio" + } + }, + "engine_params": { + "c3213beb-97bc-4adc-9bd8-cdb3ac83b398": { + "input_file_path": "/coac74908838b5dd7/inputs/small_dataset/chr6.p23", + "resources_file_path": "/coac74908838b5dd7/inputs/wdl-ci/humanwgs", + "datasets_file_path": "/datasetpbrarediseases/dataset" + } + } + } +} \ No newline at end of file diff --git a/HiFi-human-WGS-WDL/workflows/cohort_analysis/cohort_analysis.wdl b/HiFi-human-WGS-WDL/workflows/cohort_analysis/cohort_analysis.wdl new file mode 100644 index 0000000..8a9970b --- /dev/null +++ b/HiFi-human-WGS-WDL/workflows/cohort_analysis/cohort_analysis.wdl @@ -0,0 +1,112 @@ +version 1.0 + +# Run joint genotyping for a cohort. This workflow will be run if there is more than one sample in the cohort. + +import "../humanwgs_structs.wdl" +import "../wdl-common/wdl/tasks/pbsv_call.wdl" as PbsvCall +import "../wdl-common/wdl/tasks/concat_vcf.wdl" as ConcatVcf +import "../wdl-common/wdl/tasks/glnexus.wdl" as Glnexus +import "../wdl-common/wdl/workflows/hiphase/hiphase.wdl" as HiPhase + +workflow cohort_analysis { + input { + String cohort_id + Array[String] sample_ids + Array[IndexData] aligned_bams + Array[File] svsigs + Array[IndexData] gvcfs + + ReferenceData reference + + Int? pbsv_call_mem_gb + Int? glnexus_mem_gb + + RuntimeAttributes default_runtime_attributes + } + + Int sample_count = length(sample_ids) + Array[Array[String]] pbsv_splits = read_json(reference.pbsv_splits) + + scatter (gvcf_object in gvcfs) { + File gvcf = gvcf_object.data + File gvcf_index = gvcf_object.data_index + } + + scatter (shard_index in range(length(pbsv_splits))) { + Array[String] region_set = pbsv_splits[shard_index] + + call PbsvCall.pbsv_call { + input: + sample_id = cohort_id + ".joint", + svsigs = svsigs, + sample_count = sample_count, + reference = reference.fasta.data, + reference_index = reference.fasta.data_index, + reference_name = reference.name, + shard_index = shard_index, + regions = region_set, + mem_gb = pbsv_call_mem_gb, + runtime_attributes = default_runtime_attributes + } + } + + call ConcatVcf.concat_vcf { + input: + vcfs = pbsv_call.pbsv_vcf, + vcf_indices = pbsv_call.pbsv_vcf_index, + output_vcf_name = "~{cohort_id}.joint.~{reference.name}.pbsv.vcf.gz", + runtime_attributes = default_runtime_attributes + } + + IndexData zipped_pbsv_vcf = { + "data": concat_vcf.concatenated_vcf, + "data_index": concat_vcf.concatenated_vcf_index + } + + call Glnexus.glnexus { + input: + cohort_id = cohort_id + ".joint", + gvcfs = gvcf, + gvcf_indices = gvcf_index, + reference_name = reference.name, + mem_gb = glnexus_mem_gb, + runtime_attributes = default_runtime_attributes + } + + IndexData glnexus_vcf = { + "data": glnexus.vcf, + "data_index": glnexus.vcf_index + } + + call HiPhase.hiphase { + # VCF order: small variants, SVs + input: + id = cohort_id + ".joint", + refname = reference.name, + sample_ids = sample_ids, + vcfs = [glnexus_vcf, zipped_pbsv_vcf], + bams = aligned_bams, + haplotag = false, + reference_fasta = reference.fasta, + default_runtime_attributes = default_runtime_attributes + } + + output { + IndexData phased_joint_small_variant_vcf = hiphase.phased_vcfs[0] + IndexData phased_joint_sv_vcf = hiphase.phased_vcfs[1] + File hiphase_stats = hiphase.hiphase_stats + File hiphase_blocks = hiphase.hiphase_blocks + } + + parameter_meta { + cohort_id: {help: "Cohort ID; used for naming files"} + sample_ids: {help: "Sample IDs for all samples in the cohort"} + aligned_bams: {help: "BAM and index aligned to the reference genome for each movie associated with all samples in the cohort"} + svsigs: {help: "pbsv svsig files for each sample and movie BAM in the cohort"} + gvcfs: {help: "gVCF for each sample in the cohort"} + reference: {help: "Reference genome data"} + pbsv_call_mem_gb: {help: "Optional amount of RAM in GB for pbsv_call; default 64 for cohorts N<=3, 96 for cohorts N>3"} + glnexus_mem_gb: {help: "Optional amount of RAM in GB for glnexus; default 30"} + default_runtime_attributes: {help: "Default RuntimeAttributes; spot if preemptible was set to true, otherwise on_demand"} + } +} diff --git a/HiFi-human-WGS-WDL/workflows/cohort_analysis/inputs.json b/HiFi-human-WGS-WDL/workflows/cohort_analysis/inputs.json new file mode 100644 index 0000000..fab5ca6 --- /dev/null +++ b/HiFi-human-WGS-WDL/workflows/cohort_analysis/inputs.json @@ -0,0 +1,50 @@ +{ + "cohort_analysis.cohort_id": "String", + "cohort_analysis.sample_ids": [ + "String" + ], + "cohort_analysis.aligned_bams": [ + { + "data": "File", + "data_index": "File" + } + ], + "cohort_analysis.svsigs": [ + "File" + ], + "cohort_analysis.gvcfs": [ + { + "data": "File", + "data_index": "File" + } + ], + "cohort_analysis.reference": { + "name": "String", + "fasta": { + "data": "File", + "data_index": "File" + }, + "tandem_repeat_bed": "File", + "trgt_tandem_repeat_bed": "File", + "hificnv_exclude_bed": { + "data": "File", + "data_index": "File" + }, + "pbsv_splits": "File", + "hificnv_expected_bed_male": "File", + "hificnv_expected_bed_female": "File", + "gnomad_af": "File? (optional)", + "hprc_af": "File? (optional)", + "gff": "File? (optional)", + "population_vcfs": "Array[WomCompositeType {\n data -> File\ndata_index -> File \n}]? (optional)" + }, + "cohort_analysis.glnexus_mem_gb": "Int? (optional)", + "cohort_analysis.pbsv_call_mem_gb": "Int? (optional)", + "cohort_analysis.default_runtime_attributes": { + "preemptible_tries": "Int", + "max_retries": "Int", + "zones": "String", + "queue_arn": "String", + "container_registry": "String" + } +} diff --git a/HiFi-human-WGS-WDL/workflows/humanwgs_structs.wdl b/HiFi-human-WGS-WDL/workflows/humanwgs_structs.wdl new file mode 100644 index 0000000..1932ab0 --- /dev/null +++ b/HiFi-human-WGS-WDL/workflows/humanwgs_structs.wdl @@ -0,0 +1,51 @@ +version 1.0 + +import "wdl-common/wdl/structs.wdl" + +struct ReferenceData { + String name + IndexData fasta + + File pbsv_splits + + File tandem_repeat_bed + File trgt_tandem_repeat_bed + + IndexData hificnv_exclude_bed + File hificnv_expected_bed_male + File hificnv_expected_bed_female + + File? gnomad_af + File? hprc_af + File? gff + + Array[IndexData]? population_vcfs +} + +struct Sample { + String sample_id + Array[File] movie_bams + + String? sex + Boolean affected + + String? father_id + String? mother_id +} + +struct Cohort { + String cohort_id + Array[Sample] samples + + Array[String] phenotypes +} + +struct SlivarData { + File slivar_js + File hpo_terms + File hpo_dag + File hpo_annotations + File ensembl_to_hgnc + File lof_lookup + File clinvar_lookup +} diff --git a/HiFi-human-WGS-WDL/workflows/input_template.json b/HiFi-human-WGS-WDL/workflows/input_template.json new file mode 100644 index 0000000..39632a2 --- /dev/null +++ b/HiFi-human-WGS-WDL/workflows/input_template.json @@ -0,0 +1,54 @@ +{ + "humanwgs.cohort": { + "cohort_id": "String", + "samples": [ + { + "sample_id": "String", + "movie_bams": "Array[File]", + "sex": "String?", + "affected": "Boolean", + "father_id": "String? (optional)", + "mother_id": "String? (optional)" + } + ], + "phenotypes": "Array[String]" + }, + "humanwgs.reference": { + "name": "String", + "fasta": { + "data": "File", + "data_index": "File" + }, + "tandem_repeat_bed": "File", + "trgt_tandem_repeat_bed": "File", + "hificnv_exclude_bed": { + "data": "File", + "data_index": "File" + }, + "hificnv_expected_bed_male": "File", + "hificnv_expected_bed_female": "File", + "gnomad_af": "File? (optional)", + "hprc_af": "File? (optional)", + "gff": "File? (optional)", + "population_vcfs": "Array[WomCompositeType {\n data -> File\ndata_index -> File \n}]? (optional)" + }, + "humanwgs.slivar_data": { + "slivar_js": "File", + "hpo_terms": "File", + "hpo_dag": "File", + "hpo_annotations": "File", + "ensembl_to_hgnc": "File", + "lof_lookup": "File", + "clinvar_lookup": "File" + }, + "humanwgs.deepvariant_version": "String? (optional)", + "humanwgs.deepvariant_model": "WomCompositeType {\n model -> WomCompositeType {\n data -> File\ndata_index -> File \n}\nmetadata -> File \n}? (optional)", + "humanwgs.pbsv_call_mem_gb": "Int? (optional)", + "humanwgs.glnexus_mem_gb": "Int? (optional)", + "humanwgs.run_tertiary_analysis": "Boolean? (optional, default = false)", + "humanwgs.backend": "String ['GCP', 'Azure', 'AWS', or 'HPC']", + "humanwgs.zones": "String? (optional); required if backend is set to 'GCP' or 'AWS'", + "humanwgs.aws_spot_queue_arn": "String? (optional); required if backend is set to 'AWS'", + "humanwgs.aws_on_demand_queue_arn": "String? (optional); required if backend is set to 'AWS'", + "humanwgs.preemptible": "Boolean" +} diff --git a/HiFi-human-WGS-WDL/workflows/main.wdl b/HiFi-human-WGS-WDL/workflows/main.wdl new file mode 100644 index 0000000..0ee719a --- /dev/null +++ b/HiFi-human-WGS-WDL/workflows/main.wdl @@ -0,0 +1,172 @@ +version 1.0 + +import "humanwgs_structs.wdl" +import "wdl-common/wdl/workflows/backend_configuration/backend_configuration.wdl" as BackendConfiguration +import "sample_analysis/sample_analysis.wdl" as SampleAnalysis +import "cohort_analysis/cohort_analysis.wdl" as CohortAnalysis +import "tertiary_analysis/tertiary_analysis.wdl" as TertiaryAnalysis + +workflow humanwgs { + input { + Cohort cohort + + ReferenceData reference + SlivarData? slivar_data + + String deepvariant_version = "1.5.0" + DeepVariantModel? deepvariant_model + + Int? pbsv_call_mem_gb + Int? glnexus_mem_gb + + Boolean run_tertiary_analysis = false + + # Backend configuration + String backend + String? zones + String? aws_spot_queue_arn + String? aws_on_demand_queue_arn + String? container_registry + + Boolean preemptible + } + + call BackendConfiguration.backend_configuration { + input: + backend = backend, + zones = zones, + aws_spot_queue_arn = aws_spot_queue_arn, + aws_on_demand_queue_arn = aws_on_demand_queue_arn, + container_registry = container_registry + } + + RuntimeAttributes default_runtime_attributes = if preemptible then backend_configuration.spot_runtime_attributes else backend_configuration.on_demand_runtime_attributes + + scatter (sample in cohort.samples) { + call SampleAnalysis.sample_analysis { + input: + sample = sample, + reference = reference, + deepvariant_version = deepvariant_version, + deepvariant_model = deepvariant_model, + default_runtime_attributes = default_runtime_attributes + } + } + + if (length(cohort.samples) > 1) { + + scatter (sample in cohort.samples) { + String sample_id = sample.sample_id + } + + call CohortAnalysis.cohort_analysis { + input: + cohort_id = cohort.cohort_id, + sample_ids = sample_id, + aligned_bams = flatten(sample_analysis.aligned_bams), + svsigs = flatten(sample_analysis.svsigs), + gvcfs = sample_analysis.small_variant_gvcf, + reference = reference, + pbsv_call_mem_gb = pbsv_call_mem_gb, + glnexus_mem_gb = glnexus_mem_gb, + default_runtime_attributes = default_runtime_attributes + } + } + + if (run_tertiary_analysis && defined(slivar_data) && defined(reference.gnomad_af) && defined(reference.hprc_af) && defined(reference.gff) && defined(reference.population_vcfs)) { + IndexData slivar_small_variant_input_vcf = select_first([ + cohort_analysis.phased_joint_small_variant_vcf, + sample_analysis.phased_small_variant_vcf[0] + ]) + IndexData slivar_sv_input_vcf = select_first([ + cohort_analysis.phased_joint_sv_vcf, + sample_analysis.phased_sv_vcf[0] + ]) + + call TertiaryAnalysis.tertiary_analysis { + input: + cohort = cohort, + small_variant_vcf = slivar_small_variant_input_vcf, + sv_vcf = slivar_sv_input_vcf, + reference = reference, + slivar_data = select_first([slivar_data]), + default_runtime_attributes = default_runtime_attributes + } + } + + output { + # sample_analysis output + + # per movie stats, alignments + Array[Array[File]] bam_stats = sample_analysis.bam_stats + Array[Array[File]] read_length_summary = sample_analysis.read_length_summary + Array[Array[File]] read_quality_summary = sample_analysis.read_quality_summary + + # per sample small variant calls + Array[IndexData] small_variant_gvcfs = sample_analysis.small_variant_gvcf + Array[File] small_variant_vcf_stats = sample_analysis.small_variant_vcf_stats + Array[File] small_variant_roh_out = sample_analysis.small_variant_roh_out + Array[File] small_variant_roh_bed = sample_analysis.small_variant_roh_bed + + # per sample final phased variant calls and haplotagged alignments + Array[IndexData] sample_phased_small_variant_vcfs = sample_analysis.phased_small_variant_vcf + Array[IndexData] sample_phased_sv_vcfs = sample_analysis.phased_sv_vcf + Array[File] sample_hiphase_stats = sample_analysis.hiphase_stats + Array[File] sample_hiphase_blocks = sample_analysis.hiphase_blocks + Array[File] sample_hiphase_haplotags = sample_analysis.hiphase_haplotags + Array[IndexData] merged_haplotagged_bam = sample_analysis.merged_haplotagged_bam + Array[File] haplotagged_bam_mosdepth_summary = sample_analysis.haplotagged_bam_mosdepth_summary + Array[File] haplotagged_bam_mosdepth_region_bed = sample_analysis.haplotagged_bam_mosdepth_region_bed + + # per sample trgt outputs + Array[IndexData] trgt_spanning_reads = sample_analysis.trgt_spanning_reads + Array[IndexData] trgt_repeat_vcf = sample_analysis.trgt_repeat_vcf + Array[File] trgt_dropouts = sample_analysis.trgt_dropouts + + # per sample cpg outputs + Array[Array[File]] cpg_pileup_beds = sample_analysis.cpg_pileup_beds + Array[Array[File]] cpg_pileup_bigwigs = sample_analysis.cpg_pileup_bigwigs + + # per sample paraphase outputs + Array[File] paraphase_output_jsons = sample_analysis.paraphase_output_json + Array[IndexData] paraphase_realigned_bams = sample_analysis.paraphase_realigned_bam + Array[Array[File]] paraphase_vcfs = sample_analysis.paraphase_vcfs + + # per sample hificnv outputs + Array[IndexData] hificnv_vcfs = sample_analysis.hificnv_vcf + Array[File] hificnv_copynum_bedgraphs = sample_analysis.hificnv_copynum_bedgraph + Array[File] hificnv_depth_bws = sample_analysis.hificnv_depth_bw + Array[File] hificnv_maf_bws = sample_analysis.hificnv_maf_bw + + # cohort_analysis output + IndexData? cohort_sv_vcf = cohort_analysis.phased_joint_sv_vcf + IndexData? cohort_small_variant_vcf = cohort_analysis.phased_joint_small_variant_vcf + File? cohort_hiphase_stats = cohort_analysis.hiphase_stats + File? cohort_hiphase_blocks = cohort_analysis.hiphase_blocks + + # tertiary_analysis output + IndexData? filtered_small_variant_vcf = tertiary_analysis.filtered_small_variant_vcf + IndexData? compound_het_small_variant_vcf = tertiary_analysis.compound_het_small_variant_vcf + File? filtered_small_variant_tsv = tertiary_analysis.filtered_small_variant_tsv + File? compound_het_small_variant_tsv = tertiary_analysis.compound_het_small_variant_tsv + IndexData? filtered_svpack_vcf = tertiary_analysis.filtered_svpack_vcf + File? filtered_svpack_tsv = tertiary_analysis.filtered_svpack_tsv + } + + parameter_meta { + cohort: {help: "Sample information for the cohort"} + reference: {help: "Reference genome data"} + slivar_data: {help: "Data files used for annotation with slivar (required if `run_tertiary_analysis` is set to `true`)"} + deepvariant_version: {help: "Version of deepvariant to use"} + deepvariant_model: {help: "Optional deepvariant model file to use"} + pbsv_call_mem_gb: {help: "Optional amount of RAM in GB for pbsv_call; default 64 for cohorts N<=3, 96 for cohorts N>3"} + glnexus_mem_gb: {help: "Optional amount of RAM in GB for glnexus; default 30"} + run_tertiary_analysis: {help: "Run the optional tertiary analysis steps"} + backend: {help: "Backend where the workflow will be executed ['GCP', 'Azure', 'AWS', 'HPC']"} + zones: {help: "Zones where compute will take place; required if backend is set to 'AWS' or 'GCP'"} + aws_spot_queue_arn: {help: "Queue ARN for the spot batch queue; required if backend is set to 'AWS'"} + aws_on_demand_queue_arn: {help: "Queue ARN for the on demand batch queue; required if backend is set to 'AWS'"} + container_registry: {help: "Container registry where workflow images are hosted. If left blank, PacBio's public Quay.io registry will be used."} + preemptible: {help: "Where possible, run tasks preemptibly"} + } +} diff --git a/HiFi-human-WGS-WDL/workflows/sample_analysis/inputs.json b/HiFi-human-WGS-WDL/workflows/sample_analysis/inputs.json new file mode 100644 index 0000000..6a7ff8c --- /dev/null +++ b/HiFi-human-WGS-WDL/workflows/sample_analysis/inputs.json @@ -0,0 +1,41 @@ +{ + "sample_analysis.sample": { + "sample_id": "String", + "movie_bams": [ + "File" + ], + "sex": "String? (optional)", + "affected": "Boolean", + "father_id": "String? (optional)", + "mother_id": "String? (optional)" + }, + "sample_analysis.reference": { + "name": "String", + "fasta": { + "data": "File", + "data_index": "File" + }, + "pbsv_splits": "File", + "tandem_repeat_bed": "File", + "trgt_tandem_repeat_bed": "File", + "hificnv_exclude_bed": { + "data": "File", + "data_index": "File" + }, + "hificnv_expected_bed_male": "File", + "hificnv_expected_bed_female": "File", + "gnomad_af": "File? (optional)", + "hprc_af": "File? (optional)", + "gff": "File? (optional)", + "population_vcfs": "Array[WomCompositeType {\n data -> File\ndata_index -> File \n}]? (optional)" + }, + "sample_analysis.deepvariant_version": "String", + "sample_analysis.deepvariant_model": "WomCompositeType {\n model -> WomCompositeType {\n data -> File\ndata_index -> File \n}\nmetadata -> File \n}? (optional)", + "sample_analysis.default_runtime_attributes": { + "preemptible_tries": "Int", + "max_retries": "Int", + "zones": "String", + "queue_arn": "String", + "container_registry": "String" + } +} diff --git a/HiFi-human-WGS-WDL/workflows/sample_analysis/sample_analysis.wdl b/HiFi-human-WGS-WDL/workflows/sample_analysis/sample_analysis.wdl new file mode 100644 index 0000000..1f4f1a8 --- /dev/null +++ b/HiFi-human-WGS-WDL/workflows/sample_analysis/sample_analysis.wdl @@ -0,0 +1,737 @@ +version 1.0 + +# Run for each sample in the cohort. Aligns reads from each movie to the reference genome, then calls and phases small and structural variants. + +import "../humanwgs_structs.wdl" +import "../wdl-common/wdl/tasks/pbsv_discover.wdl" as PbsvDiscover +import "../wdl-common/wdl/workflows/deepvariant/deepvariant.wdl" as DeepVariant +import "../wdl-common/wdl/tasks/mosdepth.wdl" as Mosdepth +import "../wdl-common/wdl/tasks/pbsv_call.wdl" as PbsvCall +import "../wdl-common/wdl/tasks/concat_vcf.wdl" as ConcatVcf +import "../wdl-common/wdl/workflows/hiphase/hiphase.wdl" as HiPhase + +workflow sample_analysis { + input { + Sample sample + + ReferenceData reference + + String deepvariant_version + DeepVariantModel? deepvariant_model + + RuntimeAttributes default_runtime_attributes + } + + Array[Array[String]] pbsv_splits = read_json(reference.pbsv_splits) + + scatter (movie_bam in sample.movie_bams) { + call pbmm2_align { + input: + sample_id = sample.sample_id, + bam = movie_bam, + reference = reference.fasta.data, + reference_index = reference.fasta.data_index, + reference_name = reference.name, + runtime_attributes = default_runtime_attributes + } + + call PbsvDiscover.pbsv_discover { + input: + aligned_bam = pbmm2_align.aligned_bam, + aligned_bam_index = pbmm2_align.aligned_bam_index, + reference_tandem_repeat_bed = reference.tandem_repeat_bed, + runtime_attributes = default_runtime_attributes + } + + IndexData aligned_bam = { + "data": pbmm2_align.aligned_bam, + "data_index": pbmm2_align.aligned_bam_index + } + } + + call DeepVariant.deepvariant { + input: + sample_id = sample.sample_id, + aligned_bams = aligned_bam, + reference_fasta = reference.fasta, + reference_name = reference.name, + deepvariant_version = deepvariant_version, + deepvariant_model = deepvariant_model, + default_runtime_attributes = default_runtime_attributes + } + + call bcftools { + input: + vcf = deepvariant.vcf.data, + stats_params = "--apply-filters PASS --samples ~{sample.sample_id}", + reference = reference.fasta.data, + runtime_attributes = default_runtime_attributes + } + + scatter (shard_index in range(length(pbsv_splits))) { + Array[String] region_set = pbsv_splits[shard_index] + + call PbsvCall.pbsv_call { + input: + sample_id = sample.sample_id, + svsigs = pbsv_discover.svsig, + reference = reference.fasta.data, + reference_index = reference.fasta.data_index, + reference_name = reference.name, + shard_index = shard_index, + regions = region_set, + runtime_attributes = default_runtime_attributes + } + } + + call ConcatVcf.concat_vcf { + input: + vcfs = pbsv_call.pbsv_vcf, + vcf_indices = pbsv_call.pbsv_vcf_index, + output_vcf_name = "~{sample.sample_id}.~{reference.name}.pbsv.vcf.gz", + runtime_attributes = default_runtime_attributes + } + + IndexData zipped_pbsv_vcf = { + "data": concat_vcf.concatenated_vcf, + "data_index": concat_vcf.concatenated_vcf_index + } + + call HiPhase.hiphase { + # vcfs order: small variants, SVs + input: + id = sample.sample_id, + refname = reference.name, + sample_ids = [sample.sample_id], + vcfs = [deepvariant.vcf, zipped_pbsv_vcf], + bams = aligned_bam, + haplotag = true, + reference_fasta = reference.fasta, + default_runtime_attributes = default_runtime_attributes + } + + # merge haplotagged bams if there are multiple + if (length(hiphase.haplotagged_bams) > 1) { + scatter (bam_object in hiphase.haplotagged_bams) { + File bam_to_merge = bam_object.data + } + call merge_bams { + input: + bams = bam_to_merge, + output_bam_name = "~{sample.sample_id}.~{reference.name}.haplotagged.bam", + runtime_attributes = default_runtime_attributes + } + } + + # select the merged bam if it exists, otherwise select the first (only) haplotagged bam + File haplotagged_bam = select_first([merge_bams.merged_bam, hiphase.haplotagged_bams[0].data]) + File haplotagged_bam_index = select_first([merge_bams.merged_bam_index, hiphase.haplotagged_bams[0].data_index]) + + call Mosdepth.mosdepth { + input: + aligned_bam = haplotagged_bam, + aligned_bam_index = haplotagged_bam_index, + runtime_attributes = default_runtime_attributes + } + + call trgt { + input: + sample_id = sample.sample_id, + sex = sample.sex, + bam = haplotagged_bam, + bam_index = haplotagged_bam_index, + reference = reference.fasta.data, + reference_index = reference.fasta.data_index, + tandem_repeat_bed = reference.trgt_tandem_repeat_bed, + runtime_attributes = default_runtime_attributes + } + + call coverage_dropouts { + input: + bam = haplotagged_bam, + bam_index = haplotagged_bam_index, + tandem_repeat_bed = reference.trgt_tandem_repeat_bed, + output_prefix = "~{sample.sample_id}.~{reference.name}", + runtime_attributes = default_runtime_attributes + } + + call cpg_pileup { + input: + bam = haplotagged_bam, + bam_index = haplotagged_bam_index, + output_prefix = "~{sample.sample_id}.~{reference.name}", + reference = reference.fasta.data, + reference_index = reference.fasta.data_index, + runtime_attributes = default_runtime_attributes + } + + call paraphase { + input: + sample_id = sample.sample_id, + bam = haplotagged_bam, + bam_index = haplotagged_bam_index, + reference = reference.fasta.data, + reference_index = reference.fasta.data_index, + out_directory = "~{sample.sample_id}.paraphase", + runtime_attributes = default_runtime_attributes + } + + call hificnv { + input: + sample_id = sample.sample_id, + sex = sample.sex, + bam = haplotagged_bam, + bam_index = haplotagged_bam_index, + phased_vcf = hiphase.phased_vcfs[0].data, + phased_vcf_index = hiphase.phased_vcfs[0].data_index, + reference = reference.fasta.data, + reference_index = reference.fasta.data_index, + exclude_bed = reference.hificnv_exclude_bed.data, + exclude_bed_index = reference.hificnv_exclude_bed.data_index, + expected_bed_male = reference.hificnv_expected_bed_male, + expected_bed_female = reference.hificnv_expected_bed_female, + output_prefix = "hificnv", + runtime_attributes = default_runtime_attributes + } + + output { + # per movie stats, alignments, and svsigs + Array[File] bam_stats = pbmm2_align.bam_stats + Array[File] read_length_summary = pbmm2_align.read_length_summary + Array[File] read_quality_summary = pbmm2_align.read_quality_summary + Array[IndexData] aligned_bams = aligned_bam + Array[File] svsigs = pbsv_discover.svsig + + # per sample small variant calls + IndexData small_variant_gvcf = deepvariant.gvcf + File small_variant_vcf_stats = bcftools.stats + File small_variant_roh_out = bcftools.roh_out + File small_variant_roh_bed = bcftools.roh_bed + + # per sample final phased variant calls and haplotagged alignments + # phased_vcfs order: small variants, SVs + IndexData phased_small_variant_vcf = hiphase.phased_vcfs[0] + IndexData phased_sv_vcf = hiphase.phased_vcfs[1] + File hiphase_stats = hiphase.hiphase_stats + File hiphase_blocks = hiphase.hiphase_blocks + File hiphase_haplotags = select_first([hiphase.hiphase_haplotags]) + IndexData merged_haplotagged_bam = {"data": haplotagged_bam, "data_index": haplotagged_bam_index} + File haplotagged_bam_mosdepth_summary = mosdepth.summary + File haplotagged_bam_mosdepth_region_bed = mosdepth.region_bed + + # per sample trgt outputs + IndexData trgt_spanning_reads = {"data": trgt.spanning_reads, "data_index": trgt.spanning_reads_index} + IndexData trgt_repeat_vcf = {"data": trgt.repeat_vcf, "data_index": trgt.repeat_vcf_index} + File trgt_dropouts = coverage_dropouts.trgt_dropouts + + # per sample cpg outputs + Array[File] cpg_pileup_beds = cpg_pileup.pileup_beds + Array[File] cpg_pileup_bigwigs = cpg_pileup.pileup_bigwigs + + # per sample paraphase outputs + File paraphase_output_json = paraphase.output_json + IndexData paraphase_realigned_bam = {"data": paraphase.realigned_bam, "data_index": paraphase.realigned_bam_index} + Array[File] paraphase_vcfs = paraphase.paraphase_vcfs + + # per sample hificnv outputs + IndexData hificnv_vcf = {"data": hificnv.cnv_vcf, "data_index": hificnv.cnv_vcf_index} + File hificnv_copynum_bedgraph = hificnv.copynum_bedgraph + File hificnv_depth_bw = hificnv.depth_bw + File hificnv_maf_bw = hificnv.maf_bw + } + + parameter_meta { + sample: {help: "Sample information and associated data files"} + reference: {help: "Reference genome data"} + deepvariant_version: {help: "Version of deepvariant to use"} + deepvariant_model: {help: "Optional deepvariant model file to use"} + default_runtime_attributes: {help: "Default RuntimeAttributes; spot if preemptible was set to true, otherwise on_demand"} + } +} + +task pbmm2_align { + input { + String sample_id + File bam + + File reference + File reference_index + String reference_name + + RuntimeAttributes runtime_attributes + } + + String movie = basename(bam, ".bam") + + Int threads = 24 + Int mem_gb = ceil(threads * 4) + Int disk_size = ceil((size(bam, "GB") + size(reference, "GB")) * 4 + 20) + + command <<< + set -euo pipefail + + pbmm2 --version + + pbmm2 align \ + --num-threads ~{threads} \ + --sort-memory 4G \ + --preset HIFI \ + --sample ~{sample_id} \ + --log-level INFO \ + --sort \ + --unmapped \ + ~{reference} \ + ~{bam} \ + ~{sample_id}.~{movie}.~{reference_name}.aligned.bam + + # movie stats + extract_read_length_and_qual.py \ + ~{bam} \ + > ~{sample_id}.~{movie}.read_length_and_quality.tsv + + awk '{{ b=int($2/1000); b=(b>39?39:b); print 1000*b "\t" $2; }}' \ + ~{sample_id}.~{movie}.read_length_and_quality.tsv \ + | sort -k1,1g \ + | datamash -g 1 count 1 sum 2 \ + | awk 'BEGIN {{ for(i=0;i<=39;i++) {{ print 1000*i"\t0\t0"; }} }} {{ print; }}' \ + | sort -k1,1g \ + | datamash -g 1 sum 2 sum 3 \ + > ~{sample_id}.~{movie}.read_length_summary.tsv + + awk '{{ print ($3>50?50:$3) "\t" $2; }}' \ + ~{sample_id}.~{movie}.read_length_and_quality.tsv \ + | sort -k1,1g \ + | datamash -g 1 count 1 sum 2 \ + | awk 'BEGIN {{ for(i=0;i<=60;i++) {{ print i"\t0\t0"; }} }} {{ print; }}' \ + | sort -k1,1g \ + | datamash -g 1 sum 2 sum 3 \ + > ~{sample_id}.~{movie}.read_quality_summary.tsv + >>> + + output { + File aligned_bam = "~{sample_id}.~{movie}.~{reference_name}.aligned.bam" + File aligned_bam_index = "~{sample_id}.~{movie}.~{reference_name}.aligned.bam.bai" + File bam_stats = "~{sample_id}.~{movie}.read_length_and_quality.tsv" + File read_length_summary = "~{sample_id}.~{movie}.read_length_summary.tsv" + File read_quality_summary = "~{sample_id}.~{movie}.read_quality_summary.tsv" + } + + runtime { + docker: "~{runtime_attributes.container_registry}/pbmm2@sha256:1013aa0fd5fb42c607d78bfe3ec3d19e7781ad3aa337bf84d144c61ed7d51fa1" + cpu: threads + memory: mem_gb + " GB" + disk: disk_size + " GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: runtime_attributes.preemptible_tries + maxRetries: runtime_attributes.max_retries + awsBatchRetryAttempts: runtime_attributes.max_retries + queueArn: runtime_attributes.queue_arn + zones: runtime_attributes.zones + } +} + +task bcftools { + input { + File vcf + + String? stats_params + File reference + + RuntimeAttributes runtime_attributes + } + + String vcf_basename = basename(vcf, ".vcf.gz") + + Int threads = 2 + Int reference_size = if (defined(reference)) then ceil(size(reference, "GB")) else 0 + Int disk_size = ceil((size(vcf, "GB") + reference_size) * 2 + 20) + + command <<< + set -euo pipefail + + bcftools --version + + bcftools stats \ + --threads ~{threads - 1} \ + ~{stats_params} \ + ~{"--fasta-ref " + reference} \ + ~{vcf} \ + > ~{vcf_basename}.vcf.stats.txt + + bcftools roh \ + --threads ~{threads - 1} \ + --AF-dflt 0.4 \ + ~{vcf} \ + > ~{vcf_basename}.bcftools_roh.out + + echo -e "#chr\\tstart\\tend\\tqual" > ~{vcf_basename}.roh.bed + awk -v OFS='\t' '$1=="RG" {{ print $3, $4, $5, $8 }}' \ + ~{vcf_basename}.bcftools_roh.out \ + >> ~{vcf_basename}.roh.bed + >>> + + output { + File stats = "~{vcf_basename}.vcf.stats.txt" + File roh_out = "~{vcf_basename}.bcftools_roh.out" + File roh_bed = "~{vcf_basename}.roh.bed" + } + + runtime { + docker: "~{runtime_attributes.container_registry}/bcftools@sha256:46720a7ab5feba5be06d5269454a6282deec13060e296f0bc441749f6f26fdec" + cpu: threads + memory: "4 GB" + disk: disk_size + " GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: runtime_attributes.preemptible_tries + maxRetries: runtime_attributes.max_retries + awsBatchRetryAttempts: runtime_attributes.max_retries + queueArn: runtime_attributes.queue_arn + zones: runtime_attributes.zones + } +} + +task merge_bams { + input { + Array[File] bams + + String output_bam_name + + RuntimeAttributes runtime_attributes + } + + Int threads = 8 + Int disk_size = ceil(size(bams, "GB") * 2 + 20) + + command <<< + set -euo pipefail + + samtools --version + + samtools merge \ + -@ ~{threads - 1} \ + -o ~{output_bam_name} \ + ~{sep=' ' bams} + + samtools index ~{output_bam_name} + >>> + + output { + File merged_bam = "~{output_bam_name}" + File merged_bam_index = "~{output_bam_name}.bai" + } + + runtime { + docker: "~{runtime_attributes.container_registry}/samtools@sha256:cbe496e16773d4ad6f2eec4bd1b76ff142795d160f9dd418318f7162dcdaa685" + cpu: threads + memory: "4 GB" + disk: disk_size + " GB" + disks: "local-disk " + disk_size + " LOCAL" + preemptible: runtime_attributes.preemptible_tries + maxRetries: runtime_attributes.max_retries + awsBatchRetryAttempts: runtime_attributes.max_retries + queueArn: runtime_attributes.queue_arn + zones: runtime_attributes.zones + } +} + +task trgt { + input { + String sample_id + String? sex + + File bam + File bam_index + + File reference + File reference_index + File tandem_repeat_bed + + RuntimeAttributes runtime_attributes + } + + Boolean sex_defined = defined(sex) + String karyotype = if select_first([sex, "FEMALE"]) == "MALE" then "XY" else "XX" + String bam_basename = basename(bam, ".bam") + Int threads = 4 + Int disk_size = ceil((size(bam, "GB") + size(reference, "GB")) * 2 + 20) + + command <<< + set -euo pipefail + + echo ~{if sex_defined then "" else "Sex is not defined for ~{sample_id}. Defaulting to karyotype XX for TRGT."} + + trgt --version + + trgt \ + --threads ~{threads} \ + --karyotype ~{karyotype} \ + --genome ~{reference} \ + --repeats ~{tandem_repeat_bed} \ + --reads ~{bam} \ + --output-prefix ~{bam_basename}.trgt + + bcftools --version + + bcftools sort \ + --output-type z \ + --output ~{bam_basename}.trgt.sorted.vcf.gz \ + ~{bam_basename}.trgt.vcf.gz + + bcftools index \ + --threads ~{threads - 1} \ + --tbi \ + ~{bam_basename}.trgt.sorted.vcf.gz + + samtools --version + + samtools sort \ + -@ ~{threads - 1} \ + -o ~{bam_basename}.trgt.spanning.sorted.bam \ + ~{bam_basename}.trgt.spanning.bam + + samtools index \ + -@ ~{threads - 1} \ + ~{bam_basename}.trgt.spanning.sorted.bam + >>> + + output { + File spanning_reads = "~{bam_basename}.trgt.spanning.sorted.bam" + File spanning_reads_index = "~{bam_basename}.trgt.spanning.sorted.bam.bai" + File repeat_vcf = "~{bam_basename}.trgt.sorted.vcf.gz" + File repeat_vcf_index = "~{bam_basename}.trgt.sorted.vcf.gz.tbi" + } + + runtime { + docker: "~{runtime_attributes.container_registry}/trgt@sha256:8c9f236eb3422e79d7843ffd59e1cbd9b76774525f20d88cd68ca64eb63054eb" + cpu: threads + memory: "4 GB" + disk: disk_size + " GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: runtime_attributes.preemptible_tries + maxRetries: runtime_attributes.max_retries + awsBatchRetryAttempts: runtime_attributes.max_retries + queueArn: runtime_attributes.queue_arn + zones: runtime_attributes.zones + } +} + +task coverage_dropouts { + input { + File bam + File bam_index + + File tandem_repeat_bed + + String output_prefix + + RuntimeAttributes runtime_attributes + } + + Int threads = 2 + Int disk_size = ceil((size(bam, "GB")) * 2 + 20) + + command <<< + set -euo pipefail + + # Get coverage dropouts + check_trgt_coverage.py \ + ~{tandem_repeat_bed} \ + ~{bam} \ + > ~{output_prefix}.trgt.dropouts.txt + >>> + + output { + File trgt_dropouts = "~{output_prefix}.trgt.dropouts.txt" + } + + runtime { + docker: "~{runtime_attributes.container_registry}/trgt@sha256:8c9f236eb3422e79d7843ffd59e1cbd9b76774525f20d88cd68ca64eb63054eb" + cpu: threads + memory: "4 GB" + disk: disk_size + " GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: runtime_attributes.preemptible_tries + maxRetries: runtime_attributes.max_retries + awsBatchRetryAttempts: runtime_attributes.max_retries + queueArn: runtime_attributes.queue_arn + zones: runtime_attributes.zones + } +} + +task cpg_pileup { + input { + File bam + File bam_index + + String output_prefix + + File reference + File reference_index + + RuntimeAttributes runtime_attributes + } + + Int threads = 12 + # Uses ~4 GB memory / thread + Int mem_gb = threads * 4 + Int disk_size = ceil((size(bam, "GB") + size(reference, "GB")) * 2 + 20) + + command <<< + set -euo pipefail + + aligned_bam_to_cpg_scores --version + + aligned_bam_to_cpg_scores \ + --threads ~{threads} \ + --bam ~{bam} \ + --ref ~{reference} \ + --output-prefix ~{output_prefix} \ + --min-mapq 1 \ + --min-coverage 10 \ + --model "$PILEUP_MODEL_DIR"/pileup_calling_model.v1.tflite + >>> + + output { + Array[File] pileup_beds = glob("~{output_prefix}.*.bed") + Array[File] pileup_bigwigs = glob("~{output_prefix}.*.bw") + } + + runtime { + docker: "~{runtime_attributes.container_registry}/pb-cpg-tools@sha256:b95ff1c53bb16e53b8c24f0feaf625a4663973d80862518578437f44385f509b" + cpu: threads + memory: mem_gb + " GB" + disk: disk_size + " GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: runtime_attributes.preemptible_tries + maxRetries: runtime_attributes.max_retries + awsBatchRetryAttempts: runtime_attributes.max_retries + queueArn: runtime_attributes.queue_arn + zones: runtime_attributes.zones + } +} + +task paraphase { + input { + File bam + File bam_index + + File reference + File reference_index + + String sample_id + String out_directory + + RuntimeAttributes runtime_attributes + } + + Int threads = 4 + Int mem_gb = 4 + Int disk_size = ceil(size(bam, "GB") + 20) + + command <<< + set -euo pipefail + + paraphase --version + + paraphase \ + --threads ~{threads} \ + --bam ~{bam} \ + --reference ~{reference} \ + --out ~{out_directory} + >>> + + output { + File output_json = "~{out_directory}/~{sample_id}.json" + File realigned_bam = "~{out_directory}/~{sample_id}_realigned_tagged.bam" + File realigned_bam_index = "~{out_directory}/~{sample_id}_realigned_tagged.bam.bai" + Array[File] paraphase_vcfs = glob("~{out_directory}/~{sample_id}_vcfs/*.vcf") + } + + runtime { + docker: "~{runtime_attributes.container_registry}/paraphase@sha256:186dec5f6dabedf8c90fe381cd8f934d31fe74310175efee9ca4f603deac954d" + cpu: threads + memory: mem_gb + " GB" + disk: disk_size + " GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: runtime_attributes.preemptible_tries + maxRetries: runtime_attributes.max_retries + awsBatchRetryAttempts: runtime_attributes.max_retries + queueArn: runtime_attributes.queue_arn + zones: runtime_attributes.zones + } +} + +task hificnv { + input { + String sample_id + String? sex + + File bam + File bam_index + + File phased_vcf + File phased_vcf_index + + File reference + File reference_index + + File exclude_bed + File exclude_bed_index + + File expected_bed_male + File expected_bed_female + + String output_prefix + + RuntimeAttributes runtime_attributes + } + + Boolean sex_defined = defined(sex) + File expected_bed = if select_first([sex, "FEMALE"]) == "MALE" then expected_bed_male else expected_bed_female + + Int threads = 8 + # Uses ~2 GB memory / thread + Int mem_gb = threads * 2 + # <1 GB for output + Int disk_size = ceil((size(bam, "GB") + size(reference, "GB"))+ 20) + + command <<< + set -euo pipefail + + echo ~{if sex_defined then "" else "Sex is not defined for ~{sample_id}. Defaulting to karyotype XX for HiFiCNV."} + + hificnv --version + + hificnv \ + --threads ~{threads} \ + --bam ~{bam} \ + --ref ~{reference} \ + --maf ~{phased_vcf} \ + --exclude ~{exclude_bed} \ + --expected-cn ~{expected_bed} \ + --output-prefix ~{output_prefix} + + bcftools index --tbi ~{output_prefix}.~{sample_id}.vcf.gz + >>> + + output { + File cnv_vcf = "~{output_prefix}.~{sample_id}.vcf.gz" + File cnv_vcf_index = "~{output_prefix}.~{sample_id}.vcf.gz.tbi" + File copynum_bedgraph = "~{output_prefix}.~{sample_id}.copynum.bedgraph" + File depth_bw = "~{output_prefix}.~{sample_id}.depth.bw" + File maf_bw = "~{output_prefix}.~{sample_id}.maf.bw" + } + + runtime { + docker: "~{runtime_attributes.container_registry}/hificnv@sha256:19fdde99ad2454598ff7d82f27209e96184d9a6bb92dc0485cc7dbe87739b3c2" + cpu: threads + memory: mem_gb + " GB" + disk: disk_size + " GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: runtime_attributes.preemptible_tries + maxRetries: runtime_attributes.max_retries + awsBatchRetryAttempts: runtime_attributes.max_retries + queueArn: runtime_attributes.queue_arn + zones: runtime_attributes.zones + } +} diff --git a/HiFi-human-WGS-WDL/workflows/tertiary_analysis/inputs.json b/HiFi-human-WGS-WDL/workflows/tertiary_analysis/inputs.json new file mode 100644 index 0000000..a806db4 --- /dev/null +++ b/HiFi-human-WGS-WDL/workflows/tertiary_analysis/inputs.json @@ -0,0 +1,69 @@ +{ + "tertiary_analysis.cohort": { + "cohort_id": "String", + "samples": [ + { + "sample_id": "String", + "movie_bams": [ + "File" + ], + "sex": "String?", + "affected": "Boolean", + "father_id": "String?", + "mother_id": "String?" + } + ], + "phenotypes": [ + "String" + ] + }, + "tertiary_analysis.small_variant_vcf": { + "data": "File", + "data_index": "File" + }, + "tertiary_analysis.sv_vcf": { + "data": "File", + "data_index": "File" + }, + "tertiary_analysis.reference": { + "name": "String", + "fasta": { + "data": "File", + "data_index": "File" + }, + "pbsv_splits": "File", + "tandem_repeat_bed": "File", + "trgt_tandem_repeat_bed": "File", + "gnomad_af": "File", + "hprc_af": "File", + "gff": "File", + "population_vcfs": [ + { + "data": "File", + "data_index": "File" + } + ], + "hificnv_exclude_bed": { + "data": "File", + "data_index": "File" + }, + "hificnv_expected_bed_male": "File", + "hificnv_expected_bed_female": "File" + }, + "tertiary_analysis.slivar_data": { + "slivar_js": "File", + "hpo_terms": "File", + "hpo_dag": "File", + "hpo_annotations": "File", + "ensembl_to_hgnc": "File", + "lof_lookup": "File", + "clinvar_lookup": "File" + }, + "tertiary_analysis.default_runtime_attributes": { + "preemptible_tries": "Int", + "max_retries": "Int", + "zones": "String", + "queue_arn": "String", + "container_registry": "String" + } +} diff --git a/HiFi-human-WGS-WDL/workflows/tertiary_analysis/tertiary_analysis.wdl b/HiFi-human-WGS-WDL/workflows/tertiary_analysis/tertiary_analysis.wdl new file mode 100644 index 0000000..69776c3 --- /dev/null +++ b/HiFi-human-WGS-WDL/workflows/tertiary_analysis/tertiary_analysis.wdl @@ -0,0 +1,471 @@ +version 1.0 + +# Annotate small and structural variant VCFs using slivar. Outputs annotated VCFs and TSVs. +# This workflow is run on a phased single-sample VCF if there is only a single individual in the cohort, otherwise it is run on the joint-called phased VCF. + +import "../humanwgs_structs.wdl" + +workflow tertiary_analysis { + input { + Cohort cohort + IndexData small_variant_vcf + IndexData sv_vcf + + ReferenceData reference + + SlivarData slivar_data + + RuntimeAttributes default_runtime_attributes + } + + call write_yaml_ped_phrank { + input: + cohort_id = cohort.cohort_id, + cohort_json = write_json(cohort), + hpo_terms = slivar_data.hpo_terms, + hpo_dag = slivar_data.hpo_dag, + hpo_annotations = slivar_data.hpo_annotations, + ensembl_to_hgnc = slivar_data.ensembl_to_hgnc, + runtime_attributes = default_runtime_attributes + } + + call slivar_small_variant { + input: + vcf = small_variant_vcf.data, + vcf_index = small_variant_vcf.data_index, + pedigree = write_yaml_ped_phrank.pedigree, + reference = reference.fasta.data, + reference_index = reference.fasta.data_index, + slivar_js = slivar_data.slivar_js, + gnomad_af = select_first([reference.gnomad_af]), + hprc_af = select_first([reference.hprc_af]), + gff = select_first([reference.gff]), + lof_lookup = slivar_data.lof_lookup, + clinvar_lookup = slivar_data.clinvar_lookup, + phrank_lookup = write_yaml_ped_phrank.phrank_lookup, + runtime_attributes = default_runtime_attributes + } + + scatter (vcf_object in select_first([reference.population_vcfs])) { + File population_vcf = vcf_object.data + File population_vcf_index = vcf_object.data_index + } + + call svpack_filter_annotated { + input: + sv_vcf = sv_vcf.data, + population_vcfs = population_vcf, + population_vcf_indices = population_vcf_index, + gff = select_first([reference.gff]), + runtime_attributes = default_runtime_attributes + } + + call slivar_svpack_tsv { + input: + filtered_vcf = svpack_filter_annotated.svpack_vcf, + pedigree = write_yaml_ped_phrank.pedigree, + lof_lookup = slivar_data.lof_lookup, + clinvar_lookup = slivar_data.clinvar_lookup, + phrank_lookup = write_yaml_ped_phrank.phrank_lookup, + runtime_attributes = default_runtime_attributes + } + + output { + IndexData filtered_small_variant_vcf = {"data": slivar_small_variant.filtered_vcf, "data_index": slivar_small_variant.filtered_vcf_index} + IndexData compound_het_small_variant_vcf = {"data": slivar_small_variant.compound_het_vcf, "data_index": slivar_small_variant.compound_het_vcf_index} + File filtered_small_variant_tsv = slivar_small_variant.filtered_tsv + File compound_het_small_variant_tsv = slivar_small_variant.compound_het_tsv + IndexData filtered_svpack_vcf = {"data": svpack_filter_annotated.svpack_vcf, "data_index": svpack_filter_annotated.svpack_vcf_index} + File filtered_svpack_tsv = slivar_svpack_tsv.svpack_tsv + } + + parameter_meta { + cohort: {help: "Sample information for the cohort"} + small_variant_vcf: {help: "Small variant VCF to annotate using slivar"} + sv_vcf: {help: "Structural variant VCF to annotate using slivar"} + reference: {help: "Reference genome data"} + slivar_data: {help: "Data files used for annotation with slivar"} + default_runtime_attributes: {help: "Default RuntimeAttributes; spot if preemptible was set to true, otherwise on_demand"} + } +} + +task write_yaml_ped_phrank { + input { + String cohort_id + File cohort_json + + File hpo_terms + File hpo_dag + File hpo_annotations + File ensembl_to_hgnc + + RuntimeAttributes runtime_attributes + } + + Int disk_size = ceil((size(hpo_terms, "GB") + size(hpo_dag, "GB") + size(hpo_annotations, "GB") + size(ensembl_to_hgnc, "GB")) * 2 + 20) + + command <<< + set -euo pipefail + + parse_cohort.py \ + --cohort_json ~{cohort_json} \ + --write_cohort_yaml ~{cohort_id}.yml + + yaml2ped.py \ + ~{cohort_id}.yml \ + ~{cohort_id} \ + ~{cohort_id}.ped + + calculate_phrank.py \ + ~{hpo_terms} \ + ~{hpo_dag} \ + ~{hpo_annotations} \ + ~{ensembl_to_hgnc} \ + ~{cohort_id}.yml \ + ~{cohort_id} \ + ~{cohort_id}_phrank.tsv + >>> + + output { + File cohort_yaml = "~{cohort_id}.yml" + File pedigree = "~{cohort_id}.ped" + File phrank_lookup = "~{cohort_id}_phrank.tsv" + } + + runtime { + docker: "~{runtime_attributes.container_registry}/pyyaml@sha256:af6f0689a7412b1edf76bd4bf6434e7fa6a86192eebf19573e8618880d9c1dbb" + cpu: 2 + memory: "4 GB" + disk: disk_size + " GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: runtime_attributes.preemptible_tries + maxRetries: runtime_attributes.max_retries + awsBatchRetryAttempts: runtime_attributes.max_retries + queueArn: runtime_attributes.queue_arn + zones: runtime_attributes.zones + } +} + +task slivar_small_variant { + input { + File vcf + File vcf_index + + File pedigree + + File reference + File reference_index + + File slivar_js + File gnomad_af + File hprc_af + File gff + + File lof_lookup + File clinvar_lookup + File phrank_lookup + + RuntimeAttributes runtime_attributes + } + + Float max_gnomad_af = 0.03 + Float max_hprc_af = 0.03 + Int max_gnomad_nhomalt = 4 + Int max_hprc_nhomalt = 4 + Int max_gnomad_ac = 4 + Int max_hprc_ac = 4 + Int min_gq = 5 + + Array[String] info_expr = [ + 'variant.FILTER=="PASS"', + 'INFO.gnomad_af <= ~{max_gnomad_af}', + 'INFO.hprc_af <= ~{max_hprc_af}', + 'INFO.gnomad_nhomalt <= ~{max_gnomad_nhomalt}', + 'INFO.hprc_nhomalt <= ~{max_hprc_nhomalt}' + ] + Array[String] family_recessive_expr = [ + 'recessive:fam.every(segregating_recessive)' + ] + Array[String] family_x_recessive_expr = [ + 'x_recessive:(variant.CHROM == "chrX")', + 'fam.every(segregating_recessive_x)' + ] + Array[String] family_dominant_expr = [ + 'dominant:fam.every(segregating_dominant)', + 'INFO.gnomad_ac <= ~{max_gnomad_ac}', + 'INFO.hprc_ac <= ~{max_hprc_ac}' + ] + Array[String] sample_expr = [ + 'comphet_side:sample.het', + 'sample.GQ > ~{min_gq}' + ] + Array[String] skip_list = [ + 'non_coding_transcript', + 'intron', + 'non_coding', + 'upstream_gene', + 'downstream_gene', + 'non_coding_transcript_exon', + 'NMD_transcript', + '5_prime_UTR', + '3_prime_UTR' + ] + Array[String] info_fields = [ + 'gnomad_af', + 'hprc_af', + 'gnomad_nhomalt', + 'hprc_nhomalt', + 'gnomad_ac', + 'hprc_ac' + ] + + String vcf_basename = basename(vcf, ".vcf.gz") + Int threads = 8 + Int disk_size = ceil((size(vcf, "GB") + size(reference, "GB") + size(gnomad_af, "GB") + size(hprc_af, "GB") + size(gff, "GB") + size(lof_lookup, "GB") + size(clinvar_lookup, "GB") + size(phrank_lookup, "GB")) * 2 + 20) + + command <<< + set -euo pipefail + + bcftools --version + + bcftools norm \ + --threads ~{threads - 1} \ + --multiallelics \ + - \ + --output-type b \ + --fasta-ref ~{reference} \ + ~{vcf} \ + | bcftools sort \ + --output-type b \ + --output ~{vcf_basename}.norm.bcf + + bcftools index \ + --threads ~{threads - 1} \ + ~{vcf_basename}.norm.bcf + + # slivar has no version option + slivar expr 2>&1 | grep -Eo 'slivar version: [0-9.]+ [0-9a-f]+' + + pslivar \ + --processes ~{threads} \ + --fasta ~{reference} \ + --pass-only \ + --js ~{slivar_js} \ + --info '~{sep=" && " info_expr}' \ + --family-expr '~{sep=" && " family_recessive_expr}' \ + --family-expr '~{sep=" && " family_x_recessive_expr}' \ + --family-expr '~{sep=" && " family_dominant_expr}' \ + --sample-expr '~{sep=" && " sample_expr}' \ + --gnotate ~{gnomad_af} \ + --gnotate ~{hprc_af} \ + --vcf ~{vcf_basename}.norm.bcf \ + --ped ~{pedigree} \ + | bcftools csq \ + --local-csq \ + --samples - \ + --ncsq 40 \ + --gff-annot ~{gff} \ + --fasta-ref ~{reference} \ + - \ + --output-type z \ + --output ~{vcf_basename}.norm.slivar.vcf.gz + + bcftools index \ + --threads ~{threads - 1} \ + --tbi ~{vcf_basename}.norm.slivar.vcf.gz + + slivar \ + compound-hets \ + --skip ~{sep=',' skip_list} \ + --vcf ~{vcf_basename}.norm.slivar.vcf.gz \ + --sample-field comphet_side \ + --ped ~{pedigree} \ + --allow-non-trios \ + | add_comphet_phase.py \ + | bcftools view \ + --output-type z \ + --output ~{vcf_basename}.norm.slivar.compound_hets.vcf.gz + + bcftools index \ + --threads ~{threads - 1} \ + --tbi ~{vcf_basename}.norm.slivar.compound_hets.vcf.gz + + slivar tsv \ + --info-field ~{sep=' --info-field ' info_fields} \ + --sample-field dominant \ + --sample-field recessive \ + --sample-field x_recessive \ + --csq-field BCSQ \ + --gene-description ~{lof_lookup} \ + --gene-description ~{clinvar_lookup} \ + --gene-description ~{phrank_lookup} \ + --ped ~{pedigree} \ + --out /dev/stdout \ + ~{vcf_basename}.norm.slivar.vcf.gz \ + | sed '1 s/gene_description_1/lof/;s/gene_description_2/clinvar/;s/gene_description_3/phrank/;' \ + > ~{vcf_basename}.norm.slivar.tsv + + slivar tsv \ + --info-field ~{sep=' --info-field ' info_fields} \ + --sample-field slivar_comphet \ + --info-field slivar_comphet \ + --csq-field BCSQ \ + --gene-description ~{lof_lookup} \ + --gene-description ~{clinvar_lookup} \ + --gene-description ~{phrank_lookup} \ + --ped ~{pedigree} \ + --out /dev/stdout \ + ~{vcf_basename}.norm.slivar.compound_hets.vcf.gz \ + | sed '1 s/gene_description_1/lof/;s/gene_description_2/clinvar/;s/gene_description_3/phrank/;' \ + > ~{vcf_basename}.norm.slivar.compound_hets.tsv + >>> + + output { + File filtered_vcf = "~{vcf_basename}.norm.slivar.vcf.gz" + File filtered_vcf_index = "~{vcf_basename}.norm.slivar.vcf.gz.tbi" + File compound_het_vcf = "~{vcf_basename}.norm.slivar.compound_hets.vcf.gz" + File compound_het_vcf_index = "~{vcf_basename}.norm.slivar.compound_hets.vcf.gz.tbi" + File filtered_tsv = "~{vcf_basename}.norm.slivar.tsv" + File compound_het_tsv = "~{vcf_basename}.norm.slivar.compound_hets.tsv" + } + + runtime { + docker: "~{runtime_attributes.container_registry}/slivar@sha256:0a09289ccb760da310669906c675be02fd16b18bbedc971605a587275e34966c" + cpu: threads + memory: "16 GB" + disk: disk_size + " GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: runtime_attributes.preemptible_tries + maxRetries: runtime_attributes.max_retries + awsBatchRetryAttempts: runtime_attributes.max_retries + queueArn: runtime_attributes.queue_arn + zones: runtime_attributes.zones + } +} + +task svpack_filter_annotated { + input { + File sv_vcf + + Array[File] population_vcfs + Array[File] population_vcf_indices + + File gff + + RuntimeAttributes runtime_attributes + } + + String sv_vcf_basename = basename(sv_vcf, ".vcf.gz") + Int disk_size = ceil(size(sv_vcf, "GB") * 2 + 20) + + command <<< + set -euo pipefail + + echo "svpack version:" + cat /opt/svpack/.git/HEAD + + svpack \ + filter \ + --pass-only \ + --min-svlen 50 \ + ~{sv_vcf} \ + ~{sep=' ' prefix('| svpack match -v - ', population_vcfs)} \ + | svpack \ + consequence \ + - \ + ~{gff} \ + | svpack \ + tagzygosity \ + - \ + > ~{sv_vcf_basename}.svpack.vcf + + bgzip --version + + bgzip ~{sv_vcf_basename}.svpack.vcf + + tabix --version + + tabix -p vcf ~{sv_vcf_basename}.svpack.vcf.gz + >>> + + output { + File svpack_vcf = "~{sv_vcf_basename}.svpack.vcf.gz" + File svpack_vcf_index = "~{sv_vcf_basename}.svpack.vcf.gz.tbi" + } + + runtime { + docker: "~{runtime_attributes.container_registry}/svpack@sha256:a680421cb517e1fa4a3097838719a13a6bd655a5e6980ace1b03af9dd707dd75" + cpu: 2 + memory: "16 GB" + disk: disk_size + " GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: runtime_attributes.preemptible_tries + maxRetries: runtime_attributes.max_retries + awsBatchRetryAttempts: runtime_attributes.max_retries + queueArn: runtime_attributes.queue_arn + zones: runtime_attributes.zones + } +} + +task slivar_svpack_tsv { + input { + File filtered_vcf + + File pedigree + File lof_lookup + File clinvar_lookup + File phrank_lookup + + RuntimeAttributes runtime_attributes + } + + Array[String] info_fields = [ + 'SVTYPE', + 'SVLEN', + 'SVANN', + 'CIPOS', + 'MATEID', + 'END' + ] + + String filtered_vcf_basename = basename(filtered_vcf, ".vcf.gz") + Int disk_size = ceil((size(filtered_vcf, "GB") + size(lof_lookup, "GB") + size(clinvar_lookup, "GB") + size(phrank_lookup, "GB")) * 2 + 20) + + command <<< + set -euo pipefail + + # slivar has no version option + slivar expr 2>&1 | grep -Eo 'slivar version: [0-9.]+ [0-9a-f]+' + + slivar tsv \ + --info-field ~{sep=' --info-field ' info_fields} \ + --sample-field hetalt \ + --sample-field homalt \ + --csq-field BCSQ \ + --gene-description ~{lof_lookup} \ + --gene-description ~{clinvar_lookup} \ + --gene-description ~{phrank_lookup} \ + --ped ~{pedigree} \ + --out /dev/stdout \ + ~{filtered_vcf} \ + | sed '1 s/gene_description_1/lof/;s/gene_description_2/clinvar/;s/gene_description_3/phrank/;' \ + > ~{filtered_vcf_basename}.tsv + >>> + + output { + File svpack_tsv = "~{filtered_vcf_basename}.tsv" + } + + runtime { + docker: "~{runtime_attributes.container_registry}/slivar@sha256:0a09289ccb760da310669906c675be02fd16b18bbedc971605a587275e34966c" + cpu: 2 + memory: "4 GB" + disk: disk_size + " GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: runtime_attributes.preemptible_tries + maxRetries: runtime_attributes.max_retries + awsBatchRetryAttempts: runtime_attributes.max_retries + queueArn: runtime_attributes.queue_arn + zones: runtime_attributes.zones + } +}