pull remote changes

broadinstitute · Sep 18, 2024 · a96570c · a96570c
2 parents 371c6ef + e41a7da
commit a96570c
Show file tree

Hide file tree

Showing 4 changed files with 185 additions and 54 deletions.
diff --git a/CNV_Array_Prober/README.md b/CNV_Array_Prober/README.md
@@ -0,0 +1,77 @@
+# cnvArrayProber
+
+## Overview
+
+The `cnvArrayProber` is designed to analyze CNV (Copy Number Variation) intervals from a BED file and map probe information from two array support files (CytoSNP-850K and GDA). It generates a comprehensive XLSX file containing the intervals and the number of probes they contain in each array. Additionally, the script outputs a PDF file with detailed plots illustrating the locations of these probes in the CytoSNP and GDA arrays.
+
+This script was developed upon a request from Greg Nakashian in [TAG1994](https://github.com/broadinstitute/TAG/issues/1994).
+
+## Features
+
+1. **Input Processing:**
+   - **BED File:** The script reads CNV intervals from a specified BED file.
+   - **Array Support Files:** It also processes two array support files, CytoSNP-850K and GDA, to gather probe information.
+
+2. **Data Analysis:**
+   - **Interval Analysis:** The [cnvArrayProber](https://dockstore.org/workflows/github.com/broadinstitute/TAG-public/cnvArrayProber:array_prober_yg?tab=info) WDL analyzes the CNV intervals to determine the number of probes from each array (CytoSNP-850K and GDA) that fall within each interval.
+
+3. **Output Generation:**
+   - **XLSX File:** A xlsx file is generated, containing the CNV intervals and the corresponding count of probes from each array.
+   - **PDF File:** A PDF file is produced, featuring plots that visually represent the locations of the probes within the CytoSNP and GDA arrays.
+
+## Usage
+
+To use the `cnvArrayProber` WDL, follow these steps:
+
+1. **Prepare Input Files:**
+   - Ensure you have a BED file containing the CNV intervals.
+```
+chr2	97220584	130400286
+chr4	143920938	144022444
+chr9	33140790	33261063
+  ```
+
+You can get CytoSNP-850K and GDA array support files using the following gcloud link. (**Note: Ensure you are using consistent genome build for those input**)
+
+
+| GDACyto_hg19_SupportFile                                                   | CytoSNP_850k_v1_4_hg38_SupportFile                                                                      | GDACyto_hg38_SupportFile                                                             | CytoSNP_850k_v1_4_hg19_SupportFile                                                                      |
+|:-------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------|
+| gs://fc-d2c7d48c-9433-4a1f-bdeb-100265b01a63/GDA_SupportFile/GDACyto_20047166_A1.csv | gs://fc-d2c7d48c-9433-4a1f-bdeb-100265b01a63/CytoSNP-850Kv1-4_SupportFile/CytoSNP-850Kv1-4_iScan_B2.csv | gs://fc-d2c7d48c-9433-4a1f-bdeb-100265b01a63/GDA_SupportFile/GDACyto_20047166_A2.csv | gs://fc-d2c7d48c-9433-4a1f-bdeb-100265b01a63/CytoSNP-850Kv1-4_SupportFile/CytoSNP-850Kv1-4_iScan_B1.csv |
+
+
+
+2. **Execute the Workflow:**
+   - Execute the `cnvArrayProber` WDL with inputs defined by data table.
+   - The WDL will process the files and generate the output XLSX and PDF files.
+
+3. **Review Outputs:**
+
+- **XLSX File:** A Exel file with the following information in two separate sheets for CytoSNP-850K and GDA arrays:
+
+CytoSNP850K:
+
+|                          |   left_padding |   interval |   right_padding |
+|:-------------------------|---------------:|-----------:|----------------:|
+| chr2:97220584-130400286  |            239 |       8051 |             171 |
+| chr4:143920938-144022444 |              0 |          4 |               1 |
+| chr9:33140790-33261063   |              2 |         38 |               2 |
+
+GDA:
+
+|                          |   left_padding |   interval |   right_padding |
+|:-------------------------|---------------:|-----------:|----------------:|
+| chr2:97220584-130400286  |            897 |      19284 |             683 |
+| chr4:143920938-144022444 |              2 |         41 |               1 |
+| chr9:33140790-33261063   |              9 |         81 |              18 |
+
+
+- **PDF File:** A PDF document with plots showing:
+  - The distribution of CytoSNP-850K probes within each interval.
+  - The distribution of GDA probes within each interval.
+
+
+## Development and Contributions
+
+
+The script was developed by Yueyao Gao ([email protected]) in response to a request from Greg Nakashian in [TAG1994](https://github.com/broadinstitute/TAG/issues/1994). Contributions and further improvements are welcome. Please refer to the TAG repo for more information.
+
diff --git a/CNV_Array_Prober/cnvArrayProber.inputs.json b/CNV_Array_Prober/cnvArrayProber.inputs.json
@@ -0,0 +1,11 @@
+{
+  "cnvArrayProber.cnvArrayProber.cpu": "Int (optional, default = 1)",
+  "cnvArrayProber.cnvProberDocker": "String (optional, default = \"us.gcr.io/tag-public/cnv-array-prober:0.0.0\")",
+  "cnvArrayProber.cnvBedFile": "File",
+  "cnvArrayProber.CytoSNP850K_Support_Csv": "File",
+  "cnvArrayProber.sampleName": "String",
+  "cnvArrayProber.GDA_Support_Csv": "File",
+  "cnvArrayProber.cnvArrayProber.disk": "Int (optional, default = 100)",
+  "cnvArrayProber.cnvArrayProber.memory": "Int (optional, default = 4)"
+}
+
diff --git a/CNV_Array_Prober/cnvArrayProber.wdl b/CNV_Array_Prober/cnvArrayProber.wdl
@@ -0,0 +1,68 @@
+version 1.0
+
+workflow cnvArrayProber {
+    input{
+        String sampleName
+        File cnvBedFile
+        File CytoSNP850K_Support_Csv
+        File GDA_Support_Csv
+        String cnvProberDocker = "us.gcr.io/tag-public/cnv-array-prober:0.0.1"
+    }
+    call cnvArrayProber {
+        input:
+            sampleName = sampleName,
+            cnvBedFile = cnvBedFile,
+            CytoSNP850K_Support_Csv = CytoSNP850K_Support_Csv,
+            GDA_Support_Csv = GDA_Support_Csv,
+            cnvProberDocker = cnvProberDocker
+    }
+    output{
+        File cnvProbeAnnotation = cnvArrayProber.cnvProbeAnnotation
+        File cnvProbePlots = cnvArrayProber.cnvProbePlots
+    }
+    meta {
+        author: "Yueyao Gao"
+        email: "[email protected]"
+        description: "This workflow takes a CNV bed file and CytoSNP-850K and GDA support files as input and outputs a csv file with probe information for each CNV interval. Additionally,  output a PDF file with plots for each CNV interval the number of probes in the CytoSNP-850K and GDA arrays."
+    }
+}
+
+task cnvArrayProber {
+    input{
+        String sampleName
+        File cnvBedFile
+        File CytoSNP850K_Support_Csv
+        File GDA_Support_Csv
+        String cnvProberDocker
+        Int memory = 32
+        Int cpu = 2
+        Int disk_size_gb = 500
+        Boolean use_ssd = false
+        Int preemptible = 3
+        Int maxRetries = 3
+    }
+    command <<<
+    set -e
+    mkdir output
+
+    conda run --no-capture-output \
+            -n prober_env \
+            python3 /BaseImage/cnvArrayProber/scripts/cnvArrayProber.py \
+            -b ~{cnvBedFile} \
+            -c ~{CytoSNP850K_Support_Csv} \
+            -g ~{GDA_Support_Csv} \
+            -o output/~{sampleName}
+    >>>
+    output{
+        File cnvProbeAnnotation = "output/~{sampleName}CNV_Probe_Mappings.xlsx"
+        File cnvProbePlots = "output/~{sampleName}CNV_Probe_Mappings_Plots.pdf"
+    }
+    runtime {
+        docker: cnvProberDocker
+        memory: memory
+        cpu: cpu
+        disks: "local-disk " + disk_size_gb + if use_ssd then " SSD" else " HDD"
+        preemptible: preemptible
+        maxRetries: maxRetries
+    }
+}
diff --git a/PECGS-QUICviz/QUICviz.wdl b/PECGS-QUICviz/QUICviz.wdl
@@ -3,8 +3,9 @@ version 1.0
 workflow QUICviz {
     input {
         String sampleID
+        Boolean isPECGS = true
         String tumorType
-        String quicvizDocker = "us-central1-docker.pkg.dev/tag-team-160914/gptag-dockers/cmi_quicviz:0.3.1"
+        String quicvizDocker = "us-central1-docker.pkg.dev/tag-team-160914/gptag-dockers/cmi_quicviz:0.4.2"
         File allelicCountsNormal
         File allelicCountsTumor
         File denoisedCopyRatiosNormal
@@ -24,29 +25,23 @@ workflow QUICviz {
             calledCopyRatioSegTumor = calledCopyRatioSegTumor,
             oncotatedCalledTumor = oncotatedCalledTumor
     }
-
-    Array[File] QUICvizPlots = QUICviz.plot
-    call mergeImages {
-        input:
-            SampleID = sampleID,
-            TumorType = tumorType,
-            plot = QUICvizPlots,
-            quicvizDocker = quicvizDocker
-    }
     output {
-        File QUICvizPDF = mergeImages.chr_pdf
-        File AllChrPlot = mergeImages.allchr_plot
+        File QUICvizPDF = QUICviz.QUICvizPDF
+        File GeneLevelCNV = QUICviz.GeneLevelCNV
+        File AllChrPlot = QUICviz.AllChrPlot
     }
+
     meta {
         author: "Yueyao Gao"
         email: "[email protected]"
-        description: "QUICviz.wdl is based on the QUICviz_v0.3 R script developed by Alex Neil, which is a tool for visualizing CNV data"
+        description: "QUICviz.wdl is based on the QUICviz_v0.4 R script developed by Alex Neil, which is a tool for visualizing CNV data"
     }
 }
 
 task QUICviz {
     input {
         String sampleID
+        Boolean isPECGS
         String tumorType
         String quicvizDocker
         File allelicCountsNormal
@@ -57,13 +52,24 @@ task QUICviz {
         File oncotatedCalledTumor
         Int memory = 16
         Int cpu = 4
+        Int maxRetries = 3
     }
     command <<<
         set -e
         mkdir outputs
 
-        Rscript /BaseImage/CMI_QUICviz/scripts/CMI_QUICviz_v0.3.R \
-            --sample ~{sampleID} \
+        if ~{isPECGS}; then
+            IFS='-' read -r tumor_sample normal_sample <<< "~{sampleID}"
+            echo "Input Tumor Sample: $tumor_sample"
+            echo "Input Normal Sample: $normal_sample"
+        else
+            tumor_sample=~{sampleID}
+            echo "Input Tumor Sample: $tumor_sample"
+        fi
+
+
+        Rscript /BaseImage/CMI_QUICviz/scripts/CMI_QUICviz_v0.4.2.R \
+            --sample $tumor_sample \
             --tumor_type ~{tumorType} \
             --normal_acf ~{allelicCountsNormal} \
             --normal_cr ~{denoisedCopyRatiosNormal} \
@@ -73,52 +79,21 @@ task QUICviz {
             --tumor_seg_oncotated ~{oncotatedCalledTumor} \
             --output_dir outputs/
 
-    >>>
-    output {
-        Array[File] plot = glob("outputs/*.png")
-    }
-    runtime {
-        docker: quicvizDocker
-        memory: memory + " GB"
-        cpu: cpu
-        disks: "local-disk 100 HDD"
-    }
-}
-task mergeImages {
-    input {
-        String SampleID
-        String TumorType
-        Array[File] plot
-        String quicvizDocker
-        Int memory = 16
-        Int cpu = 4
-    }
-    command <<<
-        mkdir -p output/images
-        for i in `ls ~{sep=" " plot}`; do mv $i output/images/; done
+        mv outputs/*chromosome_plots.pdf outputs/~{sampleID}_chromosome_plots.pdf
+        mv outputs/*gene_level_calls.csv outputs/~{sampleID}_gene_level_calls.csv
+        mv outputs/*_all_chr.png outputs/~{sampleID}_All_chr.png
 
-        python3 <<CODE
-        import img2pdf
-        import glob
-        import os
-
-        # Get list of PNG files sorted
-        png_files = sorted(glob.glob("output/images/*.png"))
-        numeric_png_files = [file for file in png_files if os.path.basename(file).split('.')[0].isdigit()]
-        png_files = sorted(numeric_png_files, key=lambda x: int(os.path.basename(x).split('.')[0]))
-
-        with open(f"output/~{SampleID}_~{TumorType}_QUICviz.pdf","wb") as f:
-            f.write(img2pdf.convert(png_files))
-        CODE
     >>>
     output {
-        File chr_pdf = "output/~{SampleID}_~{TumorType}_QUICviz.pdf"
-        File allchr_plot = "output/images/All_chr.png"
+        File QUICvizPDF = "outputs/~{sampleID}_chromosome_plots.pdf"
+        File GeneLevelCNV = "outputs/~{sampleID}_gene_level_calls.csv"
+        File AllChrPlot = "outputs/~{sampleID}_All_chr.png"
     }
     runtime {
         docker: quicvizDocker
         memory: memory + " GB"
         cpu: cpu
         disks: "local-disk 100 HDD"
+        maxRetries: maxRetries
     }
-}
+}