Add a whole workflow to debug

Shixiang Wang (王诗翔) · Shixiang Wang (王诗翔) · commit cdfb1c744d2f · 2024-04-22T18:34:03.000+08:00
diff --git a/README.md b/README.md
@@ -123,7 +123,7 @@ rv
 
 ### Pipeline (WES bam data only)
 
-- for one tumor-normal pair, you can refer to [one-pair.R](https://github.com/ShixiangWang/gcap/blob/master/test-workflow/one-pair.R).
+- for one tumor-normal pair, you can refer to [one-pair.R](https://github.com/ShixiangWang/gcap/blob/master/test-workflow/one-pair.R). [test-workflow/debug](test-workflow/debug) contains a full workflow for data obtained from SRA.
 - for multiple tumor-normal pairs, you can refer to [two-pair.R](https://github.com/ShixiangWang/gcap/blob/master/test-workflow/two-pairs.R).
 
 To run **gcap** from bam files, a machine with **at least 80GB RAM** is required for
diff --git a/test-workflow/debug/0-dump-sra.sh b/test-workflow/debug/0-dump-sra.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+mkdir -p /data3/wsx/share/gcap_debug
+cd /data3/wsx/share
+
+export PATH=$HOME/soft/sratoolkit/bin:$PATH
+
+for i in ERR5242993 ERR5243012
+do
+  echo handling $i
+  parallel-fastq-dump -t 20 -O gcap_debug/ --split-3  --gzip -s $i
+done
diff --git a/test-workflow/debug/1-align.sh b/test-workflow/debug/1-align.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+source activate circlemap
+cd /data3/wsx/share/gcap_debug
+
+cores=24
+
+mkdir bam
+sn=$(ls *.fastq.gz | tr ' ' '\n' | sed 's/_[12].fastq.gz//' | sort | uniq)
+#sn=ERR5243012
+INDEX=/data1/database/human/hg38/bwa_index/hg38_p7
+
+for id in ${sn}; do
+
+  fastp -i ${id}_1.fastq.gz -I ${id}_2.fastq.gz -o ${id}_1.fq.gz -O ${id}_2.fq.gz -h ${id}.html -j ${id}.json --thread 16 --dont_overwrite
+  
+  if [ ! -f bam/${id}.bam ]
+  then
+    if [ ! -f bam/${id}.sam ]
+    then
+      fq1=${id}_1.fq.gz
+      fq2=${id}_2.fq.gz
+      echo "Start aligning for ${id}"
+      bwa mem -M -t $cores -R "@RG\tID:${id}\tSM:${id}\tLB:WXS\tPL:Illumina" ${INDEX} ${fq1} ${fq2} \
+          > bam/${id}.sam 2>bam/${id}_bwa.log
+    else
+        echo "BWA align for ${id} is done before, directly go to sam > bam step"
+    fi
+  
+    if [ $? -eq 0 ]
+    then
+        if [ ! -f bam/${id}.bam ]
+        then
+            samtools sort -@ $cores bam/${id}.sam -o bam/${id}.bam 2>bam/${id}_bam.log
+            samtools index bam/${id}.bam
+            if [ $? -eq 0 ]
+            then
+                echo "Removing sam files"
+                rm bam/${id}.sam
+            else
+                echo "Failed when using samtools sort, please check"
+                exit 1
+            fi
+        fi
+        echo "Done for ${id}." `date`
+    else
+        echo "Failed for ${id} in bwa." `date`
+    fi
+  
+  fi
+
+done
diff --git a/test-workflow/debug/2-prepare.sh b/test-workflow/debug/2-prepare.sh
@@ -0,0 +1,11 @@
+# Set up conda env
+# mamba create -n cancerit -c bioconda cancerit-allelecount 
+
+cd /data3/wsx/share/gcap_reference
+wget -c https://zenodo.org/records/6524005/files/1000G_loci_hg38.tar.gz
+wget -c https://zenodo.org/records/6524005/files/GC_correction_hg38.txt.gz
+wget -c https://zenodo.org/records/6524005/files/RT_correction_hg38.txt.gz
+
+tar zxvf 1000G_loci_hg38.tar.gz
+gunzip GC_correction_hg38.txt.gz
+gunzip RT_correction_hg38.txt.gz
diff --git a/test-workflow/debug/3-gcap.R b/test-workflow/debug/3-gcap.R
@@ -0,0 +1,33 @@
+# remotes::install_github("ShixiangWang/ascat@v3-for-gcap-v1", subdir = "ASCAT")
+# remotes::install_github("ShixiangWang/gcap")
+# install.packages("https://cran.r-project.org/src/contrib/Archive/xgboost/xgboost_1.5.2.1.tar.gz", repos = NULL)
+
+library(gcap)
+
+# id为PRJEB42904，wes_395LC是tumor，id：ERR5242993，wes_395N是normal，id：ERR5243012
+
+# hg38 ----------------
+gcap.workflow(
+  tumourseqfile = "~/share/gcap_debug/bam/ERR5242993.bam", 
+  normalseqfile = "~/share/gcap_debug/bam/ERR5243012.bam",
+  tumourname = "wes_395LC",
+  normalname = "wes_395N",
+  jobname = "wes_395",
+  outdir = "~/share/gcap_debug/gcap_result",
+  allelecounter_exe = "~/miniconda3/envs/cancerit/bin/alleleCounter", 
+  g1000allelesprefix = file.path(
+    "~/share/gcap_reference/1000G_loci_hg38/",
+    "1kg.phase3.v5a_GRCh38nounref_allele_index_chr"
+  ), 
+  g1000lociprefix = file.path("~/share/gcap_reference/1000G_loci_hg38/",
+                              "1kg.phase3.v5a_GRCh38nounref_loci_chrstring_chr"
+  ),
+  GCcontentfile = "~/share/gcap_reference/GC_correction_hg38.txt",
+  replictimingfile = "~/share/gcap_reference/RT_correction_hg38.txt",
+  skip_finished_ASCAT = TRUE,
+  skip_ascat_call = FALSE,
+  result_file_prefix = "wes_395",
+  genome_build = "hg38",
+  model = "XGB11"
+)
+