From 253f878a8cafa7b47b4737fbcec5072a5d483545 Mon Sep 17 00:00:00 2001
From: Yupei You <youyupei@gmail.com>
Date: Fri, 16 Aug 2024 13:39:47 +1000
Subject: [PATCH] Update README.md

---
 README.md       | 8 ++++----
 blaze/config.py | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/README.md b/README.md
index ae42616..6920f2f 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@ Combining single-cell RNA sequencing with Nanopore long-read sequencing enables
 * **Identifies the putative UMI sequences for each read** The end position of the barcode, which is also the start position of the UMI sequence, will be corrected by taking into account the insertion and deletion errors in the putative barcode. The 10 (for 10x v2 kit) or 12nt (for 10x v3 kit) sequence immediately downstream will be used as UMI.
 * **Significant runtime improvement** (~5-10 times faster)
 * **Trim the bases before and included in UMI from the demultiplexed reads:** From version 2.2, The output format will be in fastq or fastq.gz. The header with be `@<16 nt BC>_<12 nt UMI>#read_id_<strand>`
-* **Adding more supported 10X kit.**  From version 2.4, BLAZE can take '3v4', '3v3'(default), '3v2', '3v1' for 10X 3' GEX kit v4 to v2 respectively, and '5v3', '5v2' for 10X 5' GEX kit v3 and v2
+* **Adding more supported 10X kit.**  From version 2.4, The option ` --10x-kit-version` (or ` --kit-version`) can take '3v4', '3v3'(default), '3v2', '3v1' for 10X 3' GEX kit v4 to v2 respectively, and '5v3', '5v2' for 10X 5' GEX kit v3 and v2
 
 ## Minor updates
 * `--emptydrop` option in v1.x is on by default and is no longer user-specified.
@@ -45,7 +45,7 @@ Combining single-cell RNA sequencing with Nanopore long-read sequencing enables
 ## **Required Input:** 
  * **Long-read fastq files**
  * **Expected number of cells**: The expected number of cells is a required input (specified by `--expect-cells=xx`). Note that the output is robust to the specified number, but a rough number is needed to determine the count threshold to output the barcode list. 
-* **Barcode whitelist (optional)**: A file containing all possible barcodes ([more details](https://kb.10xgenomics.com/hc/en-us/articles/115004506263-What-is-a-barcode-whitelist-)). Note: there is no need to specify the file if you are using 10x Single Cell 3' gene expression v2 or v3 chemistry. The corresponding whitelists are included with BLAZE. By default, BLAZE will assume the use of v3 chemistry and automatically choose the corresponding 10X whitelist. You may specify `--kit-version=v2` if the data were generated by the v2 chemistry. You can also provide your own whitelist by specifying `--full-white-list=<filename>` (e.g., if you used customised barcodes).
+* **Barcode whitelist (optional)**: A file containing all possible barcodes ([more details](https://kb.10xgenomics.com/hc/en-us/articles/115004506263-What-is-a-barcode-whitelist-)). Note: there is no need to specify the file if you are using 10x Single Cell gene expression chemistry. The corresponding whitelists are included with BLAZE. By default, BLAZE will assume the use of 3' v3 chemistry and automatically choose the corresponding 10X whitelist. You may specify `--kit-version` if the data were generated using a different chemistry (run `blaze -h` for more information). You can also provide your own whitelist by specifying `--full-white-list=<filename>` (e.g., if you used customised barcodes).
 
 ## Running BLAZE:
 
@@ -64,7 +64,7 @@ BLAZE first searches for putative barcodes (i.e. non-error corrected sequence at
     * col1: read id
     * col2: putative barcode (i.e. the basecalled barcode segment in each read, specifically the 16nt sequence after the identified 10X adaptor within each read **without correction for any basecalling errors**)
     * col3: minimum Phred score of the bases in the putative barcode
-    * col4: putative_umi (i.e. the UMI segment in each read, specifically the 10 (for 10x v2 kit) or 12nt (for 10x v3 kit) sequence after the identified putative barcode **without correction for any basecalling errors**)
+    * col4: putative_umi (i.e. the UMI segment in each read, specifically the 10 (for 10x v2 kits) or 12nt (for 10x v3 and v4 kits) sequence after the identified putative barcode **without correction for any basecalling errors**)
     * col5: 0-based UMI end position in each read, a positive value indicates that the barcode and UMI were found at the forward strand of the read, and a negative value indicates the barcode and UMI were extracted (including the flanking sequencing in col 6 & 7) from the reverse strand.
     * col6: flanking sequence immediately upstream to the barcode in the reads
     * col7: flanking sequence immediately downstream to the UMI in the reads
@@ -132,7 +132,7 @@ blaze -h
 
 
 # Limitation:
-BLAZE has been tested on Chromium **Single Cell 3ʹ gene expression v3** and should also work on **Chromium Single Cell 3ʹ gene expression v2**. However, it doesn't yet support any 10X 5' gene expression kits.
+BLAZE is compatible with 10X 3' v4 kit but its performance hasn’t been tested on it yet.
 
 # Citing BLAZE
 
diff --git a/blaze/config.py b/blaze/config.py
index 9a40633..309b89b 100644
--- a/blaze/config.py
+++ b/blaze/config.py
@@ -29,7 +29,7 @@
 # input
 DEFAULT_GRB_MIN_SCORE=15
 DEFAULT_GRB_KIT='3v3'
-DEFAULT_UMI_SIZE = 12 if DEFAULT_GRB_KIT=='3v3' else 10
+DEFAULT_UMI_SIZE = 10 if DEFAULT_GRB_KIT in ['3v2', '5v2'] else 12
 DEFAULT_BC_SIZE = 16
 
 # The 10X barcode whitelists has been packed in the package