From 30f04e4e30251d779bf9b17861d810c3d6ef84ae Mon Sep 17 00:00:00 2001 From: MVCowley <51127523+MVCowley@users.noreply.github.com> Date: Tue, 5 Aug 2025 11:18:34 +0100 Subject: [PATCH 1/5] test(takara): add findFirstSpacer test for takarabio --- tests/test_collapse.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/test_collapse.py b/tests/test_collapse.py index b6eb44e..36b77bd 100644 --- a/tests/test_collapse.py +++ b/tests/test_collapse.py @@ -170,6 +170,18 @@ def test_nebio(self): oligo, seq, oligo_start, oligo_end ) == [oligo["spcr1"]] + def test_takara(self): + oligo = { + "spcr1": "GTACGGG", + } + seq = "CTCGTTAGGTTCGTACGGGGATTGCA" + oligo_start = 0 + oligo_end = oligo_start + 19 + + assert collapse.findFirstSpacer( + oligo, seq, oligo_start, oligo_end + ) == [oligo["spcr1"]] + class TestReadInData: From 11ddc717ea05f112006ab2511da60fd0df97888e Mon Sep 17 00:00:00 2001 From: MVCowley <51127523+MVCowley@users.noreply.github.com> Date: Tue, 5 Aug 2025 12:08:18 +0100 Subject: [PATCH 2/5] feat(takara): add takarabio umi search --- src/decombinator/collapse.py | 73 +++++++++++++++++++++++------------- 1 file changed, 47 insertions(+), 26 deletions(-) diff --git a/src/decombinator/collapse.py b/src/decombinator/collapse.py index b672614..86e787d 100644 --- a/src/decombinator/collapse.py +++ b/src/decombinator/collapse.py @@ -11,7 +11,7 @@ This version is a modified version of KB's script collapsinator_20141126.py (That was itself an improved version of the CollapseTCRs.py script used in the Heather et al HIV TCR paper (DOI: 10.3389/fimmu.2015.00644)) Version 4.0.2 includes improved clustering routines measuring the similarity in both barcode and TCR sequence of TCR repertoire data - + NOTE - from version 4.1 this optionally looks for barcode 6NI86N at the beginning of the read; instead of M13_6N_I8_6N_I8 (i.e. only one spacer). This makes it compatible with the multiplex protocol in which the barcode is incorproated in the RT step @@ -19,18 +19,18 @@ ################## ###### INPUT ##### ################## - - Required inputs - -in/--infile : Defines input file. Takes as input .n12 files produced by Decombinator (v3 or higher), + + Required inputs + -in/--infile : Defines input file. Takes as input .n12 files produced by Decombinator (v3 or higher), assuming it has been run on suitably barcoded and demultiplexed data. -ol/--oligo : Specifies the spacer (protocol dependent) as M13, I8, I8_single. The I8 protocol is deprecated. - + Other optional flags: - - -s/--supresssummary: Supress the production of a summary file containing details of the run into a 'Logs' directory. - - -dz/--dontgzip: Suppress the automatic compression of output demultiplexed FASTQ files with gzip. - + + -s/--supresssummary: Suppress the production of a summary file containing details of the run into a 'Logs' directory. + + -dz/--dontgzip: Suppress the automatic compression of output demultiplexed FASTQ files with gzip. + -dc/--dontcount: Suppress the whether or not to show the running line count, every 100,000 reads. Helps in monitoring progress of large batches. The other optional flags are somewhat complex, and caution is advised in their alteration. @@ -41,30 +41,31 @@ V index, J index, V deletions, J deletions, insert, ID, inter-tag TCR sequence, inter-tag quality, barcode sequence, barcode quality ################## -##### OUTPUT ##### +##### OUTPUT ##### ################## - + A Decombinator index file, giving each error-corrected DCR index, and the frequency with which it appears in the final processed data, and an average UMI count, which can be used to estimate the robustness of the data for that particular sequence -#######################################################################################################################################################""" +####################################################################################################################################################### +""" import ast import collections as coll import gzip -from importlib import metadata import os +import sys import time import typing -from scipy import sparse -import sys +from importlib import metadata import networkx as nx import polyleven import pyrepseq.nn as prsnn import regex import scipy.sparse +from scipy import sparse ######################################################################################################################## # Functions @@ -97,7 +98,9 @@ def check_dcr_file(infile, opener): return False print(os.path.getsize(infile)) if os.path.getsize(infile) == 0: - raise ValueError("Input file appears to be empty; please double-check path.") + raise ValueError( + "Input file appears to be empty; please double-check path." + ) # Check first few lines with opener(infile, "rt") as poss_dcr: @@ -174,6 +177,7 @@ def getOligo(oligo_name): oligos["i8"] = {"spcr1": "GTCGTGAT", "spcr2": "GTCGTGAT"} oligos["i8_single"] = {"spcr1": "ATCACGAC"} oligos["nebio"] = {"spcr1": "TACGGG"} + oligos["takara"] = {"spcr1": "GTACGGG"} if oligo_name.lower() not in oligos: print( @@ -365,14 +369,20 @@ def get_barcode_positions( ) -> list[int]: """ Given a barcode-region sequence, outputs the sequence of the do-docamer barcode. - This barcode (theoretically) consists of the concatentation of the two random hexamer sequences contained in the ligation oligo. + For m13 and i8 oligos, this barcode (theoretically) consists of the concatentation of the two random hexamer sequences contained in the ligation oligo. However errors in sequences and ligation oligo production can mean that the random nucleotides are not always at the expected position. This function uses the known sequence of the spacers (which bound each of the two N6s to their 5') to deduce the random sequences. Returns a list of four numbers, giving the start and stop positions of N1 and N2 respectively. """ - if str.lower(inputargs["oligo"]) not in ["i8", "i8_single", "m13", "nebio"]: + if str.lower(inputargs["oligo"]) not in [ + "i8", + "i8_single", + "m13", + "nebio", + "takara", + ]: raise ValueError( - "The flag for the -ol input must be one of M13, I8, I8_single, or NEBIO" + "The flag for the -ol input must be one of M13, I8, I8_single, NEBIO, or TAKARA." ) if ( @@ -388,6 +398,9 @@ def get_barcode_positions( if str.lower(inputargs["oligo"]) == "nebio": oligo_start = 18 oligo_end = oligo_start + 10 + elif str.lower(inputargs["oligo"]) == "takara": + oligo_start = 0 + oligo_end = 19 else: oligo_start = 0 allowance = 10 @@ -401,7 +414,7 @@ def get_barcode_positions( # sets second spacer based on specified oligo (unless single oligo) - if str.lower(inputargs["oligo"]) not in ["i8_single", "nebio"]: + if str.lower(inputargs["oligo"]) not in ["i8_single", "nebio", "takara"]: spacers += findSecondSpacer(oligo, bcseq) # sequences which do not have two spacers are logged then removed from analysis if not len(spacers) == 2: @@ -409,9 +422,12 @@ def get_barcode_positions( return None spacer_positions = getSpacerPositions(bcseq, spacers) - if str.lower(inputargs["oligo"]) == "nebio": + if str.lower(inputargs["oligo"]) in ["nebio", "takara"]: # set expected barcode length - bclength = inputargs["bclength"] + if str.lower(inputargs["oligo"]) == "nebio": + bclength = 17 + else: + bclength = 12 # start and end of barcode positions are set b1start = 0 b1end = bclength @@ -588,7 +604,10 @@ def read_in_data( barcode + "|" + str(index[0]) + "|" + index[1] ] - barcode_lookup[barcode][index[0]] = [index[0], protoseq] + barcode_lookup[barcode][index[0]] = [ + index[0], + protoseq, + ] group_assigned = True # if assigned to a group, stop and move onto next read @@ -628,7 +647,7 @@ def read_in_data( def create_clustering_objs( - barcode_dcretc: dict[str, list[str]] + barcode_dcretc: dict[str, list[str]], ) -> tuple[int, list[tuple[str, str]], list[tuple[str, str]]]: # get number of initial groups @@ -745,7 +764,9 @@ def write_clusters(clusters): count += 1 os.mkdir(dirname) - print(" Writing clusters to directory: ", os.path.abspath(dirname), "...") + print( + " Writing clusters to directory: ", os.path.abspath(dirname), "..." + ) # write data of each cluster to a separate file and store in clusters directory for k in clusters: with open( From 342dec5eb0cf8d51b74cdf450849e68b79d9a1e1 Mon Sep 17 00:00:00 2001 From: MVCowley <51127523+MVCowley@users.noreply.github.com> Date: Tue, 5 Aug 2025 12:08:46 +0100 Subject: [PATCH 3/5] test(takara): unit test takara umi position finding --- tests/test_collapse.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tests/test_collapse.py b/tests/test_collapse.py index 36b77bd..c43e633 100644 --- a/tests/test_collapse.py +++ b/tests/test_collapse.py @@ -108,12 +108,23 @@ def test_nebio(self, counter): inputargs = { "oligo": "nebio", "allowNs": False, - "bclength": 18, } assert collapse.get_barcode_positions(bcseq, inputargs, counter) == [ 0, - 18, + 17, + ] + + def test_takara(self, counter): + bcseq = "CTCGTTAGGTTCGTACGGGGATTGCA" + inputargs = { + "oligo": "takara", + "allowNs": False, + } + + assert collapse.get_barcode_positions(bcseq, inputargs, counter) == [ + 0, + 12, ] @@ -176,7 +187,7 @@ def test_takara(self): } seq = "CTCGTTAGGTTCGTACGGGGATTGCA" oligo_start = 0 - oligo_end = oligo_start + 19 + oligo_end = 19 assert collapse.findFirstSpacer( oligo, seq, oligo_start, oligo_end From f1d3f8cacba4b0e1611ecccf2f2c81039d927431 Mon Sep 17 00:00:00 2001 From: MVCowley <51127523+MVCowley@users.noreply.github.com> Date: Tue, 5 Aug 2025 12:21:32 +0100 Subject: [PATCH 4/5] docs(takara): add TAKARA to list of oligo choices --- README.md | 4 ++-- src/decombinator/collapse.py | 2 +- src/decombinator/io.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index f444391..1d6b4ab 100644 --- a/README.md +++ b/README.md @@ -87,7 +87,7 @@ decombinator pipeline -in XXXX.fq -c b -br R2 -bl 42 -ol M13 | `-di`, `--dontcheckinput` | Override the input file sanity check | | `-bd`, `--barcodeduplication` | Optionally output a file containing the final list of clustered barcodes, and their frequencies | | `-pb`, `--positionalbarcodes` | Instead of inferring random barcode sequences from their context relative to spacer sequences, just take the sequence at the default positions. Useful to salvage runs when R2 quality is terrible. | -| `-ol OLIGO`, `--oligo OLIGO` | Choose experimental oligo for correct identification of spacers ["M13", "I8", "I8_single", "NEBIO"] (default: M13) | +| `-ol OLIGO`, `--oligo OLIGO` | Choose experimental oligo for correct identification of spacers ["M13", "I8", "I8_single", "NEBIO", "TAKARA"] (default: M13) | | `-wc`, `--writeclusters` | Write cluster data to separate cluster files | | `-uh`, `--UMIhistogram` | Creates histogram of average UMI cluster sizes | | `-npf`, `--nonproductivefilter` | Filter out non-productive reads from the output | @@ -202,7 +202,7 @@ decombinator collapse -in XXXX.n12 -c b -ol M13 | `-di`, `--dontcheckinput` | Override the input file sanity check | | `-bd`, `--barcodeduplication` | Optionally output a file containing the final list of clustered barcodes, and their frequencies | | `-pb`, `--positionalbarcodes` | Instead of inferring random barcode sequences from their context relative to spacer sequences, just take the sequence at the default positions. Useful to salvage runs when R2 quality is terrible. | -| `-ol OLIGO`, `--oligo OLIGO` | Choose experimental oligo for correct identification of spacers ["M13", "I8", "I8_single", "NEBIO"] (default: M13) | +| `-ol OLIGO`, `--oligo OLIGO` | Choose experimental oligo for correct identification of spacers ["M13", "I8", "I8_single", "NEBIO", "TAKARA"] (default: M13) | | `-wc`, `--writeclusters` | Write cluster data to separate cluster files | | `-uh`, `--UMIhistogram` | Creates histogram of average UMI cluster sizes | diff --git a/src/decombinator/collapse.py b/src/decombinator/collapse.py index 86e787d..541abf3 100644 --- a/src/decombinator/collapse.py +++ b/src/decombinator/collapse.py @@ -23,7 +23,7 @@ Required inputs -in/--infile : Defines input file. Takes as input .n12 files produced by Decombinator (v3 or higher), assuming it has been run on suitably barcoded and demultiplexed data. - -ol/--oligo : Specifies the spacer (protocol dependent) as M13, I8, I8_single. The I8 protocol is deprecated. + -ol/--oligo : Specifies the spacer (protocol dependent) as M13, I8, I8_single, NEBIO, or TAKARA. The I8 protocol is deprecated. Other optional flags: diff --git a/src/decombinator/io.py b/src/decombinator/io.py index 3d8c43c..d335250 100644 --- a/src/decombinator/io.py +++ b/src/decombinator/io.py @@ -312,7 +312,7 @@ def add_collapse_arguments(parser: argparse.ArgumentParser): type=str, required=True, default="m13", - help='Choose experimental oligo for correct identification of spacers ["M13", "I8", "I8_single", "NEBIO"] (default: M13)', + help='Choose experimental oligo for correct identification of spacers ["M13", "I8", "I8_single", "NEBIO", "TAKARA"] (default: M13)', ) parser.add_argument( "-wc", From 5ead5016a6460c52e2da3d4b8be724696b4dcb1d Mon Sep 17 00:00:00 2001 From: MVCowley <51127523+MVCowley@users.noreply.github.com> Date: Tue, 5 Aug 2025 14:24:56 +0100 Subject: [PATCH 5/5] feat(takara): add takara to single umi set_barcode condition --- src/decombinator/collapse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/decombinator/collapse.py b/src/decombinator/collapse.py index 541abf3..257c8d9 100644 --- a/src/decombinator/collapse.py +++ b/src/decombinator/collapse.py @@ -281,7 +281,7 @@ def set_barcode( fields: list[str], bc_locs: list[int], inputargs: dict ) -> tuple[str, str]: # account for N1 barcode being greater or shorter than 6 nt (due to manufacturing errors) - if str.lower(inputargs["oligo"]) == "nebio": + if str.lower(inputargs["oligo"]) in ["nebio", "takara"]: barcode = fields[8][bc_locs[0] : bc_locs[1]] barcode_qualstring = fields[9][bc_locs[0] : bc_locs[1]] else: