Skip to content

Commit

Permalink
Add 5' kit support (merge the support-5prim-kit branch)
Browse files Browse the repository at this point in the history
  • Loading branch information
youyupei committed Aug 15, 2024
2 parents ee14ca7 + 52997b8 commit 8436ebe
Show file tree
Hide file tree
Showing 9 changed files with 117 additions and 206 deletions.
Binary file added blaze/10X_bc/3M-3pgex-may-2023.zip
Binary file not shown.
Binary file added blaze/10X_bc/3M-5pgex-jan-2023.zip
Binary file not shown.
18 changes: 12 additions & 6 deletions blaze/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,24 +13,30 @@
ADPT_WIN=200 #search adaptor in subsequence from both end of the reads with this size
ADPT_MAC_MATCH_ED=2 #minimum proportion of match required when searching

## format suffix
SEQ_SUFFIX_WIN=200 #search poly T/ TSO in subsequence from both end of the reads with this size
SEQ_SUFFIX_MIN_MATCH_PROP=1 #minimum proportion of match required when searching for poly T/TSO
SEQ_SUFFIX_AFT_ADPT=(20,50) #a poly T / TSO should locate within this range downstream an adaptor

## poly T searching
PLY_T_LEN=4 #length of searched poly T
PLY_T_WIN=200 #search poly T in subsequence from both end of the reads with this size
PLY_T_MIN_MATCH_PROP=1#minimum proportion of match required when searching
PLY_T_NT_AFT_ADPT=(20,50)#a poly T should locate within this range downstream an adaptor
## TSO searching
TSO_SEQ='TTTCTTATATGGG'

####################################################
####### DEFAULT in getting putative bc ######
####################################################
# input
DEFAULT_GRB_MIN_SCORE=15
DEFAULT_GRB_KIT='v3'
DEFAULT_UMI_SIZE = 12 if DEFAULT_GRB_KIT=='v3' else 10
DEFAULT_GRB_KIT='3v3'
DEFAULT_UMI_SIZE = 12 if DEFAULT_GRB_KIT=='3v3' else 10
DEFAULT_BC_SIZE = 16

# The 10X barcode whitelists has been packed in the package
DEFAULT_GRB_WHITELIST_V3=os.path.join(os.path.dirname(__file__), '10X_bc', '3M-february-2018.zip')
DEFAULT_GRB_WHITELIST_3V3=os.path.join(os.path.dirname(__file__), '10X_bc', '3M-february-2018.zip')
DEFAULT_GRB_WHITELIST_V2=os.path.join(os.path.dirname(__file__), '10X_bc', '737K-august-2016.txt')
DEFAULT_GRB_WHITELIST_5V3=os.path.join(os.path.dirname(__file__), '10X_bc', '3M-5pgex-jan-2023.zip')
DEFAULT_GRB_WHITELIST_3V4=os.path.join(os.path.dirname(__file__), '10X_bc', '3M-3pgex-may-2023.zip')

#output
DEFAULT_GRB_OUT_RAW_BC='putative_bc.csv'
Expand Down
15 changes: 8 additions & 7 deletions blaze/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

def get_raw_bc_from_reads(reads, min_q=0):
# Parse fastq -> polyT_adaptor_finder.Read class
def get_raw_bc_from_reads(reads, min_q=0, kit=None):
"""
Get putative BC from each reads from a batch of read (can be defined by batch_iterator function)
Expand Down Expand Up @@ -72,7 +73,8 @@ def get_raw_bc_from_reads(reads, min_q=0):

# create read object
read = polyT_adaptor_finder.Read(read_id = r.id, sequence=str(r.seq),
phred_score=r.q_letter)
phred_score=r.q_letter, kit=kit)


read.get_strand_and_raw_bc()
read_ids.append(read.id)
Expand Down Expand Up @@ -335,19 +337,18 @@ def main():
#for arg in vars(args):
# print(f"{arg}: {getattr(args, arg)}")




######################
###### Getting putative barcodes
######################
if args.do_bc_search:
logger.info(f'Getting putative barcodes from {len(args.fastq_fns)} FASTQ files...')
read_batchs = read_batch_generator(args.fastq_fns, batch_size=args.batch_size)



rst_futures = helper.multiprocessing_submit(get_raw_bc_from_reads,
read_batchs, n_process=args.threads,
min_q=args.minQ)
min_q=args.minQ, kit=args.kit_version)


raw_bc_pass_count = defaultdict(int)

Expand Down
16 changes: 10 additions & 6 deletions blaze/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,12 +100,12 @@ def get_files_from_dir(fastq_dir):
whitelist_arg_opt.add_argument('--max-edit-distance', type=int, default=DEFAULT_ASSIGNMENT_ED,
help='Maximum edit distance allowed between a putative barcode and a barcode \nfor a read to be assigned to the barcode.')

whitelist_arg_opt.add_argument('--10x-kit-version', dest="kit_version", choices=['v2', 'v3'], default=DEFAULT_GRB_KIT,
help='Choose from 10X Single Cell 3ʹ gene expression v2 or v3. If using other protocols, \n'
whitelist_arg_opt.add_argument('--10x-kit-version', '--kit-version', dest="kit_version", choices=['3v4', '3v3', '3v2', '5v3', '5v2'], default=DEFAULT_GRB_KIT,
help='Choose from 10X Single Cell 3ʹ gene expression v4, v3, v2 (3v4, 3v3, 3v2) or 5ʹ gene expression v3, v2 (5v3, 5v2). If using other protocols, \n'
'please do not specify this option and specify --full-bc-whitelist instead.')

whitelist_arg_opt.add_argument('--full-bc-whitelist',
type=lambda x: x if helper.check_files_exist(fastq_dir) else None,
type=lambda x: x if helper.check_files_exist(x) else None,
default=None,
help='Filename of the full barcode whitelist. If not specified, the corresponding version of 10X whitelist will be used.')
whitelist_arg_opt.add_argument('--high-sensitivity-mode', action='store_true',
Expand Down Expand Up @@ -172,9 +172,13 @@ def get_files_from_dir(fastq_dir):
helper.warning_msg(textwrap.dedent(
f'You are using {os.path.basename(args.full_bc_whitelist)} as the full barcode '\
'whitelist. Note that the barcodes not listed in the file will never be found.'))
elif args.kit_version == 'v3':
args.full_bc_whitelist = DEFAULT_GRB_WHITELIST_V3
elif args.kit_version == 'v2':
elif args.kit_version == '3v4':
args.full_bc_whitelist = DEFAULT_GRB_WHITELIST_3V4
elif args.kit_version == '3v3':
args.full_bc_whitelist = DEFAULT_GRB_WHITELIST_3V3
elif args.kit_version == '5v3':
args.full_bc_whitelist = DEFAULT_GRB_WHITELIST_5V3
elif args.kit_version == '5v2' or args.kit_version == '3v2':
args.full_bc_whitelist = DEFAULT_GRB_WHITELIST_V2
else:
helper.err_msg("Error: Invalid value of --kit-version, please choose from v3 or v2 or specify --full-bc-whitelist.", printit=True)
Expand Down
117 changes: 84 additions & 33 deletions blaze/polyT_adaptor_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,13 @@ class Read(object):
"""
def __init__(self, read_id, sequence, phred_score=None, strand = None, **kwargs):
def __init__(self, read_id, sequence, phred_score=None, strand = None, kit=None, **kwargs):
self.id = read_id
self.seq = sequence
self.phred_score = phred_score
self._strand = strand
self._get_strand_and_raw_bc_flag = False # record whether the raw bc searching has been performed
self.kit = kit
for key in kwargs.keys():
self.__dict__[key] = kwargs[key]

Expand All @@ -39,35 +40,53 @@ def add(self, attr_name, attr_val, overwrite = True):
else:
self.__dict__[attr_name] = attr_val


def find_adaptor(self,read=None, strand=None, adaptor_seq=ADPT_SEQ,
num_nt=ADPT_WIN, max_ed=ADPT_MAC_MATCH_ED):
'''
find adaptor from a read
Parameters
----------
read : STR
num_nt : INT
The adaptor will be searched within first num_nt bases in the reads.
strand : '+' or '-'
Transcript strand, this function will find adaptor in '-' poly T strand
Will do a reverse complement if strand == '-'
adaptor_seq : STR
10X adaptor sequence, the full-length adaptor sequence is
'CTACACGACGCTCTTCCGATCT' (22nt). Last 12nt is used by default.
max_ed : float
max edit distance in the adaptor alignment. Default: 2
Returns
-------
Dict
'''
# check input
def find_adapter_5_prime(self, read=None, strand=None, adapter_seq=ADPT_SEQ,
num_nt=ADPT_WIN,max_ed=ADPT_MAC_MATCH_ED,
tso_seq=TSO_SEQ):

strand = self._strand if not strand else strand
read = self.seq if not read else read

if strand == '-':
seq = read[:num_nt]
d1, d2 = SEQ_SUFFIX_AFT_ADPT
_, tso_seq_end = sub_edit_distance(seq, tso_seq, max_ed)
if tso_seq_end > 0:
tso_seq_start = max(0, tso_seq_end-len(TSO_SEQ))

_, sub_seq_end = sub_edit_distance(
seq[max(0,tso_seq_start-d2):max(0,tso_seq_start-d1)],
adapter_seq, max_ed)

return {'-':[max(0,tso_seq_start-d2)+sub_seq_end+1]} if sub_seq_end > 0 else {}
else:
return {}

if strand == '+':
read = helper.reverse_complement(read[-num_nt:])
adpt_ends = self.find_adapter_5_prime(
read=read, strand='-', adapter_seq=adapter_seq, num_nt=num_nt,
max_ed=max_ed, tso_seq=tso_seq).get('-', [])

return {'+':adpt_ends} if len(adpt_ends) else {}
else:

fwd_strand = self.find_adapter_5_prime(
strand='-', adapter_seq=adapter_seq,
num_nt=num_nt, tso_seq=tso_seq)
rev_strand = self.find_adapter_5_prime(
strand='+', adapter_seq=adapter_seq,
num_nt=num_nt, tso_seq=tso_seq)

rst = {**{k:v for k,v in fwd_strand.items() if len(v)},
**{k:v for k,v in rev_strand.items() if len(v)}}
return rst

def find_adaptor_3_prime(self, read=None, strand=None, adaptor_seq=ADPT_SEQ,
num_nt=ADPT_WIN, max_ed=ADPT_MAC_MATCH_ED):

def find_poly_T(seq, poly_T_len=PLY_T_LEN,
min_match_prop=PLY_T_MIN_MATCH_PROP):
min_match_prop=SEQ_SUFFIX_MIN_MATCH_PROP):
'''Find poly T in seq
Parameters
----------
Expand All @@ -93,13 +112,14 @@ def find_poly_T(seq, poly_T_len=PLY_T_LEN,
T_prop = helper.sliding_window_mean(read_code, poly_T_len)
return np.where(T_prop >= min_match_prop)[0]


strand = self._strand if not strand else strand
read = self.seq if not read else read

if strand == '-':
seq = read[:num_nt]

# find polyT
ply_T_idx = find_poly_T(seq)
d1, d2 = PLY_T_NT_AFT_ADPT
d1, d2 = SEQ_SUFFIX_AFT_ADPT
ply_T_idx = ply_T_idx[ply_T_idx >= d1]
adpt_ends = []
searching_win = []
Expand Down Expand Up @@ -127,22 +147,53 @@ def find_poly_T(seq, poly_T_len=PLY_T_LEN,
# take reverse complement if read is coming from transcript strand (with ployA instead ployT)
if strand == '+':
read = helper.reverse_complement(read[-num_nt:])
adpt_ends = self.find_adaptor(
adpt_ends = self.find_adaptor_3_prime(
read, strand='-', adaptor_seq=adaptor_seq,
num_nt=num_nt).get('-',[])
return {'+':adpt_ends} if len(adpt_ends) else {}

else:
T_strand = self.find_adaptor(
T_strand = self.find_adaptor_3_prime(
strand='-', adaptor_seq=adaptor_seq,
num_nt=num_nt)
A_strand = self.find_adaptor(
A_strand = self.find_adaptor_3_prime(
strand='+', adaptor_seq=adaptor_seq,
num_nt=num_nt)
rst = {**{k:v for k,v in T_strand.items() if len(v)},
**{k:v for k,v in A_strand.items() if len(v)}}
return rst


def find_adaptor(self,read=None, strand=None, adaptor_seq=ADPT_SEQ,
num_nt=ADPT_WIN, max_ed=ADPT_MAC_MATCH_ED, tso_seq=TSO_SEQ):
'''
find adaptor from a read
Parameters
----------
read : STR
num_nt : INT
The adaptor will be searched within first num_nt bases in the reads.
strand : '+' or '-'
Transcript strand, this function will find adaptor in '-' poly T strand
Will do a reverse complement if strand == '-'
adaptor_seq : STR
10X adaptor sequence, the full-length adaptor sequence is
'CTACACGACGCTCTTCCGATCT' (22nt). Last 12nt is used by default.
max_ed : float
max edit distance in the adaptor alignment. Default: 2
Returns
-------
Dict
'''

if self.kit == '3v4' or self.kit == '3v3' or self.kit == '3v2':
return self.find_adaptor_3_prime(read=read, strand=strand,
adaptor_seq=adaptor_seq, num_nt=num_nt, max_ed=max_ed)
else:
return self.find_adapter_5_prime(read=read, strand=strand,
adapter_seq=adaptor_seq, num_nt=num_nt, max_ed=max_ed,
tso_seq=tso_seq)

def get_strand_and_raw_bc(self):
'''
Expand Down Expand Up @@ -373,4 +424,4 @@ def main():
if __name__ == '__main__':
main()



Loading

0 comments on commit 8436ebe

Please sign in to comment.