diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..16c728a --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +build +build_pipeline +Example_scripts_and_sequences +anarci-1.* +.idea +*.pyc +.DS_Store +lib/python/anarci/__pycache__ \ No newline at end of file diff --git a/INSTALL b/INSTALL index 361a1c3..1341402 100644 --- a/INSTALL +++ b/INSTALL @@ -20,3 +20,9 @@ https://docs.python.org/2/install/#alternate-installation-the-user-scheme For help see README or run: $ ANARCI -h + +To package source code: +echo anarci-1.3.1 | xargs -I {} sh -c "mkdir {} && cp -r lib {} && cp -r bin {} && cp static_setup.py {} && rm -rf {}/lib/python/anarci/__pycache__ && rm -rf {}/lib/python/anarci/*.csv && tar -czf {}\.tar\.gz {}" + +To build Python command on local machine with specified hmm(lib/python/anarci/dat/HMMs), and static_steup.py will skip building hmm: +python static_steup.py install \ No newline at end of file diff --git a/bin/ANARCI b/bin/ANARCI index 578f439..47c21be 100755 --- a/bin/ANARCI +++ b/bin/ANARCI @@ -112,6 +112,7 @@ if __name__ == "__main__": parser.add_argument( '--assign_germline', action = 'store_true', default=False, help="Assign the v and j germlines to the sequence. The most sequence identical germline is assigned.", dest="assign_germline") parser.add_argument( '--use_species', type=str, default=False, help="Use a specific species in the germline assignment.", choices=all_species, dest="use_species") parser.add_argument( '--bit_score_threshold', type=int, default=80, help="Change the bit score threshold used to confirm an alignment should be used.", dest="bit_score_threshold") + parser.add_argument( '--cdr_scheme','-cs', type=str, help="Use cdr scheme will cause scheme loss impact.", dest="cdr_scheme") args = parser.parse_args() @@ -187,7 +188,7 @@ if __name__ == "__main__": sequences, numbered, alignment_details, hit_tables = run_anarci(args.inputsequence, scheme=args.scheme, output=True, outfile=outfile, csv=args.csv, allow=allow, ncpu=args.ncpu, assign_germline=args.assign_germline, allowed_species=allowed_species, - bit_score_threshold=args.bit_score_threshold ) + bit_score_threshold=args.bit_score_threshold, cdr_scheme=args.cdr_scheme) if hitfile: with open( hitfile, "w") as outfile: diff --git a/lib/python/anarci/__pycache__/__init__.cpython-36.pyc b/lib/python/anarci/__pycache__/__init__.cpython-36.pyc deleted file mode 100644 index b0d6f65..0000000 Binary files a/lib/python/anarci/__pycache__/__init__.cpython-36.pyc and /dev/null differ diff --git a/lib/python/anarci/__pycache__/__init__.cpython-38.pyc b/lib/python/anarci/__pycache__/__init__.cpython-38.pyc deleted file mode 100644 index 478abd9..0000000 Binary files a/lib/python/anarci/__pycache__/__init__.cpython-38.pyc and /dev/null differ diff --git a/lib/python/anarci/__pycache__/anarci.cpython-36.pyc b/lib/python/anarci/__pycache__/anarci.cpython-36.pyc deleted file mode 100644 index 7f43a0a..0000000 Binary files a/lib/python/anarci/__pycache__/anarci.cpython-36.pyc and /dev/null differ diff --git a/lib/python/anarci/__pycache__/anarci.cpython-38.pyc b/lib/python/anarci/__pycache__/anarci.cpython-38.pyc deleted file mode 100644 index 10e6fe8..0000000 Binary files a/lib/python/anarci/__pycache__/anarci.cpython-38.pyc and /dev/null differ diff --git a/lib/python/anarci/__pycache__/germlines.cpython-36.pyc b/lib/python/anarci/__pycache__/germlines.cpython-36.pyc deleted file mode 100644 index 027e0c7..0000000 Binary files a/lib/python/anarci/__pycache__/germlines.cpython-36.pyc and /dev/null differ diff --git a/lib/python/anarci/__pycache__/germlines.cpython-38.pyc b/lib/python/anarci/__pycache__/germlines.cpython-38.pyc deleted file mode 100644 index 651240b..0000000 Binary files a/lib/python/anarci/__pycache__/germlines.cpython-38.pyc and /dev/null differ diff --git a/lib/python/anarci/__pycache__/schemes.cpython-36.pyc b/lib/python/anarci/__pycache__/schemes.cpython-36.pyc deleted file mode 100644 index 0cad6a1..0000000 Binary files a/lib/python/anarci/__pycache__/schemes.cpython-36.pyc and /dev/null differ diff --git a/lib/python/anarci/__pycache__/schemes.cpython-38.pyc b/lib/python/anarci/__pycache__/schemes.cpython-38.pyc deleted file mode 100644 index 065b2b1..0000000 Binary files a/lib/python/anarci/__pycache__/schemes.cpython-38.pyc and /dev/null differ diff --git a/lib/python/anarci/anarci.py b/lib/python/anarci/anarci.py index 01bc657..6cc2b63 100644 --- a/lib/python/anarci/anarci.py +++ b/lib/python/anarci/anarci.py @@ -69,7 +69,7 @@ all_species = list(all_germlines['V']['H'].keys()) -amino_acids = sorted(list("QWERTYIPASDFGHKLCVNM")) +amino_acids = sorted(list("QWERTYIPASDFGHKLCVNMUOBJZX")) set_amino_acids = set(amino_acids) anarci_path = os.path.split(__file__)[0] @@ -138,7 +138,10 @@ def validate_numbering(xxx_todo_changeme, name_seq=[]): Further validation could be done but at the moment we just check that the numbering indices are incremental (they should be) """ - (numbering, start, end) = xxx_todo_changeme + numbering = xxx_todo_changeme[0] + start = xxx_todo_changeme[1] + end = xxx_todo_changeme[2] + cdrs = xxx_todo_changeme[3] if len(xxx_todo_changeme) >=4 else [] name, seq = name_seq last = -1 nseq="" @@ -150,7 +153,7 @@ def validate_numbering(xxx_todo_changeme, name_seq=[]): assert nseq in seq.replace("-",""), "The algorithm did not number a contiguous segment for sequence %s. Please report"%name - return numbering, start, end + return numbering, start, end, cdrs def grouper(n, iterable): ''' @@ -189,10 +192,8 @@ def anarci_output(numbered, sequences, alignment_details, outfile, sequence_id=N print("# Most significant HMM hit", file=outfile) print("#|species|chain_type|e-value|score|seqstart_index|seqend_index|", file=outfile) alignment_details[i][j]["evalue"] = str( alignment_details[i][j]["evalue"] ) - print("#|%s|%s|%s|%.1f|%d|%d|"%tuple( [alignment_details[i][j][field] for field in - ["species","chain_type","evalue","bitscore"]] - +[ numbered[i][j][1], numbered[i][j][2]] ), file=outfile) - + print("#|%s|%s|%s|%.1f|%s|%s|"%tuple( [alignment_details[i][j][field] for field in + ["species","chain_type","evalue","bitscore","query_start","query_end"]]), file=outfile) if 'germlines' in alignment_details[i][j]: print('# Most sequence-identical germlines', file=outfile) print('#|species|v_gene|v_identity|j_gene|j_identity|', file=outfile) @@ -279,8 +280,8 @@ def csv_output(sequences, numbered, details, outfileroot): details[i][j].get('chain_type',''), str(details[i][j].get('evalue','')), str(details[i][j].get('bitscore','')), - str(numbered[i][j][1]), - str(numbered[i][j][2]), + str(details[i][j].get('query_start', '')), + str(details[i][j].get('query_end', '')), details[i][j].get('germlines',{}).get( 'v_gene',[['',''],0] )[0][0], details[i][j].get('germlines',{}).get( 'v_gene',[['',''],0] )[0][1], '%.2f'%details[i][j].get('germlines',{}).get( 'v_gene',[['',''],0] )[1], @@ -294,6 +295,89 @@ def csv_output(sequences, numbered, details, outfileroot): assert len( line ) == len( fields ) print(','.join( line ), file=out) +def csv_output_alignments(sequences, numbered, details, outfileroot, cdr_scheme): + with open(outfileroot + '_alignments.csv', 'w') as out: + fields = ['query_no', 'query_name', 'id', 'description', 'evalue', 'bitscore', 'query_start', 'query_end', + 'species', 'chain_type', 'scheme', 'bias', 'cdr_1', 'cdr_2', 'cdr_3'] + print(','.join(fields), file=out) + + for i in range(len(sequences)): # Iterate over entries + if numbered[i] is None: continue + for j in range(len(numbered[i])): + if details is None: continue + item = details[i][j] + chain_type = item.get('chain_type', '') + single_numbered = numbered[i][j] + cdrs = enhanced_find_cdrs(single_numbered, cdr_scheme, chain_type) + line = [str(i), + item.get('query_name', ''), + item.get('id', ''), + item.get('description', ''), + str(item.get('evalue', '')), + str(item.get('bitscore', '')), + str(item.get('query_start', '')), + str(item.get('query_end', '')), + item.get('species', ''), + chain_type, + item.get('scheme', ''), + str(item.get('bias', '')), + '' if not cdrs else ''.join([str(p[1]) for p in cdrs[0]]).replace('-', ''), + '' if not cdrs else ''.join([str(p[1]) for p in cdrs[1]]).replace('-', ''), + '' if not cdrs else ''.join([str(p[1]) for p in cdrs[2]]).replace('-', '') + ] + print(','.join(line), file=out) + +def get_defaults_from_cdr_scheme(number_scheme, allow, cdr_scheme): + target_scheme = number_scheme + target_allow = allow + if cdr_scheme: + if cdr_scheme in ['contact', 'chothia', 'abm', 'kabat']: + target_scheme = 'chothia' + target_allow = ["H", "K", "L"] + elif cdr_scheme == 'imgt': + target_scheme = 'imgt' + target_allow = allow + return target_scheme, target_allow + +def enhanced_find_cdrs(single_numbered, cdr_scheme, chain_type): + if cdr_scheme is None: + return None + + if cdr_scheme == 'imgt': + return single_numbered[3] + + cdr_map = dict() + # 参考网站: + # http://www.bioinf.org.uk/abs/info.html (chothia的定义方案做了更新,跟其他的参考网站不一样,本方案依然采用老方案) + # https://plueckthun.bioc.uzh.ch/antibody/Numbering/Numbering.html (也是本程序参考的网站) + # https://www.novopro.cn/tools/cdr.html (chothia的cdr定义方案有bug,采用了kabat编码方案,没有对H32..H34进行逻辑判断) + # http://www.abysis.org/abysis/about/definitions/definitions.cgi (REGIONS对应定义方案,cdr定义方案是一致的) + #注:Contact,Chothia,AbM, Kabat直接采用Chothia编码方案,为的是统一, IMGT利用本程序直接返回CDR,暂时没有找到对应的方案 + cdr_map['chothia_heavy'] = [['26','32'],['52','56'],['95','102']], [[],[],[]] + cdr_map['chothia_light'] = [['24','34'],['50','56'],['89','97']],[[],[],[]] + cdr_map['kabat_light'] = [['24','34'],['50','56'],['89','97']], [[],[],[]] + cdr_map['kabat_heavy'] = [['31','35'],['50','65'],['95','102']],[[],[],[]] + cdr_map['contact_light'] = [['30','36'],['46','55'],['89','96']], [[],[],[]] + cdr_map['contact_heavy'] = [['30','35'],['47','58'],['93','101']], [[],[],[]] + cdr_map['abm_light'] = [['24','34'],['50','56'],['89','97']], [[],[],[]] + cdr_map['abm_heavy'] = [['26','35'],['50','58'],['95','102']], [[],[],[]] + + if chain_type == 'H': + cdr_key = cdr_scheme + '_heavy' + elif chain_type in 'KL': + cdr_key = cdr_scheme + '_light' + + if cdr_key is None or cdr_key not in cdr_map.keys() or single_numbered is None: + return None + + cdr_regions = cdr_map[cdr_key][0] + for index, range in enumerate(cdr_regions): + cdr_idx = [i for i, (x, y) in enumerate(single_numbered[0]) if + (str(x[0]) + x[1]).strip() == range[0] or (str(x[0]) + x[1]).strip() == range[1]] + if len(cdr_idx) == 2: + #末尾是个开区间,所以要+1 + cdr_map[cdr_key][1][index] = single_numbered[0][cdr_idx[0]:cdr_idx[1]+1] + return cdr_map[cdr_key][1] ## Parsing and recognising domain hits from hmmscan ## @@ -741,7 +825,7 @@ def check_for_j( sequences, alignments, scheme ): # Sandwich the presumed CDR3 region between the V and J regions. vRegion = ali[:cys_ai+1] - jRegion = [ (state, index+cys_si+1) for state, index in re_states[0] if state[0] >= 117 ] + jRegion = [ (state, index+cys_si+1) for state, index in re_states[0] if state[0] >= 117] cdrRegion = [] next = 105 for si in range( cys_si+1, jRegion[0][1] ): @@ -764,7 +848,8 @@ def check_for_j( sequences, alignments, scheme ): # Main function for ANARCI # Name conflict with function, module and package is kept for legacy unless issues are reported in future. def anarci(sequences, scheme="imgt", database="ALL", output=False, outfile=None, csv=False, allow=set(["H","K","L","A","B","G","D"]), - hmmerpath="", ncpu=None, assign_germline=False, allowed_species=None, bit_score_threshold=80): + hmmerpath="", ncpu=None, assign_germline=False, allowed_species=None, bit_score_threshold=80, + cdr_scheme=None): """ The main function for anarci. Identify antibody and TCR domains, number them and annotate their germline and species. @@ -833,15 +918,17 @@ def anarci(sequences, scheme="imgt", database="ALL", output=False, outfile=None, # Modify alignments in-place check_for_j( sequences, alignments, scheme ) + scheme, allow = get_defaults_from_cdr_scheme(scheme, allow, cdr_scheme) + # Apply the desired numbering scheme to all sequences - numbered, alignment_details, hit_tables = number_sequences_from_alignment(sequences, alignments, scheme=scheme, allow=allow, + numbered, alignment_details, hit_tables = number_sequences_from_alignment(sequences, alignments, scheme=scheme, allow=allow, assign_germline=assign_germline, allowed_species=allowed_species) # Output if necessary if output: if csv: - csv_output(sequences, numbered, details, outfile) + csv_output(sequences, numbered, alignment_details, outfile) else: outto, close=sys.stdout, False if outfile: @@ -938,7 +1025,8 @@ def run_anarci( seq, ncpu=1, **kwargs): # Output if necessary if output: if csv: - csv_output(sequences, numbered, alignment_details, outfile) + csv_output(sequences, numbered, alignment_details, outfile) + csv_output_alignments(sequences, numbered, alignment_details, outfile, kwargs.get('cdr_scheme')) else: outto, close=sys.stdout, False if outfile: @@ -993,21 +1081,27 @@ def number(sequence, scheme="imgt", database="ALL", allow=set(["H","K","L","A"," return False, False if __name__ == "__main__": - # Test and example useage of the anarci function. - sequences = [ ("12e8:H","EVQLQQSGAEVVRSGASVKLSCTASGFNIKDYYIHWVKQRPEKGLEWIGWIDPEIGDTEYVPKFQGKATMTADTSSNTAYLQLSSLTSEDTAVYYCNAGHDYDRGRFPYWGQGTLVTVSAAKTTPPSVYPLAP"), - ("12e8:L","DIVMTQSQKFMSTSVGDRVSITCKASQNVGTAVAWYQQKPGQSPKLMIYSASNRYTGVPDRFTGSGSGTDFTLTISNMQSEDLADYFCQQYSSYPLTFGAGTKLELKRADAAPTVSIFPPSSEQLTSGGASV"), - ("scfv:A","DIQMTQSPSSLSASVGDRVTITCRTSGNIHNYLTWYQQKPGKAPQLLIYNAKTLADGVPSRFSGSGSGTQFTLTISSLQPEDFANYYCQHFWSLPFTFGQGTKVEIKRTGGGGSGGGGSGGGGSGGGGSEVQLVESGGGLVQPGGSLRLSCAASGFDFSRYDMSWVRQAPGKRLEWVAYISSGGGSTYFPDTVKGRFTISRDNAKNTLYLQMNSLRAEDTAVYYCARQNKKLTWFDYWGQGTLVTVSSHHHHHH"), - ("lysozyme:A","KVFGRCELAAAMKRHGLDNYRGYSLGNWVCAAKFESNFNTQATNRNTDGSTDYGILQINSRWWCNDGRTPGSRNLCNIPCSALLSSDITASVNCAKKIVSDGNGMNAWVAWRNRCKGTDVQAWIRGCRL")] - - results = anarci(sequences, scheme="imgt", output=True) - numbering, alignment_details, hit_tables = results - - expect_one_VH_domain_numbering, expect_one_VL_domain_numbering, expect_VH_then_VL_numbering, expect_None = numbering - assert len(expect_one_VH_domain_numbering) == 1 - assert len(expect_one_VL_domain_numbering) == 1 - assert len(expect_VH_then_VL_numbering) == 2 - assert expect_None == None - + # Test and example useage of the anarci function. + + sequences = [("seq1", + "MASISIFIVVFAFFTQESSGQITVTQTPAVKAVLHGQTVTMSCKVSPAVHNNNYLAWYQQEPGEAPKLLIYYASNRNSGIPSRFSGSGSSTDFTLTISGVQAEDAGDYYCQSEHNIGSTFSPSWLLTQ")] + # fasta_file = '/Users/kongwenfei/Downloads/part-00016-c849a94d-2f5b-46c6-9765-213535c72f13-c000.txt' + # fasta_file = '/Users/kongwenfei/Downloads/part-00000-9146dde2-26a3-4c27-acc9-72224fed81a5-c000.txt' + results = run_anarci(sequences, scheme="imgt", output=True, outfile="abc", csv=True) + + # sequences = [ ("12e8:H","EVQLQQSGAEVVRSGASVKLSCTASGFNIKDYYIHWVKQRPEKGLEWIGWIDPEIGDTEYVPKFQGKATMTADTSSNTAYLQLSSLTSEDTAVYYCNAGHDYDRGRFPYWGQGTLVTVSAAKTTPPSVYPLAP"), + # ("12e8:L","DIVMTQSQKFMSTSVGDRVSITCKASQNVGTAVAWYQQKPGQSPKLMIYSASNRYTGVPDRFTGSGSGTDFTLTISNMQSEDLADYFCQQYSSYPLTFGAGTKLELKRADAAPTVSIFPPSSEQLTSGGASV"), + # ("scfv:A","DIQMTQSPSSLSASVGDRVTITCRTSGNIHNYLTWYQQKPGKAPQLLIYNAKTLADGVPSRFSGSGSGTQFTLTISSLQPEDFANYYCQHFWSLPFTFGQGTKVEIKRTGGGGSGGGGSGGGGSGGGGSEVQLVESGGGLVQPGGSLRLSCAASGFDFSRYDMSWVRQAPGKRLEWVAYISSGGGSTYFPDTVKGRFTISRDNAKNTLYLQMNSLRAEDTAVYYCARQNKKLTWFDYWGQGTLVTVSSHHHHHH"), + # ("lysozyme:A","KVFGRCELAAAMKRHGLDNYRGYSLGNWVCAAKFESNFNTQATNRNTDGSTDYGILQINSRWWCNDGRTPGSRNLCNIPCSALLSSDITASVNCAKKIVSDGNGMNAWVAWRNRCKGTDVQAWIRGCRL")] + # + # results = anarci(sequences, scheme="imgt", output=True) + # numbering, alignment_details, hit_tables = results + # + # expect_one_VH_domain_numbering, expect_one_VL_domain_numbering, expect_VH_then_VL_numbering, expect_None = numbering + # assert len(expect_one_VH_domain_numbering) == 1 + # assert len(expect_one_VL_domain_numbering) == 1 + # assert len(expect_VH_then_VL_numbering) == 2 + # assert expect_None == None diff --git a/lib/python/anarci/schemes.py b/lib/python/anarci/schemes.py index 61f812a..f155361 100644 --- a/lib/python/anarci/schemes.py +++ b/lib/python/anarci/schemes.py @@ -423,7 +423,7 @@ def number_imgt(state_vector, sequence): # maximum length of 65 (13 positions, 26*2 insertions) . In practice ANARCI will not recognise CDR3s of this length. cdr3seq = "".join([ x[1] for x in _regions[5] if x[1] != "-" ]) cdr3length = len(cdr3seq) - if cdr3length > 117: return [], startindex, endindex # Too many insertions. Do not apply numbering. + if cdr3length > 117: return [], startindex, endindex # Too many insertions. Do not apply numbering. si = 0 previous_state_id = 104 for ann in get_imgt_cdr(cdr3length, 13, 105, 118): @@ -436,7 +436,7 @@ def number_imgt(state_vector, sequence): si+=1 # Return the full vector and the start and end indices of the numbered region of the sequence - return gap_missing( _numbering ), startindex, endindex + return gap_missing( _numbering ), startindex, endindex, [_numbering[1], _numbering[3], _numbering[5]] def get_imgt_cdr(length, maxlength, start, end): """ @@ -641,7 +641,7 @@ def number_aho(state_vector, sequence, chain_type): # longer than the number of positions. insertions = max( length-18 , 0 ) if insertions > 26: - return [], startindex, endindex # Too many insertions. Do not apply numbering. + return [], startindex, endindex # Too many insertions. Do not apply numbering. elif insertions > 0: # They are placed on residue 36 alphabetically. insertat = annotations.index( (36, ' ') )+1 # Always 12 @@ -677,7 +677,7 @@ def number_aho(state_vector, sequence, chain_type): # Insertions are not described in the AHo scheme but must be included. insertions = max( length-20 , 0 ) if insertions > 26: - return [], startindex, endindex # Too many insertions. Do not apply numbering. + return [], startindex, endindex # Too many insertions. Do not apply numbering. elif insertions > 0: # They are placed on residue 63 alphabetically. insertat = annotations.index( (63, ' ') )+1 # Always 6 @@ -698,7 +698,7 @@ def number_aho(state_vector, sequence, chain_type): # Insertions are not described in the AHo scheme but must be included. insertions = max( length-16 , 0 ) if insertions > 26: - return [], startindex, endindex # Too many insertions. Do not apply numbering. + return [], startindex, endindex # Too many insertions. Do not apply numbering. elif insertions > 0: # They are placed on residue 85 alphabetically. insertat = annotations.index( (85, ' ') )+1 # Always 8 @@ -726,7 +726,7 @@ def number_aho(state_vector, sequence, chain_type): # Insertions are not described in the AHo scheme but must be included. insertions = max( length-32 , 0 ) if insertions > 26: - return [], startindex, endindex # Too many insertions. Do not apply numbering. + return [], startindex, endindex # Too many insertions. Do not apply numbering. elif insertions > 0: # They are placed on residue 123 alphabetically. insertat = annotations.index( (123, ' ') )+1 # Always 17 @@ -860,12 +860,12 @@ def number_chothia_heavy(state_vector, sequence): # Chothia H region 7 (index 6) # put insertions onto 100 length = len( _regions[6] ) - if length > 36: return [], startindex, endindex # Too many insertions. Do not apply numbering. + if length > 36: return [], startindex, endindex # Too many insertions. Do not apply numbering. annotations = get_cdr3_annotations(length, scheme="chothia", chain_type="heavy") _numbering[6] = [ (annotations[i], _regions[6][i][1]) for i in range(length) ] # Return the full vector and the start and end indices of the numbered region of the sequence - return gap_missing( _numbering ), startindex, endindex + return gap_missing( _numbering ), startindex, endindex # Light chains def number_chothia_light(state_vector, sequence): @@ -969,13 +969,12 @@ def number_chothia_light(state_vector, sequence): # put insertions onto 95 length = len( _regions[5] ) - if length > 35: return [], startindex, endindex # Too many insertions. Do not apply numbering. + if length > 35: return [], startindex, endindex # Too many insertions. Do not apply numbering. annotations = get_cdr3_annotations(length, scheme="chothia", chain_type="light") _numbering[5] = [ (annotations[i], _regions[5][i][1]) for i in range(length) ] # Return the full vector and the start and end indices of the numbered region of the sequence - - return gap_missing( _numbering ), startindex, endindex + return gap_missing(_numbering), startindex, endindex ######### @@ -1090,12 +1089,12 @@ def number_kabat_heavy(state_vector, sequence): # Chothia H region 7 (index 6) # put insertions onto 100 length = len( _regions[6] ) - if length > 36: return [], startindex, endindex # Too many insertions. Do not apply numbering. + if length > 36: return [], startindex, endindex # Too many insertions. Do not apply numbering. annotations = get_cdr3_annotations(length, scheme="kabat", chain_type="heavy") # Chothia and Kabat the same here _numbering[6] = [ (annotations[i], _regions[6][i][1]) for i in range(length) ] # Return the full vector and the start and end indices of the numbered region of the sequence - return gap_missing( _numbering ), startindex, endindex + return gap_missing( _numbering ), startindex, endindex # Light chains def number_kabat_light(state_vector, sequence): @@ -1192,7 +1191,7 @@ def number_kabat_light(state_vector, sequence): annotations = get_cdr3_annotations(length, scheme="kabat", chain_type="light") _numbering[5] = [ (annotations[i], _regions[5][i][1]) for i in range(length) ] - return gap_missing( _numbering ), startindex, endindex + return gap_missing( _numbering ), startindex, endindex @@ -1321,12 +1320,12 @@ def number_martin_heavy(state_vector, sequence): # Chothia H region 7 (index 6) # put insertions onto 100 length = len( _regions[6] ) - if length > 36: return [], startindex, endindex # Too many insertions. Do not apply numbering. + if length > 36: return [], startindex, endindex # Too many insertions. Do not apply numbering. annotations = get_cdr3_annotations(length, scheme="chothia", chain_type="heavy") _numbering[6] = [ (annotations[i], _regions[6][i][1]) for i in range(length) ] # Return the full vector and the start and end indices of the numbered region of the sequence - return gap_missing( _numbering ), startindex, endindex + return gap_missing( _numbering ), startindex, endindex # Light chains def number_martin_light(state_vector, sequence): @@ -1465,7 +1464,7 @@ def number_wolfguy_heavy(state_vector, sequence): _numbering[5] = [ ((annotations[i]," "), _regions[5][i][1]) for i in range(length) ] # Return the full vector and the start and end indices of the numbered region of the sequence - return sum( _numbering, [] ), startindex, endindex + return sum( _numbering, [] ), startindex, endindex def number_wolfguy_light(state_vector, sequence): @@ -1570,7 +1569,7 @@ def number_wolfguy_light(state_vector, sequence): _numbering[9] = [ ((annotations[i]," "), _regions[9][i][1]) for i in range(length) ] # Return the full vector and the start and end indices of the numbered region of the sequence - return sum( _numbering, [] ), startindex, endindex + return sum( _numbering, [] ), startindex, endindex def _get_wolfguy_L1(seq, length): @@ -1634,7 +1633,6 @@ def gap_missing( numbering ): num.append( (p,a) ) return num[1:] - ###################### # Annotation of CDR3 # ###################### diff --git a/static_setup.py b/static_setup.py new file mode 100644 index 0000000..3ee31b8 --- /dev/null +++ b/static_setup.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 + +import shutil, os +if os.path.isdir("build"): + shutil.rmtree("build/") + +from distutils.core import setup + +setup(name='anarci', + version='1.3', + description='Antibody Numbering and Receptor ClassIfication', + author='James Dunbar', + author_email='opig@stats.ox.ac.uk', + url='http://opig.stats.ox.ac.uk/webapps/ANARCI', + packages=['anarci'], + package_dir={'anarci': 'lib/python/anarci'}, + package_data={'anarci': ['dat/HMMs/ALL.hmm', + 'dat/HMMs/ALL.hmm.h3f', + 'dat/HMMs/ALL.hmm.h3i', + 'dat/HMMs/ALL.hmm.h3m', + 'dat/HMMs/ALL.hmm.h3p']}, + scripts=['bin/ANARCI'], + data_files = [ ('bin', ['bin/muscle', 'bin/muscle_macOS']) ] + ) \ No newline at end of file