-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path1_extraction.py
More file actions
56 lines (48 loc) · 2.2 KB
/
1_extraction.py
File metadata and controls
56 lines (48 loc) · 2.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#!/usr/bin/env python3
"""
extraction.py
-------------
Extract antigenic sites from an MSA FASTA file.
Usage:
python3 extraction.py --msa_file <MSA.fasta> --sites_file <sites.json> --output_dir <out_dir> --label <db|influx>
The JSON file should contain a dictionary mapping antigenic site names to lists of [start, end] pairs.
Example JSON:
{
"Site1": [[50,60], [70,75]],
"Site2": [[10,20]]
}
"""
import os
import json
import argparse
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
def extract_multiple_sites(msa_file, sites_dict, output_dir, label):
records = list(SeqIO.parse(msa_file, "fasta"))
for site_name, antigenic_ranges in sites_dict.items():
site_folder = os.path.join(output_dir, site_name)
os.makedirs(site_folder, exist_ok=True)
output_file = os.path.join(site_folder, f"{label}_{site_name}_extracted.fasta")
with open(output_file, "w") as out_handle:
for record in records:
extracted_seq_str = ""
for (start, end) in antigenic_ranges:
extracted_seq_str += str(record.seq[start-1:end])
new_record = SeqRecord(Seq(extracted_seq_str),
id=record.id,
description=f"{site_name} extracted region")
SeqIO.write(new_record, out_handle, "fasta")
print(f"[INFO] {label}: Site '{site_name}' extracted to {output_file}")
def main():
parser = argparse.ArgumentParser(description="Extract antigenic sites from an MSA file")
parser.add_argument("--msa_file", required=True, help="Path to the MSA FASTA file")
parser.add_argument("--sites_file", required=True, help="Path to the JSON file with antigenic site definitions")
parser.add_argument("--output_dir", required=True, help="Output directory for extracted sequences")
parser.add_argument("--label", required=True, help="Label for the sequences (e.g., db or influx)")
args = parser.parse_args()
with open(args.sites_file, "r") as f:
sites_dict = json.load(f)
extract_multiple_sites(args.msa_file, sites_dict, args.output_dir, args.label)
if __name__ == "__main__":
main()