-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathgenbank_analyzer_procedural.py
More file actions
251 lines (197 loc) · 9.15 KB
/
genbank_analyzer_procedural.py
File metadata and controls
251 lines (197 loc) · 9.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
import sys
from pathlib import Path
from typing import Optional
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
def parse_genbank_sequence(file_path: str) -> tuple[str, int]:
"""
Parse a GenBank format file and extract the sequence using BioPython.
Args:
file_path (str): Path to the GenBank format file
Returns:
tuple: (sequence, length) where sequence is the DNA/RNA sequence
and length is the number of nucleotides
"""
try:
record = SeqIO.read(file_path, "genbank")
sequence = str(record.seq).lower()
return sequence, len(sequence)
except FileNotFoundError:
raise FileNotFoundError(f"File not found: {file_path}")
except Exception as e:
raise Exception(f"Error reading GenBank file {file_path}: {e}")
def get_genbank_info(file_path: str) -> dict:
"""
Extract basic information from GenBank file header using BioPython.
Args:
file_path (str): Path to the GenBank format file
Returns:
dict: Dictionary with basic file information
"""
info = {}
try:
record: SeqRecord = SeqIO.read(file_path, "genbank")
# Basic identifiers
info['locus'] = record.name if record.name else record.id
info['accession'] = record.id
info['definition'] = record.description
info['declared_length'] = len(record.seq)
# Extract source organism from annotations
if 'source' in record.annotations:
info['source'] = record.annotations['source']
elif 'organism' in record.annotations:
info['source'] = record.annotations['organism']
# Extract additional useful information
if 'date' in record.annotations:
info['date'] = record.annotations['date']
if 'molecule_type' in record.annotations:
info['molecule_type'] = record.annotations['molecule_type']
if 'topology' in record.annotations:
info['topology'] = record.annotations['topology']
except Exception as e:
print(f"Warning: Could not extract header information: {e}")
return info
def get_genbank_features(file_path: str) -> list:
"""
Extract feature information from GenBank file using BioPython.
Args:
file_path (str): Path to the GenBank format file
Returns:
list: List of feature dictionaries with type, location, and qualifiers
"""
features = []
try:
record: SeqRecord = SeqIO.read(file_path, "genbank")
for feature in record.features:
feature_info = {
'type': feature.type,
'location': str(feature.location),
'strand': getattr(feature.location, 'strand', None)
}
# Extract common qualifiers
if feature.qualifiers:
for key in ['gene', 'product', 'protein_id', 'translation', 'note']:
if key in feature.qualifiers:
feature_info[key] = feature.qualifiers[key][0] # Take first value
features.append(feature_info)
except Exception as e:
print(f"Warning: Could not extract feature information: {e}")
return features
def main() -> None:
try:
print("GenBank Sequence Length Analyzer")
print("-" * 32)
print("This program reads GenBank format files and analyzes the sequence length.\n")
# Get file path from user or command line argument
if len(sys.argv) > 1:
file_path = sys.argv[1]
else:
file_path = input("Enter the path to the GenBank file: ").strip()
# Validate file exists
if not Path(file_path).exists():
print(f"Error: File '{file_path}' does not exist.")
return
# Validate file extension (optional warning)
if not file_path.lower().endswith(('.gb', '.gbk', '.genbank')):
print(f"Warning: File '{file_path}' doesn't have a typical GenBank extension (.gb, .gbk, .genbank)")
print(f"Analyzing file: {file_path}")
print("-" * 50)
# Extract header information
info = get_genbank_info(file_path)
if info:
print("File Information:")
if 'locus' in info:
print(f" Locus: {info['locus']}")
if 'accession' in info:
print(f" Accession: {info['accession']}")
if 'source' in info:
print(f" Source: {info['source']}")
if 'definition' in info:
print(f" Definition: {info['definition']}")
if 'declared_length' in info:
print(f" Declared length: {info['declared_length']} bp")
print()
# Parse the sequence
sequence, actual_length = parse_genbank_sequence(file_path)
if not sequence:
print("Warning: No sequence data found in the file.")
return
# Display results
print("Sequence Analysis:")
print(f" Actual sequence length: {actual_length} nucleotides")
# Verify against declared length if available
if info.get('declared_length'):
declared_length = info['declared_length']
if actual_length == declared_length:
print(f" ✓ Length matches declared length ({declared_length} bp)")
else:
print(f" ⚠ Length mismatch! Declared: {declared_length} bp, Actual: {actual_length} bp")
# Show sequence composition
if sequence:
# Count all possible nucleotides including ambiguous ones
nucleotides = ['a', 't', 'g', 'c', 'u', 'n'] # Include U for RNA
composition = {}
for nuc in nucleotides:
count = sequence.count(nuc)
if count > 0: # Only show nucleotides that are present
composition[nuc] = count
# Count any other characters (ambiguous nucleotides)
total_counted = sum(composition.values())
if total_counted < actual_length:
other_count = actual_length - total_counted
composition['other'] = other_count
print(f"\nNucleotide Composition:")
for nucleotide, count in composition.items():
percentage = (count / actual_length) * 100 if actual_length > 0 else 0
print(f" {nucleotide.upper()}: {count:,} ({percentage:.1f}%)")
# Calculate GC content (works for both DNA and RNA)
gc_count = composition.get('g', 0) + composition.get('c', 0)
gc_content = (gc_count / actual_length) * 100 if actual_length > 0 else 0
print(f"\nGC Content: {gc_content:.1f}%")
# Show features information
features = get_genbank_features(file_path)
if features:
print(f"\nFeatures Found ({len(features)} total):")
# Group features by type for better display
feature_types = {}
for feature in features:
feat_type = feature['type']
if feat_type not in feature_types:
feature_types[feat_type] = []
feature_types[feat_type].append(feature)
# Display feature summary
for feat_type, feat_list in feature_types.items():
print(f" {feat_type}: {len(feat_list)}")
# Show detailed information for important features
important_features = ['CDS', 'gene', 'mRNA', 'tRNA', 'rRNA']
shown_features = 0
max_features_to_show = 5
print(f"\nDetailed Feature Information (showing up to {max_features_to_show}):")
for feature in features:
if shown_features >= max_features_to_show:
remaining = len(features) - shown_features
if remaining > 0:
print(f" ... and {remaining} more features")
break
if feature['type'] in important_features or shown_features < 3:
print(f" {feature['type']} at {feature['location']}")
if 'gene' in feature:
print(f" Gene: {feature['gene']}")
if 'product' in feature:
print(f" Product: {feature['product']}")
shown_features += 1
# Show sequence preview
if len(sequence) > 0:
preview_length = min(60, len(sequence))
print(f"\nSequence Preview (first {preview_length} nucleotides):")
print(f" {sequence[:preview_length].upper()}")
if len(sequence) > preview_length:
print(" ...")
except FileNotFoundError as e:
print(f"Error: {e}")
except Exception as e:
print(f"Error: {e}")
except KeyboardInterrupt:
print("\nProgram interrupted by user.")
if __name__ == "__main__":
main()