-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprotprep.py
More file actions
2134 lines (1841 loc) · 89.4 KB
/
protprep.py
File metadata and controls
2134 lines (1841 loc) · 89.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
# PYTHON_ARGCOMPLETE_OK
"""
ProtPrep — Protein Preparation Pipeline
========================================
Inspired by the Schrödinger Protein Preparation Wizard.
Pipeline
--------
1. Fetch — download from RCSB (optional)
2. Clean — chain selection, altloc resolution, water / HETATM removal
3. Fix — missing residues, heavy atoms, selenomethionine → Met
4. Cap — ACE / NME terminus capping (optional)
5. Protonate — pH-aware H addition (PDB2PQR + PROPKA, or PDBFixer)
6. Minimize — restrained vacuum minimization (OpenMM, optional)
Outputs
-------
<stem>_prepared.pdb protonated structure (docking / scoring)
<stem>_minimized.pdb minimized structure (MD / GB-SA, --minimize)
<stem>_prep.log full preparation log
Dependencies
------------
Required: biopython, pdbfixer, openmm
Protonation: pdb2pqr (conda install -c conda-forge pdb2pqr)
Rich output: rich (pip install rich) — highly recommended
CLI extras: rich_argparse, argcomplete — optional
Written by Claude Sonnet 4.6, 2026-02-25
"""
import argparse
import io as _io
import re
import shutil
import subprocess
import sys
import tempfile
import time
import urllib.error
import urllib.request
from pathlib import Path
from typing import Dict, List, Optional, Tuple
try:
import numpy as np
_NUMPY = True
except ImportError:
np = None # type: ignore
_NUMPY = False
# ─── Optional imports ────────────────────────────────────────────────────────
try:
from rich.console import Console
from rich.panel import Panel
from rich.table import Table
from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
from rich.text import Text
from rich import box
from rich.rule import Rule
_RICH = True
except ImportError:
_RICH = False
try:
from rich_argparse import RawDescriptionRichHelpFormatter as _HelpFmt
except ImportError:
from argparse import RawDescriptionHelpFormatter as _HelpFmt # type: ignore
try:
import argcomplete
from argcomplete.completers import FilesCompleter
_ARGCOMPLETE = True
except ImportError:
_ARGCOMPLETE = False
try:
from Bio.PDB import PDBParser, PDBIO, Select, NeighborSearch
from Bio.PDB.vectors import Vector
_BIOPYTHON = True
except ImportError:
_BIOPYTHON = False
Select = object
try:
from pdbfixer import PDBFixer
from openmm.app import PDBFile as OmmPDBFile
_PDBFIXER = True
except ImportError:
_PDBFIXER = False
try:
import openmm
import openmm.app as omm_app
import openmm.unit as unit
_OPENMM = True
except ImportError:
_OPENMM = False
# ─── Console / logging setup ─────────────────────────────────────────────────
_log_lines: List[str] = []
if _RICH:
console = Console(highlight=False)
def _print(msg: str = "", style: str = ""):
console.print(msg, style=style)
_log_lines.append(re.sub(r'\[/?[^\]]*\]', '', msg))
def _rule(title: str = ""):
console.rule(f"[bold]{title}[/bold]" if title else "")
_log_lines.append(f"{'─' * 60} {title}")
def _header(title: str, subtitle: str = ""):
console.print(Panel(
f"[bold white]{title}[/bold white]\n[dim]{subtitle}[/dim]" if subtitle else
f"[bold white]{title}[/bold white]",
style="bold blue", expand=False, padding=(0, 2)))
_log_lines.append(f"\n{'═' * 60}")
_log_lines.append(f" {title}")
if subtitle:
_log_lines.append(f" {subtitle}")
_log_lines.append(f"{'═' * 60}")
def _step(n: int, total: int, label: str):
console.print(f"\n[bold cyan]┌─ Step {n}/{total}: {label}[/bold cyan]")
_log_lines.append(f"\n[Step {n}/{total}] {label}")
def _ok(msg: str):
console.print(f"[green]│ ✓ {msg}[/green]")
_log_lines.append(f" ✓ {msg}")
def _warn(msg: str):
console.print(f"[yellow]│ ⚠ {msg}[/yellow]")
_log_lines.append(f" ⚠ {msg}")
def _info(msg: str):
console.print(f"[dim]│ {msg}[/dim]")
_log_lines.append(f" {msg}")
def _err(msg: str):
console.print(f"[bold red]│ ✗ {msg}[/bold red]")
_log_lines.append(f" ✗ {msg}")
else:
def _print(msg: str = "", style: str = ""):
print(msg); _log_lines.append(msg)
def _rule(title: str = ""):
print(f"{'─' * 60} {title}" if title else "─" * 60)
_log_lines.append(f"{'─' * 60} {title}")
def _header(title: str, subtitle: str = ""):
bar = "=" * 60
print(f"\n{bar}\n {title}\n{bar}")
_log_lines.extend([f"\n{bar}", f" {title}", bar])
def _step(n: int, total: int, label: str):
msg = f"\n[Step {n}/{total}] {label}"
print(msg); _log_lines.append(msg)
def _ok(msg: str): line = f" ✓ {msg}"; print(line); _log_lines.append(line)
def _warn(msg: str): line = f" ⚠ {msg}"; print(line); _log_lines.append(line)
def _info(msg: str): line = f" {msg}"; print(line); _log_lines.append(line)
def _err(msg: str): line = f" ✗ {msg}"; print(line); _log_lines.append(line)
def _fatal(msg: str):
_err(msg)
sys.exit(1)
def _fmt_time(s: float) -> str:
if s < 60: return f"{s:.1f} s"
if s < 3600: return f"{s/60:.1f} min"
return f"{s/3600:.1f} h"
# ─── BioPython helpers ────────────────────────────────────────────────────────
# Standard amino acid residue names
_STD_AA = {
'ALA','ARG','ASN','ASP','CYS','GLN','GLU','GLY','HIS','ILE',
'LEU','LYS','MET','PHE','PRO','SER','THR','TRP','TYR','VAL',
# protonation variants
'HID','HIE','HIP','HSD','HSE','HSP','CYX','ASH','GLH','LYN',
'MSE', # selenomethionine — treated as standard
}
# Titratable residues and their typical pKa ranges
_TITRATABLE = {
'ASP': (3.5, 4.5), 'GLU': (4.0, 5.0),
'HIS': (6.0, 7.0), 'LYS': (10.0, 11.0),
'ARG': (12.0, 13.5), 'TYR': (9.5, 10.5),
'CYS': (8.0, 9.5),
}
# Disulfide bond SG-SG distance threshold (Å)
_SSBOND_DIST = 2.5
# Backbone heavy atoms — restrained during minimization by default
_BACKBONE_ATOMS = frozenset({'N', 'CA', 'C', 'O', 'OXT'})
# Van der Waals radii (nm) used for the frozen-ligand repulsion model.
# Values are approximate Bondi radii; unmapped elements fall back to carbon.
_VDW_RADII_NM: Dict[str, float] = {
'C': 0.170, 'N': 0.155, 'O': 0.152, 'S': 0.180,
'P': 0.180, 'F': 0.147, 'CL': 0.175, 'BR': 0.185,
'I': 0.198, 'ZN': 0.139, 'MG': 0.130, 'CA': 0.197,
'B': 0.191,
'FE': 0.156, 'MN': 0.161, 'CU': 0.140, 'NA': 0.227,
}
_DEFAULT_VDW_NM = 0.170
def _element_vdw(element: str) -> float:
return _VDW_RADII_NM.get(element.upper(), _DEFAULT_VDW_NM)
def _rotate_around_axis(pos, p1, p2):
"""Rotate *pos* (numpy array) by 180° around the axis p1→p2.
Uses the 180° special case of Rodrigues' rotation formula:
R₁₈₀(v) = 2(v·n̂)n̂ − v
where v = pos − p1 and n̂ = normalise(p2 − p1).
"""
v = pos - p1
n = p2 - p1
n = n / np.linalg.norm(n)
return p1 + 2.0 * np.dot(v, n) * n - v
class _ChainSelector(Select):
"""Keep selected chains, resolve altlocs, remove waters and unwanted HETATM.
Altloc resolution strategy:
• Prefer altloc ' ' (no disorder) or 'A'.
• If only higher-letter conformers exist for an atom (e.g. only 'B'),
accept the one present rather than dropping the atom entirely.
A pre-scan of the full structure is used to determine which altlocs exist
for each (chain, residue, atom_name) triple.
"""
def __init__(self, chains=None, keep_resnames=None, structure=None):
self.chains = set(chains) if chains else None
self.keep_res: set = set()
self.keep_res_instances: set = set() # (resname, chain_id, seqid) tuples
for r in (keep_resnames or []):
r = r.strip().upper()
if '/' in r:
resname, loc = r.split('/', 1)
if ':' in loc:
ch, resnum_str = loc.split(':', 1)
try:
self.keep_res_instances.add((resname, ch, int(resnum_str)))
continue
except ValueError:
pass
self.keep_res.add(resname)
else:
self.keep_res.add(r)
# Chains that contain a keep_res residue — accepted even when not in
# the user-selected chain list (handles ligand-only chains, e.g. K8Q
# living in chains F–J while the user keeps only chain A).
self._keep_res_chains: set = set()
# (chain_id, res_full_id, atom_name) → set of altloc chars present
self._atom_altlocs: Dict[tuple, set] = {}
if structure is not None:
self._scan_altlocs(structure)
if self.chains is not None and (self.keep_res or self.keep_res_instances):
self._scan_keep_res_chains(structure)
def _scan_altlocs(self, structure):
"""Pre-scan: record every altloc that exists for every atom."""
for model in structure:
for chain in model:
cid = chain.get_id()
for residue in chain:
rid = residue.get_id()
for atom in residue.get_unpacked_list():
key = (cid, rid, atom.get_name())
self._atom_altlocs.setdefault(key, set()).add(
atom.get_altloc()
)
def _scan_keep_res_chains(self, structure):
"""Pre-scan: find which chains contain any keep_res residue."""
for model in structure:
for chain in model:
cid = chain.get_id()
for residue in chain:
resname = residue.get_resname().strip().upper()
seqid = residue.get_id()[1]
if (resname in self.keep_res or
(resname, cid, seqid) in self.keep_res_instances):
self._keep_res_chains.add(cid)
def accept_chain(self, chain):
if self.chains is None:
return True
return chain.get_id() in self.chains or chain.get_id() in self._keep_res_chains
def accept_residue(self, residue):
hetflag, seqid, _ = residue.get_id()
if hetflag == ' ': return True
if hetflag == 'W': return False
resname = residue.get_resname().strip().upper()
if resname in self.keep_res:
return True
chain_id = residue.get_parent().get_id()
return (resname, chain_id, seqid) in self.keep_res_instances
def accept_atom(self, atom):
altloc = atom.get_altloc()
if altloc in (' ', 'A'):
return True
# Non-A altloc: accept only when no preferred conformer (' ' or 'A')
# exists for this specific atom, so we never drop an atom entirely.
if not self._atom_altlocs:
return False
key = (
atom.get_parent().get_parent().get_id(),
atom.get_parent().get_id(),
atom.get_name(),
)
existing = self._atom_altlocs.get(key, set())
return not (existing & {' ', 'A'})
def _inspect(pdb_path: Path) -> dict:
"""Extract structural statistics from a PDB file."""
parser = PDBParser(QUIET=True)
s = parser.get_structure('p', str(pdb_path))[0]
chains = list(s.get_chains())
residues = list(s.get_residues())
atoms = list(s.get_atoms())
std_res = [r for r in residues if r.get_id()[0] == ' ']
het_res = [r for r in residues if r.get_id()[0] not in (' ', 'W')]
water = [r for r in residues if r.get_id()[0] == 'W']
mse = [r for r in std_res if r.get_resname().strip() == 'MSE']
altlocs = [a for a in atoms if a.get_altloc() not in (' ', 'A')]
# Chain-level residue counts
chain_info = {}
for ch in chains:
std = [r for r in ch.get_residues() if r.get_id()[0] == ' ']
chain_info[ch.get_id()] = len(std)
# Disulfide bond detection
cys_sg = []
for r in std_res:
if r.get_resname().strip() in ('CYS', 'CYX') and 'SG' in r:
cys_sg.append(r['SG'])
ssbonds = []
if len(cys_sg) >= 2:
ns = NeighborSearch(cys_sg)
seen = set()
for sg in cys_sg:
neighbors = ns.search(sg.get_vector().get_array(), _SSBOND_DIST, 'A')
for nb in neighbors:
if nb is not sg:
pair = tuple(sorted([id(sg), id(nb)]))
if pair not in seen:
seen.add(pair)
r1 = sg.get_parent()
r2 = nb.get_parent()
ssbonds.append((
r1.get_parent().get_id(),
r1.get_id()[1],
r2.get_parent().get_id(),
r2.get_id()[1],
))
# Titratable residue count per type
titr_counts: Dict[str, int] = {}
for r in std_res:
name = r.get_resname().strip()
if name in _TITRATABLE:
titr_counts[name] = titr_counts.get(name, 0) + 1
het_groups = {}
for r in het_res:
name = r.get_resname().strip()
ch = r.get_parent().get_id()
seqid = r.get_id()[1]
key = f"{name}/{ch}:{seqid}"
het_groups.setdefault(key, []).append(f"{ch}:{seqid}")
return {
'chains': [c.get_id() for c in chains],
'chain_info': chain_info,
'n_std': len(std_res),
'n_water': len(water),
'n_het': len(het_res),
'n_mse': len(mse),
'het_groups': het_groups,
'n_altloc': len(altlocs),
'ssbonds': ssbonds,
'titr_counts': titr_counts,
'n_atoms': len(atoms),
}
# ─── Gap handling ─────────────────────────────────────────────────────────────
def _insert_ter_at_gaps(pdb_in: Path, pdb_out: Path, gap_threshold: int = 2,
rename_nterm_h: bool = False) -> int:
"""Insert TER records where residue numbers jump within the same chain.
Without TER records at sequence gaps, pdb2pqr and OpenMM/PDBFixer treat
structurally disconnected segments as covalently bonded across the gap.
When *rename_nterm_h* is True the backbone H atom of the first residue
immediately after each gap is renamed to H1. pdb2pqr protonates gap
N-terminal residues as mid-chain (outputting 'H' instead of 'H1/H2/H3');
renaming H→H1 lets OpenMM's addHydrogens() recognise the residue as
N-terminal and supply the missing H2/H3 with idealized geometry.
"""
lines = pdb_in.read_text().splitlines(keepends=True)
out_lines = []
prev_chain: Optional[str] = None
prev_resseq: Optional[int] = None
n_inserted = 0
_nterm_chain: Optional[str] = None # chain of gap-N-terminal residue
_nterm_resseq: Optional[int] = None # resseq of gap-N-terminal residue
for line in lines:
record = line[:6].strip()
if record in ('ATOM', 'HETATM'):
chain = line[21]
try:
resseq = int(line[22:26])
except ValueError:
out_lines.append(line)
continue
if (prev_chain is not None
and chain == prev_chain
and resseq - prev_resseq >= gap_threshold):
out_lines.append('TER\n')
n_inserted += 1
if rename_nterm_h:
_nterm_chain = chain
_nterm_resseq = resseq
# Rename backbone H → H1 on the first residue after a gap
if (rename_nterm_h
and chain == _nterm_chain
and resseq == _nterm_resseq
and record == 'ATOM'
and line[12:16].strip() == 'H'):
line = line[:12] + ' H1 ' + line[16:]
_nterm_chain = _nterm_resseq = None # one rename per gap
prev_chain = chain
prev_resseq = resseq
out_lines.append(line)
pdb_out.write_text(''.join(out_lines))
return n_inserted
# ─── Pipeline steps ───────────────────────────────────────────────────────────
def step_fetch(pdb_id: str, output_path: Path, assembly: Optional[int] = None):
"""Download a PDB entry from RCSB.
When *assembly* is given (e.g. 1), the biological assembly PDB is fetched.
RCSB provides assemblies in PDB format for most entries; for those that are
CIF-only the mmCIF file is downloaded and converted to PDB via BioPython.
"""
pdb_id = pdb_id.upper()
if assembly:
pdb_url = (f'https://files.rcsb.org/download/'
f'{pdb_id}-assembly{assembly}.pdb')
cif_url = (f'https://files.rcsb.org/download/'
f'{pdb_id}-assembly{assembly}.cif')
_info(f"Biological assembly: {assembly}")
_info(f"URL (PDB): {pdb_url}")
try:
urllib.request.urlretrieve(pdb_url, str(output_path))
return
except urllib.error.HTTPError as e:
if e.code != 404:
_fatal(f"Failed to fetch {pdb_id} assembly {assembly}: "
f"HTTP {e.code}")
_info("PDB format unavailable for this assembly — trying mmCIF …")
# Fallback: download mmCIF and convert to PDB with BioPython
_info(f"URL (mmCIF): {cif_url}")
cif_tmp = output_path.with_suffix('.cif')
try:
urllib.request.urlretrieve(cif_url, str(cif_tmp))
except urllib.error.HTTPError as e:
_fatal(f"Failed to fetch {pdb_id} assembly {assembly}: HTTP {e.code}")
except urllib.error.URLError as e:
_fatal(f"Network error: {e.reason}")
if not _BIOPYTHON:
_fatal("BioPython is required to convert mmCIF → PDB. "
"pip install biopython")
try:
from Bio.PDB import MMCIFParser as _CIFParser
cif_struct = _CIFParser(QUIET=True).get_structure(pdb_id, str(cif_tmp))
# PDB format allows only single-character chain IDs. Biological
# assemblies often have multi-char IDs like 'A-2'; remap them.
_CHAIN_CHARS = (
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
'abcdefghijklmnopqrstuvwxyz'
'0123456789'
)
chain_map: dict = {}
for model in cif_struct:
# Collect IDs already valid (single-char) — reserved.
used = {ch.id for ch in model if len(ch.id) == 1}
available = [c for c in _CHAIN_CHARS if c not in used]
avail_iter = iter(available)
for chain in model:
cid = chain.id
if len(cid) > 1:
if cid not in chain_map:
try:
chain_map[cid] = next(avail_iter)
except StopIteration:
_fatal("Too many chains to remap to single-char PDB IDs")
chain.id = chain_map[cid]
if chain_map:
_info(f"Remapped {len(chain_map)} long chain ID(s) to single chars: "
+ ', '.join(f'{k}→{v}' for k, v in chain_map.items()))
bio_io = PDBIO()
bio_io.set_structure(cif_struct)
bio_io.save(str(output_path))
cif_tmp.unlink(missing_ok=True)
_info("mmCIF → PDB conversion done")
except Exception as ex:
_fatal(f"mmCIF → PDB conversion failed: {ex}")
else:
url = f'https://files.rcsb.org/download/{pdb_id}.pdb'
_info(f"URL: {url}")
try:
urllib.request.urlretrieve(url, str(output_path))
except urllib.error.HTTPError as e:
_fatal(f"Failed to fetch {pdb_id}: HTTP {e.code}")
except urllib.error.URLError as e:
_fatal(f"Network error: {e.reason}")
def step_clean(input_pdb: Path, output_pdb: Path,
chains=None, keep_het=None) -> dict:
if not _BIOPYTHON:
_fatal("BioPython is required: pip install biopython")
info = _inspect(input_pdb)
_info(f"Chains found: {', '.join(info['chains'])}")
for ch, n in info['chain_info'].items():
_info(f" Chain {ch}: {n} residues")
_info(f"Total standard res: {info['n_std']}")
_info(f"Water molecules: {info['n_water']}")
_info(f"HETATM groups: {info['n_het']}")
if info['het_groups']:
for name, locs in info['het_groups'].items():
kept = name.upper() in {h.upper() for h in (keep_het or [])}
tag = '[keep]' if kept else '[remove]'
_info(f" {name:<6} {tag:<8} at {', '.join(locs[:4])}"
+ (' …' if len(locs) > 4 else ''))
if info['n_mse']:
_warn(f"Selenomethionine (MSE): {info['n_mse']} residues — will be converted to MET by PDBFixer")
if info['n_altloc']:
_warn(f"Alternate locations: {info['n_altloc']} atoms — "
"keeping conformer A (fallback: highest-occupancy conformer)")
if info['ssbonds']:
_ok(f"Disulfide bonds detected: {len(info['ssbonds'])}")
for c1, r1, c2, r2 in info['ssbonds']:
_info(f" CYS {c1}:{r1} — CYS {c2}:{r2}")
else:
_info("No disulfide bonds detected")
if info['titr_counts']:
_info("Titratable residues:")
for res, count in sorted(info['titr_counts'].items()):
lo, hi = _TITRATABLE[res]
_info(f" {res}: {count} (typical pKa {lo}–{hi})")
if chains:
missing = [c for c in chains if c not in info['chains']]
if missing:
_fatal(f"Chain(s) not found: {', '.join(missing)}. "
f"Available: {', '.join(info['chains'])}")
parser = PDBParser(QUIET=True)
struct = parser.get_structure('p', str(input_pdb))
pdb_io = PDBIO()
pdb_io.set_structure(struct)
pdb_io.save(str(output_pdb),
_ChainSelector(chains=chains, keep_resnames=keep_het,
structure=struct))
return info
def step_fix(input_pdb: Path, output_pdb: Path,
convert_mse: bool = True,
replace_nonstandard: bool = True) -> dict:
if not _PDBFIXER:
_fatal("PDBFixer is required: pip install pdbfixer")
fixer = PDBFixer(filename=str(input_pdb))
# MSE → MET is handled automatically by PDBFixer's replaceNonstandardResidues
fixer.findMissingResidues()
n_miss_res = sum(len(v) for v in fixer.missingResidues.values())
fixer.findNonstandardResidues()
nonstandard = [(r.name, s) for r, s in fixer.nonstandardResidues]
if replace_nonstandard:
fixer.replaceNonstandardResidues()
fixer.findMissingAtoms()
n_miss_atoms = sum(len(v) for v in fixer.missingAtoms.values())
n_term_atoms = sum(len(v) for v in fixer.missingTerminals.values())
_info(f"Missing residues: {n_miss_res}")
_info(f"Missing heavy atoms: {n_miss_atoms} (+{n_term_atoms} terminal atoms)")
if nonstandard:
for orig, repl in nonstandard:
_warn(f"Non-standard residue: {orig} → {repl}")
fixer.addMissingAtoms()
with open(str(output_pdb), 'w') as fh:
OmmPDBFile.writeFile(fixer.topology, fixer.positions, fh, keepIds=True)
return {
'n_missing_res': n_miss_res,
'n_missing_atoms': n_miss_atoms,
'nonstandard': nonstandard,
}
def step_cap_termini(input_pdb: Path, output_pdb: Path) -> dict:
"""
Complete missing terminal atoms at chain breaks and real termini (OXT, H, etc.).
Uses PDBFixer's addMissingAtoms() — no ACE/NME cap residues are inserted.
"""
if not _PDBFIXER:
_warn("PDBFixer not available — skipping terminus capping.")
shutil.copy(input_pdb, output_pdb)
return {'n_caps': 0}
fixer = PDBFixer(filename=str(input_pdb))
fixer.findMissingResidues()
fixer.missingResidues = {} # don't insert gap residues
fixer.findMissingAtoms()
n_term = sum(len(v) for v in fixer.missingTerminals.values())
fixer.addMissingAtoms()
with open(str(output_pdb), 'w') as fh:
OmmPDBFile.writeFile(fixer.topology, fixer.positions, fh, keepIds=True)
return {'n_caps': n_term}
def step_protonate_pdb2pqr(input_pdb: Path, output_pdb: Path,
ph: float, ff: str = 'AMBER') -> Tuple[bool, dict]:
"""PDB2PQR + PROPKA: assign protonation states at target pH."""
pqr_out = output_pdb.with_suffix('.pqr')
propka_out = output_pdb.with_suffix('.propka')
cmd = [
'pdb2pqr',
'--ff', ff,
'--titration-state-method', 'propka',
'--with-ph', str(ph),
'--pdb-output', str(output_pdb),
'--keep-chain',
'--drop-water',
str(input_pdb),
str(pqr_out),
]
try:
result = subprocess.run(cmd, capture_output=True, text=True,
cwd=str(output_pdb.parent))
except FileNotFoundError:
_warn("pdb2pqr not found in PATH — falling back to PDBFixer")
return False, {}
if result.returncode != 0:
_warn(f"PDB2PQR exited with code {result.returncode}")
for line in result.stderr.strip().splitlines()[:6]:
_info(f" {line}")
return False, {}
# Parse PROPKA output for ionization report
propka_info = _parse_propka(propka_out, ph)
# Clean up temporary files
for f in list(output_pdb.parent.glob('*.pqr')) + \
list(output_pdb.parent.glob('*.propka')):
f.unlink(missing_ok=True)
return True, propka_info
def _parse_propka(propka_path: Path, ph: float) -> dict:
"""Parse PROPKA output to extract pKa predictions and protonation states."""
if not propka_path.exists():
return {}
try:
text = propka_path.read_text()
except Exception:
return {}
results = []
# Match lines like: ASP 18 A 3.80 (3.80) ...
pattern = re.compile(
r'^(ASP|GLU|HIS|LYS|ARG|TYR|CYS|NTR|CTR)\s+(\d+)\s+(\w)\s+([\d.]+)',
re.MULTILINE
)
for m in pattern.finditer(text):
resname, resid, chain, pka = m.groups()
pka_val = float(pka)
protonated = pka_val > ph # residue is protonated if pKa > pH
# for acids: protonated = neutral (COOH); for bases: protonated = charged (+NH3)
results.append({
'resname': resname, 'resid': int(resid), 'chain': chain,
'pka': pka_val, 'protonated': protonated,
})
return {'propka': results}
def _report_protonation(propka_info: dict, ph: float):
"""Print a PPW-style ionization table."""
entries = propka_info.get('propka', [])
if not entries:
return
acids = [e for e in entries if e['resname'] in ('ASP','GLU','CTR')]
bases = [e for e in entries if e['resname'] in ('LYS','ARG','HIS','NTR')]
other = [e for e in entries if e['resname'] in ('TYR','CYS')]
special = []
# Flag residues with unusual protonation at this pH
for e in entries:
rn = e['resname']
pka = e['pka']
# Flag if pKa is "near" pH (within 1 unit) — these could be ambiguous
if abs(pka - ph) < 1.0 and rn not in ('NTR','CTR'):
special.append(e)
_info(f"Ionization report at pH {ph}:")
if _RICH:
tbl = Table(box=box.SIMPLE, show_header=True, header_style="bold magenta",
show_edge=False, padding=(0, 1))
tbl.add_column("Residue", style="cyan", no_wrap=True)
tbl.add_column("Chain")
tbl.add_column("Seq#", justify="right")
tbl.add_column("pKa", justify="right")
tbl.add_column("State at pH " + str(ph))
tbl.add_column("Note")
for e in sorted(entries, key=lambda x: (x['chain'], x['resid'])):
rn = e['resname']
state, color = _protonation_label(rn, e['protonated'])
note = "[yellow]⚠ near pH[/yellow]" if e in special else ""
tbl.add_row(rn, e['chain'], str(e['resid']),
f"{e['pka']:.2f}", f"[{color}]{state}[/{color}]", note)
console.print(tbl)
_log_lines.append(f" {len(entries)} titratable residues analyzed by PROPKA")
else:
_info(f" {'Res':<6} {'Chain':>5} {'Seq#':>5} {'pKa':>6} State")
for e in sorted(entries, key=lambda x: (x['chain'], x['resid'])):
state, _ = _protonation_label(e['resname'], e['protonated'])
flag = " ⚠" if e in special else ""
_info(f" {e['resname']:<6} {e['chain']:>5} {e['resid']:>5} "
f"{e['pka']:>6.2f} {state}{flag}")
if special:
_warn(f"{len(special)} residue(s) have pKa within 1 unit of pH {ph} "
f"— protonation state may be ambiguous; consider manual inspection.")
def _protonation_label(resname: str, protonated: bool) -> Tuple[str, str]:
"""Return human-readable state label and rich color."""
mapping = {
('ASP', True): ("neutral (–COOH)", "white"),
('ASP', False): ("anionic (–COO⁻)", "green"),
('GLU', True): ("neutral (–COOH)", "white"),
('GLU', False): ("anionic (–COO⁻)", "green"),
('HIS', True): ("protonated (HIP+)", "yellow"),
('HIS', False): ("neutral (HIE/HID)", "green"),
('LYS', True): ("protonated (+NH₃)", "green"),
('LYS', False): ("neutral (–NH₂)", "yellow"),
('ARG', True): ("protonated (+)", "green"),
('ARG', False): ("neutral", "yellow"),
('TYR', True): ("neutral (–OH)", "white"),
('TYR', False): ("anionic (–O⁻)", "yellow"),
('CYS', True): ("neutral (–SH)", "white"),
('CYS', False): ("thiolate (–S⁻)", "yellow"),
('NTR', True): ("N-term protonated", "green"),
('NTR', False): ("N-term neutral", "white"),
('CTR', True): ("C-term neutral", "white"),
('CTR', False): ("C-term anionic", "green"),
}
return mapping.get((resname, protonated), ("unknown", "dim"))
def step_protonate_pdbfixer(input_pdb: Path, output_pdb: Path, ph: float):
"""PDBFixer fallback: add hydrogens at target pH without PROPKA."""
if not _PDBFIXER:
_fatal("PDBFixer is required: pip install pdbfixer")
fixer = PDBFixer(filename=str(input_pdb))
fixer.addMissingHydrogens(ph)
with open(str(output_pdb), 'w') as fh:
OmmPDBFile.writeFile(fixer.topology, fixer.positions, fh, keepIds=True)
def _normalize_his_names(model) -> int:
"""Rename HIS-family residues to the correct HID/HIE/HIP variant.
Inspects which imidazole ring hydrogen atoms are actually present:
HD1 present, HE2 absent → HID (neutral, δ-protonated)
HE2 present, HD1 absent → HIE (neutral, ε-protonated)
both present → HIP (protonated cation, +1)
neither present → HIE (default neutral; warns — may indicate
a failed protonation step)
Also handles CHARMM naming (HSD/HSE/HSP). Returns the number of
residues whose name was changed.
"""
_charmm_his = frozenset({'HSD', 'HSE', 'HSP'})
_his_variants = frozenset({'HIS', 'HID', 'HIE', 'HIP', 'HSD', 'HSE', 'HSP'})
n_renamed = 0
for chain in model:
for res in chain:
rn = res.get_resname().strip().upper()
if rn not in _his_variants:
continue
use_charmm = rn in _charmm_his
hd1 = 'HD1' in res
he2 = 'HE2' in res
if hd1 and he2:
correct = 'HSP' if use_charmm else 'HIP'
elif hd1:
correct = 'HSD' if use_charmm else 'HID'
elif he2:
correct = 'HSE' if use_charmm else 'HIE'
else:
# No imidazole H found. A bare HIS (no HD1, no HE2) causes
# force fields and viewers to apply an ambiguous charge model,
# producing the characteristic "+N / −N" appearance on the ring.
# Default to HIE (ε-tautomer — statistically the more common
# neutral form) and warn so the user can inspect manually.
correct = 'HSE' if use_charmm else 'HIE'
rid = res.get_id()
_warn(f"HIS {chain.id}{rid[1]}: no imidazole H found after "
f"protonation — defaulting to {correct}. "
f"Inspect manually (metal coordination? unusual environment?).")
if res.get_resname().strip() != correct:
res.resname = correct
n_renamed += 1
return n_renamed
def step_normalize_his(input_pdb: Path, output_pdb: Path) -> int:
"""Read a protonated PDB, normalise HIS residue names, write back.
Runs _normalize_his_names unconditionally so the correct HID/HIE/HIP
naming is applied even when rotamer flipping is disabled (--no-flip).
Returns the number of renamed residues, or 0 if BioPython is unavailable.
"""
if not _BIOPYTHON:
shutil.copy(input_pdb, output_pdb)
return 0
parser = PDBParser(QUIET=True)
structure = parser.get_structure('p', str(input_pdb))
model = structure[0]
n = _normalize_his_names(model)
io = PDBIO()
io.set_structure(structure)
io.save(str(output_pdb))
return n
_HIS_VAR_RE = re.compile(
r'^((?:ATOM |HETATM).{11})(HID|HIE|HIP)( )(.)(.{4})',
re.MULTILINE,
)
def _rename_his_to_his(pdb_path: Path) -> int:
"""Rename HID/HIE/HIP → HIS in a PDB file for viewer compatibility.
The protonation state is preserved in hydrogen atom positions regardless
of the residue name. Returns the number of unique HIS residues renamed.
"""
text = pdb_path.read_text()
seen: set = set()
def _sub(m: re.Match) -> str:
seen.add((m.group(4), m.group(5).strip())) # (chain, resseq)
return m.group(1) + 'HIS' + m.group(3) + m.group(4) + m.group(5)
new_text = _HIS_VAR_RE.sub(_sub, text)
if new_text != text:
pdb_path.write_text(new_text)
return len(seen)
def _count_clashes(pdb_path: Path, cutoff: float = 1.5) -> int:
"""Count severe steric clashes (heavy-atom pairs closer than cutoff Å)."""
if not _BIOPYTHON:
return -1
try:
parser = PDBParser(QUIET=True)
s = parser.get_structure('p', str(pdb_path))[0]
heavy = [a for a in s.get_atoms()
if a.element not in (None, 'H') and a.get_altloc() in (' ', 'A')]
if not heavy:
return 0
ns = NeighborSearch(heavy)
clashes = 0
seen = set()
for atom in heavy:
nbs = ns.search(atom.get_vector().get_array(), cutoff, 'A')
for nb in nbs:
if nb is atom:
continue
pair = tuple(sorted([id(atom), id(nb)]))
if pair in seen:
continue
seen.add(pair)
r1 = atom.get_parent()
r2 = nb.get_parent()
# Skip intra-residue pairs and adjacent-residue backbone bonds
# (e.g. C(i)–N(i+1) ~1.33 Å is below the clash cutoff)
if r1 is r2:
continue
if (r1.get_parent() is r2.get_parent() and
abs(r1.get_id()[1] - r2.get_id()[1]) <= 1):
continue
clashes += 1
return clashes
except Exception:
return -1
def _filter_nearest_hetatm(hetatm_lines: List[str], protein_pdb: Path) -> List[str]:
"""When there are multiple instances of the same HETATM residue (e.g. K8Q
in chains F, G, H …), return only the instance whose atoms are closest to
any protein atom. Falls back to all lines if parsing fails or only one
instance exists. Non-HETATM lines (REMARK, CONECT …) are always kept.
"""
from collections import defaultdict
groups: dict = defaultdict(list)
other_lines: List[str] = []
for line in hetatm_lines:
if line[:6].strip() != 'HETATM':
other_lines.append(line)
continue
chain_id = line[21] if len(line) > 21 else '?'
try:
seq_id = int(line[22:26].strip())
except (ValueError, IndexError):
other_lines.append(line)
continue
groups[(chain_id, seq_id)].append(line)
if len(groups) <= 1:
return hetatm_lines # Nothing to filter
# Collect protein atom positions
try:
prot_atoms = []
for pl in protein_pdb.read_text().splitlines():
if pl[:6].strip() in ('ATOM', 'HETATM'):
try:
prot_atoms.append([float(pl[30:38]), float(pl[38:46]), float(pl[46:54])])
except (ValueError, IndexError):
pass
except Exception:
return hetatm_lines # Fallback: return all
if not prot_atoms or not _NUMPY:
return hetatm_lines
prot_arr = np.array(prot_atoms)
best_key, best_dist = None, float('inf')