|
| 1 | +#!/usr/bin/env python |
| 2 | +# |
| 3 | +# This script can be used for any purpose without limitation subject to the |
| 4 | +# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx |
| 5 | +# |
| 6 | +# This permission notice and the following statement of attribution must be |
| 7 | +# included in all copies or substantial portions of this script. |
| 8 | +# |
| 9 | +# 2024-05-02: created by the Cambridge Crystallographic Data Centre |
| 10 | +# |
| 11 | + |
| 12 | +""" |
| 13 | +Credit - this code was written by Jason C. Cole, Natalie Johnson and Alex Moldovan |
| 14 | +""" |
| 15 | +import argparse |
| 16 | + |
| 17 | +from ccdc.io import EntryReader |
| 18 | +from ccdc.search import SubstructureSearch, SMARTSSubstructure |
| 19 | + |
| 20 | + |
| 21 | +class TrihydroxyIsoflavoneHitfinder(SubstructureSearch.HitProcessor): |
| 22 | + """Post process hits - pick out hits that are only tri-substituted |
| 23 | + and then retrieve information from them to tabulate. |
| 24 | + """ |
| 25 | + |
| 26 | + def __init__(self, query_string, database='CSD'): |
| 27 | + |
| 28 | + self._query = SMARTSSubstructure(query_string) |
| 29 | + self._query_string = query_string |
| 30 | + self._hits = [] |
| 31 | + self._extracted_data = [] |
| 32 | + self._entry_reader = EntryReader(database) |
| 33 | + self._hits_without_filtering = [] |
| 34 | + self._database = database |
| 35 | + |
| 36 | + @staticmethod |
| 37 | + def _hydroxyl_or_hydroxylate(atom): |
| 38 | + return (atom.atomic_number == 8) and \ |
| 39 | + (len(atom.neighbours) == 1 and atom.formal_charge == -1.0) or \ |
| 40 | + (len(atom.neighbours) == 2 and ( |
| 41 | + atom.neighbours[0].atomic_number == 1 or atom.neighbours[1].atomic_number == 1)) |
| 42 | + |
| 43 | + def _find_hydroxyls(self, hit): |
| 44 | + return [atom for atom in hit.match_atoms() if self._hydroxyl_or_hydroxylate(atom)] |
| 45 | + |
| 46 | + def _count_bound_hydroxyls(self, hit): |
| 47 | + return len(self._find_hydroxyls(hit)) |
| 48 | + |
| 49 | + def _get_entry_data_other(self, identifier): |
| 50 | + entry = self._entry_reader.entry(identifier) |
| 51 | + return entry.attributes |
| 52 | + |
| 53 | + def _get_entry_data_csd(self, identifier): |
| 54 | + entry = self._entry_reader.entry(identifier) |
| 55 | + |
| 56 | + synonyms = entry.synonyms |
| 57 | + chemical_name = entry.chemical_name |
| 58 | + return {"Chemical Name": chemical_name, "Synonyms": synonyms} |
| 59 | + |
| 60 | + def _substitution_pattern(self, hit): |
| 61 | + # Get the labelled atoms in the query |
| 62 | + |
| 63 | + pattern = [] |
| 64 | + label_index_lookup = {i: self._query.label_to_atom_index(i) for i in self._query._matches.keys()} |
| 65 | + print(label_index_lookup) |
| 66 | + match_atoms = hit.match_atoms() |
| 67 | + for k in label_index_lookup.keys(): |
| 68 | + |
| 69 | + atom = match_atoms[label_index_lookup[k]] |
| 70 | + |
| 71 | + if self._hydroxyl_or_hydroxylate(atom): |
| 72 | + pattern.append(str(k)) |
| 73 | + |
| 74 | + return ",".join(sorted(pattern)) |
| 75 | + |
| 76 | + def _get_entry_data(self, hit): |
| 77 | + if self.database == 'CSD': |
| 78 | + d = self._get_entry_data_csd(hit.identifier) |
| 79 | + else: |
| 80 | + d = self._get_entry_data_other(hit.identifier) |
| 81 | + return d | {"Substitution Pattern": self._substitution_pattern(hit)} |
| 82 | + |
| 83 | + def add_hit(self, hit): |
| 84 | + """ |
| 85 | + This is the key method that gets called in the search |
| 86 | + """ |
| 87 | + self._hits_without_filtering.append(hit) |
| 88 | + if self._count_bound_hydroxyls(hit) == 3: |
| 89 | + self._hits.append(hit) |
| 90 | + |
| 91 | + def tabulate(self): |
| 92 | + """ |
| 93 | + Generate a dictionary of dictionaries of relevant data from the hits |
| 94 | + """ |
| 95 | + data = {} |
| 96 | + for hit in self._hits: |
| 97 | + data[hit.identifier] = self._get_entry_data(hit) |
| 98 | + |
| 99 | + return data |
| 100 | + |
| 101 | + def run(self): |
| 102 | + searcher = SubstructureSearch() |
| 103 | + searcher.add_substructure(self._query) |
| 104 | + super().search(searcher, self._entry_reader) |
| 105 | + |
| 106 | + |
| 107 | +if __name__ == "__main__": |
| 108 | + |
| 109 | + sub = "$([#1]),$([OH1]),$([OX1H0]),$(O[CH3]),$(Oc1ccccc1)" |
| 110 | + query_string = (f"c(!@[{sub}:1])1c(!@[{sub}:2])c(!@[{sub}:3])c(!@[{sub}:4])c(OC(!@[{sub}:5])" |
| 111 | + f"=C(c2c(!@[{sub}:6])c(!@[{sub}:7])c(!@[{sub}:8])c(!@[{sub}:9])c(!@[{sub}:10])2)C(=O)c1)") # noqa |
| 112 | + |
| 113 | + parser = argparse.ArgumentParser() |
| 114 | + parser.add_argument('-d', '--database', default='CSD', |
| 115 | + help='Path to the file to search or "CSD" to use the CSD') |
| 116 | + args = parser.parse_args() |
| 117 | + |
| 118 | + database = 'tiny.sdf' |
| 119 | + filtered_search = TrihydroxyIsoflavoneHitfinder(query_string, args.database) |
| 120 | + filtered_search.run() |
| 121 | + |
| 122 | + data = filtered_search.tabulate() |
| 123 | + sorted_ids = sorted(data.keys()) |
| 124 | + info_keys = sorted(data[sorted_ids[0]].keys()) |
| 125 | + |
| 126 | + print(f"Without filtering, we have: {len(filtered_search._hits_without_filtering)} hits") |
| 127 | + print(f"After filtering we have: {len(filtered_search._hits)} hits") |
| 128 | + |
| 129 | + print(",".join(["Identifier"] + info_keys)) |
| 130 | + for key in sorted_ids: |
| 131 | + datum = data[key] |
| 132 | + |
| 133 | + print(",".join([key] + [str(datum[x]) for x in info_keys])) |
0 commit comments