Skip to content

Commit a98eb19

Browse files
committed
Initial commit of example scripts for API paper
1 parent 3be3593 commit a98eb19

27 files changed

+204477
-0
lines changed

api_paper_2024/example_1/ReadMe.md

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Extending Substructure Searching using the CSD Python API
2+
3+
By searching the CSD using a SMARTS query through the CSD Python API and further inspect molecules as required.
4+
Using SMARTS, it is possible to express a complex Isoflavone query which is recursive in nature.
5+
6+
7+
## To Run:
8+
```bash
9+
python isovflavone_search.py
10+
```

api_paper_2024/example_1/data_files/demo_subset_of_pubchem.sdf

+191,668
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
#!/usr/bin/env python
2+
#
3+
# This script can be used for any purpose without limitation subject to the
4+
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
5+
#
6+
# This permission notice and the following statement of attribution must be
7+
# included in all copies or substantial portions of this script.
8+
#
9+
# 2024-05-02: created by the Cambridge Crystallographic Data Centre
10+
#
11+
12+
"""
13+
Credit - this code was written by Jason C. Cole, Natalie Johnson and Alex Moldovan
14+
"""
15+
16+
from ccdc.io import EntryReader
17+
from ccdc.search import SubstructureSearch, SMARTSSubstructure
18+
19+
20+
class TrihydroxyIsoflavoneHitfinder(SubstructureSearch.HitProcessor):
21+
"""Post process hits - pick out hits that are only tri-substituted
22+
and then retrieve information from them to tabulate.
23+
"""
24+
25+
def __init__(self, query_string, database='CSD'):
26+
27+
self._query = SMARTSSubstructure(query_string)
28+
self._query_string = query_string
29+
self._hits = []
30+
self._extracted_data = []
31+
self._entry_reader = EntryReader(database)
32+
self._hits_without_filtering = []
33+
self._database = database
34+
35+
@staticmethod
36+
def _hydroxyl_or_hydroxylate(atom):
37+
return (atom.atomic_number == 8) and \
38+
(len(atom.neighbours) == 1 and atom.formal_charge == -1.0) or \
39+
(len(atom.neighbours) == 2 and (
40+
atom.neighbours[0].atomic_number == 1 or atom.neighbours[1].atomic_number == 1))
41+
42+
def _find_hydroxyls(self, hit):
43+
return [atom for atom in hit.match_atoms() if self._hydroxyl_or_hydroxylate(atom)]
44+
45+
def _count_bound_hydroxyls(self, hit):
46+
return len(self._find_hydroxyls(hit))
47+
48+
def _get_entry_data_other(self, identifier):
49+
entry = self._entry_reader.entry(identifier)
50+
return entry.attributes
51+
52+
def _get_entry_data_csd(self, identifier):
53+
entry = self._entry_reader.entry(identifier)
54+
55+
synonyms = entry.synonyms
56+
chemical_name = entry.chemical_name
57+
return {"Chemical Name": chemical_name, "Synonyms": synonyms}
58+
59+
def _substitution_pattern(self, hit):
60+
# Get the labelled atoms in the query
61+
62+
pattern = []
63+
label_index_lookup = {i: self._query.label_to_atom_index(i) for i in self._query._matches.keys()}
64+
print(label_index_lookup)
65+
match_atoms = hit.match_atoms()
66+
for k in label_index_lookup.keys():
67+
68+
atom = match_atoms[label_index_lookup[k]]
69+
70+
if self._hydroxyl_or_hydroxylate(atom):
71+
pattern.append(str(k))
72+
73+
return ",".join(sorted(pattern))
74+
75+
def _get_entry_data(self, hit):
76+
if self.database == 'CSD':
77+
d = self._get_entry_data_csd(hit.identifier)
78+
else:
79+
d = self._get_entry_data_other(hit.identifier)
80+
return d | {"Substitution Pattern": self._substitution_pattern(hit)}
81+
82+
def add_hit(self, hit):
83+
"""
84+
This is the key method that gets called in the search
85+
"""
86+
self._hits_without_filtering.append(hit)
87+
if self._count_bound_hydroxyls(hit) == 3:
88+
self._hits.append(hit)
89+
90+
def tabulate(self):
91+
"""
92+
Generate a dictionary of dictionaries of relevant data from the hits
93+
"""
94+
data = {}
95+
for hit in self._hits:
96+
data[hit.identifier] = self._get_entry_data(hit)
97+
98+
return data
99+
100+
def run(self):
101+
searcher = SubstructureSearch()
102+
searcher.add_substructure(self._query)
103+
super().search(searcher, self._entry_reader)
104+
105+
106+
if __name__ == "__main__":
107+
108+
sub = "$([#1]),$([OH1]),$([OX1H0]),$(O[CH3]),$(Oc1ccccc1)"
109+
query_string = f"c(!@[{sub}:1])1c(!@[{sub}:2])c(!@[{sub}:3])c(!@[{sub}:4])c(OC(!@[{sub}:5])=C(c2c(!@[{sub}:6])c(!@[{sub}:7])c(!@[{sub}:8])c(!@[{sub}:9])c(!@[{sub}:10])2)C(=O)c1)" # noqa
110+
111+
112+
import argparse
113+
parser = argparse.ArgumentParser()
114+
parser.add_argument('-d','--database', default='CSD',help='Path to the file to search or "CSD" to use the CSD')
115+
args = parser.parse_args()
116+
117+
database = 'tiny.sdf'
118+
filtered_search = TrihydroxyIsoflavoneHitfinder(query_string, args.database)
119+
filtered_search.run()
120+
121+
data = filtered_search.tabulate()
122+
sorted_ids = sorted(data.keys())
123+
info_keys = sorted(data[sorted_ids[0]].keys())
124+
125+
print(f"Without filtering, we have: {len(filtered_search._hits_without_filtering)} hits")
126+
print(f"After filtering we have: {len(filtered_search._hits)} hits")
127+
128+
print(",".join(["Identifier"] + info_keys))
129+
for key in sorted_ids:
130+
datum = data[key]
131+
132+
print(",".join([key] + [str(datum[x]) for x in info_keys]))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#!/usr/bin/env python
2+
#
3+
# This script can be used for any purpose without limitation subject to the
4+
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
5+
#
6+
# This permission notice and the following statement of attribution must be
7+
# included in all copies or substantial portions of this script.
8+
#
9+
# 2024-05-02: created by the Cambridge Crystallographic Data Centre
10+
#
11+
12+
import sys
13+
14+
from ccdc.entry import Entry
15+
from ccdc.io import EntryReader, EntryWriter
16+
17+
18+
def process_structures(input_file, output_file):
19+
with EntryReader(input_file) as er, EntryWriter(output_file) as ew:
20+
for e in er:
21+
attribs = e.attributes
22+
molecule = e.molecule
23+
molecule.assign_bond_types('all')
24+
ne = Entry.from_molecule(molecule, attributes=attribs)
25+
ew.write(ne)
26+
27+
28+
if __name__ == "__main__":
29+
input_file = sys.argv[1]
30+
output_file = sys.argv[2]
31+
process_structures(input_file, output_file)

api_paper_2024/example_2/ReadMe.md

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Mining third-party resources for structural context
2+
3+
The CSD Python API, combined with other Python and REST APIs, can be very powerful for mining for such relationships.
4+
Each CCDC refcode has associated publication information, and many have an associated publication document object
5+
identifier (DOI). ##TODO Add very quick summary of tool.
6+
7+
## Dependencies
8+
9+
- habanero
10+
11+
Optional:
12+
13+
- wordcloud
14+
- nltk (after install please consult https://www.nltk.org/data.html)
15+
16+
```conda install -c conda-forge habanero wordcloud nltk```
17+
18+
## To Run
19+
20+
Add script folder through CSD Python API drop down menu.
21+
With a structure opened, click `about_entry.py` in menu dropdown.

0 commit comments

Comments
 (0)