Skip to content

Commit e5812f9

Browse files
authored
Merge pull request #55 from ccdc-opensource/api_paper_examples
Initial Commit of example scripts for the CSD Python API Paper
2 parents 3be3593 + 8fd49be commit e5812f9

27 files changed

+204484
-0
lines changed

api_paper_2024/example_1/ReadMe.md

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Extending Substructure Searching using the CSD Python API
2+
3+
By searching the CSD using a SMARTS query through the CSD Python API and further inspect molecules as required.
4+
Using SMARTS, it is possible to express a complex Isoflavone query which is recursive in nature.
5+
6+
## To Run
7+
8+
```bash
9+
python isovflavone_search.py
10+
```

api_paper_2024/example_1/data_files/demo_subset_of_pubchem.sdf

+191,668
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
#!/usr/bin/env python
2+
#
3+
# This script can be used for any purpose without limitation subject to the
4+
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
5+
#
6+
# This permission notice and the following statement of attribution must be
7+
# included in all copies or substantial portions of this script.
8+
#
9+
# 2024-05-02: created by the Cambridge Crystallographic Data Centre
10+
#
11+
12+
"""
13+
Credit - this code was written by Jason C. Cole, Natalie Johnson and Alex Moldovan
14+
"""
15+
import argparse
16+
17+
from ccdc.io import EntryReader
18+
from ccdc.search import SubstructureSearch, SMARTSSubstructure
19+
20+
21+
class TrihydroxyIsoflavoneHitfinder(SubstructureSearch.HitProcessor):
22+
"""Post process hits - pick out hits that are only tri-substituted
23+
and then retrieve information from them to tabulate.
24+
"""
25+
26+
def __init__(self, query_string, database='CSD'):
27+
28+
self._query = SMARTSSubstructure(query_string)
29+
self._query_string = query_string
30+
self._hits = []
31+
self._extracted_data = []
32+
self._entry_reader = EntryReader(database)
33+
self._hits_without_filtering = []
34+
self._database = database
35+
36+
@staticmethod
37+
def _hydroxyl_or_hydroxylate(atom):
38+
return (atom.atomic_number == 8) and \
39+
(len(atom.neighbours) == 1 and atom.formal_charge == -1.0) or \
40+
(len(atom.neighbours) == 2 and (
41+
atom.neighbours[0].atomic_number == 1 or atom.neighbours[1].atomic_number == 1))
42+
43+
def _find_hydroxyls(self, hit):
44+
return [atom for atom in hit.match_atoms() if self._hydroxyl_or_hydroxylate(atom)]
45+
46+
def _count_bound_hydroxyls(self, hit):
47+
return len(self._find_hydroxyls(hit))
48+
49+
def _get_entry_data_other(self, identifier):
50+
entry = self._entry_reader.entry(identifier)
51+
return entry.attributes
52+
53+
def _get_entry_data_csd(self, identifier):
54+
entry = self._entry_reader.entry(identifier)
55+
56+
synonyms = entry.synonyms
57+
chemical_name = entry.chemical_name
58+
return {"Chemical Name": chemical_name, "Synonyms": synonyms}
59+
60+
def _substitution_pattern(self, hit):
61+
# Get the labelled atoms in the query
62+
63+
pattern = []
64+
label_index_lookup = {i: self._query.label_to_atom_index(i) for i in self._query._matches.keys()}
65+
print(label_index_lookup)
66+
match_atoms = hit.match_atoms()
67+
for k in label_index_lookup.keys():
68+
69+
atom = match_atoms[label_index_lookup[k]]
70+
71+
if self._hydroxyl_or_hydroxylate(atom):
72+
pattern.append(str(k))
73+
74+
return ",".join(sorted(pattern))
75+
76+
def _get_entry_data(self, hit):
77+
if self.database == 'CSD':
78+
d = self._get_entry_data_csd(hit.identifier)
79+
else:
80+
d = self._get_entry_data_other(hit.identifier)
81+
return d | {"Substitution Pattern": self._substitution_pattern(hit)}
82+
83+
def add_hit(self, hit):
84+
"""
85+
This is the key method that gets called in the search
86+
"""
87+
self._hits_without_filtering.append(hit)
88+
if self._count_bound_hydroxyls(hit) == 3:
89+
self._hits.append(hit)
90+
91+
def tabulate(self):
92+
"""
93+
Generate a dictionary of dictionaries of relevant data from the hits
94+
"""
95+
data = {}
96+
for hit in self._hits:
97+
data[hit.identifier] = self._get_entry_data(hit)
98+
99+
return data
100+
101+
def run(self):
102+
searcher = SubstructureSearch()
103+
searcher.add_substructure(self._query)
104+
super().search(searcher, self._entry_reader)
105+
106+
107+
if __name__ == "__main__":
108+
109+
sub = "$([#1]),$([OH1]),$([OX1H0]),$(O[CH3]),$(Oc1ccccc1)"
110+
query_string = (f"c(!@[{sub}:1])1c(!@[{sub}:2])c(!@[{sub}:3])c(!@[{sub}:4])c(OC(!@[{sub}:5])"
111+
f"=C(c2c(!@[{sub}:6])c(!@[{sub}:7])c(!@[{sub}:8])c(!@[{sub}:9])c(!@[{sub}:10])2)C(=O)c1)") # noqa
112+
113+
parser = argparse.ArgumentParser()
114+
parser.add_argument('-d', '--database', default='CSD',
115+
help='Path to the file to search or "CSD" to use the CSD')
116+
args = parser.parse_args()
117+
118+
database = 'tiny.sdf'
119+
filtered_search = TrihydroxyIsoflavoneHitfinder(query_string, args.database)
120+
filtered_search.run()
121+
122+
data = filtered_search.tabulate()
123+
sorted_ids = sorted(data.keys())
124+
info_keys = sorted(data[sorted_ids[0]].keys())
125+
126+
print(f"Without filtering, we have: {len(filtered_search._hits_without_filtering)} hits")
127+
print(f"After filtering we have: {len(filtered_search._hits)} hits")
128+
129+
print(",".join(["Identifier"] + info_keys))
130+
for key in sorted_ids:
131+
datum = data[key]
132+
133+
print(",".join([key] + [str(datum[x]) for x in info_keys]))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#!/usr/bin/env python
2+
#
3+
# This script can be used for any purpose without limitation subject to the
4+
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
5+
#
6+
# This permission notice and the following statement of attribution must be
7+
# included in all copies or substantial portions of this script.
8+
#
9+
# 2024-05-02: created by the Cambridge Crystallographic Data Centre
10+
#
11+
12+
import sys
13+
14+
from ccdc.entry import Entry
15+
from ccdc.io import EntryReader, EntryWriter
16+
17+
18+
def process_structures(input_file, output_file):
19+
with EntryReader(input_file) as er, EntryWriter(output_file) as ew:
20+
for e in er:
21+
attribs = e.attributes
22+
molecule = e.molecule
23+
molecule.assign_bond_types('all')
24+
ne = Entry.from_molecule(molecule, attributes=attribs)
25+
ew.write(ne)
26+
27+
28+
if __name__ == "__main__":
29+
input_file = sys.argv[1]
30+
output_file = sys.argv[2]
31+
process_structures(input_file, output_file)

api_paper_2024/example_2/ReadMe.md

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Mining third-party resources for structural context
2+
3+
The CSD Python API, combined with other Python and REST APIs, can be very powerful for mining for such relationships.
4+
Each CCDC refcode has associated publication information, and many have an associated publication document object
5+
identifier (DOI).
6+
7+
## Dependencies
8+
9+
- habanero
10+
11+
Optional:
12+
13+
- wordcloud
14+
- nltk (after install please consult <https://www.nltk.org/data.html>)
15+
16+
```conda install -c conda-forge habanero wordcloud nltk```
17+
18+
## To Run
19+
20+
Add script folder through CSD Python API drop down menu.
21+
With a structure opened, click `about_entry.py` in menu dropdown.

0 commit comments

Comments
 (0)