Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial Commit of example scripts for the CSD Python API Paper #55

Merged
merged 13 commits into from
May 13, 2024
7 changes: 7 additions & 0 deletions api_paper_2024/example_1/ReadMe.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Extending Substructure Searching using the CSD Python API
By searching the CSD using a SMARTS query through the CSD Python API and further inspect molecules as required.
Using SMARTS, it is possible to express a complex Isoflavone query which is recursive in nature.
## To Run
```bash
python isovflavone_search.py
```
191,668 changes: 191,668 additions & 0 deletions api_paper_2024/example_1/data_files/demo_subset_of_pubchem.sdf

Large diffs are not rendered by default.

133 changes: 133 additions & 0 deletions api_paper_2024/example_1/isovflavone_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2024-05-02: created by the Cambridge Crystallographic Data Centre
#

"""
Credit - this code was written by Jason C. Cole, Natalie Johnson and Alex Moldovan
"""
import argparse

from ccdc.io import EntryReader
from ccdc.search import SubstructureSearch, SMARTSSubstructure


class TrihydroxyIsoflavoneHitfinder(SubstructureSearch.HitProcessor):
"""Post process hits - pick out hits that are only tri-substituted
and then retrieve information from them to tabulate.
"""

def __init__(self, query_string, database='CSD'):

self._query = SMARTSSubstructure(query_string)
self._query_string = query_string
self._hits = []
self._extracted_data = []
self._entry_reader = EntryReader(database)
self._hits_without_filtering = []
self._database = database

@staticmethod
def _hydroxyl_or_hydroxylate(atom):
return (atom.atomic_number == 8) and \
(len(atom.neighbours) == 1 and atom.formal_charge == -1.0) or \
(len(atom.neighbours) == 2 and (
atom.neighbours[0].atomic_number == 1 or atom.neighbours[1].atomic_number == 1))

def _find_hydroxyls(self, hit):
return [atom for atom in hit.match_atoms() if self._hydroxyl_or_hydroxylate(atom)]

def _count_bound_hydroxyls(self, hit):
return len(self._find_hydroxyls(hit))

def _get_entry_data_other(self, identifier):
entry = self._entry_reader.entry(identifier)
return entry.attributes

def _get_entry_data_csd(self, identifier):
entry = self._entry_reader.entry(identifier)

synonyms = entry.synonyms
chemical_name = entry.chemical_name
return {"Chemical Name": chemical_name, "Synonyms": synonyms}

def _substitution_pattern(self, hit):
# Get the labelled atoms in the query

pattern = []
label_index_lookup = {i: self._query.label_to_atom_index(i) for i in self._query._matches.keys()}
print(label_index_lookup)
match_atoms = hit.match_atoms()
for k in label_index_lookup.keys():

atom = match_atoms[label_index_lookup[k]]

if self._hydroxyl_or_hydroxylate(atom):
pattern.append(str(k))

return ",".join(sorted(pattern))

def _get_entry_data(self, hit):
if self.database == 'CSD':
d = self._get_entry_data_csd(hit.identifier)
else:
d = self._get_entry_data_other(hit.identifier)
return d | {"Substitution Pattern": self._substitution_pattern(hit)}

def add_hit(self, hit):
"""
This is the key method that gets called in the search
"""
self._hits_without_filtering.append(hit)
if self._count_bound_hydroxyls(hit) == 3:
self._hits.append(hit)

def tabulate(self):
"""
Generate a dictionary of dictionaries of relevant data from the hits
"""
data = {}
for hit in self._hits:
data[hit.identifier] = self._get_entry_data(hit)

return data

def run(self):
searcher = SubstructureSearch()
searcher.add_substructure(self._query)
super().search(searcher, self._entry_reader)


if __name__ == "__main__":

sub = "$([#1]),$([OH1]),$([OX1H0]),$(O[CH3]),$(Oc1ccccc1)"
query_string = (f"c(!@[{sub}:1])1c(!@[{sub}:2])c(!@[{sub}:3])c(!@[{sub}:4])c(OC(!@[{sub}:5])"
f"=C(c2c(!@[{sub}:6])c(!@[{sub}:7])c(!@[{sub}:8])c(!@[{sub}:9])c(!@[{sub}:10])2)C(=O)c1)") # noqa

parser = argparse.ArgumentParser()
parser.add_argument('-d', '--database', default='CSD',
help='Path to the file to search or "CSD" to use the CSD')
args = parser.parse_args()

database = 'tiny.sdf'
filtered_search = TrihydroxyIsoflavoneHitfinder(query_string, args.database)
filtered_search.run()

data = filtered_search.tabulate()
sorted_ids = sorted(data.keys())
info_keys = sorted(data[sorted_ids[0]].keys())

print(f"Without filtering, we have: {len(filtered_search._hits_without_filtering)} hits")
print(f"After filtering we have: {len(filtered_search._hits)} hits")

print(",".join(["Identifier"] + info_keys))
for key in sorted_ids:
datum = data[key]

print(",".join([key] + [str(datum[x]) for x in info_keys]))
31 changes: 31 additions & 0 deletions api_paper_2024/example_1/process_pubchem_structures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2024-05-02: created by the Cambridge Crystallographic Data Centre
#

import sys

from ccdc.entry import Entry
from ccdc.io import EntryReader, EntryWriter


def process_structures(input_file, output_file):
with EntryReader(input_file) as er, EntryWriter(output_file) as ew:
for e in er:
attribs = e.attributes
molecule = e.molecule
molecule.assign_bond_types('all')
ne = Entry.from_molecule(molecule, attributes=attribs)
ew.write(ne)


if __name__ == "__main__":
input_file = sys.argv[1]
output_file = sys.argv[2]
process_structures(input_file, output_file)
18 changes: 18 additions & 0 deletions api_paper_2024/example_2/ReadMe.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Mining third-party resources for structural context
The CSD Python API, combined with other Python and REST APIs, can be very powerful for mining for such relationships.
Each CCDC refcode has associated publication information, and many have an associated publication document object
identifier (DOI).

## Dependencies
- habanero

Optional:

- wordcloud
- nltk (after install please consult <https://www.nltk.org/data.html>)

```conda install -c conda-forge habanero wordcloud nltk```

## To Run
Add script folder through CSD Python API drop down menu.
With a structure opened, click `about_entry.py` in menu dropdown.
Loading
Loading