Skip to content

Commit

Permalink
Merge branch 'granular_download' into staging
Browse files Browse the repository at this point in the history
  • Loading branch information
kaliif committed Nov 21, 2023
2 parents 72281dc + 8205e28 commit 323a234
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 57 deletions.
2 changes: 2 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ RUN apt-get update -y && \
nginx \
pandoc \
texlive-latex-base \
texlive-latex-recommended \
lmodern \
texlive-fonts-recommended && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
Expand Down
121 changes: 70 additions & 51 deletions viewer/download_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,16 @@
import datetime
import uuid
import shutil
import fnmatch
import logging
import copy
import json

from pathlib import Path

import pandoc



from django.conf import settings

from viewer.models import DownloadLinks
Expand All @@ -32,10 +36,10 @@
'cif_info': ('aligned'),
'mtz_info': ('aligned'),
'map_info': ('aligned'),
'sigmaa_info': ('aligned'),
'diff_info': ('aligned'),
# 'sigmaa_info': ('aligned'),
# 'diff_info': ('aligned'),
'event_info': ('aligned'),
'trans_matrix_info': ('aligned'),
# 'trans_matrix_info': ('aligned'),
'sdf_info': ('aligned'),
'single_sdf_file': (''),
'metadata_info': (''),
Expand All @@ -46,14 +50,14 @@

# Dictionary containing all references needed to create the zip file
# NB you may need to add a version number to this at some point...
zip_template = {'proteins': {'pdb_info': {},
'bound_info': {},
'cif_info': {},
'mtz_info': {},
'diff_info': {},
'event_info': {},
'sigmaa_info': {},
'trans_matrix_info': {}
zip_template = {'proteins': {'pdb_info': {}, # from experiment
'bound_file': {}, # x
'cif_info': {}, # from experiment
'mtz_info': {}, # from experiment
'event_file': {}, # x
# 'diff_info': {}, # next 3, not collected anymore
# 'sigmaa_info': {},
# 'trans_matrix_info': {}
},
'molecules': {'sdf_files': {},
'sdf_info': False,
Expand Down Expand Up @@ -90,7 +94,7 @@ def _replace_missing_sdf(molecule, code):

# We shouldn't be called if molecule['sdf_info'] is blank.
# but check anyway.
sdf_info = molecule['sdf_info']
sdf_info = molecule.ligand_mol_file
if not sdf_info:
return None
sdf_lines = sdf_info.splitlines(True)[1:]
Expand Down Expand Up @@ -150,11 +154,7 @@ def _is_mol_or_sdf(path):
"""Returns True if the file and path look like a MOL or SDF file.
It does this by simply checking the file's extension.
"""
path_parts = os.path.splitext(os.path.basename(path))
if path_parts[1].lower() in ('.sdf', '.mol'):
return True
# Doesn't look like a MOL or SDF if we get here.
return False
return Path(path).suffix.lower() in ('.sdf', '.mol')


def _read_and_patch_molecule_name(path, molecule_name=None):
Expand Down Expand Up @@ -215,24 +215,28 @@ def _add_file_to_zip_aligned(ziparchive, code, filepath):
Returns:
[boolean]: [True of record added to error file]
"""
media_root = settings.MEDIA_ROOT
if not filepath:
return False

fullpath = os.path.join(media_root, filepath)
logger.debug('+ _add_file_to_zip_aligned, filepath: %s', filepath)

if os.path.isfile(fullpath):
cleaned_filename = clean_filename(fullpath)
archive_path = os.path.join('aligned', code, cleaned_filename)
if _is_mol_or_sdf(fullpath):
# apparently incoming filepath can be both str and FieldFile
try:
filepath = filepath.path
except AttributeError:
filepath = str(Path(settings.MEDIA_ROOT).joinpath(filepath))

if Path(filepath).is_file():
archive_path = str(Path('aligned').joinpath(code).joinpath(filepath))
if _is_mol_or_sdf(filepath):
# It's a MOL or SD file.
# Read and (potentially) adjust the file
# and add to the archive as a string.
content = _read_and_patch_molecule_name(fullpath, molecule_name=code)
content = _read_and_patch_molecule_name(filepath, molecule_name=code)
ziparchive.writestr(archive_path, content)
else:
# Copy the file without modification
ziparchive.write(fullpath, archive_path)
ziparchive.write(filepath, archive_path)
return False

return True
Expand Down Expand Up @@ -273,10 +277,11 @@ def _protein_files_zip(zip_contents, ziparchive, error_file):
if not files:
continue

for prot, file in files.items():
if _add_file_to_zip_aligned(ziparchive, prot.split(":")[0], file):
for prot, prot_file in files.items():
logger.debug('%s: %s', prot, prot_file)
if _add_file_to_zip_aligned(ziparchive, prot.split(":")[0], prot_file):
error_file.write(
'{},{},{}\n'.format(param, prot, file))
'{},{},{}\n'.format(param, prot, prot_file))
prot_errors += 1
return prot_errors

Expand Down Expand Up @@ -349,7 +354,7 @@ def _extra_files_zip(ziparchive, target):
logger.info('Processing extra files (%s)...', extra_files)

if os.path.isdir(extra_files):
for dirpath, dummy, files in os.walk(extra_files):
for dirpath, _, files in os.walk(extra_files):
for file in files:
filepath = os.path.join(dirpath, file)
logger.info('Adding extra file "%s"...', filepath)
Expand Down Expand Up @@ -536,8 +541,7 @@ def _protein_garbage_filter(proteins):
Returns:
[list]: [update protein list]
"""
return [p for p in proteins
if not fnmatch.fnmatch(p['code'], 'references_*')]
return proteins.exclude(code__startswith=r'references_')


def _create_structures_dict(target, site_obvs, protein_params, other_params):
Expand All @@ -562,7 +566,15 @@ def _create_structures_dict(target, site_obvs, protein_params, other_params):
for so in site_obvs:
for param in protein_params:
if protein_params[param] is True:
zip_contents['proteins'][param].update({so['code']: so[param]})
try:
# getting the param from experiment. more data are
# coming from there, that's why this is in try
# block
# getattr retrieves FieldFile object, hance the .name
zip_contents['proteins'][param][so.code] = getattr(so.experiment, param).name
except AttributeError:
# on the off chance that the data are in site_observation model
zip_contents['proteins'][param][so.code] = getattr(so, param).name

if other_params['single_sdf_file'] is True:
zip_contents['molecules']['single_sdf_file'] = True
Expand Down Expand Up @@ -609,7 +621,7 @@ def _create_structures_dict(target, site_obvs, protein_params, other_params):
# The smiles at molecule level may not be unique.
if other_params['smiles_info'] is True:
for molecule in site_obvs:
zip_contents['molecules']['smiles_info'].update({molecule['smiles']: None})
zip_contents['molecules']['smiles_info'].update({molecule.smiles: None})

# Add the metadata file from the target
if other_params['metadata_info'] is True:
Expand All @@ -633,11 +645,9 @@ def get_download_params(request):
Returns:
protein_params, other_params
"""
protein_param_flags = ['pdb_info', 'bound_file', 'cif_info',
'mtz_info', 'event_file',]

protein_param_flags = ['pdb_info', 'bound_info',
'cif_info', 'mtz_info',
'diff_info', 'event_info',
'sigmaa_info', 'trans_matrix_info']
other_param_flags = ['sdf_info', 'single_sdf_file',
'metadata_info', 'smiles_info']

Expand Down Expand Up @@ -679,7 +689,7 @@ def get_download_params(request):

def check_download_links(request,
target,
proteins):
site_observations):
"""Check/create the download zip file for dynamic links
Args:
Expand All @@ -695,17 +705,22 @@ def check_download_links(request,
host = request.get_host()

protein_params, other_params, static_link = get_download_params(request)
logger.debug('proteins_params: %s', protein_params)
logger.debug('other_params: %s', other_params)
logger.debug('static_link: %s', static_link)

# Remove 'references_' from protein list if present.
num_given_proteins = len(proteins)
proteins = _protein_garbage_filter(proteins)
num_removed = num_given_proteins - len(proteins)
num_given_proteins = site_observations.count()
site_observations = _protein_garbage_filter(site_observations)
num_removed = num_given_proteins - site_observations.count()
if num_removed:
logger.warning('Removed %d "references_" proteins from download', num_removed)

# Save the list of protein codes - this is the ispybsafe set for
# this user.
proteins_list = [p['code'] for p in proteins]
# proteins_list = [p.code for p in site_observations]
proteins_list = list(site_observations.values_list('code', flat=True))
logger.debug('proteins_list: %s', proteins_list)

# Remove the token so the original search can be stored
original_search = copy.deepcopy(request.data)
Expand All @@ -719,13 +734,15 @@ def check_download_links(request,
# from the search, to contain the latest information.
# If no record is not there at all, then this is a new link.

existing_link = DownloadLinks.objects.filter(target_id=target.id)\
.filter(proteins=proteins_list) \
.filter(protein_params=protein_params)\
.filter(other_params=other_params) \
.filter(static_link=False)
existing_link = DownloadLinks.objects.filter(
target_id=target.id,
proteins=proteins_list,
protein_params=protein_params,
other_params=other_params,
static_link=False,
)

if existing_link:
if existing_link.exists():
if (existing_link[0].zip_file
and os.path.isfile(existing_link[0].file_url)
and not static_link):
Expand All @@ -744,7 +761,7 @@ def check_download_links(request,
# If so the missing file will have a file reference of 'MISSING/<filename>'
# in the corresponding ['molecules']['sdf_files'] entry.
zip_contents = _create_structures_dict(target,
proteins,
site_observations,
protein_params,
other_params)
_create_structures_zip(target,
Expand All @@ -770,7 +787,7 @@ def check_download_links(request,
file_url = os.path.join(media_root, 'downloads', str(uuid.uuid4()), filename)

zip_contents = _create_structures_dict(target,
proteins,
site_observations,
protein_params,
other_params)

Expand All @@ -780,6 +797,8 @@ def check_download_links(request,
original_search,
host)

logger.debug('zip_contents: %s', zip_contents)

download_link = DownloadLinks()
download_link.file_url = file_url
if request.user.is_authenticated:
Expand Down
12 changes: 6 additions & 6 deletions viewer/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -745,16 +745,16 @@ class Meta:
class DownloadStructuresSerializer(serializers.Serializer):
target_name = serializers.CharField(max_length=200)
proteins = serializers.CharField(max_length=5000)
pdb_info = serializers.BooleanField(default=False)
bound_info = serializers.BooleanField(default=False)
apo_file = serializers.BooleanField(default=False)
bound_file = serializers.BooleanField(default=False)
cif_info = serializers.BooleanField(default=False)
mtz_info = serializers.BooleanField(default=False)
diff_info = serializers.BooleanField(default=False)
event_info = serializers.BooleanField(default=False)
sigmaa_info = serializers.BooleanField(default=False)
# diff_info = serializers.BooleanField(default=False)
event_file = serializers.BooleanField(default=False)
# sigmaa_info = serializers.BooleanField(default=False)
sdf_info = serializers.BooleanField(default=False)
single_sdf_file = serializers.BooleanField(default=False)
trans_matrix_info = serializers.BooleanField(default=False)
# trans_matrix_info = serializers.BooleanField(default=False)
metadata_info = serializers.BooleanField(default=False)
smiles_info = serializers.BooleanField(default=False)
static_link = serializers.BooleanField(default=False)
Expand Down
1 change: 1 addition & 0 deletions viewer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ def add_prop_to_mol(mol_field, mol_file_out, value):
Chem.MolToMolFile(rd_mol, mol_file_out)


# TODO: this method may be deprecated, not an issue with new uploads
def clean_filename(filepath):
"""Return the "clean" version of a Django filename without the '_abcdefg_' that is
created when a file is overwritten.
Expand Down

0 comments on commit 323a234

Please sign in to comment.