Skip to content

Commit

Permalink
New format to download zip (issue 1326) (#530)
Browse files Browse the repository at this point in the history
* stashing

* stashing

* feat: download structure fixed

TODO: add all the yamls

* All yaml files added to download
  • Loading branch information
kaliif authored Feb 15, 2024
1 parent 654f87c commit 417126f
Show file tree
Hide file tree
Showing 2 changed files with 143 additions and 38 deletions.
179 changes: 142 additions & 37 deletions viewer/download_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import shutil
import uuid
import zipfile
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from io import BytesIO
from pathlib import Path
Expand Down Expand Up @@ -49,6 +50,13 @@
'readme': (''),
}


@dataclass(frozen=True)
class ArchiveFile:
path: str
archive_path: str


# Dictionary containing all references needed to create the zip file
# NB you may need to add a version number to this at some point...
zip_template = {
Expand Down Expand Up @@ -216,7 +224,7 @@ def _read_and_patch_molecule_name(path, molecule_name=None):
return content


def _add_file_to_zip_aligned(ziparchive, code, filepath):
def _add_file_to_zip_aligned(ziparchive, code, archive_file):
"""Add the requested file to the zip archive.
If the file is an SDF or MOL we insert the name of the molecule
Expand All @@ -230,39 +238,32 @@ def _add_file_to_zip_aligned(ziparchive, code, filepath):
Returns:
[boolean]: [True of record added to archive]
"""
logger.debug('+_add_file_to_zip_aligned: %s, %s', code, filepath)
if not filepath:
logger.debug('+_add_file_to_zip_aligned: %s, %s', code, archive_file)
if not archive_file:
# Odd - assume success
logger.error('No filepath value')
return True

# Incoming filepath can be both str and FieldFile
try:
filepath = filepath.path
except AttributeError:
filepath = str(Path(settings.MEDIA_ROOT).joinpath(filepath))

# strip off the leading parts of path
archive_path = str(Path(*Path(filepath).parts[7:]))
filepath = str(Path(settings.MEDIA_ROOT).joinpath(archive_file.path))
if Path(filepath).is_file():
if _is_mol_or_sdf(filepath):
# It's a MOL or SD file.
# Read and (potentially) adjust the file
# and add to the archive as a string.
content = _read_and_patch_molecule_name(filepath, molecule_name=code)
ziparchive.writestr(archive_path, content)
ziparchive.writestr(archive_file.archive_path, content)
else:
# Copy the file without modification
ziparchive.write(filepath, archive_path)
ziparchive.write(filepath, archive_file.archive_path)
return True
else:
logger.warning('filepath "%s" is not a file', filepath)
_add_empty_file(ziparchive, archive_path)
_add_empty_file(ziparchive, archive_file.archive_path)

return False


def _add_file_to_sdf(combined_sdf_file, filepath):
def _add_file_to_sdf(combined_sdf_file, archive_file):
"""Append the requested sdf file to the single sdf file provided.
Args:
Expand All @@ -274,19 +275,19 @@ def _add_file_to_sdf(combined_sdf_file, filepath):
"""
media_root = settings.MEDIA_ROOT

if not filepath:
if not archive_file.path:
# Odd - assume success
logger.error('No filepath value')
return True

fullpath = os.path.join(media_root, filepath)
fullpath = os.path.join(media_root, archive_file.path)
if os.path.isfile(fullpath):
with open(combined_sdf_file, 'a', encoding='utf-8') as f_out:
patched_sdf_content = _read_and_patch_molecule_name(fullpath)
f_out.write(patched_sdf_content)
return True
else:
logger.warning('filepath "%s" is not a file', filepath)
logger.warning('filepath "%s" is not a file', archive_file.path)

return False

Expand All @@ -301,11 +302,8 @@ def _protein_files_zip(zip_contents, ziparchive, error_file):
continue

for prot, prot_file in files.items():
# if it's a list of files (map_info) instead of single file
if not isinstance(prot_file, list):
prot_file = [prot_file]
for f in prot_file:
if not _add_file_to_zip_aligned(ziparchive, prot.split(":")[0], f):
if not _add_file_to_zip_aligned(ziparchive, prot, f):
error_file.write(f'{param},{prot},{f}\n')
prot_errors += 1

Expand Down Expand Up @@ -333,14 +331,14 @@ def _molecule_files_zip(zip_contents, ziparchive, combined_sdf_file, error_file)
] is True and not _add_file_to_zip_aligned(
ziparchive, prot.split(":")[0], file
):
error_file.write(f'sdf_info,{prot},{file}\n')
error_file.write(f'sdf_info,{prot},{file.path}\n')
mol_errors += 1

# Append sdf file on the Molecule record to the combined_sdf_file.
if zip_contents['molecules'][
'single_sdf_file'
] is True and not _add_file_to_sdf(combined_sdf_file, file):
error_file.write(f'single_sdf_file,{prot},{file}\n')
error_file.write(f'single_sdf_file,{prot},{file.path}\n')
mol_errors += 1

return mol_errors
Expand Down Expand Up @@ -448,6 +446,46 @@ def _extra_files_zip(ziparchive, target):
logger.info('Processed %s extra files', num_processed)


def _yaml_files_zip(ziparchive, target):
"""Add all yaml files (except transforms) from upload to ziparchive"""

for experiment_upload in target.experimentupload_set.order_by('commit_datetime'):
yaml_paths = (
Path(settings.MEDIA_ROOT)
.joinpath(settings.TARGET_LOADER_MEDIA_DIRECTORY)
.joinpath(experiment_upload.task_id)
)

transforms = [
Path(f.name).name
for f in (
experiment_upload.neighbourhood_transforms,
experiment_upload.neighbourhood_transforms,
experiment_upload.neighbourhood_transforms,
)
]
# taking the latest upload for now
# add unpacked zip directory
yaml_paths = [d for d in list(yaml_paths.glob("*")) if d.is_dir()][0]

# add upload_[d] dir
yaml_paths = next(yaml_paths.glob("upload_*"))

archive_path = Path('yaml_files').joinpath(yaml_paths.parts[-1])

yaml_files = [
f
for f in list(yaml_paths.glob("*.yaml"))
if f.is_file() and f.name not in transforms
]

logger.info('Processing yaml files (%s)...', yaml_files)

for file in yaml_files:
logger.info('Adding yaml file "%s"...', file)
ziparchive.write(file, str(Path(archive_path).joinpath(file.name)))


def _document_file_zip(ziparchive, download_path, original_search, host):
"""Create the document file
This consists of a template plus an added contents description.
Expand Down Expand Up @@ -583,6 +621,8 @@ def _create_structures_zip(target, zip_contents, file_url, original_search, host

_extra_files_zip(ziparchive, target)

_yaml_files_zip(ziparchive, target)

_document_file_zip(ziparchive, download_path, original_search, host)

error_file.close()
Expand Down Expand Up @@ -625,21 +665,79 @@ def _create_structures_dict(target, site_obvs, protein_params, other_params):
for so in site_obvs:
for param in protein_params:
if protein_params[param] is True:
try:
# getting the param from experiment. more data are
# coming from there, that's why this is in try
# block
if param in ['pdb_info', 'mtz_info', 'cif_info', 'map_info']:
# experiment object
model_attr = getattr(so.experiment, param)
# getattr retrieves FieldFile object, hence the .name
if isinstance(model_attr, list):
# except map_files, this returns a list of files
zip_contents['proteins'][param][so.code] = model_attr
logger.debug(
'Adding param to zip: %s, value: %s', param, model_attr
)
if param != 'map_info':
# treat all params as list
model_attr = (
[model_attr.name]
# None - some weird glitch in storing the values
if model_attr and not str(model_attr).find('None') > -1
else [param]
)

afile = []
for f in model_attr:
# here the model_attr is already stringified
if model_attr and model_attr != 'None':
archive_path = str(
Path('crystallographic_files')
.joinpath(so.code)
.joinpath(
Path(f)
.parts[-1]
.replace(so.experiment.code, so.code)
)
)
else:
archive_path = param
afile.append(ArchiveFile(path=f, archive_path=archive_path))

elif param in [
'bound_file',
'apo_solv_file',
'apo_desolv_file',
'apo_file',
'sigmaa_file',
'event_file',
'artefacts_file',
'pdb_header_file',
'diff_file',
]:
# siteobservation object

model_attr = getattr(so, param)
logger.debug(
'Adding param to zip: %s, value: %s', param, model_attr
)
if model_attr and model_attr != 'None':
archive_path = str(
Path('aligned_files')
.joinpath(so.code)
.joinpath(
Path(model_attr.name)
.parts[-1]
.replace(so.longcode, so.code)
)
)
else:
zip_contents['proteins'][param][so.code] = model_attr.name
archive_path = param

afile = [
ArchiveFile(
path=model_attr.name,
archive_path=archive_path,
)
]
else:
logger.warning('Unexpected param: %s', param)
continue

except AttributeError:
# on the off chance that the data are in site_observation model
zip_contents['proteins'][param][so.code] = getattr(so, param).name
zip_contents['proteins'][param][so.code] = afile

if other_params['single_sdf_file'] is True:
zip_contents['molecules']['single_sdf_file'] = True
Expand All @@ -666,7 +764,14 @@ def _create_structures_dict(target, site_obvs, protein_params, other_params):

if rel_sd_file:
logger.debug('rel_sd_file=%s code=%s', rel_sd_file, so.code)
zip_contents['molecules']['sdf_files'].update({rel_sd_file: so.code})
zip_contents['molecules']['sdf_files'].update(
{
ArchiveFile(
path=rel_sd_file,
archive_path=rel_sd_file,
): so.code
}
)
num_molecules_collected += 1

# Report (in the log) anomalies
Expand Down
2 changes: 1 addition & 1 deletion viewer/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -1492,7 +1492,7 @@ def create(self, request):
# prot = models.Protein.objects.filter(code__contains=code_first_part).values()
# I don't see why I need to drop out of django objects here
prot = models.SiteObservation.objects.filter(
code__contains=code_first_part
experiment__experiment_upload__target=target, code=code_first_part
)
if prot.exists():
# even more than just django object, I need an
Expand Down

0 comments on commit 417126f

Please sign in to comment.