diff --git a/docker-compose.celery.yml b/docker-compose.celery.yml index fe8d6875..179fc651 100644 --- a/docker-compose.celery.yml +++ b/docker-compose.celery.yml @@ -38,6 +38,43 @@ services: - .env image: ${BE_NAMESPACE:-xchem}/fragalysis-backend:${BE_IMAGE_TAG:-latest} restart: on-failure + volumes: + - ./data/logs:/code/logs/ + - ./data/media:/code/media/ + - .:/code/ + environment: + AUTHENTICATE_UPLOAD: ${AUTHENTICATE_UPLOAD:-True} + DEPLOYMENT_MODE: 'development' + POSTGRESQL_USER: postgres + # Comma-separated dforced errors (infections?) + INFECTIONS: '' + # Celery tasks need to run synchronously + CELERY_TASK_ALWAYS_EAGER: 'False' + # Error reporting and default/root log-level + FRAGALYSIS_BACKEND_SENTRY_DNS: ${FRAGALYSIS_BACKEND_SENTRY_DNS} + LOGGING_FRAMEWORK_ROOT_LEVEL: ${LOGGING_FRAMEWORK_ROOT_LEVEL:-INFO} + # Keycloak configuration + OIDC_KEYCLOAK_REALM: ${OIDC_KEYCLOAK_REALM} + OIDC_RP_CLIENT_ID: ${OIDC_RP_CLIENT_ID:-fragalysis-local} + OIDC_RP_CLIENT_SECRET: ${OIDC_RP_CLIENT_SECRET} + OIDC_AS_CLIENT_ID: ${OIDC_AS_CLIENT_ID:-account-server-api} + OIDC_DM_CLIENT_ID: ${OIDC_DM_CLIENT_ID:-data-manager-api} + OIDC_RENEW_ID_TOKEN_EXPIRY_MINUTES: '210' + # Public target access strings? + # A comma-separated list of Project titles. + PUBLIC_TAS: ${PUBLIC_TAS:-lb18145-1} + # Squonk configuration + SQUONK2_VERIFY_CERTIFICATES: 'No' + SQUONK2_UNIT_BILLING_DAY: 3 + SQUONK2_PRODUCT_FLAVOUR: BRONZE + SQUONK2_SLUG: fs-local + SQUONK2_ORG_OWNER: ${SQUONK2_ORG_OWNER} + SQUONK2_ORG_OWNER_PASSWORD: ${SQUONK2_ORG_OWNER_PASSWORD} + SQUONK2_ORG_UUID: ${SQUONK2_ORG_UUID} + SQUONK2_UI_URL: ${SQUONK2_UI_URL} + SQUONK2_DMAPI_URL: ${SQUONK2_DMAPI_URL} + SQUONK2_ASAPI_URL: ${SQUONK2_ASAPI_URL} + PROXY_FORWARDED_PROTO_HEADER: ${PROXY_FORWARDED_PROTO_HEADER:-http} celery_beat: command: sh -c "celery -A fragalysis beat -l info --scheduler django_celery_beat.schedulers:DatabaseScheduler" diff --git a/viewer/cset_upload.py b/viewer/cset_upload.py index 8a4264bf..8030495e 100644 --- a/viewer/cset_upload.py +++ b/viewer/cset_upload.py @@ -15,6 +15,7 @@ os.environ.setdefault("DJANGO_SETTINGS_MODULE", "fragalysis.settings") import django +from django.db import IntegrityError, transaction django.setup() @@ -40,6 +41,8 @@ ) from viewer.utils import add_props_to_sdf_molecule, alphanumerator, is_url, word_count +from .sdf_check import add_warning + logger = logging.getLogger(__name__) @@ -47,62 +50,62 @@ _DIST_LIMIT = 0.5 EMPTY_VALUES = ( - 'nan', - '', + "nan", + "", None, np.nan, ) HEADER_MOL_FIELDS = ( - 'ref_url', - 'method', - 'submitter_name', - 'submitter_institution', - 'submitter_email', - 'generation_date', + "ref_url", + "method", + "submitter_name", + "submitter_institution", + "submitter_email", + "generation_date", ) def dataType(a_str: str) -> str: lean_str = a_str.strip() if not lean_str: - return 'BLANK' + return "BLANK" try: t = ast.literal_eval(lean_str) except (ValueError, SyntaxError): - return 'TEXT' + return "TEXT" else: if type(t) in [int, int, float, bool]: if t in [ True, False, - 'TRUE', - 'FALSE', - 'true', - 'false', - 'yes', - 'no', - 'YES', - 'NO', - 'Yes', - 'No', + "TRUE", + "FALSE", + "true", + "false", + "yes", + "no", + "YES", + "NO", + "Yes", + "No", "Y", "N", "y", "n", ]: - return 'BIT' + return "BIT" if type(t) is int or type(t) is int: - return 'INT' + return "INT" if type(t) is float: - return 'FLOAT' + return "FLOAT" # Can't get here? assert False else: - return 'TEXT' + return "TEXT" class PdbOps: @@ -118,10 +121,10 @@ def save_pdb_zip( zfile_hashvals = {} for filename in zip_lst: # only handle pdb files - if filename.split('.')[-1] == 'pdb': + if filename.split(".")[-1] == "pdb": # Test if Protein object already exists - code = filename.split('/')[-1].replace('.pdb', '') - test_pdb_code = filename.split('/')[-1].replace('.pdb', '') + code = filename.split("/")[-1].replace(".pdb", "") + test_pdb_code = filename.split("/")[-1].replace(".pdb", "") test_site_obvs_objs = SiteObservation.objects.filter( code=test_pdb_code ) @@ -129,12 +132,12 @@ def save_pdb_zip( if len(test_site_obvs_objs) != 0: # make a unique pdb code as not to overwrite existing object rand_str = uuid.uuid4().hex - test_pdb_code = f'{code}#{rand_str}' + test_pdb_code = f"{code}#{rand_str}" zfile_hashvals[code] = rand_str - fn = f'{test_pdb_code}.pdb' + fn = f"{test_pdb_code}.pdb" pdb_path = default_storage.save( - f'tmp/{fn}', ContentFile(zf.read(filename)) + f"tmp/{fn}", ContentFile(zf.read(filename)) ) zfile[test_pdb_code] = pdb_path @@ -145,7 +148,7 @@ def save_pdb_zip( return zfile, zfile_hashvals def run(self, params) -> Tuple[Optional[Dict[str, Any]], Optional[Dict[str, str]]]: - return self.save_pdb_zip(params['pdb_zip']) + return self.save_pdb_zip(params["pdb_zip"]) class MolOps: @@ -171,10 +174,17 @@ def __init__( self.zfile_hashvals = zfile_hashvals self.computed_set_name = computed_set_name + # using the same mechanism to pass messages as validation + self.messages: dict[str, Any] = { + "molecule_name": [], + "field": [], + "warning_string": [], + } + def process_pdb(self, pdb_code, zfile, zfile_hashvals) -> str | None: for key in zfile_hashvals.keys(): if key == pdb_code: - pdb_code = f'{pdb_code}#{zfile_hashvals[pdb_code]}' + pdb_code = f"{pdb_code}#{zfile_hashvals[pdb_code]}" try: pdb_fp = zfile[pdb_code] @@ -182,7 +192,7 @@ def process_pdb(self, pdb_code, zfile, zfile_hashvals) -> str | None: return None # ensure filename uniqueness - pdb_fn = '_'.join([zfile[pdb_code].split('/')[-1], uuid.uuid4().hex]) + pdb_fn = "_".join([zfile[pdb_code].split("/")[-1], uuid.uuid4().hex]) pdb_field = Path(settings.COMPUTED_SET_MEDIA_DIRECTORY).joinpath(pdb_fn) new_filename = Path(settings.MEDIA_ROOT).joinpath(pdb_field) @@ -214,11 +224,11 @@ def get_site_observation( ) return None - pdb_fn = mol.GetProp(property_name).split('/')[-1] + pdb_fn = mol.GetProp(property_name).split("/")[-1] if zfile: # pdb archive uploaded. referenced pdb file may or may not be included - pdb_code = pdb_fn.replace('.pdb', '') + pdb_code = pdb_fn.replace(".pdb", "") pdb_file = self.process_pdb( pdb_code=pdb_code, zfile=zfile, @@ -228,7 +238,7 @@ def get_site_observation( return pdb_file else: logger.info( - 'No protein pdb (%s) found in zipfile', + "No protein pdb (%s) found in zipfile", pdb_fn, ) @@ -243,7 +253,7 @@ def get_site_observation( except SiteObservation.DoesNotExist: # Initial SiteObservation lookup failed. logger.warning( - 'Failed to get SiteObservation object (target=%s name=%s)', + "Failed to get SiteObservation object (target=%s name=%s)", compound_set.target.title, name, ) @@ -255,25 +265,25 @@ def get_site_observation( ) if qs.exists(): logger.info( - 'Found SiteObservation containing name=%s qs=%s', + "Found SiteObservation containing name=%s qs=%s", name, qs, ) else: - alt_name = name.split(':')[0].split('_')[0] + alt_name = name.split(":")[0].split("_")[0] qs = SiteObservation.objects.filter( code__contains=alt_name, experiment__experiment_upload__target__pk=target, ) if qs.exists(): logger.info( - 'Found SiteObservation containing alternative name=%s qs=%s', + "Found SiteObservation containing alternative name=%s qs=%s", alt_name, qs, ) if qs.count() > 0: logger.debug( - 'Found alternative (target=%s name=%s)', + "Found alternative (target=%s name=%s)", compound_set.target.title, name, ) @@ -281,7 +291,7 @@ def get_site_observation( if not site_obvs: logger.warning( - 'No SiteObservation found (target=%s pdb_fn=%s)', + "No SiteObservation found (target=%s pdb_fn=%s)", compound_set.target.title, pdb_fn, ) @@ -299,7 +309,7 @@ def create_mol(self, inchi, target, name=None) -> tuple[Compound, str]: qs = Compound.objects.filter( computedmolecule__computed_set__target=target, ) - cpd_number = '1' + cpd_number = "1" try: # NB! Max said there could be thousands of compounds per # target so this distinct() here may become a problem @@ -328,23 +338,17 @@ def create_mol(self, inchi, target, name=None) -> tuple[Compound, str]: # occur. However there's nothing in the db to prevent # this, so adding a catch clause and writing a meaningful # message - logger.error( - 'Duplicate compounds for target %s with inchi key %s.', - target.title, - inchi_key, - ) - raise MultipleObjectsReturned from exc + msg = f"Duplicate compounds for target {target.title} with inchi key {inchi_key}." + logger.error(msg) + raise IntegrityError(msg) from exc return cpd, cpd_number def set_props(self, cpd, props, score_descriptions) -> List[ScoreDescription]: - if 'ref_mols' and 'ref_pdb' not in list(props.keys()): - raise Exception('ref_mols and ref_pdb not set!') - for sd, val in score_descriptions.items(): - logger.debug('sd: %s', sd) - logger.debug('sd.name, val: %s: %s', sd.name, val) - if dataType(str(props[sd.name])) == 'TEXT': + logger.debug("sd: %s", sd) + logger.debug("sd.name, val: %s: %s", sd.name, val) + if dataType(str(props[sd.name])) == "TEXT": score_value = TextScoreValues() else: score_value = NumericalScoreValues() @@ -369,7 +373,7 @@ def set_mol( smiles = Chem.MolToSmiles(mol) inchi = Chem.inchi.MolToInchi(mol) - molecule_name = mol.GetProp('_Name') + molecule_name = mol.GetProp("_Name") flattened_copy = copy.deepcopy(mol) Chem.RemoveStereochemistry(mol) @@ -379,8 +383,8 @@ def set_mol( inchi, compound_set.target, name=molecule_name ) - insp = mol.GetProp('ref_mols') - insp = insp.split(',') + insp = mol.GetProp("ref_mols") + insp = insp.split(",") insp = [i.strip() for i in insp] insp_frags = [] for i in insp: @@ -393,19 +397,19 @@ def set_mol( ref = site_obvs except SiteObservation.DoesNotExist: qs = SiteObservation.objects.filter( - code=str(i.split(':')[0].split('_')[0]), + code=str(i.split(":")[0].split("_")[0]), experiment__experiment_upload__target=compound_set.target, ) if not qs.exists(): - raise Exception( # pylint: disable=raise-missing-from - 'No matching molecules found for inspiration frag ' + i + raise IntegrityError( # pylint: disable=raise-missing-from + "No matching molecules found for inspiration frag " + i ) - ref = qs.order_by('-cmpd_id').first() + ref = qs.order_by("-cmpd_id").first() insp_frags.append(ref) - ref_property = 'ref_pdb' + ref_property = "ref_pdb" # data in ref ref_pdb field may be one of 2 things: # - siteobservation's short code (code field) # - pdb file in uploaded zipfile @@ -419,7 +423,7 @@ def set_mol( ) if not ref_so: logger.warning( - 'Failed to get a Reference SiteObservation (%s) for %s, %s, %s', + "Failed to get a Reference SiteObservation (%s) for %s, %s, %s", ref_property, mol, target, @@ -431,27 +435,29 @@ def set_mol( # I think, realistically, I only need to check compound # update: I used to annotate name components, with the new - # format, this is not necessary. or possible fmt: off + # format, this is not necessary. or possible qs = ComputedMolecule.objects.filter( compound=compound, - ).order_by('name') + ).order_by("name") if qs.exists(): # not actually latest, just last according to sorting above latest = qs.last() # regex pattern - split name like 'v1a' # ('(letters)(digits)(letters)' to components - groups = re.search(r'()(\d+)(\D+)', qs.last().name) + groups = re.search(r"()(\d+)(\D+)", qs.last().name) if groups is None or len(groups.groups()) != 3: # just a quick sanity check - raise ValueError(f'Non-standard ComputedMolecule.name: {latest.name}') + raise IntegrityError( + f"Non-standard ComputedMolecule.name: {latest.name}" + ) number = groups.groups()[1] # type: ignore [index] suffix = next(alphanumerator(start_from=groups.groups()[2])) # type: ignore [index] else: - suffix = 'a' + suffix = "a" # number = 1 - name = f'v{number}{suffix}' + name = f"v{number}{suffix}" existing_computed_molecules = [] for k in qs: @@ -471,11 +477,11 @@ def set_mol( ) except RuntimeError as exc: msg = ( - f'Failed to find alignment between {k.molecule_name} ' + f"Failed to find alignment between {k.molecule_name} " + f'and {mol.GetProp("original ID")}' ) logger.error(msg) - raise RuntimeError(msg) from exc + raise IntegrityError(msg) from exc molconf = mol.GetConformer() kmolconf = kmol.GetConformer() @@ -492,18 +498,18 @@ def set_mol( if len(existing_computed_molecules) == 1: logger.warning( - 'Using existing ComputedMolecule %s and overwriting its metadata', + "Using existing ComputedMolecule %s and overwriting its metadata", existing_computed_molecules[0], ) computed_molecule = existing_computed_molecules[0] elif len(existing_computed_molecules) > 1: - logger.warning('Deleting existing ComputedMolecules (more than 1 found') + logger.warning("Deleting existing ComputedMolecules (more than 1 found") for exist in existing_computed_molecules: - logger.info('Deleting ComputedMolecule %s', exist) + logger.info("Deleting ComputedMolecule %s", exist) exist.delete() computed_molecule = ComputedMolecule(name=name) else: - logger.info('Creating new ComputedMolecule') + logger.info("Creating new ComputedMolecule") computed_molecule = ComputedMolecule(name=name) if isinstance(ref_so, SiteObservation): @@ -531,11 +537,11 @@ def set_mol( # Extract possible reference URL and Rationale # URLs have to be valid URLs and rationals must contain more than one word ref_url: Optional[str] = ( - mol.GetProp('ref_url') if mol.HasProp('ref_url') else None + mol.GetProp("ref_url") if mol.HasProp("ref_url") else None ) computed_molecule.ref_url = ref_url if is_url(ref_url) else None rationale: Optional[str] = ( - mol.GetProp('rationale') if mol.HasProp('rationale') else None + mol.GetProp("rationale") if mol.HasProp("rationale") else None ) computed_molecule.rationale = rationale if word_count(rationale) > 1 else None # To avoid the error... @@ -561,18 +567,19 @@ def set_mol( return computed_molecule def get_submission_info(self, description_mol) -> ComputedSetSubmitter: - datestring = description_mol.GetProp('generation_date') + datestring = description_mol.GetProp("generation_date") try: date = parse(datestring, dayfirst=True) except ValueError as exc: - logger.error('"%s" is not a valid date', datestring) - raise ValueError from exc + msg = f'"{datestring}" is not a valid date' + logger.error(msg) + raise IntegrityError(msg) from exc return ComputedSetSubmitter.objects.get_or_create( - name=description_mol.GetProp('submitter_name'), - method=description_mol.GetProp('method'), - email=description_mol.GetProp('submitter_email'), - institution=description_mol.GetProp('submitter_institution'), + name=description_mol.GetProp("submitter_name"), + method=description_mol.GetProp("method"), + email=description_mol.GetProp("submitter_email"), + institution=description_mol.GetProp("submitter_institution"), generation_date=date, )[0] @@ -585,10 +592,55 @@ def process_mol( score_descriptions, zfile=None, zfile_hashvals=None, - ) -> List[ScoreDescription]: - cpd = self.set_mol(mol, target, compound_set, filename, zfile, zfile_hashvals) + ) -> None: + molecule_name = mol.GetProp("_Name") + logger.debug("+ process_mol %s", molecule_name) + other_props = mol.GetPropsAsDict() - return self.set_props(cpd, other_props, score_descriptions) + skip_mol = False + + # if ref_mols or ref_pdb is missing skip the molecule + for prop in ["ref_mols", "ref_pdb"]: + if prop not in other_props.keys(): + self.messages = add_warning( + molecule_name=molecule_name, + field=prop, + warning_string=f"Property {prop} missing. Skipping molecule!", + validate_dict=self.messages, + ) + skip_mol = True + elif other_props[prop] in EMPTY_VALUES: + self.messages = add_warning( + molecule_name=molecule_name, + field=prop, + warning_string=f"Property {prop} undefined. Skipping molecule!", + validate_dict=self.messages, + ) + skip_mol = True + + # if any header mol fields are defined on non-header molecules those values are ignored and a warning shown + for prop in HEADER_MOL_FIELDS: + if prop not in other_props.keys(): + # non-header molecules don't need header fields + continue + + if other_props[prop] not in EMPTY_VALUES: + # header fields in non-header molecules have values ignored + self.messages = add_warning( + molecule_name=molecule_name, + field=prop, + warning_string=f"Property {prop} value {other_props[prop]} ignored.", + validate_dict=self.messages, + ) + + # get rid of the header field property on the non-header molecule + del other_props[prop] + + if not skip_mol: + cpd = self.set_mol( + mol, target, compound_set, filename, zfile, zfile_hashvals + ) + self.set_props(cpd, other_props, score_descriptions) def set_descriptions( self, filename, computed_set: ComputedSet @@ -606,23 +658,23 @@ def set_descriptions( ) computed_set.submitter = self.get_submission_info(description_mol) - if description_mol.HasProp('ref_url'): - computed_set.method_url = description_mol.GetProp('ref_url') + if description_mol.HasProp("ref_url"): + computed_set.method_url = description_mol.GetProp("ref_url") computed_set.save() description_dict = description_mol.GetPropsAsDict() - logger.debug('index mol original values: %s', description_dict) + logger.debug("index mol original values: %s", description_dict) # score descriptions for this upload, doesn't matter if # created or existing score_descriptions = {} errors = [] for key in description_dict.keys(): if key in descriptions_needed and key not in [ - 'ref_mols', - 'ref_pdb', - 'index', - 'Name', - 'original SMILES', + "ref_mols", + "ref_pdb", + "index", + "Name", + "original SMILES", ]: description, _ = ScoreDescription.objects.get_or_create( computed_set=computed_set, @@ -634,132 +686,149 @@ def set_descriptions( if key in HEADER_MOL_FIELDS: if value in EMPTY_VALUES: - msg = f'Empty value for {key} in header molecule' + msg = f"Empty value for {key} in header molecule" errors.append(msg) logger.error(msg) - if key == 'submitter_email': + if key == "submitter_email": try: validate_email(value) - except ValidationError as exc: + except ValidationError: msg = f'"{value}" is not a valid email' logger.error(msg) errors.append(msg) - raise ValidationError(msg) from exc score_descriptions[description] = value - logger.debug('index mol values: %s', score_descriptions.values()) + logger.debug("index mol values: %s", score_descriptions.values()) if errors: - raise ValueError(errors) + raise IntegrityError(errors) return mols, score_descriptions - def task(self) -> ComputedSet: + def task(self) -> tuple[ComputedSet, dict]: # Truncate submitted method (lower-case)? - truncated_submitter_method: str = 'unspecified' - if self.submitter_method: - truncated_submitter_method = self.submitter_method[ - : ComputedSet.LENGTH_METHOD_IN_NAME - ] - if len(self.submitter_method) > len(truncated_submitter_method): - logger.warning( - 'ComputedSet submitter method is too long (%s). Truncated to "%s"', - self.submitter_method, - truncated_submitter_method, - ) - else: - logger.warning( - 'ComputedSet submitter method is not set. Using "%s"', - truncated_submitter_method, - ) - - # Do we have any existing ComputedSets? + truncated_submitter_method: str = "unspecified" try: - computed_set = ComputedSet.objects.get(name=self.computed_set_name) - # refresh some attributes - computed_set.md_ordinal = F('md_ordinal') + 1 - computed_set.upload_date = datetime.date.today() - computed_set.save() - except ComputedSet.DoesNotExist: - # no, create new + with transaction.atomic(): + if self.submitter_method: + truncated_submitter_method = self.submitter_method[ + : ComputedSet.LENGTH_METHOD_IN_NAME + ] + if len(self.submitter_method) > len(truncated_submitter_method): + logger.warning( + 'ComputedSet submitter method is too long (%s). Truncated to "%s"', + self.submitter_method, + truncated_submitter_method, + ) + else: + logger.warning( + 'ComputedSet submitter method is not set. Using "%s"', + truncated_submitter_method, + ) - today: datetime.date = datetime.date.today() - new_ordinal: int = 1 + # Do we have any existing ComputedSets? + try: + computed_set = ComputedSet.objects.get(name=self.computed_set_name) + # refresh some attributes + computed_set.md_ordinal = F("md_ordinal") + 1 + computed_set.upload_date = datetime.date.today() + computed_set.save() + except ComputedSet.DoesNotExist: + # no, create new + + today: datetime.date = datetime.date.today() + new_ordinal: int = 1 + + try: + target = Target.objects.get(pk=self.target_id) + except Target.DoesNotExist as exc: + # target's existance should be validated in the view, + # this could hardly happen + msg = f"Target {self.target_id} does not exist" + logger.error(msg) + raise IntegrityError(msg) from exc + + cs_name: str = ( + f"{truncated_submitter_method}-{str(today)}-" + + f"{get_column_letter(new_ordinal)}" + ) + logger.info('Creating new ComputedSet "%s"', cs_name) + + computed_set = ComputedSet( + name=cs_name, + md_ordinal=new_ordinal, + upload_date=today, + method=self.submitter_method[: ComputedSet.LENGTH_METHOD], + target=target, + spec_version=float(self.version.strip("ver_")), + ) + if self.user_id: + try: + computed_set.owner_user = User.objects.get(id=self.user_id) + except User.DoesNotExist as exc: + msg = f"User {self.user_id} does not exist" + logger.error(msg) + raise IntegrityError(msg) from exc + else: + # The User ID may only be None if AUTHENTICATE_UPLOAD is False. + # Here the ComputedSet owner will take on a default (anonymous) value. + assert settings.AUTHENTICATE_UPLOAD is False + + computed_set.save() + + # Set descriptions in return for the Molecules. + # This also sets the submitter and method URL properties of the computed set + # while also saving it. + sdf_filename = str(self.sdf_filename) + mols_to_process, score_descriptions = self.set_descriptions( + filename=sdf_filename, computed_set=computed_set + ) + + # Process the molecules + logger.info("%s mols_to_process=%s", computed_set, len(mols_to_process)) + for i in range(len(mols_to_process)): + logger.debug( + "processing mol %s: %s", i, mols_to_process[i].GetProp("_Name") + ) + self.process_mol( + mols_to_process[i], + self.target_id, + computed_set, + sdf_filename, + score_descriptions, + self.zfile, + self.zfile_hashvals, + ) + except IntegrityError as exc: + # clean up previously written files. this is not ideal, + # they should be written to a tempdir or something, like + # in target loader. TODO for later try: - target = Target.objects.get(pk=self.target_id) - except Target.DoesNotExist as exc: - # target's existance should be validated in the view, - # this could hardly happen - logger.error('Target %s does not exist', self.target_id) - raise Target.DoesNotExist from exc - - cs_name: str = ( - f'{truncated_submitter_method}-{str(today)}-' - + f'{get_column_letter(new_ordinal)}' - ) - logger.info('Creating new ComputedSet "%s"', cs_name) - - computed_set = ComputedSet( - name=cs_name, - md_ordinal=new_ordinal, - upload_date=today, - method=self.submitter_method[: ComputedSet.LENGTH_METHOD], - target=target, - spec_version=float(self.version.strip('ver_')), - ) - if self.user_id: - try: - computed_set.owner_user = User.objects.get(id=self.user_id) - except User.DoesNotExist as exc: - logger.error('User %s does not exist', self.user_id) - raise User.DoesNotExist from exc + for p in self.zfile.values(): + Path(p).unlink() + except AttributeError: + # zfile is None, nothing to do + pass - else: - # The User ID may only be None if AUTHENTICATE_UPLOAD is False. - # Here the ComputedSet owner will take on a default (anonymous) value. - assert settings.AUTHENTICATE_UPLOAD is False + raise ValueError(exc.args[0]) from exc - computed_set.save() + # assuming no errors, write the files # check compound set folder exists. cmp_set_folder = os.path.join( settings.MEDIA_ROOT, settings.COMPUTED_SET_MEDIA_DIRECTORY ) if not os.path.isdir(cmp_set_folder): - logger.info('Making ComputedSet folder (%s)', cmp_set_folder) + logger.info("Making ComputedSet folder (%s)", cmp_set_folder) os.mkdir(cmp_set_folder) - # Set descriptions in return for the Molecules. - # This also sets the submitter and method URL properties of the computed set - # while also saving it. - sdf_filename = str(self.sdf_filename) - mols_to_process, score_descriptions = self.set_descriptions( - filename=sdf_filename, computed_set=computed_set - ) - - # Process the molecules - logger.info('%s mols_to_process=%s', computed_set, len(mols_to_process)) - for i in range(len(mols_to_process)): - logger.debug( - 'processing mol %s: %s', i, mols_to_process[i].GetProp('_Name') - ) - _ = self.process_mol( - mols_to_process[i], - self.target_id, - computed_set, - sdf_filename, - score_descriptions, - self.zfile, - self.zfile_hashvals, - ) - # move and save the compound set new_filename = ( Path(settings.MEDIA_ROOT) .joinpath(settings.COMPUTED_SET_MEDIA_DIRECTORY) .joinpath( - f'{computed_set.name}_upload_{computed_set.md_ordinal}_{Path(sdf_filename).name}' + f"{computed_set.name}_upload_{computed_set.md_ordinal}_{Path(sdf_filename).name}" ) ) os.rename(sdf_filename, new_filename) @@ -767,9 +836,9 @@ def task(self) -> ComputedSet: computed_set.written_sdf_filename = new_filename computed_set.save() - logger.info('Created %s', computed_set) + logger.info("Created %s", computed_set) - return computed_set + return computed_set, self.messages def blank_mol_vals(sdf_file) -> Tuple[str, str, str]: @@ -778,23 +847,23 @@ def blank_mol_vals(sdf_file) -> Tuple[str, str, str]: """ suppl = Chem.SDMolSupplier(sdf_file) if not suppl: - return '', '', '' + return "", "", "" # print('%d mols detected (including blank mol)' % (len(suppl),)) blank_mol = suppl[0] if not blank_mol: - return '', '', '' + return "", "", "" # Get submitter name/info for passing into upload to get unique name - submitter_name = '' - if blank_mol.HasProp('submitter_name'): - submitter_name = blank_mol.GetProp('submitter_name') + submitter_name = "" + if blank_mol.HasProp("submitter_name"): + submitter_name = blank_mol.GetProp("submitter_name") - submitter_method = '' - if blank_mol.HasProp('method'): - submitter_method = blank_mol.GetProp('method') + submitter_method = "" + if blank_mol.HasProp("method"): + submitter_method = blank_mol.GetProp("method") - version = '' - if blank_mol.HasProp('_Name'): - version = blank_mol.GetProp('_Name') + version = "" + if blank_mol.HasProp("_Name"): + version = blank_mol.GetProp("_Name") return submitter_name, submitter_method, version diff --git a/viewer/migrations/0085_target_alias_order.py b/viewer/migrations/0085_target_alias_order.py new file mode 100644 index 00000000..fe7f0d96 --- /dev/null +++ b/viewer/migrations/0085_target_alias_order.py @@ -0,0 +1,19 @@ +# Generated by Django 3.2.25 on 2024-12-04 16:03 + +import django.contrib.postgres.fields +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('viewer', '0084_auto_20241108_1119'), + ] + + operations = [ + migrations.AddField( + model_name='target', + name='alias_order', + field=django.contrib.postgres.fields.ArrayField(base_field=models.TextField(), null=True, size=None), + ), + ] diff --git a/viewer/models.py b/viewer/models.py index ac01da39..7438be24 100644 --- a/viewer/models.py +++ b/viewer/models.py @@ -1,3 +1,4 @@ +import contextlib import logging import uuid from dataclasses import dataclass @@ -121,6 +122,7 @@ class Target(models.Model): upload_datetime = models.DateTimeField( null=True, help_text='The datetime the upload was completed' ) + alias_order = ArrayField(models.TextField(), null=True) def __str__(self) -> str: return f"{self.title}" @@ -574,6 +576,16 @@ def __repr__(self) -> str: self.cmpd, ) + def get_ligand_mol_file(self): + contents = '' + if self.ligand_mol: + path = Path(settings.MEDIA_ROOT).joinpath(self.ligand_mol.name) + with contextlib.suppress(TypeError, FileNotFoundError): + with open(path, "r", encoding="utf-8") as f: + contents = f.read() + + return contents + class CompoundIdentifierType(models.Model): name = models.TextField(primary_key=True) diff --git a/viewer/sdf_check.py b/viewer/sdf_check.py index aa362239..6b2d96c0 100755 --- a/viewer/sdf_check.py +++ b/viewer/sdf_check.py @@ -237,7 +237,7 @@ def check_blank_prop(blank_mol, validate_dict): # Properties to ignore prop_ignore_list = ['ref_mols', 'ref_pdb'] - for key, value in zip(list(property_dict.keys()), list(property_dict.values())): + for key, value in property_dict.items(): if value == '' and key not in prop_ignore_list: validate_dict = add_warning( molecule_name=blank_mol.GetProp('_Name'), @@ -269,11 +269,12 @@ def check_field_populated(mol, validate_dict): :return: Updates validate dictionary with pass/fail message """ - # Compuslory fields - compulsory_fields = ['ref_pdb', 'ref_mols', 'original SMILES'] + # Compuslory fields (after 1589) + # compulsory_fields = ['ref_pdb', 'ref_mols', 'original SMILES'] + compulsory_fields = ['original SMILES'] property_dict = mol.GetPropsAsDict() - for key, value in zip(list(property_dict.keys()), list(property_dict.values())): + for key, value in property_dict.items(): if value == '' and key in compulsory_fields: validate_dict = add_warning( molecule_name=mol.GetProp('_Name'), diff --git a/viewer/serializers.py b/viewer/serializers.py index e002840c..3242573e 100644 --- a/viewer/serializers.py +++ b/viewer/serializers.py @@ -1,4 +1,3 @@ -import contextlib import logging from pathlib import Path from urllib.parse import urljoin @@ -218,6 +217,7 @@ class Meta: "metadata", "zip_archive", "upload_status", + "alias_order", ) extra_kwargs = { "id": {"read_only": True}, @@ -228,6 +228,7 @@ class Meta: "metadata": {"read_only": True}, "zip_archive": {"read_only": True}, "upload_status": {"read_only": True}, + "alias_order": {"read_only": False}, } @@ -479,18 +480,15 @@ class VectorsSerializer(serializers.ModelSerializer): vectors = serializers.SerializerMethodField() def get_vectors(self, obj): + ligand_mol_file = obj.get_ligand_mol_file() out_data = {} - if obj.ligand_mol_file: + if ligand_mol_file: try: - out_data["3d"] = get_3d_vects_for_mol( - obj.ligand_mol_file, iso_labels=False - ) + out_data["3d"] = get_3d_vects_for_mol(ligand_mol_file, iso_labels=False) # temporary patch except IndexError: - out_data["3d"] = get_3d_vects_for_mol( - obj.ligand_mol_file, iso_labels=True - ) - out_data["indices"] = get_vect_indices_for_mol(obj.ligand_mol_file) + out_data["3d"] = get_3d_vects_for_mol(ligand_mol_file, iso_labels=True) + out_data["indices"] = get_vect_indices_for_mol(ligand_mol_file) return out_data class Meta: @@ -953,14 +951,7 @@ class SiteObservationReadSerializer(serializers.ModelSerializer): ligand_mol_file = serializers.SerializerMethodField() def get_ligand_mol_file(self, obj): - contents = '' - if obj.ligand_mol: - path = Path(settings.MEDIA_ROOT).joinpath(obj.ligand_mol.name) - with contextlib.suppress(TypeError, FileNotFoundError): - with open(path, "r", encoding="utf-8") as f: - contents = f.read() - - return contents + return obj.get_ligand_mol_file() class Meta: model = models.SiteObservation diff --git a/viewer/target_loader.py b/viewer/target_loader.py index 167ed825..e456b41f 100644 --- a/viewer/target_loader.py +++ b/viewer/target_loader.py @@ -2079,13 +2079,20 @@ def import_compound_identifiers(self, alias_file_path): extended_key_cols = key_cols + ["compound_code"] non_idf_cols = extended_key_cols + ["compound_code_update"] - identifiers_from_file = set([k for k in df.columns if k not in non_idf_cols]) + identifiers_from_file = [k for k in df.columns if k not in non_idf_cols] + + # mypy is doing it's thing again + if not self.target.alias_order: # type: ignore[attr-defined] + self.target.alias_order = identifiers_from_file # type: ignore[attr-defined] + self.target.save() # type: ignore[attr-defined] + + identifiers_from_file = set(identifiers_from_file) # type: ignore[assignment] # I think this is a bad idea, but it was explicitly in the spec identifier_types = set( CompoundIdentifierType.objects.values_list("name", flat=True) ) - new_identifiers = identifiers_from_file.difference(identifier_types) + new_identifiers = identifiers_from_file.difference(identifier_types) # type: ignore[attr-defined] for identifier in new_identifiers: CompoundIdentifierType(name=identifier).save() diff --git a/viewer/tasks.py b/viewer/tasks.py index ef2471ec..049358cb 100644 --- a/viewer/tasks.py +++ b/viewer/tasks.py @@ -25,7 +25,7 @@ from .cset_upload import MolOps, PdbOps, blank_mol_vals from .models import ComputedSet, JobFileTransfer, JobRequest, SiteObservation -from .sdf_check import ( +from .sdf_check import ( # check_refmol, add_warning, check_blank_mol_props, check_blank_prop, @@ -33,7 +33,6 @@ check_field_populated, check_mol_props, check_name_characters, - check_refmol, check_SMILES, check_ver_name, ) @@ -103,12 +102,12 @@ def process_compound_set(validate_output): zfile_hashvals=zfile_hashvals, computed_set_name=computed_set_name, ) - compound_set = save_mols.task() + compound_set, process_messages = save_mols.task() logger.info( 'process_compound_set() EXIT (CompoundSet.name="%s")', compound_set.name ) - return 'process', compound_set.name + return 'process', compound_set.name, process_messages @shared_task @@ -310,7 +309,7 @@ def validate_compound_set(task_params): molecule_name = m.GetProp('_Name') validate_dict = check_name_characters(molecule_name, validate_dict) # validate_dict = check_pdb(m, validate_dict, target, zfile) - validate_dict = check_refmol(m, validate_dict, target) + # validate_dict = check_refmol(m, validate_dict, target) validate_dict = check_field_populated(m, validate_dict) validate_dict = check_SMILES(m, validate_dict) diff --git a/viewer/templates/viewer/upload-cset.html b/viewer/templates/viewer/upload-cset.html index 41fa626b..9f51de20 100644 --- a/viewer/templates/viewer/upload-cset.html +++ b/viewer/templates/viewer/upload-cset.html @@ -74,6 +74,7 @@

Compound Set Upload

+

{% if error_message %} @@ -161,8 +162,8 @@

Compound Set Upload

} if (validatedStatus === 'Validated') { - clearTimer('Your files were uploaded! The download links are:'); + clearTimer('Your files were uploaded! The download links are:'); var url_a = response.data.results.cset_download_url; var content = document.getElementById('links'); var a = document.createElement("a"); @@ -182,6 +183,10 @@

Compound Set Upload

b.title = 'protein set'; b.href = url_b; content.appendChild(b); + if (response.data.html) { + var content = document.getElementById('upload_warnings'); + content.innerHTML = response.data.html; + } } var moleculesProcessed = response.data.processed; @@ -220,6 +225,6 @@

Compound Set Upload

progressTitle.innerHTML = message; } - {% endif %} + {% endif %} {% endblock %} diff --git a/viewer/views.py b/viewer/views.py index a5b1cb7b..54fd883e 100644 --- a/viewer/views.py +++ b/viewer/views.py @@ -717,7 +717,7 @@ def get(self, request, validate_task_id): ): return Response( { - 'error': "You are not a member of the proposal {project_name}" + 'error': f"You are not a member of the proposal {project_name}" }, status=status.HTTP_403_FORBIDDEN, ) @@ -840,14 +840,63 @@ def get(self, request, upload_task_id): response_data['html'] = html_table return JsonResponse(response_data) + + elif results[0] == 'process': + # section added to pass warnings to user after successful processing + logger.debug('processed warnings: %s', results[2]) + # Upload/Update output tasks send back a tuple + response_data['results'] = {} + response_data['validated'] = 'Validated' + cset_name = results[1] + cset = models.ComputedSet.objects.get(name=cset_name) + + if ( + settings.AUTHENTICATE_UPLOAD + and not _ISPYB_SAFE_QUERY_SET.user_is_member_of_target( + request.user, cset.target + ) + ): + # TODO: this will break if wrong user accesses + # it, but with the given flow, this is very + # unlikely to happen + logger.debug('User not recognised') + return JsonResponse( + { + 'error_message': "You are not a member of the Target's proposal" + }, + ) + + name = cset.name + + response_data['results']['cset_download_url'] = ( + '/viewer/compound_set/%s' % name + ) + response_data['results']['pset_download_url'] = ( + '/viewer/protein_set/%s' % name + ) + + process_messages = results[2] + # set pandas options to display all column data + if len(next(iter(process_messages.values()))) > 0: + pd.set_option('display.max_colwidth', -1) + table = pd.DataFrame.from_dict(process_messages) + html_table = '''

Your data was uploaded with warnings

''' + html_table += table.to_html() + response_data['html'] = html_table + + return JsonResponse(response_data) + else: # Upload/Update output tasks send back a tuple response_data['validated'] = 'Validated' cset_name = results[1] cset = models.ComputedSet.objects.get(name=cset_name) - if not _ISPYB_SAFE_QUERY_SET.user_is_member_of_target( - request.user, cset.target + if ( + settings.AUTHENTICATE_UPLOAD + and not _ISPYB_SAFE_QUERY_SET.user_is_member_of_target( + request.user, cset.target + ) ): return Response( {'error': "You are not a member of the Target's proposal"},