From 280d87f046f457600b6ea113553b7c7c7452d7b7 Mon Sep 17 00:00:00 2001
From: Kalev Takkis <ktakkis@informaticsmatters.com>
Date: Thu, 28 Nov 2024 10:41:18 +0000
Subject: [PATCH 1/9] feat: added mechanism to pass warnings to user on
 successful CSET upload

TODO:
- added just the mechanism at the moment, but they're not being populated yet
- actual changes required by this ticket, prevent validation fail on certain attributes

Another changes and random fixes that happened along the way:
- Cset upload now in atomic transaction block (on fail, the db won't be
  polluted with partial data)
- docker-compose.celery.yml now actually works
---
 docker-compose.celery.yml                |  37 +++++
 viewer/cset_upload.py                    | 190 +++++++++++++----------
 viewer/sdf_check.py                      |   4 +-
 viewer/tasks.py                          |   4 +-
 viewer/templates/viewer/upload-cset.html |   9 +-
 viewer/views.py                          |  55 ++++++-
 6 files changed, 205 insertions(+), 94 deletions(-)

diff --git a/docker-compose.celery.yml b/docker-compose.celery.yml
index fe8d6875..179fc651 100644
--- a/docker-compose.celery.yml
+++ b/docker-compose.celery.yml
@@ -38,6 +38,43 @@ services:
       - .env
     image: ${BE_NAMESPACE:-xchem}/fragalysis-backend:${BE_IMAGE_TAG:-latest}
     restart: on-failure
+    volumes:
+    - ./data/logs:/code/logs/
+    - ./data/media:/code/media/
+    - .:/code/
+    environment:
+      AUTHENTICATE_UPLOAD: ${AUTHENTICATE_UPLOAD:-True}
+      DEPLOYMENT_MODE: 'development'
+      POSTGRESQL_USER: postgres
+      # Comma-separated dforced errors (infections?)
+      INFECTIONS: ''
+      # Celery tasks need to run synchronously
+      CELERY_TASK_ALWAYS_EAGER: 'False'
+      # Error reporting and default/root log-level
+      FRAGALYSIS_BACKEND_SENTRY_DNS: ${FRAGALYSIS_BACKEND_SENTRY_DNS}
+      LOGGING_FRAMEWORK_ROOT_LEVEL: ${LOGGING_FRAMEWORK_ROOT_LEVEL:-INFO}
+      # Keycloak configuration
+      OIDC_KEYCLOAK_REALM: ${OIDC_KEYCLOAK_REALM}
+      OIDC_RP_CLIENT_ID: ${OIDC_RP_CLIENT_ID:-fragalysis-local}
+      OIDC_RP_CLIENT_SECRET: ${OIDC_RP_CLIENT_SECRET}
+      OIDC_AS_CLIENT_ID: ${OIDC_AS_CLIENT_ID:-account-server-api}
+      OIDC_DM_CLIENT_ID: ${OIDC_DM_CLIENT_ID:-data-manager-api}
+      OIDC_RENEW_ID_TOKEN_EXPIRY_MINUTES: '210'
+      # Public target access strings?
+      # A comma-separated list of Project titles.
+      PUBLIC_TAS: ${PUBLIC_TAS:-lb18145-1}
+      # Squonk configuration
+      SQUONK2_VERIFY_CERTIFICATES: 'No'
+      SQUONK2_UNIT_BILLING_DAY: 3
+      SQUONK2_PRODUCT_FLAVOUR: BRONZE
+      SQUONK2_SLUG: fs-local
+      SQUONK2_ORG_OWNER: ${SQUONK2_ORG_OWNER}
+      SQUONK2_ORG_OWNER_PASSWORD: ${SQUONK2_ORG_OWNER_PASSWORD}
+      SQUONK2_ORG_UUID: ${SQUONK2_ORG_UUID}
+      SQUONK2_UI_URL: ${SQUONK2_UI_URL}
+      SQUONK2_DMAPI_URL: ${SQUONK2_DMAPI_URL}
+      SQUONK2_ASAPI_URL: ${SQUONK2_ASAPI_URL}
+      PROXY_FORWARDED_PROTO_HEADER: ${PROXY_FORWARDED_PROTO_HEADER:-http}
 
   celery_beat:
     command: sh -c "celery -A fragalysis beat -l info --scheduler django_celery_beat.schedulers:DatabaseScheduler"
diff --git a/viewer/cset_upload.py b/viewer/cset_upload.py
index 8a4264bf..56703ee0 100644
--- a/viewer/cset_upload.py
+++ b/viewer/cset_upload.py
@@ -15,6 +15,7 @@
 
 os.environ.setdefault("DJANGO_SETTINGS_MODULE", "fragalysis.settings")
 import django
+from django.db import IntegrityError, transaction
 
 django.setup()
 
@@ -171,6 +172,13 @@ def __init__(
         self.zfile_hashvals = zfile_hashvals
         self.computed_set_name = computed_set_name
 
+        # using the same mechanism to pass messages as validation
+        self.messages: dict[str, Any] = {
+            'molecule_name': [],
+            'field': [],
+            'warning_string': [],
+        }
+
     def process_pdb(self, pdb_code, zfile, zfile_hashvals) -> str | None:
         for key in zfile_hashvals.keys():
             if key == pdb_code:
@@ -654,73 +662,109 @@ def set_descriptions(
 
         return mols, score_descriptions
 
-    def task(self) -> ComputedSet:
+    def task(self) -> tuple[ComputedSet, dict]:
         # Truncate submitted method (lower-case)?
         truncated_submitter_method: str = 'unspecified'
-        if self.submitter_method:
-            truncated_submitter_method = self.submitter_method[
-                : ComputedSet.LENGTH_METHOD_IN_NAME
-            ]
-            if len(self.submitter_method) > len(truncated_submitter_method):
-                logger.warning(
-                    'ComputedSet submitter method is too long (%s). Truncated to "%s"',
-                    self.submitter_method,
-                    truncated_submitter_method,
-                )
-        else:
-            logger.warning(
-                'ComputedSet submitter method is not set. Using "%s"',
-                truncated_submitter_method,
-            )
-
-        # Do we have any existing ComputedSets?
         try:
-            computed_set = ComputedSet.objects.get(name=self.computed_set_name)
-            # refresh some attributes
-            computed_set.md_ordinal = F('md_ordinal') + 1
-            computed_set.upload_date = datetime.date.today()
-            computed_set.save()
-        except ComputedSet.DoesNotExist:
-            # no, create new
-
-            today: datetime.date = datetime.date.today()
-            new_ordinal: int = 1
+            with transaction.atomic():
+                if self.submitter_method:
+                    truncated_submitter_method = self.submitter_method[
+                        : ComputedSet.LENGTH_METHOD_IN_NAME
+                    ]
+                    if len(self.submitter_method) > len(truncated_submitter_method):
+                        logger.warning(
+                            'ComputedSet submitter method is too long (%s). Truncated to "%s"',
+                            self.submitter_method,
+                            truncated_submitter_method,
+                        )
+                else:
+                    logger.warning(
+                        'ComputedSet submitter method is not set. Using "%s"',
+                        truncated_submitter_method,
+                    )
 
-            try:
-                target = Target.objects.get(pk=self.target_id)
-            except Target.DoesNotExist as exc:
-                # target's existance should be validated in the view,
-                # this could hardly happen
-                logger.error('Target %s does not exist', self.target_id)
-                raise Target.DoesNotExist from exc
-
-            cs_name: str = (
-                f'{truncated_submitter_method}-{str(today)}-'
-                + f'{get_column_letter(new_ordinal)}'
-            )
-            logger.info('Creating new ComputedSet "%s"', cs_name)
-
-            computed_set = ComputedSet(
-                name=cs_name,
-                md_ordinal=new_ordinal,
-                upload_date=today,
-                method=self.submitter_method[: ComputedSet.LENGTH_METHOD],
-                target=target,
-                spec_version=float(self.version.strip('ver_')),
-            )
-            if self.user_id:
+                # Do we have any existing ComputedSets?
                 try:
-                    computed_set.owner_user = User.objects.get(id=self.user_id)
-                except User.DoesNotExist as exc:
-                    logger.error('User %s does not exist', self.user_id)
-                    raise User.DoesNotExist from exc
+                    computed_set = ComputedSet.objects.get(name=self.computed_set_name)
+                    # refresh some attributes
+                    computed_set.md_ordinal = F('md_ordinal') + 1
+                    computed_set.upload_date = datetime.date.today()
+                    computed_set.save()
+                except ComputedSet.DoesNotExist:
+                    # no, create new
+
+                    today: datetime.date = datetime.date.today()
+                    new_ordinal: int = 1
+
+                    try:
+                        target = Target.objects.get(pk=self.target_id)
+                    except Target.DoesNotExist as exc:
+                        # target's existance should be validated in the view,
+                        # this could hardly happen
+                        logger.error('Target %s does not exist', self.target_id)
+                        raise Target.DoesNotExist from exc
+
+                    cs_name: str = (
+                        f'{truncated_submitter_method}-{str(today)}-'
+                        + f'{get_column_letter(new_ordinal)}'
+                    )
+                    logger.info('Creating new ComputedSet "%s"', cs_name)
+
+                    computed_set = ComputedSet(
+                        name=cs_name,
+                        md_ordinal=new_ordinal,
+                        upload_date=today,
+                        method=self.submitter_method[: ComputedSet.LENGTH_METHOD],
+                        target=target,
+                        spec_version=float(self.version.strip('ver_')),
+                    )
+                    if self.user_id:
+                        try:
+                            computed_set.owner_user = User.objects.get(id=self.user_id)
+                        except User.DoesNotExist as exc:
+                            logger.error('User %s does not exist', self.user_id)
+                            raise User.DoesNotExist from exc
+
+                    else:
+                        # The User ID may only be None if AUTHENTICATE_UPLOAD is False.
+                        # Here the ComputedSet owner will take on a default (anonymous) value.
+                        assert settings.AUTHENTICATE_UPLOAD is False
+
+                    computed_set.save()
+
+                # Set descriptions in return for the Molecules.
+                # This also sets the submitter and method URL properties of the computed set
+                # while also saving it.
+                sdf_filename = str(self.sdf_filename)
+                mols_to_process, score_descriptions = self.set_descriptions(
+                    filename=sdf_filename, computed_set=computed_set
+                )
 
-            else:
-                # The User ID may only be None if AUTHENTICATE_UPLOAD is False.
-                # Here the ComputedSet owner will take on a default (anonymous) value.
-                assert settings.AUTHENTICATE_UPLOAD is False
+                # Process the molecules
+                logger.info('%s mols_to_process=%s', computed_set, len(mols_to_process))
+                for i in range(len(mols_to_process)):
+                    logger.debug(
+                        'processing mol %s: %s', i, mols_to_process[i].GetProp('_Name')
+                    )
+                    _ = self.process_mol(
+                        mols_to_process[i],
+                        self.target_id,
+                        computed_set,
+                        sdf_filename,
+                        score_descriptions,
+                        self.zfile,
+                        self.zfile_hashvals,
+                    )
+        except IntegrityError as exc:
+            # clean up previously written files. this is not ideal,
+            # they should be written to a tempdir or something, like
+            # in target loader. TODO for later
+            for p in self.zfile.values():
+                Path(p).unlink()
 
-            computed_set.save()
+            raise ValueError(exc.args[0]) from exc
+
+        # assuming no errors, write the files
 
         # check compound set folder exists.
         cmp_set_folder = os.path.join(
@@ -730,30 +774,6 @@ def task(self) -> ComputedSet:
             logger.info('Making ComputedSet folder (%s)', cmp_set_folder)
             os.mkdir(cmp_set_folder)
 
-        # Set descriptions in return for the Molecules.
-        # This also sets the submitter and method URL properties of the computed set
-        # while also saving it.
-        sdf_filename = str(self.sdf_filename)
-        mols_to_process, score_descriptions = self.set_descriptions(
-            filename=sdf_filename, computed_set=computed_set
-        )
-
-        # Process the molecules
-        logger.info('%s mols_to_process=%s', computed_set, len(mols_to_process))
-        for i in range(len(mols_to_process)):
-            logger.debug(
-                'processing mol %s: %s', i, mols_to_process[i].GetProp('_Name')
-            )
-            _ = self.process_mol(
-                mols_to_process[i],
-                self.target_id,
-                computed_set,
-                sdf_filename,
-                score_descriptions,
-                self.zfile,
-                self.zfile_hashvals,
-            )
-
         # move and save the compound set
         new_filename = (
             Path(settings.MEDIA_ROOT)
@@ -769,7 +789,7 @@ def task(self) -> ComputedSet:
 
         logger.info('Created %s', computed_set)
 
-        return computed_set
+        return computed_set, self.messages
 
 
 def blank_mol_vals(sdf_file) -> Tuple[str, str, str]:
diff --git a/viewer/sdf_check.py b/viewer/sdf_check.py
index aa362239..f8141d37 100755
--- a/viewer/sdf_check.py
+++ b/viewer/sdf_check.py
@@ -237,7 +237,7 @@ def check_blank_prop(blank_mol, validate_dict):
     # Properties to ignore
     prop_ignore_list = ['ref_mols', 'ref_pdb']
 
-    for key, value in zip(list(property_dict.keys()), list(property_dict.values())):
+    for key, value in property_dict.items():
         if value == '' and key not in prop_ignore_list:
             validate_dict = add_warning(
                 molecule_name=blank_mol.GetProp('_Name'),
@@ -273,7 +273,7 @@ def check_field_populated(mol, validate_dict):
     compulsory_fields = ['ref_pdb', 'ref_mols', 'original SMILES']
 
     property_dict = mol.GetPropsAsDict()
-    for key, value in zip(list(property_dict.keys()), list(property_dict.values())):
+    for key, value in property_dict.items():
         if value == '' and key in compulsory_fields:
             validate_dict = add_warning(
                 molecule_name=mol.GetProp('_Name'),
diff --git a/viewer/tasks.py b/viewer/tasks.py
index ef2471ec..31f0bcc3 100644
--- a/viewer/tasks.py
+++ b/viewer/tasks.py
@@ -103,12 +103,12 @@ def process_compound_set(validate_output):
         zfile_hashvals=zfile_hashvals,
         computed_set_name=computed_set_name,
     )
-    compound_set = save_mols.task()
+    compound_set, process_messages = save_mols.task()
 
     logger.info(
         'process_compound_set() EXIT (CompoundSet.name="%s")', compound_set.name
     )
-    return 'process', compound_set.name
+    return 'process', compound_set.name, process_messages
 
 
 @shared_task
diff --git a/viewer/templates/viewer/upload-cset.html b/viewer/templates/viewer/upload-cset.html
index 41fa626b..9f51de20 100644
--- a/viewer/templates/viewer/upload-cset.html
+++ b/viewer/templates/viewer/upload-cset.html
@@ -74,6 +74,7 @@ <h1 class="title is-size-1 has-text-centered">Compound Set Upload</h1>
         <p class="has-text-centered" id="links"></p>
         <p class="has-text-centered" id="validate-id"></p>
         <p class="has-text-centered" id="upload-id"></p>
+        <p class="has-text-centered" id="upload_warnings"></p>
     </body>
 
     {% if error_message %}
@@ -161,8 +162,8 @@ <h1 class="title is-size-1 has-text-centered">Compound Set Upload</h1>
 
                       }
                       if (validatedStatus === 'Validated') {
-                          clearTimer('Your files were uploaded! The download links are:');
 
+                          clearTimer('Your files were uploaded! The download links are:');
                           var url_a = response.data.results.cset_download_url;
                           var content = document.getElementById('links');
                           var a = document.createElement("a");
@@ -182,6 +183,10 @@ <h1 class="title is-size-1 has-text-centered">Compound Set Upload</h1>
                           b.title = 'protein set';
                           b.href = url_b;
                           content.appendChild(b);
+                          if (response.data.html) {
+                              var content = document.getElementById('upload_warnings');
+                              content.innerHTML = response.data.html;
+                          }
 
                       }
                       var moleculesProcessed = response.data.processed;
@@ -220,6 +225,6 @@ <h1 class="title is-size-1 has-text-centered">Compound Set Upload</h1>
             progressTitle.innerHTML = message;
           }
          </script>
-    {% endif %}
+     {% endif %}
 
 {% endblock %}
diff --git a/viewer/views.py b/viewer/views.py
index a5b1cb7b..54fd883e 100644
--- a/viewer/views.py
+++ b/viewer/views.py
@@ -717,7 +717,7 @@ def get(self, request, validate_task_id):
                 ):
                     return Response(
                         {
-                            'error': "You are not a member of the proposal {project_name}"
+                            'error': f"You are not a member of the proposal {project_name}"
                         },
                         status=status.HTTP_403_FORBIDDEN,
                     )
@@ -840,14 +840,63 @@ def get(self, request, upload_task_id):
                     response_data['html'] = html_table
 
                     return JsonResponse(response_data)
+
+                elif results[0] == 'process':
+                    # section added to pass warnings to user after successful processing
+                    logger.debug('processed warnings: %s', results[2])
+                    # Upload/Update output tasks send back a tuple
+                    response_data['results'] = {}
+                    response_data['validated'] = 'Validated'
+                    cset_name = results[1]
+                    cset = models.ComputedSet.objects.get(name=cset_name)
+
+                    if (
+                        settings.AUTHENTICATE_UPLOAD
+                        and not _ISPYB_SAFE_QUERY_SET.user_is_member_of_target(
+                            request.user, cset.target
+                        )
+                    ):
+                        # TODO: this will break if wrong user accesses
+                        # it, but with the given flow, this is very
+                        # unlikely to happen
+                        logger.debug('User not recognised')
+                        return JsonResponse(
+                            {
+                                'error_message': "You are not a member of the Target's proposal"
+                            },
+                        )
+
+                    name = cset.name
+
+                    response_data['results']['cset_download_url'] = (
+                        '/viewer/compound_set/%s' % name
+                    )
+                    response_data['results']['pset_download_url'] = (
+                        '/viewer/protein_set/%s' % name
+                    )
+
+                    process_messages = results[2]
+                    # set pandas options to display all column data
+                    if len(next(iter(process_messages.values()))) > 0:
+                        pd.set_option('display.max_colwidth', -1)
+                        table = pd.DataFrame.from_dict(process_messages)
+                        html_table = '''<p> Your data was uploaded with warnings</p>'''
+                        html_table += table.to_html()
+                        response_data['html'] = html_table
+
+                    return JsonResponse(response_data)
+
                 else:
                     # Upload/Update output tasks send back a tuple
                     response_data['validated'] = 'Validated'
                     cset_name = results[1]
                     cset = models.ComputedSet.objects.get(name=cset_name)
 
-                    if not _ISPYB_SAFE_QUERY_SET.user_is_member_of_target(
-                        request.user, cset.target
+                    if (
+                        settings.AUTHENTICATE_UPLOAD
+                        and not _ISPYB_SAFE_QUERY_SET.user_is_member_of_target(
+                            request.user, cset.target
+                        )
                     ):
                         return Response(
                             {'error': "You are not a member of the Target's proposal"},

From 4719d5250627f73d0d7c3ee51e8e8730472690b1 Mon Sep 17 00:00:00 2001
From: Kalev Takkis <ktakkis@informaticsmatters.com>
Date: Thu, 28 Nov 2024 16:02:26 +0000
Subject: [PATCH 2/9] feat: warnings on missing attributes (issues 1589, 1527)

---
 viewer/cset_upload.py | 85 +++++++++++++++++++++++++++++--------------
 viewer/sdf_check.py   |  5 ++-
 viewer/tasks.py       |  5 +--
 3 files changed, 62 insertions(+), 33 deletions(-)

diff --git a/viewer/cset_upload.py b/viewer/cset_upload.py
index 56703ee0..87b124ff 100644
--- a/viewer/cset_upload.py
+++ b/viewer/cset_upload.py
@@ -41,6 +41,8 @@
 )
 from viewer.utils import add_props_to_sdf_molecule, alphanumerator, is_url, word_count
 
+from .sdf_check import add_warning
+
 logger = logging.getLogger(__name__)
 
 
@@ -336,19 +338,13 @@ def create_mol(self, inchi, target, name=None) -> tuple[Compound, str]:
             # occur. However there's nothing in the db to prevent
             # this, so adding a catch clause and writing a meaningful
             # message
-            logger.error(
-                'Duplicate compounds for target %s with inchi key %s.',
-                target.title,
-                inchi_key,
-            )
-            raise MultipleObjectsReturned from exc
+            msg = f'Duplicate compounds for target {target.title} with inchi key {inchi_key}.'
+            logger.error(msg)
+            raise IntegrityError(msg) from exc
 
         return cpd, cpd_number
 
     def set_props(self, cpd, props, score_descriptions) -> List[ScoreDescription]:
-        if 'ref_mols' and 'ref_pdb' not in list(props.keys()):
-            raise Exception('ref_mols and ref_pdb not set!')
-
         for sd, val in score_descriptions.items():
             logger.debug('sd: %s', sd)
             logger.debug('sd.name, val: %s: %s', sd.name, val)
@@ -405,7 +401,7 @@ def set_mol(
                     experiment__experiment_upload__target=compound_set.target,
                 )
                 if not qs.exists():
-                    raise Exception(  # pylint: disable=raise-missing-from
+                    raise IntegrityError(  # pylint: disable=raise-missing-from
                         'No matching molecules found for inspiration frag ' + i
                     )
 
@@ -439,7 +435,7 @@ def set_mol(
 
         # I think, realistically, I only need to check compound
         # update: I used to annotate name components, with the new
-        # format, this is not necessary. or possible fmt: off
+        # format, this is not necessary. or possible
         qs = ComputedMolecule.objects.filter(
             compound=compound,
         ).order_by('name')
@@ -452,7 +448,9 @@ def set_mol(
             groups = re.search(r'()(\d+)(\D+)', qs.last().name)
             if groups is None or len(groups.groups()) != 3:
                 # just a quick sanity check
-                raise ValueError(f'Non-standard ComputedMolecule.name: {latest.name}')
+                raise IntegrityError(
+                    f'Non-standard ComputedMolecule.name: {latest.name}'
+                )
             number = groups.groups()[1]  # type: ignore [index]
             suffix = next(alphanumerator(start_from=groups.groups()[2]))  # type: ignore [index]
         else:
@@ -483,7 +481,7 @@ def set_mol(
                         + f'and {mol.GetProp("original ID")}'
                     )
                     logger.error(msg)
-                    raise RuntimeError(msg) from exc
+                    raise IntegrityError(msg) from exc
 
                 molconf = mol.GetConformer()
                 kmolconf = kmol.GetConformer()
@@ -573,8 +571,9 @@ def get_submission_info(self, description_mol) -> ComputedSetSubmitter:
         try:
             date = parse(datestring, dayfirst=True)
         except ValueError as exc:
-            logger.error('"%s" is not a valid date', datestring)
-            raise ValueError from exc
+            msg = f'"{datestring}" is not a valid date'
+            logger.error(msg)
+            raise IntegrityError(msg) from exc
 
         return ComputedSetSubmitter.objects.get_or_create(
             name=description_mol.GetProp('submitter_name'),
@@ -593,10 +592,35 @@ def process_mol(
         score_descriptions,
         zfile=None,
         zfile_hashvals=None,
-    ) -> List[ScoreDescription]:
-        cpd = self.set_mol(mol, target, compound_set, filename, zfile, zfile_hashvals)
+    ) -> None:
+        molecule_name = mol.GetProp('_Name')
+        logger.debug('+ process_mol %s', molecule_name)
+
         other_props = mol.GetPropsAsDict()
-        return self.set_props(cpd, other_props, score_descriptions)
+        skip_mol = False
+        for prop in ['ref_mols', 'ref_pdb'] + list(HEADER_MOL_FIELDS):
+            if prop not in other_props.keys():
+                self.messages = add_warning(
+                    molecule_name=molecule_name,
+                    field=prop,
+                    warning_string=f'Property {prop} missing',
+                    validate_dict=self.messages,
+                )
+                skip_mol = True
+            elif other_props[prop] in EMPTY_VALUES:
+                self.messages = add_warning(
+                    molecule_name=molecule_name,
+                    field=prop,
+                    warning_string=f'Property {prop} undefined',
+                    validate_dict=self.messages,
+                )
+                skip_mol = True
+
+        if not skip_mol:
+            cpd = self.set_mol(
+                mol, target, compound_set, filename, zfile, zfile_hashvals
+            )
+            self.set_props(cpd, other_props, score_descriptions)
 
     def set_descriptions(
         self, filename, computed_set: ComputedSet
@@ -648,17 +672,16 @@ def set_descriptions(
                     if key == 'submitter_email':
                         try:
                             validate_email(value)
-                        except ValidationError as exc:
+                        except ValidationError:
                             msg = f'"{value}" is not a valid email'
                             logger.error(msg)
                             errors.append(msg)
-                            raise ValidationError(msg) from exc
 
                 score_descriptions[description] = value
 
         logger.debug('index mol values: %s', score_descriptions.values())
         if errors:
-            raise ValueError(errors)
+            raise IntegrityError(errors)
 
         return mols, score_descriptions
 
@@ -701,8 +724,9 @@ def task(self) -> tuple[ComputedSet, dict]:
                     except Target.DoesNotExist as exc:
                         # target's existance should be validated in the view,
                         # this could hardly happen
-                        logger.error('Target %s does not exist', self.target_id)
-                        raise Target.DoesNotExist from exc
+                        msg = f'Target {self.target_id} does not exist'
+                        logger.error(msg)
+                        raise IntegrityError(msg) from exc
 
                     cs_name: str = (
                         f'{truncated_submitter_method}-{str(today)}-'
@@ -722,8 +746,9 @@ def task(self) -> tuple[ComputedSet, dict]:
                         try:
                             computed_set.owner_user = User.objects.get(id=self.user_id)
                         except User.DoesNotExist as exc:
-                            logger.error('User %s does not exist', self.user_id)
-                            raise User.DoesNotExist from exc
+                            msg = f'User {self.user_id} does not exist'
+                            logger.error(msg)
+                            raise IntegrityError(msg) from exc
 
                     else:
                         # The User ID may only be None if AUTHENTICATE_UPLOAD is False.
@@ -746,7 +771,7 @@ def task(self) -> tuple[ComputedSet, dict]:
                     logger.debug(
                         'processing mol %s: %s', i, mols_to_process[i].GetProp('_Name')
                     )
-                    _ = self.process_mol(
+                    self.process_mol(
                         mols_to_process[i],
                         self.target_id,
                         computed_set,
@@ -759,8 +784,12 @@ def task(self) -> tuple[ComputedSet, dict]:
             # clean up previously written files. this is not ideal,
             # they should be written to a tempdir or something, like
             # in target loader. TODO for later
-            for p in self.zfile.values():
-                Path(p).unlink()
+            try:
+                for p in self.zfile.values():
+                    Path(p).unlink()
+            except AttributeError:
+                # zfile is None, nothing to do
+                pass
 
             raise ValueError(exc.args[0]) from exc
 
diff --git a/viewer/sdf_check.py b/viewer/sdf_check.py
index f8141d37..6b2d96c0 100755
--- a/viewer/sdf_check.py
+++ b/viewer/sdf_check.py
@@ -269,8 +269,9 @@ def check_field_populated(mol, validate_dict):
     :return: Updates validate dictionary with pass/fail message
     """
 
-    # Compuslory fields
-    compulsory_fields = ['ref_pdb', 'ref_mols', 'original SMILES']
+    # Compuslory fields (after 1589)
+    # compulsory_fields = ['ref_pdb', 'ref_mols', 'original SMILES']
+    compulsory_fields = ['original SMILES']
 
     property_dict = mol.GetPropsAsDict()
     for key, value in property_dict.items():
diff --git a/viewer/tasks.py b/viewer/tasks.py
index 31f0bcc3..049358cb 100644
--- a/viewer/tasks.py
+++ b/viewer/tasks.py
@@ -25,7 +25,7 @@
 
 from .cset_upload import MolOps, PdbOps, blank_mol_vals
 from .models import ComputedSet, JobFileTransfer, JobRequest, SiteObservation
-from .sdf_check import (
+from .sdf_check import (  # check_refmol,
     add_warning,
     check_blank_mol_props,
     check_blank_prop,
@@ -33,7 +33,6 @@
     check_field_populated,
     check_mol_props,
     check_name_characters,
-    check_refmol,
     check_SMILES,
     check_ver_name,
 )
@@ -310,7 +309,7 @@ def validate_compound_set(task_params):
                 molecule_name = m.GetProp('_Name')
             validate_dict = check_name_characters(molecule_name, validate_dict)
             # validate_dict = check_pdb(m, validate_dict, target, zfile)
-            validate_dict = check_refmol(m, validate_dict, target)
+            # validate_dict = check_refmol(m, validate_dict, target)
             validate_dict = check_field_populated(m, validate_dict)
             validate_dict = check_SMILES(m, validate_dict)
 

From 804ad811b5819e4b7efa2fb5dbd764a72a438d3c Mon Sep 17 00:00:00 2001
From: Kalev Takkis <ktakkis@informaticsmatters.com>
Date: Wed, 4 Dec 2024 16:28:37 +0000
Subject: [PATCH 3/9] feat: customizable target-wide identifier order

---
 viewer/models.py        |  1 +
 viewer/serializers.py   |  2 ++
 viewer/target_loader.py | 11 +++++++++--
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/viewer/models.py b/viewer/models.py
index ac01da39..f01488ab 100644
--- a/viewer/models.py
+++ b/viewer/models.py
@@ -121,6 +121,7 @@ class Target(models.Model):
     upload_datetime = models.DateTimeField(
         null=True, help_text='The datetime the upload was completed'
     )
+    alias_order = ArrayField(models.TextField(), null=True)
 
     def __str__(self) -> str:
         return f"{self.title}"
diff --git a/viewer/serializers.py b/viewer/serializers.py
index e002840c..358817d4 100644
--- a/viewer/serializers.py
+++ b/viewer/serializers.py
@@ -218,6 +218,7 @@ class Meta:
             "metadata",
             "zip_archive",
             "upload_status",
+            "alias_order",
         )
         extra_kwargs = {
             "id": {"read_only": True},
@@ -228,6 +229,7 @@ class Meta:
             "metadata": {"read_only": True},
             "zip_archive": {"read_only": True},
             "upload_status": {"read_only": True},
+            "alias_order": {"read_only": False},
         }
 
 
diff --git a/viewer/target_loader.py b/viewer/target_loader.py
index 167ed825..e456b41f 100644
--- a/viewer/target_loader.py
+++ b/viewer/target_loader.py
@@ -2079,13 +2079,20 @@ def import_compound_identifiers(self, alias_file_path):
         extended_key_cols = key_cols + ["compound_code"]
         non_idf_cols = extended_key_cols + ["compound_code_update"]
 
-        identifiers_from_file = set([k for k in df.columns if k not in non_idf_cols])
+        identifiers_from_file = [k for k in df.columns if k not in non_idf_cols]
+
+        # mypy is doing it's thing again
+        if not self.target.alias_order:  # type: ignore[attr-defined]
+            self.target.alias_order = identifiers_from_file  # type: ignore[attr-defined]
+            self.target.save()  # type: ignore[attr-defined]
+
+        identifiers_from_file = set(identifiers_from_file)  # type: ignore[assignment]
 
         # I think this is a bad idea, but it was explicitly in the spec
         identifier_types = set(
             CompoundIdentifierType.objects.values_list("name", flat=True)
         )
-        new_identifiers = identifiers_from_file.difference(identifier_types)
+        new_identifiers = identifiers_from_file.difference(identifier_types)  # type: ignore[attr-defined]
         for identifier in new_identifiers:
             CompoundIdentifierType(name=identifier).save()
 

From 552f76b84f8cebdd9e9776b5c25c1c03a4f86fa1 Mon Sep 17 00:00:00 2001
From: Kalev Takkis <ktakkis@informaticsmatters.com>
Date: Thu, 5 Dec 2024 10:21:38 +0000
Subject: [PATCH 4/9] fix: removed reference to old observation attribute

site_observation.ligand_mol_file, was causing errors in api/vectors call
---
 viewer/models.py      | 11 +++++++++++
 viewer/serializers.py | 23 ++++++-----------------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/viewer/models.py b/viewer/models.py
index f01488ab..7438be24 100644
--- a/viewer/models.py
+++ b/viewer/models.py
@@ -1,3 +1,4 @@
+import contextlib
 import logging
 import uuid
 from dataclasses import dataclass
@@ -575,6 +576,16 @@ def __repr__(self) -> str:
             self.cmpd,
         )
 
+    def get_ligand_mol_file(self):
+        contents = ''
+        if self.ligand_mol:
+            path = Path(settings.MEDIA_ROOT).joinpath(self.ligand_mol.name)
+            with contextlib.suppress(TypeError, FileNotFoundError):
+                with open(path, "r", encoding="utf-8") as f:
+                    contents = f.read()
+
+        return contents
+
 
 class CompoundIdentifierType(models.Model):
     name = models.TextField(primary_key=True)
diff --git a/viewer/serializers.py b/viewer/serializers.py
index 358817d4..3242573e 100644
--- a/viewer/serializers.py
+++ b/viewer/serializers.py
@@ -1,4 +1,3 @@
-import contextlib
 import logging
 from pathlib import Path
 from urllib.parse import urljoin
@@ -481,18 +480,15 @@ class VectorsSerializer(serializers.ModelSerializer):
     vectors = serializers.SerializerMethodField()
 
     def get_vectors(self, obj):
+        ligand_mol_file = obj.get_ligand_mol_file()
         out_data = {}
-        if obj.ligand_mol_file:
+        if ligand_mol_file:
             try:
-                out_data["3d"] = get_3d_vects_for_mol(
-                    obj.ligand_mol_file, iso_labels=False
-                )
+                out_data["3d"] = get_3d_vects_for_mol(ligand_mol_file, iso_labels=False)
             # temporary patch
             except IndexError:
-                out_data["3d"] = get_3d_vects_for_mol(
-                    obj.ligand_mol_file, iso_labels=True
-                )
-            out_data["indices"] = get_vect_indices_for_mol(obj.ligand_mol_file)
+                out_data["3d"] = get_3d_vects_for_mol(ligand_mol_file, iso_labels=True)
+            out_data["indices"] = get_vect_indices_for_mol(ligand_mol_file)
         return out_data
 
     class Meta:
@@ -955,14 +951,7 @@ class SiteObservationReadSerializer(serializers.ModelSerializer):
     ligand_mol_file = serializers.SerializerMethodField()
 
     def get_ligand_mol_file(self, obj):
-        contents = ''
-        if obj.ligand_mol:
-            path = Path(settings.MEDIA_ROOT).joinpath(obj.ligand_mol.name)
-            with contextlib.suppress(TypeError, FileNotFoundError):
-                with open(path, "r", encoding="utf-8") as f:
-                    contents = f.read()
-
-        return contents
+        return obj.get_ligand_mol_file()
 
     class Meta:
         model = models.SiteObservation

From 990a773d65c3a83615c53d0b1aea699413f8a1a6 Mon Sep 17 00:00:00 2001
From: Kalev Takkis <ktakkis@informaticsmatters.com>
Date: Thu, 5 Dec 2024 11:09:58 +0000
Subject: [PATCH 5/9] fix: migration file added

---
 viewer/migrations/0085_target_alias_order.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 viewer/migrations/0085_target_alias_order.py

diff --git a/viewer/migrations/0085_target_alias_order.py b/viewer/migrations/0085_target_alias_order.py
new file mode 100644
index 00000000..fe7f0d96
--- /dev/null
+++ b/viewer/migrations/0085_target_alias_order.py
@@ -0,0 +1,19 @@
+# Generated by Django 3.2.25 on 2024-12-04 16:03
+
+import django.contrib.postgres.fields
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('viewer', '0084_auto_20241108_1119'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='target',
+            name='alias_order',
+            field=django.contrib.postgres.fields.ArrayField(base_field=models.TextField(), null=True, size=None),
+        ),
+    ]

From 0fd27e1605e8bb237b6925f4a5b7bf8b0e2cae31 Mon Sep 17 00:00:00 2001
From: Max Winokan <mwinokan@me.com>
Date: Fri, 6 Dec 2024 09:19:32 +0000
Subject: [PATCH 6/9] Max's attempt to fix behaviour for 1589

---
 viewer/cset_upload.py | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/viewer/cset_upload.py b/viewer/cset_upload.py
index 87b124ff..6531a86e 100644
--- a/viewer/cset_upload.py
+++ b/viewer/cset_upload.py
@@ -598,12 +598,14 @@ def process_mol(
 
         other_props = mol.GetPropsAsDict()
         skip_mol = False
-        for prop in ['ref_mols', 'ref_pdb'] + list(HEADER_MOL_FIELDS):
+
+        # if ref_mols or ref_pdb is missing skip the molecule
+        for prop in ['ref_mols', 'ref_pdb']:
             if prop not in other_props.keys():
                 self.messages = add_warning(
                     molecule_name=molecule_name,
                     field=prop,
-                    warning_string=f'Property {prop} missing',
+                    warning_string=f'Property {prop} missing. Skipping molecule!',
                     validate_dict=self.messages,
                 )
                 skip_mol = True
@@ -611,11 +613,30 @@ def process_mol(
                 self.messages = add_warning(
                     molecule_name=molecule_name,
                     field=prop,
-                    warning_string=f'Property {prop} undefined',
+                    warning_string=f'Property {prop} undefined. Skipping molecule!',
                     validate_dict=self.messages,
                 )
                 skip_mol = True
 
+        # if any header mol fields are defined on non-header molecules those values are ignored and a warning shown
+        for prop in HEADER_MOL_FIELDS:
+
+            if prop not in other_props.keys():
+                # non-header molecules don't need header fields
+                continue
+            
+            if other_props[prop] not in EMPTY_VALUES:
+                # header fields in non-header molecules have values ignored
+                self.messages = add_warning(
+                    molecule_name=molecule_name,
+                    field=prop,
+                    warning_string=f'Property {prop} value {other_props[prop]} ignored.',
+                    validate_dict=self.messages,
+                )
+            
+            # get rid of the header field property on the non-header molecule
+            del other_props[prop]
+
         if not skip_mol:
             cpd = self.set_mol(
                 mol, target, compound_set, filename, zfile, zfile_hashvals

From 7cb7881b0dc8aaf8a1c64b4d3f1f836e569d7551 Mon Sep 17 00:00:00 2001
From: Max Winokan <mwinokan@me.com>
Date: Fri, 6 Dec 2024 10:15:01 +0000
Subject: [PATCH 7/9] black lint

---
 viewer/cset_upload.py | 234 +++++++++++++++++++++---------------------
 1 file changed, 117 insertions(+), 117 deletions(-)

diff --git a/viewer/cset_upload.py b/viewer/cset_upload.py
index 6531a86e..5d3100b7 100644
--- a/viewer/cset_upload.py
+++ b/viewer/cset_upload.py
@@ -50,62 +50,62 @@
 _DIST_LIMIT = 0.5
 
 EMPTY_VALUES = (
-    'nan',
-    '',
+    "nan",
+    "",
     None,
     np.nan,
 )
 
 
 HEADER_MOL_FIELDS = (
-    'ref_url',
-    'method',
-    'submitter_name',
-    'submitter_institution',
-    'submitter_email',
-    'generation_date',
+    "ref_url",
+    "method",
+    "submitter_name",
+    "submitter_institution",
+    "submitter_email",
+    "generation_date",
 )
 
 
 def dataType(a_str: str) -> str:
     lean_str = a_str.strip()
     if not lean_str:
-        return 'BLANK'
+        return "BLANK"
 
     try:
         t = ast.literal_eval(lean_str)
     except (ValueError, SyntaxError):
-        return 'TEXT'
+        return "TEXT"
     else:
         if type(t) in [int, int, float, bool]:
             if t in [
                 True,
                 False,
-                'TRUE',
-                'FALSE',
-                'true',
-                'false',
-                'yes',
-                'no',
-                'YES',
-                'NO',
-                'Yes',
-                'No',
+                "TRUE",
+                "FALSE",
+                "true",
+                "false",
+                "yes",
+                "no",
+                "YES",
+                "NO",
+                "Yes",
+                "No",
                 "Y",
                 "N",
                 "y",
                 "n",
             ]:
-                return 'BIT'
+                return "BIT"
             if type(t) is int or type(t) is int:
-                return 'INT'
+                return "INT"
             if type(t) is float:
-                return 'FLOAT'
+                return "FLOAT"
 
             # Can't get here?
             assert False
         else:
-            return 'TEXT'
+            return "TEXT"
 
 
 class PdbOps:
@@ -121,10 +121,10 @@ def save_pdb_zip(
             zfile_hashvals = {}
             for filename in zip_lst:
                 # only handle pdb files
-                if filename.split('.')[-1] == 'pdb':
+                if filename.split(".")[-1] == "pdb":
                     # Test if Protein object already exists
-                    code = filename.split('/')[-1].replace('.pdb', '')
-                    test_pdb_code = filename.split('/')[-1].replace('.pdb', '')
+                    code = filename.split("/")[-1].replace(".pdb", "")
+                    test_pdb_code = filename.split("/")[-1].replace(".pdb", "")
                     test_site_obvs_objs = SiteObservation.objects.filter(
                         code=test_pdb_code
                     )
@@ -132,12 +132,12 @@ def save_pdb_zip(
                     if len(test_site_obvs_objs) != 0:
                         # make a unique pdb code as not to overwrite existing object
                         rand_str = uuid.uuid4().hex
-                        test_pdb_code = f'{code}#{rand_str}'
+                        test_pdb_code = f"{code}#{rand_str}"
                         zfile_hashvals[code] = rand_str
 
-                    fn = f'{test_pdb_code}.pdb'
+                    fn = f"{test_pdb_code}.pdb"
                     pdb_path = default_storage.save(
-                        f'tmp/{fn}', ContentFile(zf.read(filename))
+                        f"tmp/{fn}", ContentFile(zf.read(filename))
                     )
                     zfile[test_pdb_code] = pdb_path
 
@@ -148,7 +148,7 @@ def save_pdb_zip(
         return zfile, zfile_hashvals
 
     def run(self, params) -> Tuple[Optional[Dict[str, Any]], Optional[Dict[str, str]]]:
-        return self.save_pdb_zip(params['pdb_zip'])
+        return self.save_pdb_zip(params["pdb_zip"])
 
 
 class MolOps:
@@ -176,15 +176,15 @@ def __init__(
 
         # using the same mechanism to pass messages as validation
         self.messages: dict[str, Any] = {
-            'molecule_name': [],
-            'field': [],
-            'warning_string': [],
+            "molecule_name": [],
+            "field": [],
+            "warning_string": [],
         }
 
     def process_pdb(self, pdb_code, zfile, zfile_hashvals) -> str | None:
         for key in zfile_hashvals.keys():
             if key == pdb_code:
-                pdb_code = f'{pdb_code}#{zfile_hashvals[pdb_code]}'
+                pdb_code = f"{pdb_code}#{zfile_hashvals[pdb_code]}"
 
         try:
             pdb_fp = zfile[pdb_code]
@@ -192,7 +192,7 @@ def process_pdb(self, pdb_code, zfile, zfile_hashvals) -> str | None:
             return None
 
         # ensure filename uniqueness
-        pdb_fn = '_'.join([zfile[pdb_code].split('/')[-1], uuid.uuid4().hex])
+        pdb_fn = "_".join([zfile[pdb_code].split("/")[-1], uuid.uuid4().hex])
         pdb_field = Path(settings.COMPUTED_SET_MEDIA_DIRECTORY).joinpath(pdb_fn)
 
         new_filename = Path(settings.MEDIA_ROOT).joinpath(pdb_field)
@@ -224,11 +224,11 @@ def get_site_observation(
             )
             return None
 
-        pdb_fn = mol.GetProp(property_name).split('/')[-1]
+        pdb_fn = mol.GetProp(property_name).split("/")[-1]
 
         if zfile:
             # pdb archive uploaded. referenced pdb file may or may not be included
-            pdb_code = pdb_fn.replace('.pdb', '')
+            pdb_code = pdb_fn.replace(".pdb", "")
             pdb_file = self.process_pdb(
                 pdb_code=pdb_code,
                 zfile=zfile,
@@ -238,7 +238,7 @@ def get_site_observation(
                 return pdb_file
             else:
                 logger.info(
-                    'No protein pdb (%s) found in zipfile',
+                    "No protein pdb (%s) found in zipfile",
                     pdb_fn,
                 )
 
@@ -253,7 +253,7 @@ def get_site_observation(
         except SiteObservation.DoesNotExist:
             # Initial SiteObservation lookup failed.
             logger.warning(
-                'Failed to get SiteObservation object (target=%s name=%s)',
+                "Failed to get SiteObservation object (target=%s name=%s)",
                 compound_set.target.title,
                 name,
             )
@@ -265,25 +265,25 @@ def get_site_observation(
             )
             if qs.exists():
                 logger.info(
-                    'Found SiteObservation containing name=%s qs=%s',
+                    "Found SiteObservation containing name=%s qs=%s",
                     name,
                     qs,
                 )
             else:
-                alt_name = name.split(':')[0].split('_')[0]
+                alt_name = name.split(":")[0].split("_")[0]
                 qs = SiteObservation.objects.filter(
                     code__contains=alt_name,
                     experiment__experiment_upload__target__pk=target,
                 )
                 if qs.exists():
                     logger.info(
-                        'Found SiteObservation containing alternative name=%s qs=%s',
+                        "Found SiteObservation containing alternative name=%s qs=%s",
                         alt_name,
                         qs,
                     )
             if qs.count() > 0:
                 logger.debug(
-                    'Found alternative (target=%s name=%s)',
+                    "Found alternative (target=%s name=%s)",
                     compound_set.target.title,
                     name,
                 )
@@ -291,7 +291,7 @@ def get_site_observation(
 
         if not site_obvs:
             logger.warning(
-                'No SiteObservation found (target=%s pdb_fn=%s)',
+                "No SiteObservation found (target=%s pdb_fn=%s)",
                 compound_set.target.title,
                 pdb_fn,
             )
@@ -309,7 +309,7 @@ def create_mol(self, inchi, target, name=None) -> tuple[Compound, str]:
         qs = Compound.objects.filter(
             computedmolecule__computed_set__target=target,
         )
-        cpd_number = '1'
+        cpd_number = "1"
         try:
             # NB! Max said there could be thousands of compounds per
             # target so this distinct() here may become a problem
@@ -338,7 +338,7 @@ def create_mol(self, inchi, target, name=None) -> tuple[Compound, str]:
             # occur. However there's nothing in the db to prevent
             # this, so adding a catch clause and writing a meaningful
             # message
-            msg = f'Duplicate compounds for target {target.title} with inchi key {inchi_key}.'
+            msg = f"Duplicate compounds for target {target.title} with inchi key {inchi_key}."
             logger.error(msg)
             raise IntegrityError(msg) from exc
 
@@ -346,9 +346,9 @@ def create_mol(self, inchi, target, name=None) -> tuple[Compound, str]:
 
     def set_props(self, cpd, props, score_descriptions) -> List[ScoreDescription]:
         for sd, val in score_descriptions.items():
-            logger.debug('sd: %s', sd)
-            logger.debug('sd.name, val: %s: %s', sd.name, val)
-            if dataType(str(props[sd.name])) == 'TEXT':
+            logger.debug("sd: %s", sd)
+            logger.debug("sd.name, val: %s: %s", sd.name, val)
+            if dataType(str(props[sd.name])) == "TEXT":
                 score_value = TextScoreValues()
             else:
                 score_value = NumericalScoreValues()
@@ -373,7 +373,7 @@ def set_mol(
 
         smiles = Chem.MolToSmiles(mol)
         inchi = Chem.inchi.MolToInchi(mol)
-        molecule_name = mol.GetProp('_Name')
+        molecule_name = mol.GetProp("_Name")
 
         flattened_copy = copy.deepcopy(mol)
         Chem.RemoveStereochemistry(mol)
@@ -383,8 +383,8 @@ def set_mol(
             inchi, compound_set.target, name=molecule_name
         )
 
-        insp = mol.GetProp('ref_mols')
-        insp = insp.split(',')
+        insp = mol.GetProp("ref_mols")
+        insp = insp.split(",")
         insp = [i.strip() for i in insp]
         insp_frags = []
         for i in insp:
@@ -397,19 +397,19 @@ def set_mol(
                 ref = site_obvs
             except SiteObservation.DoesNotExist:
                 qs = SiteObservation.objects.filter(
-                    code=str(i.split(':')[0].split('_')[0]),
+                    code=str(i.split(":")[0].split("_")[0]),
                     experiment__experiment_upload__target=compound_set.target,
                 )
                 if not qs.exists():
                     raise IntegrityError(  # pylint: disable=raise-missing-from
-                        'No matching molecules found for inspiration frag ' + i
+                        "No matching molecules found for inspiration frag " + i
                     )
 
-                ref = qs.order_by('-cmpd_id').first()
+                ref = qs.order_by("-cmpd_id").first()
 
             insp_frags.append(ref)
 
-        ref_property = 'ref_pdb'
+        ref_property = "ref_pdb"
         # data in ref ref_pdb field may be one of 2 things:
         # - siteobservation's short code (code field)
         # - pdb file in uploaded zipfile
@@ -423,7 +423,7 @@ def set_mol(
         )
         if not ref_so:
             logger.warning(
-                'Failed to get a Reference SiteObservation (%s) for %s, %s, %s',
+                "Failed to get a Reference SiteObservation (%s) for %s, %s, %s",
                 ref_property,
                 mol,
                 target,
@@ -438,26 +438,26 @@ def set_mol(
         # format, this is not necessary. or possible
         qs = ComputedMolecule.objects.filter(
             compound=compound,
-        ).order_by('name')
+        ).order_by("name")
 
         if qs.exists():
             # not actually latest, just last according to sorting above
             latest = qs.last()
             # regex pattern - split name like 'v1a'
             # ('(letters)(digits)(letters)' to components
-            groups = re.search(r'()(\d+)(\D+)', qs.last().name)
+            groups = re.search(r"()(\d+)(\D+)", qs.last().name)
             if groups is None or len(groups.groups()) != 3:
                 # just a quick sanity check
                 raise IntegrityError(
-                    f'Non-standard ComputedMolecule.name: {latest.name}'
+                    f"Non-standard ComputedMolecule.name: {latest.name}"
                 )
             number = groups.groups()[1]  # type: ignore [index]
             suffix = next(alphanumerator(start_from=groups.groups()[2]))  # type: ignore [index]
         else:
-            suffix = 'a'
+            suffix = "a"
             # number = 1
 
-        name = f'v{number}{suffix}'
+        name = f"v{number}{suffix}"
 
         existing_computed_molecules = []
         for k in qs:
@@ -477,7 +477,7 @@ def set_mol(
                     )
                 except RuntimeError as exc:
                     msg = (
-                        f'Failed to find alignment between {k.molecule_name} '
+                        f"Failed to find alignment between {k.molecule_name} "
                         + f'and {mol.GetProp("original ID")}'
                     )
                     logger.error(msg)
@@ -498,18 +498,18 @@ def set_mol(
 
         if len(existing_computed_molecules) == 1:
             logger.warning(
-                'Using existing ComputedMolecule %s and overwriting its metadata',
+                "Using existing ComputedMolecule %s and overwriting its metadata",
                 existing_computed_molecules[0],
             )
             computed_molecule = existing_computed_molecules[0]
         elif len(existing_computed_molecules) > 1:
-            logger.warning('Deleting existing ComputedMolecules (more than 1 found')
+            logger.warning("Deleting existing ComputedMolecules (more than 1 found")
             for exist in existing_computed_molecules:
-                logger.info('Deleting ComputedMolecule %s', exist)
+                logger.info("Deleting ComputedMolecule %s", exist)
                 exist.delete()
             computed_molecule = ComputedMolecule(name=name)
         else:
-            logger.info('Creating new ComputedMolecule')
+            logger.info("Creating new ComputedMolecule")
             computed_molecule = ComputedMolecule(name=name)
 
         if isinstance(ref_so, SiteObservation):
@@ -537,11 +537,11 @@ def set_mol(
         # Extract possible reference URL and Rationale
         # URLs have to be valid URLs and rationals must contain more than one word
         ref_url: Optional[str] = (
-            mol.GetProp('ref_url') if mol.HasProp('ref_url') else None
+            mol.GetProp("ref_url") if mol.HasProp("ref_url") else None
         )
         computed_molecule.ref_url = ref_url if is_url(ref_url) else None
         rationale: Optional[str] = (
-            mol.GetProp('rationale') if mol.HasProp('rationale') else None
+            mol.GetProp("rationale") if mol.HasProp("rationale") else None
         )
         computed_molecule.rationale = rationale if word_count(rationale) > 1 else None
         # To avoid the error...
@@ -567,7 +567,7 @@ def set_mol(
         return computed_molecule
 
     def get_submission_info(self, description_mol) -> ComputedSetSubmitter:
-        datestring = description_mol.GetProp('generation_date')
+        datestring = description_mol.GetProp("generation_date")
         try:
             date = parse(datestring, dayfirst=True)
         except ValueError as exc:
@@ -576,10 +576,10 @@ def get_submission_info(self, description_mol) -> ComputedSetSubmitter:
             raise IntegrityError(msg) from exc
 
         return ComputedSetSubmitter.objects.get_or_create(
-            name=description_mol.GetProp('submitter_name'),
-            method=description_mol.GetProp('method'),
-            email=description_mol.GetProp('submitter_email'),
-            institution=description_mol.GetProp('submitter_institution'),
+            name=description_mol.GetProp("submitter_name"),
+            method=description_mol.GetProp("method"),
+            email=description_mol.GetProp("submitter_email"),
+            institution=description_mol.GetProp("submitter_institution"),
             generation_date=date,
         )[0]
 
@@ -593,19 +593,19 @@ def process_mol(
         zfile=None,
         zfile_hashvals=None,
     ) -> None:
-        molecule_name = mol.GetProp('_Name')
-        logger.debug('+ process_mol %s', molecule_name)
+        molecule_name = mol.GetProp("_Name")
+        logger.debug("+ process_mol %s", molecule_name)
 
         other_props = mol.GetPropsAsDict()
         skip_mol = False
 
         # if ref_mols or ref_pdb is missing skip the molecule
-        for prop in ['ref_mols', 'ref_pdb']:
+        for prop in ["ref_mols", "ref_pdb"]:
             if prop not in other_props.keys():
                 self.messages = add_warning(
                     molecule_name=molecule_name,
                     field=prop,
-                    warning_string=f'Property {prop} missing. Skipping molecule!',
+                    warning_string=f"Property {prop} missing. Skipping molecule!",
                     validate_dict=self.messages,
                 )
                 skip_mol = True
@@ -613,7 +613,7 @@ def process_mol(
                 self.messages = add_warning(
                     molecule_name=molecule_name,
                     field=prop,
-                    warning_string=f'Property {prop} undefined. Skipping molecule!',
+                    warning_string=f"Property {prop} undefined. Skipping molecule!",
                     validate_dict=self.messages,
                 )
                 skip_mol = True
@@ -624,16 +624,16 @@ def process_mol(
             if prop not in other_props.keys():
                 # non-header molecules don't need header fields
                 continue
-            
+
             if other_props[prop] not in EMPTY_VALUES:
                 # header fields in non-header molecules have values ignored
                 self.messages = add_warning(
                     molecule_name=molecule_name,
                     field=prop,
-                    warning_string=f'Property {prop} value {other_props[prop]} ignored.',
+                    warning_string=f"Property {prop} value {other_props[prop]} ignored.",
                     validate_dict=self.messages,
                 )
-            
+
             # get rid of the header field property on the non-header molecule
             del other_props[prop]
 
@@ -659,23 +659,23 @@ def set_descriptions(
         )
 
         computed_set.submitter = self.get_submission_info(description_mol)
-        if description_mol.HasProp('ref_url'):
-            computed_set.method_url = description_mol.GetProp('ref_url')
+        if description_mol.HasProp("ref_url"):
+            computed_set.method_url = description_mol.GetProp("ref_url")
         computed_set.save()
 
         description_dict = description_mol.GetPropsAsDict()
-        logger.debug('index mol original values: %s', description_dict)
+        logger.debug("index mol original values: %s", description_dict)
         # score descriptions for this upload, doesn't matter if
         # created or existing
         score_descriptions = {}
         errors = []
         for key in description_dict.keys():
             if key in descriptions_needed and key not in [
-                'ref_mols',
-                'ref_pdb',
-                'index',
-                'Name',
-                'original SMILES',
+                "ref_mols",
+                "ref_pdb",
+                "index",
+                "Name",
+                "original SMILES",
             ]:
                 description, _ = ScoreDescription.objects.get_or_create(
                     computed_set=computed_set,
@@ -687,10 +687,10 @@ def set_descriptions(
 
                 if key in HEADER_MOL_FIELDS:
                     if value in EMPTY_VALUES:
-                        msg = f'Empty value for {key} in header molecule'
+                        msg = f"Empty value for {key} in header molecule"
                         errors.append(msg)
                         logger.error(msg)
-                    if key == 'submitter_email':
+                    if key == "submitter_email":
                         try:
                             validate_email(value)
                         except ValidationError:
@@ -700,7 +700,7 @@ def set_descriptions(
 
                 score_descriptions[description] = value
 
-        logger.debug('index mol values: %s', score_descriptions.values())
+        logger.debug("index mol values: %s", score_descriptions.values())
         if errors:
             raise IntegrityError(errors)
 
@@ -708,7 +708,7 @@ def set_descriptions(
 
     def task(self) -> tuple[ComputedSet, dict]:
         # Truncate submitted method (lower-case)?
-        truncated_submitter_method: str = 'unspecified'
+        truncated_submitter_method: str = "unspecified"
         try:
             with transaction.atomic():
                 if self.submitter_method:
@@ -731,7 +731,7 @@ def task(self) -> tuple[ComputedSet, dict]:
                 try:
                     computed_set = ComputedSet.objects.get(name=self.computed_set_name)
                     # refresh some attributes
-                    computed_set.md_ordinal = F('md_ordinal') + 1
+                    computed_set.md_ordinal = F("md_ordinal") + 1
                     computed_set.upload_date = datetime.date.today()
                     computed_set.save()
                 except ComputedSet.DoesNotExist:
@@ -745,13 +745,13 @@ def task(self) -> tuple[ComputedSet, dict]:
                     except Target.DoesNotExist as exc:
                         # target's existance should be validated in the view,
                         # this could hardly happen
-                        msg = f'Target {self.target_id} does not exist'
+                        msg = f"Target {self.target_id} does not exist"
                         logger.error(msg)
                         raise IntegrityError(msg) from exc
 
                     cs_name: str = (
-                        f'{truncated_submitter_method}-{str(today)}-'
-                        + f'{get_column_letter(new_ordinal)}'
+                        f"{truncated_submitter_method}-{str(today)}-"
+                        + f"{get_column_letter(new_ordinal)}"
                     )
                     logger.info('Creating new ComputedSet "%s"', cs_name)
 
@@ -761,13 +761,13 @@ def task(self) -> tuple[ComputedSet, dict]:
                         upload_date=today,
                         method=self.submitter_method[: ComputedSet.LENGTH_METHOD],
                         target=target,
-                        spec_version=float(self.version.strip('ver_')),
+                        spec_version=float(self.version.strip("ver_")),
                     )
                     if self.user_id:
                         try:
                             computed_set.owner_user = User.objects.get(id=self.user_id)
                         except User.DoesNotExist as exc:
-                            msg = f'User {self.user_id} does not exist'
+                            msg = f"User {self.user_id} does not exist"
                             logger.error(msg)
                             raise IntegrityError(msg) from exc
 
@@ -787,10 +787,10 @@ def task(self) -> tuple[ComputedSet, dict]:
                 )
 
                 # Process the molecules
-                logger.info('%s mols_to_process=%s', computed_set, len(mols_to_process))
+                logger.info("%s mols_to_process=%s", computed_set, len(mols_to_process))
                 for i in range(len(mols_to_process)):
                     logger.debug(
-                        'processing mol %s: %s', i, mols_to_process[i].GetProp('_Name')
+                        "processing mol %s: %s", i, mols_to_process[i].GetProp("_Name")
                     )
                     self.process_mol(
                         mols_to_process[i],
@@ -821,7 +821,7 @@ def task(self) -> tuple[ComputedSet, dict]:
             settings.MEDIA_ROOT, settings.COMPUTED_SET_MEDIA_DIRECTORY
         )
         if not os.path.isdir(cmp_set_folder):
-            logger.info('Making ComputedSet folder (%s)', cmp_set_folder)
+            logger.info("Making ComputedSet folder (%s)", cmp_set_folder)
             os.mkdir(cmp_set_folder)
 
         # move and save the compound set
@@ -829,7 +829,7 @@ def task(self) -> tuple[ComputedSet, dict]:
             Path(settings.MEDIA_ROOT)
             .joinpath(settings.COMPUTED_SET_MEDIA_DIRECTORY)
             .joinpath(
-                f'{computed_set.name}_upload_{computed_set.md_ordinal}_{Path(sdf_filename).name}'
+                f"{computed_set.name}_upload_{computed_set.md_ordinal}_{Path(sdf_filename).name}"
             )
         )
         os.rename(sdf_filename, new_filename)
@@ -837,7 +837,7 @@ def task(self) -> tuple[ComputedSet, dict]:
         computed_set.written_sdf_filename = new_filename
         computed_set.save()
 
-        logger.info('Created %s', computed_set)
+        logger.info("Created %s", computed_set)
 
         return computed_set, self.messages
 
@@ -848,23 +848,23 @@ def blank_mol_vals(sdf_file) -> Tuple[str, str, str]:
     """
     suppl = Chem.SDMolSupplier(sdf_file)
     if not suppl:
-        return '', '', ''
+        return "", "", ""
     # print('%d mols detected (including blank mol)' % (len(suppl),))
     blank_mol = suppl[0]
     if not blank_mol:
-        return '', '', ''
+        return "", "", ""
 
     # Get submitter name/info for passing into upload to get unique name
-    submitter_name = ''
-    if blank_mol.HasProp('submitter_name'):
-        submitter_name = blank_mol.GetProp('submitter_name')
+    submitter_name = ""
+    if blank_mol.HasProp("submitter_name"):
+        submitter_name = blank_mol.GetProp("submitter_name")
 
-    submitter_method = ''
-    if blank_mol.HasProp('method'):
-        submitter_method = blank_mol.GetProp('method')
+    submitter_method = ""
+    if blank_mol.HasProp("method"):
+        submitter_method = blank_mol.GetProp("method")
 
-    version = ''
-    if blank_mol.HasProp('_Name'):
-        version = blank_mol.GetProp('_Name')
+    version = ""
+    if blank_mol.HasProp("_Name"):
+        version = blank_mol.GetProp("_Name")
 
     return submitter_name, submitter_method, version

From b2f5738df1bbdbe1b425a9e87e18250578dee5e2 Mon Sep 17 00:00:00 2001
From: Max Winokan <mwinokan@me.com>
Date: Fri, 6 Dec 2024 10:15:01 +0000
Subject: [PATCH 8/9] black lint

---
 viewer/cset_upload.py | 234 +++++++++++++++++++++---------------------
 1 file changed, 117 insertions(+), 117 deletions(-)

diff --git a/viewer/cset_upload.py b/viewer/cset_upload.py
index 6531a86e..5d3100b7 100644
--- a/viewer/cset_upload.py
+++ b/viewer/cset_upload.py
@@ -50,62 +50,62 @@
 _DIST_LIMIT = 0.5
 
 EMPTY_VALUES = (
-    'nan',
-    '',
+    "nan",
+    "",
     None,
     np.nan,
 )
 
 
 HEADER_MOL_FIELDS = (
-    'ref_url',
-    'method',
-    'submitter_name',
-    'submitter_institution',
-    'submitter_email',
-    'generation_date',
+    "ref_url",
+    "method",
+    "submitter_name",
+    "submitter_institution",
+    "submitter_email",
+    "generation_date",
 )
 
 
 def dataType(a_str: str) -> str:
     lean_str = a_str.strip()
     if not lean_str:
-        return 'BLANK'
+        return "BLANK"
 
     try:
         t = ast.literal_eval(lean_str)
     except (ValueError, SyntaxError):
-        return 'TEXT'
+        return "TEXT"
     else:
         if type(t) in [int, int, float, bool]:
             if t in [
                 True,
                 False,
-                'TRUE',
-                'FALSE',
-                'true',
-                'false',
-                'yes',
-                'no',
-                'YES',
-                'NO',
-                'Yes',
-                'No',
+                "TRUE",
+                "FALSE",
+                "true",
+                "false",
+                "yes",
+                "no",
+                "YES",
+                "NO",
+                "Yes",
+                "No",
                 "Y",
                 "N",
                 "y",
                 "n",
             ]:
-                return 'BIT'
+                return "BIT"
             if type(t) is int or type(t) is int:
-                return 'INT'
+                return "INT"
             if type(t) is float:
-                return 'FLOAT'
+                return "FLOAT"
 
             # Can't get here?
             assert False
         else:
-            return 'TEXT'
+            return "TEXT"
 
 
 class PdbOps:
@@ -121,10 +121,10 @@ def save_pdb_zip(
             zfile_hashvals = {}
             for filename in zip_lst:
                 # only handle pdb files
-                if filename.split('.')[-1] == 'pdb':
+                if filename.split(".")[-1] == "pdb":
                     # Test if Protein object already exists
-                    code = filename.split('/')[-1].replace('.pdb', '')
-                    test_pdb_code = filename.split('/')[-1].replace('.pdb', '')
+                    code = filename.split("/")[-1].replace(".pdb", "")
+                    test_pdb_code = filename.split("/")[-1].replace(".pdb", "")
                     test_site_obvs_objs = SiteObservation.objects.filter(
                         code=test_pdb_code
                     )
@@ -132,12 +132,12 @@ def save_pdb_zip(
                     if len(test_site_obvs_objs) != 0:
                         # make a unique pdb code as not to overwrite existing object
                         rand_str = uuid.uuid4().hex
-                        test_pdb_code = f'{code}#{rand_str}'
+                        test_pdb_code = f"{code}#{rand_str}"
                         zfile_hashvals[code] = rand_str
 
-                    fn = f'{test_pdb_code}.pdb'
+                    fn = f"{test_pdb_code}.pdb"
                     pdb_path = default_storage.save(
-                        f'tmp/{fn}', ContentFile(zf.read(filename))
+                        f"tmp/{fn}", ContentFile(zf.read(filename))
                     )
                     zfile[test_pdb_code] = pdb_path
 
@@ -148,7 +148,7 @@ def save_pdb_zip(
         return zfile, zfile_hashvals
 
     def run(self, params) -> Tuple[Optional[Dict[str, Any]], Optional[Dict[str, str]]]:
-        return self.save_pdb_zip(params['pdb_zip'])
+        return self.save_pdb_zip(params["pdb_zip"])
 
 
 class MolOps:
@@ -176,15 +176,15 @@ def __init__(
 
         # using the same mechanism to pass messages as validation
         self.messages: dict[str, Any] = {
-            'molecule_name': [],
-            'field': [],
-            'warning_string': [],
+            "molecule_name": [],
+            "field": [],
+            "warning_string": [],
         }
 
     def process_pdb(self, pdb_code, zfile, zfile_hashvals) -> str | None:
         for key in zfile_hashvals.keys():
             if key == pdb_code:
-                pdb_code = f'{pdb_code}#{zfile_hashvals[pdb_code]}'
+                pdb_code = f"{pdb_code}#{zfile_hashvals[pdb_code]}"
 
         try:
             pdb_fp = zfile[pdb_code]
@@ -192,7 +192,7 @@ def process_pdb(self, pdb_code, zfile, zfile_hashvals) -> str | None:
             return None
 
         # ensure filename uniqueness
-        pdb_fn = '_'.join([zfile[pdb_code].split('/')[-1], uuid.uuid4().hex])
+        pdb_fn = "_".join([zfile[pdb_code].split("/")[-1], uuid.uuid4().hex])
         pdb_field = Path(settings.COMPUTED_SET_MEDIA_DIRECTORY).joinpath(pdb_fn)
 
         new_filename = Path(settings.MEDIA_ROOT).joinpath(pdb_field)
@@ -224,11 +224,11 @@ def get_site_observation(
             )
             return None
 
-        pdb_fn = mol.GetProp(property_name).split('/')[-1]
+        pdb_fn = mol.GetProp(property_name).split("/")[-1]
 
         if zfile:
             # pdb archive uploaded. referenced pdb file may or may not be included
-            pdb_code = pdb_fn.replace('.pdb', '')
+            pdb_code = pdb_fn.replace(".pdb", "")
             pdb_file = self.process_pdb(
                 pdb_code=pdb_code,
                 zfile=zfile,
@@ -238,7 +238,7 @@ def get_site_observation(
                 return pdb_file
             else:
                 logger.info(
-                    'No protein pdb (%s) found in zipfile',
+                    "No protein pdb (%s) found in zipfile",
                     pdb_fn,
                 )
 
@@ -253,7 +253,7 @@ def get_site_observation(
         except SiteObservation.DoesNotExist:
             # Initial SiteObservation lookup failed.
             logger.warning(
-                'Failed to get SiteObservation object (target=%s name=%s)',
+                "Failed to get SiteObservation object (target=%s name=%s)",
                 compound_set.target.title,
                 name,
             )
@@ -265,25 +265,25 @@ def get_site_observation(
             )
             if qs.exists():
                 logger.info(
-                    'Found SiteObservation containing name=%s qs=%s',
+                    "Found SiteObservation containing name=%s qs=%s",
                     name,
                     qs,
                 )
             else:
-                alt_name = name.split(':')[0].split('_')[0]
+                alt_name = name.split(":")[0].split("_")[0]
                 qs = SiteObservation.objects.filter(
                     code__contains=alt_name,
                     experiment__experiment_upload__target__pk=target,
                 )
                 if qs.exists():
                     logger.info(
-                        'Found SiteObservation containing alternative name=%s qs=%s',
+                        "Found SiteObservation containing alternative name=%s qs=%s",
                         alt_name,
                         qs,
                     )
             if qs.count() > 0:
                 logger.debug(
-                    'Found alternative (target=%s name=%s)',
+                    "Found alternative (target=%s name=%s)",
                     compound_set.target.title,
                     name,
                 )
@@ -291,7 +291,7 @@ def get_site_observation(
 
         if not site_obvs:
             logger.warning(
-                'No SiteObservation found (target=%s pdb_fn=%s)',
+                "No SiteObservation found (target=%s pdb_fn=%s)",
                 compound_set.target.title,
                 pdb_fn,
             )
@@ -309,7 +309,7 @@ def create_mol(self, inchi, target, name=None) -> tuple[Compound, str]:
         qs = Compound.objects.filter(
             computedmolecule__computed_set__target=target,
         )
-        cpd_number = '1'
+        cpd_number = "1"
         try:
             # NB! Max said there could be thousands of compounds per
             # target so this distinct() here may become a problem
@@ -338,7 +338,7 @@ def create_mol(self, inchi, target, name=None) -> tuple[Compound, str]:
             # occur. However there's nothing in the db to prevent
             # this, so adding a catch clause and writing a meaningful
             # message
-            msg = f'Duplicate compounds for target {target.title} with inchi key {inchi_key}.'
+            msg = f"Duplicate compounds for target {target.title} with inchi key {inchi_key}."
             logger.error(msg)
             raise IntegrityError(msg) from exc
 
@@ -346,9 +346,9 @@ def create_mol(self, inchi, target, name=None) -> tuple[Compound, str]:
 
     def set_props(self, cpd, props, score_descriptions) -> List[ScoreDescription]:
         for sd, val in score_descriptions.items():
-            logger.debug('sd: %s', sd)
-            logger.debug('sd.name, val: %s: %s', sd.name, val)
-            if dataType(str(props[sd.name])) == 'TEXT':
+            logger.debug("sd: %s", sd)
+            logger.debug("sd.name, val: %s: %s", sd.name, val)
+            if dataType(str(props[sd.name])) == "TEXT":
                 score_value = TextScoreValues()
             else:
                 score_value = NumericalScoreValues()
@@ -373,7 +373,7 @@ def set_mol(
 
         smiles = Chem.MolToSmiles(mol)
         inchi = Chem.inchi.MolToInchi(mol)
-        molecule_name = mol.GetProp('_Name')
+        molecule_name = mol.GetProp("_Name")
 
         flattened_copy = copy.deepcopy(mol)
         Chem.RemoveStereochemistry(mol)
@@ -383,8 +383,8 @@ def set_mol(
             inchi, compound_set.target, name=molecule_name
         )
 
-        insp = mol.GetProp('ref_mols')
-        insp = insp.split(',')
+        insp = mol.GetProp("ref_mols")
+        insp = insp.split(",")
         insp = [i.strip() for i in insp]
         insp_frags = []
         for i in insp:
@@ -397,19 +397,19 @@ def set_mol(
                 ref = site_obvs
             except SiteObservation.DoesNotExist:
                 qs = SiteObservation.objects.filter(
-                    code=str(i.split(':')[0].split('_')[0]),
+                    code=str(i.split(":")[0].split("_")[0]),
                     experiment__experiment_upload__target=compound_set.target,
                 )
                 if not qs.exists():
                     raise IntegrityError(  # pylint: disable=raise-missing-from
-                        'No matching molecules found for inspiration frag ' + i
+                        "No matching molecules found for inspiration frag " + i
                     )
 
-                ref = qs.order_by('-cmpd_id').first()
+                ref = qs.order_by("-cmpd_id").first()
 
             insp_frags.append(ref)
 
-        ref_property = 'ref_pdb'
+        ref_property = "ref_pdb"
         # data in ref ref_pdb field may be one of 2 things:
         # - siteobservation's short code (code field)
         # - pdb file in uploaded zipfile
@@ -423,7 +423,7 @@ def set_mol(
         )
         if not ref_so:
             logger.warning(
-                'Failed to get a Reference SiteObservation (%s) for %s, %s, %s',
+                "Failed to get a Reference SiteObservation (%s) for %s, %s, %s",
                 ref_property,
                 mol,
                 target,
@@ -438,26 +438,26 @@ def set_mol(
         # format, this is not necessary. or possible
         qs = ComputedMolecule.objects.filter(
             compound=compound,
-        ).order_by('name')
+        ).order_by("name")
 
         if qs.exists():
             # not actually latest, just last according to sorting above
             latest = qs.last()
             # regex pattern - split name like 'v1a'
             # ('(letters)(digits)(letters)' to components
-            groups = re.search(r'()(\d+)(\D+)', qs.last().name)
+            groups = re.search(r"()(\d+)(\D+)", qs.last().name)
             if groups is None or len(groups.groups()) != 3:
                 # just a quick sanity check
                 raise IntegrityError(
-                    f'Non-standard ComputedMolecule.name: {latest.name}'
+                    f"Non-standard ComputedMolecule.name: {latest.name}"
                 )
             number = groups.groups()[1]  # type: ignore [index]
             suffix = next(alphanumerator(start_from=groups.groups()[2]))  # type: ignore [index]
         else:
-            suffix = 'a'
+            suffix = "a"
             # number = 1
 
-        name = f'v{number}{suffix}'
+        name = f"v{number}{suffix}"
 
         existing_computed_molecules = []
         for k in qs:
@@ -477,7 +477,7 @@ def set_mol(
                     )
                 except RuntimeError as exc:
                     msg = (
-                        f'Failed to find alignment between {k.molecule_name} '
+                        f"Failed to find alignment between {k.molecule_name} "
                         + f'and {mol.GetProp("original ID")}'
                     )
                     logger.error(msg)
@@ -498,18 +498,18 @@ def set_mol(
 
         if len(existing_computed_molecules) == 1:
             logger.warning(
-                'Using existing ComputedMolecule %s and overwriting its metadata',
+                "Using existing ComputedMolecule %s and overwriting its metadata",
                 existing_computed_molecules[0],
             )
             computed_molecule = existing_computed_molecules[0]
         elif len(existing_computed_molecules) > 1:
-            logger.warning('Deleting existing ComputedMolecules (more than 1 found')
+            logger.warning("Deleting existing ComputedMolecules (more than 1 found")
             for exist in existing_computed_molecules:
-                logger.info('Deleting ComputedMolecule %s', exist)
+                logger.info("Deleting ComputedMolecule %s", exist)
                 exist.delete()
             computed_molecule = ComputedMolecule(name=name)
         else:
-            logger.info('Creating new ComputedMolecule')
+            logger.info("Creating new ComputedMolecule")
             computed_molecule = ComputedMolecule(name=name)
 
         if isinstance(ref_so, SiteObservation):
@@ -537,11 +537,11 @@ def set_mol(
         # Extract possible reference URL and Rationale
         # URLs have to be valid URLs and rationals must contain more than one word
         ref_url: Optional[str] = (
-            mol.GetProp('ref_url') if mol.HasProp('ref_url') else None
+            mol.GetProp("ref_url") if mol.HasProp("ref_url") else None
         )
         computed_molecule.ref_url = ref_url if is_url(ref_url) else None
         rationale: Optional[str] = (
-            mol.GetProp('rationale') if mol.HasProp('rationale') else None
+            mol.GetProp("rationale") if mol.HasProp("rationale") else None
         )
         computed_molecule.rationale = rationale if word_count(rationale) > 1 else None
         # To avoid the error...
@@ -567,7 +567,7 @@ def set_mol(
         return computed_molecule
 
     def get_submission_info(self, description_mol) -> ComputedSetSubmitter:
-        datestring = description_mol.GetProp('generation_date')
+        datestring = description_mol.GetProp("generation_date")
         try:
             date = parse(datestring, dayfirst=True)
         except ValueError as exc:
@@ -576,10 +576,10 @@ def get_submission_info(self, description_mol) -> ComputedSetSubmitter:
             raise IntegrityError(msg) from exc
 
         return ComputedSetSubmitter.objects.get_or_create(
-            name=description_mol.GetProp('submitter_name'),
-            method=description_mol.GetProp('method'),
-            email=description_mol.GetProp('submitter_email'),
-            institution=description_mol.GetProp('submitter_institution'),
+            name=description_mol.GetProp("submitter_name"),
+            method=description_mol.GetProp("method"),
+            email=description_mol.GetProp("submitter_email"),
+            institution=description_mol.GetProp("submitter_institution"),
             generation_date=date,
         )[0]
 
@@ -593,19 +593,19 @@ def process_mol(
         zfile=None,
         zfile_hashvals=None,
     ) -> None:
-        molecule_name = mol.GetProp('_Name')
-        logger.debug('+ process_mol %s', molecule_name)
+        molecule_name = mol.GetProp("_Name")
+        logger.debug("+ process_mol %s", molecule_name)
 
         other_props = mol.GetPropsAsDict()
         skip_mol = False
 
         # if ref_mols or ref_pdb is missing skip the molecule
-        for prop in ['ref_mols', 'ref_pdb']:
+        for prop in ["ref_mols", "ref_pdb"]:
             if prop not in other_props.keys():
                 self.messages = add_warning(
                     molecule_name=molecule_name,
                     field=prop,
-                    warning_string=f'Property {prop} missing. Skipping molecule!',
+                    warning_string=f"Property {prop} missing. Skipping molecule!",
                     validate_dict=self.messages,
                 )
                 skip_mol = True
@@ -613,7 +613,7 @@ def process_mol(
                 self.messages = add_warning(
                     molecule_name=molecule_name,
                     field=prop,
-                    warning_string=f'Property {prop} undefined. Skipping molecule!',
+                    warning_string=f"Property {prop} undefined. Skipping molecule!",
                     validate_dict=self.messages,
                 )
                 skip_mol = True
@@ -624,16 +624,16 @@ def process_mol(
             if prop not in other_props.keys():
                 # non-header molecules don't need header fields
                 continue
-            
+
             if other_props[prop] not in EMPTY_VALUES:
                 # header fields in non-header molecules have values ignored
                 self.messages = add_warning(
                     molecule_name=molecule_name,
                     field=prop,
-                    warning_string=f'Property {prop} value {other_props[prop]} ignored.',
+                    warning_string=f"Property {prop} value {other_props[prop]} ignored.",
                     validate_dict=self.messages,
                 )
-            
+
             # get rid of the header field property on the non-header molecule
             del other_props[prop]
 
@@ -659,23 +659,23 @@ def set_descriptions(
         )
 
         computed_set.submitter = self.get_submission_info(description_mol)
-        if description_mol.HasProp('ref_url'):
-            computed_set.method_url = description_mol.GetProp('ref_url')
+        if description_mol.HasProp("ref_url"):
+            computed_set.method_url = description_mol.GetProp("ref_url")
         computed_set.save()
 
         description_dict = description_mol.GetPropsAsDict()
-        logger.debug('index mol original values: %s', description_dict)
+        logger.debug("index mol original values: %s", description_dict)
         # score descriptions for this upload, doesn't matter if
         # created or existing
         score_descriptions = {}
         errors = []
         for key in description_dict.keys():
             if key in descriptions_needed and key not in [
-                'ref_mols',
-                'ref_pdb',
-                'index',
-                'Name',
-                'original SMILES',
+                "ref_mols",
+                "ref_pdb",
+                "index",
+                "Name",
+                "original SMILES",
             ]:
                 description, _ = ScoreDescription.objects.get_or_create(
                     computed_set=computed_set,
@@ -687,10 +687,10 @@ def set_descriptions(
 
                 if key in HEADER_MOL_FIELDS:
                     if value in EMPTY_VALUES:
-                        msg = f'Empty value for {key} in header molecule'
+                        msg = f"Empty value for {key} in header molecule"
                         errors.append(msg)
                         logger.error(msg)
-                    if key == 'submitter_email':
+                    if key == "submitter_email":
                         try:
                             validate_email(value)
                         except ValidationError:
@@ -700,7 +700,7 @@ def set_descriptions(
 
                 score_descriptions[description] = value
 
-        logger.debug('index mol values: %s', score_descriptions.values())
+        logger.debug("index mol values: %s", score_descriptions.values())
         if errors:
             raise IntegrityError(errors)
 
@@ -708,7 +708,7 @@ def set_descriptions(
 
     def task(self) -> tuple[ComputedSet, dict]:
         # Truncate submitted method (lower-case)?
-        truncated_submitter_method: str = 'unspecified'
+        truncated_submitter_method: str = "unspecified"
         try:
             with transaction.atomic():
                 if self.submitter_method:
@@ -731,7 +731,7 @@ def task(self) -> tuple[ComputedSet, dict]:
                 try:
                     computed_set = ComputedSet.objects.get(name=self.computed_set_name)
                     # refresh some attributes
-                    computed_set.md_ordinal = F('md_ordinal') + 1
+                    computed_set.md_ordinal = F("md_ordinal") + 1
                     computed_set.upload_date = datetime.date.today()
                     computed_set.save()
                 except ComputedSet.DoesNotExist:
@@ -745,13 +745,13 @@ def task(self) -> tuple[ComputedSet, dict]:
                     except Target.DoesNotExist as exc:
                         # target's existance should be validated in the view,
                         # this could hardly happen
-                        msg = f'Target {self.target_id} does not exist'
+                        msg = f"Target {self.target_id} does not exist"
                         logger.error(msg)
                         raise IntegrityError(msg) from exc
 
                     cs_name: str = (
-                        f'{truncated_submitter_method}-{str(today)}-'
-                        + f'{get_column_letter(new_ordinal)}'
+                        f"{truncated_submitter_method}-{str(today)}-"
+                        + f"{get_column_letter(new_ordinal)}"
                     )
                     logger.info('Creating new ComputedSet "%s"', cs_name)
 
@@ -761,13 +761,13 @@ def task(self) -> tuple[ComputedSet, dict]:
                         upload_date=today,
                         method=self.submitter_method[: ComputedSet.LENGTH_METHOD],
                         target=target,
-                        spec_version=float(self.version.strip('ver_')),
+                        spec_version=float(self.version.strip("ver_")),
                     )
                     if self.user_id:
                         try:
                             computed_set.owner_user = User.objects.get(id=self.user_id)
                         except User.DoesNotExist as exc:
-                            msg = f'User {self.user_id} does not exist'
+                            msg = f"User {self.user_id} does not exist"
                             logger.error(msg)
                             raise IntegrityError(msg) from exc
 
@@ -787,10 +787,10 @@ def task(self) -> tuple[ComputedSet, dict]:
                 )
 
                 # Process the molecules
-                logger.info('%s mols_to_process=%s', computed_set, len(mols_to_process))
+                logger.info("%s mols_to_process=%s", computed_set, len(mols_to_process))
                 for i in range(len(mols_to_process)):
                     logger.debug(
-                        'processing mol %s: %s', i, mols_to_process[i].GetProp('_Name')
+                        "processing mol %s: %s", i, mols_to_process[i].GetProp("_Name")
                     )
                     self.process_mol(
                         mols_to_process[i],
@@ -821,7 +821,7 @@ def task(self) -> tuple[ComputedSet, dict]:
             settings.MEDIA_ROOT, settings.COMPUTED_SET_MEDIA_DIRECTORY
         )
         if not os.path.isdir(cmp_set_folder):
-            logger.info('Making ComputedSet folder (%s)', cmp_set_folder)
+            logger.info("Making ComputedSet folder (%s)", cmp_set_folder)
             os.mkdir(cmp_set_folder)
 
         # move and save the compound set
@@ -829,7 +829,7 @@ def task(self) -> tuple[ComputedSet, dict]:
             Path(settings.MEDIA_ROOT)
             .joinpath(settings.COMPUTED_SET_MEDIA_DIRECTORY)
             .joinpath(
-                f'{computed_set.name}_upload_{computed_set.md_ordinal}_{Path(sdf_filename).name}'
+                f"{computed_set.name}_upload_{computed_set.md_ordinal}_{Path(sdf_filename).name}"
             )
         )
         os.rename(sdf_filename, new_filename)
@@ -837,7 +837,7 @@ def task(self) -> tuple[ComputedSet, dict]:
         computed_set.written_sdf_filename = new_filename
         computed_set.save()
 
-        logger.info('Created %s', computed_set)
+        logger.info("Created %s", computed_set)
 
         return computed_set, self.messages
 
@@ -848,23 +848,23 @@ def blank_mol_vals(sdf_file) -> Tuple[str, str, str]:
     """
     suppl = Chem.SDMolSupplier(sdf_file)
     if not suppl:
-        return '', '', ''
+        return "", "", ""
     # print('%d mols detected (including blank mol)' % (len(suppl),))
     blank_mol = suppl[0]
     if not blank_mol:
-        return '', '', ''
+        return "", "", ""
 
     # Get submitter name/info for passing into upload to get unique name
-    submitter_name = ''
-    if blank_mol.HasProp('submitter_name'):
-        submitter_name = blank_mol.GetProp('submitter_name')
+    submitter_name = ""
+    if blank_mol.HasProp("submitter_name"):
+        submitter_name = blank_mol.GetProp("submitter_name")
 
-    submitter_method = ''
-    if blank_mol.HasProp('method'):
-        submitter_method = blank_mol.GetProp('method')
+    submitter_method = ""
+    if blank_mol.HasProp("method"):
+        submitter_method = blank_mol.GetProp("method")
 
-    version = ''
-    if blank_mol.HasProp('_Name'):
-        version = blank_mol.GetProp('_Name')
+    version = ""
+    if blank_mol.HasProp("_Name"):
+        version = blank_mol.GetProp("_Name")
 
     return submitter_name, submitter_method, version

From abd0277ef91ae4efabec19d4b8bef3de10f730d0 Mon Sep 17 00:00:00 2001
From: Kalev Takkis <ktakkis@informaticsmatters.com>
Date: Fri, 6 Dec 2024 10:31:12 +0000
Subject: [PATCH 9/9] fix: code linted

---
 viewer/cset_upload.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/viewer/cset_upload.py b/viewer/cset_upload.py
index 5d3100b7..8030495e 100644
--- a/viewer/cset_upload.py
+++ b/viewer/cset_upload.py
@@ -620,7 +620,6 @@ def process_mol(
 
         # if any header mol fields are defined on non-header molecules those values are ignored and a warning shown
         for prop in HEADER_MOL_FIELDS:
-
             if prop not in other_props.keys():
                 # non-header molecules don't need header fields
                 continue