diff --git a/CITATION.cff b/CITATION.cff index 7ac08d03..018949dd 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -23,35 +23,28 @@ authors: given-names: Tooba affiliation: Maastricht University orcid: https://orcid.org/0000-0002-4904-3269 - - family-names: Gadiya given-names: Yojana affiliation: Enveda Therapeutics (United States) orcid: https://orcid.org/0000-0002-7683-0452 - - family-names: Millán Acosta given-names: Javier orcid: https://orcid.org/0000-0002-4166-7093 - - family-names: Willighagen given-names: Egon affiliation: Maastricht University orcid: https://orcid.org/0000-0001-7542-0286 - - family-names: Mei given-names: Hailiang orcid: https://orcid.org/0000-0003-1781-5508 - - family-names: Sima given-names: Ana Claudia affiliation: Swiss Institute of Bioinformatics Geneva orcid: https://orcid.org/0000-0003-3213-4495 - - family-names: Martinát given-names: Dominik affiliation: Univerzita Palackého v Olomouci Přírodovědecká fakulta orcid: https://orcid.org/0000-0001-6611-7883 - - family-names: Adriaque Lozano given-names: Alejandro affiliation: Maastricht University @@ -62,8 +55,8 @@ identifiers: value: 10.5281/zenodo.18468943 description: Archived software release on Zenodo -repository-code: https://github.com//pyBioDataFuse -url: https://github.com//pyBioDataFuse +repository-code: https://github.com/BioDataFuse/pyBiodatafuse +url: https://github.com/BioDataFuse/pyBiodatafuse keywords: - biomedical data integration @@ -75,6 +68,32 @@ keywords: preferred-citation: type: article - title: "pyBiodatafuse: extending interoperability of data using modular queries across biomedical resources" + title: "pyBiodatafuse: extending interoperability of data using modular queries across biomedical resources" journal: Bioinformatics + year: 2023 doi: 10.1093/bioinformatics/btag064 + authors: + - family-names: Gadiya + given-names: Yojana + orcid: https://orcid.org/0000-0002-7683-0452 + - family-names: Millán Acosta + given-names: Javier + orcid: https://orcid.org/0000-0002-4166-7093 + - family-names: Abbassi-Daloii + given-names: Tooba + orcid: https://orcid.org/0000-0002-4904-3269 + - family-names: Willighagen + given-names: Egon + orcid: https://orcid.org/0000-0001-7542-0286 + - family-names: Mei + given-names: Hailiang + orcid: https://orcid.org/0000-0003-1781-5508 + - family-names: Sima + given-names: Ana Claudia + orcid: https://orcid.org/0000-0003-3213-4495 + - family-names: Martinát + given-names: Dominik + orcid: https://orcid.org/0000-0001-6611-7883 + - family-names: Adriaque Lozano + given-names: Alejandro + orcid: https://orcid.org/0009-0007-2725-2098 diff --git a/src/pyBiodatafuse/analyzer/explorer/patent.py b/src/pyBiodatafuse/analyzer/explorer/patent.py index 3b5fc82e..3ce9dee5 100644 --- a/src/pyBiodatafuse/analyzer/explorer/patent.py +++ b/src/pyBiodatafuse/analyzer/explorer/patent.py @@ -8,11 +8,12 @@ import time from typing import Literal, Union +import matplotlib.pyplot as plt import pandas as pd +import plotly.express as px import requests from tqdm import tqdm -import plotly.express as px -import matplotlib.pyplot as plt + from pyBiodatafuse.analyzer.utils import ( plot_hbarplot_chart, plot_pie_chart, diff --git a/src/pyBiodatafuse/annotators/intact.py b/src/pyBiodatafuse/annotators/intact.py index a326e449..d3668d8b 100644 --- a/src/pyBiodatafuse/annotators/intact.py +++ b/src/pyBiodatafuse/annotators/intact.py @@ -46,6 +46,39 @@ def check_version_intact() -> dict: return {"source_version": "unknown"} +def _normalize_intact_id(raw_id: str) -> str: + """Normalise a raw IntAct identifier field to ``namespace:accession`` form. + + The IntAct REST API returns identifiers in one of two formats: + + * ``"ID (namespace)"`` – e.g. ``"Q14118 (uniprotkb)"``, + ``"CHEBI:15361 (chebi)"``, ``"CPX-3573 (complex portal)"`` + * ``"namespace:ID"`` – legacy format, kept for safety. + + In both cases the returned string is normalised to ``namespace:ID``. + Multi-word namespace suffixes (e.g. ``"complex portal"``) are preserved + by joining all tokens between the first ``(`` and the closing ``)``. + + :param raw_id: Raw identifier string from the API response. + :returns: Normalised ``namespace:accession`` string. + """ + raw_id = raw_id.strip() + + # Current API format: "ID (namespace)" or "CHEBI:15361 (chebi)" + if raw_id.endswith(")") and "(" in raw_id: + paren_open = raw_id.index("(") + accession = raw_id[:paren_open].strip() + namespace = raw_id[paren_open + 1 : -1].strip() # strip surrounding parens + # If the accession already carries a colon (e.g. CHEBI:15361) it is + # already in canonical form – return as-is. + if ":" in accession: + return accession + return f"{namespace}:{accession}" + + # Legacy / already-canonical format: "namespace:ID" + return raw_id + + def get_intact_interactions(gene_ids: List[str]) -> List[dict]: """Retrieve protein interactions for a list of genes from IntAct. @@ -59,19 +92,24 @@ def get_intact_interactions(gene_ids: List[str]) -> List[dict]: encoded_ids = urllib.parse.quote(joined_ids) url = f"{Cons.INTACT_ENDPOINT}/ws/interaction/findInteractions/{encoded_ids}?pageSize=200" + logger.debug("Querying IntAct interactions URL: %s", url) + try: response = requests.get(url, timeout=60) data = response.json() content = data.get("content", []) + logger.debug("IntAct returned %d interaction records for ids: %s", len(content), gene_ids) if not content: return [] - interation_info = { + # Mapping from our output field names to the IntAct REST API JSON keys. + # API reference: https://www.ebi.ac.uk/intact/ws/interaction/findInteractions + interaction_field_map = { Cons.INTACT_INTERACTION_ID: "ac", - Cons.INTACT_INTERACTOR_ID_A: "acA", - Cons.INTACT_INTERACTOR_ID_B: "acB", - Cons.INTACT_SCORE: "intactMiscore", + Cons.INTACT_INTERACTOR_ID_A: "acA", # IntAct AC for interactor A + Cons.INTACT_INTERACTOR_ID_B: "acB", # IntAct AC for interactor B + Cons.INTACT_SCORE: "intactMiscore", # MI-score confidence value Cons.INTACT_BIOLOGICAL_ROLE_A: "biologicalRoleA", Cons.INTACT_BIOLOGICAL_ROLE_B: "biologicalRoleB", Cons.INTACT_TYPE: "type", @@ -83,39 +121,45 @@ def get_intact_interactions(gene_ids: List[str]) -> List[dict]: Cons.INTACT_INTERACTOR_B_SPECIES: "speciesB", Cons.INTACT_MOLECULE_A: "moleculeA", Cons.INTACT_MOLECULE_B: "moleculeB", + # idA / idB: primary identifier in "ID (namespace)" format, e.g. + # "Q14118 (uniprotkb)", "CHEBI:15361 (chebi)", "CPX-3573 (complex portal)" Cons.INTACT_ID_A: "idA", Cons.INTACT_ID_B: "idB", Cons.INTACT_PUBMED_PUBLICATION_ID: "publicationPubmedIdentifier", } interactions = [ - {key: item.get(value, np.nan) for key, value in interation_info.items()} + {key: item.get(api_key, np.nan) for key, api_key in interaction_field_map.items()} for item in content ] - # cleanup the alternative ids + # Normalise idA / idB from "ID (namespace)" → "namespace:ID" for interaction in interactions: - ids_a = interaction[Cons.INTACT_ID_A] - ids_b = interaction[Cons.INTACT_ID_B] + raw_a = interaction[Cons.INTACT_ID_A] + raw_b = interaction[Cons.INTACT_ID_B] - if ":" in ids_a: - interaction[Cons.INTACT_ID_A] = ids_a.split(" ")[0] # stays the same + if isinstance(raw_a, str): + interaction[Cons.INTACT_ID_A] = _normalize_intact_id(raw_a) else: - idx = ids_a.split(" ")[0] - namespace = ids_a.split(" ")[1].replace("(", "").replace(")", "") - interaction[Cons.INTACT_ID_A] = f"{namespace}:{idx}" - - if ":" in ids_b: - interaction[Cons.INTACT_ID_B] = ids_b.split(" ")[0] # stays the same + logger.debug( + "Unexpected non-string idA value: %r (interaction %s)", + raw_a, + interaction.get(Cons.INTACT_INTERACTION_ID), + ) + + if isinstance(raw_b, str): + interaction[Cons.INTACT_ID_B] = _normalize_intact_id(raw_b) else: - idx = ids_b.split(" ")[0] - namespace = ids_b.split(" ")[1].replace("(", "").replace(")", "") - interaction[Cons.INTACT_ID_B] = f"{namespace}:{idx}" + logger.debug( + "Unexpected non-string idB value: %r (interaction %s)", + raw_b, + interaction.get(Cons.INTACT_INTERACTION_ID), + ) return interactions except requests.RequestException as e: - logger.warning(f"Batch request failed for genes {gene_ids}: {e}") + logger.warning("Batch request failed for ids %s: %s", gene_ids, e) return [] @@ -126,12 +170,16 @@ def get_protein_intact_acs(id_of_interest: str) -> List[str]: :returns: Interactor information if possible, empty list if not. """ url = f"{Cons.INTACT_ENDPOINT}/ws/interactor/findInteractor/{id_of_interest}?pageSize=100" + logger.debug("Querying IntAct interactor lookup URL: %s", url) try: response = requests.get(url, timeout=120) response.raise_for_status() data = response.json() content = data.get("content", []) + logger.debug( + "IntAct interactor lookup returned %d records for %s", len(content), id_of_interest + ) protein_acs = [] for item in content: @@ -141,10 +189,13 @@ def get_protein_intact_acs(id_of_interest: str) -> List[str]: if interactor_type == "protein": protein_acs.append(interactor_ac) + logger.debug( + "Found %d protein AC(s) for %s: %s", len(protein_acs), id_of_interest, protein_acs + ) return protein_acs except requests.exceptions.RequestException as e: - logger.warning(f"Failed to get interactors for {id_of_interest}: {e}") + logger.warning("Failed to get interactors for %s: %s", id_of_interest, e) return [] @@ -170,6 +221,13 @@ def get_filtered_interactions( """ results: Dict[str, List[dict]] = {idx: [] for idx in batch_ids} interactions = get_intact_interactions(batch_ids) + logger.debug( + "Filtering %d raw interactions for batch %s (interaction_type=%s, is_compound=%s)", + len(interactions), + batch_ids, + interaction_type, + is_compound, + ) for interaction in interactions: if interaction_type in Cons.INTACT_GENE_INTERACTION_TYPES and not is_compound: @@ -234,6 +292,13 @@ def get_filtered_interactions( keep_interaction = True if not keep_interaction: + logger.debug( + "Skipping interaction %s (id_a=%s, id_b=%s) – does not match type '%s'", + interaction.get(Cons.INTACT_INTERACTION_ID), + id_a, + id_b, + interaction_type, + ) continue for idx in batch_ids: @@ -252,9 +317,12 @@ def get_filtered_interactions( for gene_id in batch_ids: if not results[gene_id]: + logger.debug("No interactions found for %s – inserting empty entry", gene_id) empty_entry = {key: np.nan for key in Cons.INTACT_OUTPUT_DICT} empty_entry["intact_link_to"] = np.nan results[gene_id] = [empty_entry] + else: + logger.debug("Kept %d interaction(s) for %s", len(results[gene_id]), gene_id) return results diff --git a/src/pyBiodatafuse/graph/rdf/graphdb.py b/src/pyBiodatafuse/graph/rdf/graphdb.py index 6d47ff5a..1f717e6e 100644 --- a/src/pyBiodatafuse/graph/rdf/graphdb.py +++ b/src/pyBiodatafuse/graph/rdf/graphdb.py @@ -83,9 +83,7 @@ def create_repository( graphdb:enable-literal-index "true" ; ] ]. - """.format( - repository_name=repository_name - ) + """.format(repository_name=repository_name) # Save the configuration content to a temporary file with tempfile.NamedTemporaryFile(delete=False, suffix=".ttl") as temp_file: temp_file.write(config_content.encode("utf-8")) diff --git a/tests/annotators/test_intact.py b/tests/annotators/test_intact.py index 934d1a86..b21e58fd 100644 --- a/tests/annotators/test_intact.py +++ b/tests/annotators/test_intact.py @@ -39,7 +39,7 @@ def test_get_interactions(self): "interaction_id": "EBI-7882257", "interactor_id_A": "EBI-1755945", "interactor_id_B": "EBI-1755945", - "score": 0.0, + "score": 0.56, "biological_role_A": "unspecified role", "biological_role_B": "unspecified role", "type": "direct interaction", @@ -60,7 +60,7 @@ def test_get_interactions(self): "interaction_id": "EBI-7882311", "interactor_id_A": "EBI-1755945", "interactor_id_B": "EBI-1755945", - "score": 0.0, + "score": 0.56, "biological_role_A": "unspecified role", "biological_role_B": "unspecified role", "type": "direct interaction", @@ -81,7 +81,7 @@ def test_get_interactions(self): "interaction_id": "EBI-5327885", "interactor_id_A": "EBI-5327879", "interactor_id_B": "EBI-1755945", - "score": 0.0, + "score": 0.4, "biological_role_A": "unspecified role", "biological_role_B": "unspecified role", "type": "physical association", @@ -130,7 +130,7 @@ def test_get_compound_interactions(self): "interaction_id": "EBI-9301798", "interactor_id_A": "EBI-9096", "interactor_id_B": "EBI-6621808", - "score": 0.0, + "score": 0.44, "biological_role_A": "enzyme", "biological_role_B": "enzyme target", "type": "enzymatic reaction", @@ -151,7 +151,7 @@ def test_get_compound_interactions(self): "interaction_id": "EBI-6621805", "interactor_id_A": "EBI-372327", "interactor_id_B": "EBI-6621808", - "score": 0.0, + "score": 0.44, "biological_role_A": "enzyme", "biological_role_B": "enzyme target", "type": "enzymatic reaction",