From 6b704804a5e41e75b440929bb9238841cfd22906 Mon Sep 17 00:00:00 2001 From: Adrian Altenhoff Date: Tue, 3 Dec 2024 11:26:52 +0100 Subject: [PATCH 1/7] [ADD] ExtantSyntenyRels table defintion --- pyoma/browser/tablefmt.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/pyoma/browser/tablefmt.py b/pyoma/browser/tablefmt.py index 2208d504..b82a7394 100644 --- a/pyoma/browser/tablefmt.py +++ b/pyoma/browser/tablefmt.py @@ -33,9 +33,7 @@ class OrthoXmlHogTable(tables.IsDescription): HogAugmentedBufferLength = tables.UInt32Col(pos=4) -class AncestralSyntenyRels(tables.IsDescription): - HogRow1 = tables.UInt32Col(pos=0) - HogRow2 = tables.UInt32Col(pos=1) +class AbstractSyntenyRels(tables.IsDescription): Weight = tables.Float16Col(pos=2) Evidence = tables.EnumCol( tables.Enum({"linearized": 1, "parsimonious": 2, "any": 4}), @@ -46,6 +44,16 @@ class AncestralSyntenyRels(tables.IsDescription): LCA_taxid = tables.Int32Col(pos=4, dflt=-1) +class AncestralSyntenyRels(AbstractSyntenyRels): + HogRow1 = tables.UInt32Col(pos=0) + HogRow2 = tables.UInt32Col(pos=1) + + +class ExtantSyntenyRels(AbstractSyntenyRels): + EntryNr1 = tables.UInt32Col(pos=0) + EntryNr2 = tables.UInt32Col(pos=1) + + class ProteinTable(tables.IsDescription): EntryNr = tables.UInt32Col(pos=1) SeqBufferOffset = tables.UInt64Col(pos=2) From 448d49665cc32c7acc1f360c08891b04beeb0684 Mon Sep 17 00:00:00 2001 From: Adrian Altenhoff Date: Wed, 24 Sep 2025 00:47:14 +0200 Subject: [PATCH 2/7] [RF] add is_orthologous_to/is_paralogous_to method to ProteinEntry model - based on OmaHOG id. - various unittests --- pyoma/browser/db.py | 30 +++++------ pyoma/browser/models.py | 48 +++++++++++++++++- tests/browser/test_models.py | 96 ++++++++++++++++++++++++++++++++++++ 3 files changed, 156 insertions(+), 18 deletions(-) diff --git a/pyoma/browser/db.py b/pyoma/browser/db.py index 7001b487..f23e1ae2 100644 --- a/pyoma/browser/db.py +++ b/pyoma/browser/db.py @@ -742,18 +742,16 @@ def get_hog_induced_pairwise_orthologs(self, entry): entry = self.ensure_entry(entry) genome_entry_range = self.id_mapper["OMA"].genome_range(entry["EntryNr"]) - def is_orthologous(a, b): + def is_orthologous(a: ProteinEntry, b: ProteinEntry): """genes are orthologs if their HOG id have a common prefix that is either the base id of the family or the prefix does not end with a subfamily number, ie. not a digit as common prefix. See LOFT paper for details on encoding.""" - if a["EntryNr"] == b["EntryNr"]: - return False - prefix = os.path.commonprefix((a["OmaHOG"], b["OmaHOG"])).decode() - if "." in prefix and prefix[-1].isdigit(): + prefix_or_false = a.is_orthologous_to(b) + if prefix_or_false is False: return False # count number of genes in query genome that are co-orthologs (== having the prefix) - cnts = numpy.char.startswith(hogids_of_genes_in_query_genome, prefix.encode("utf-8")).sum() + cnts = numpy.char.startswith(hogids_of_genes_in_query_genome, prefix_or_false.encode("utf-8")).sum() return cnts try: @@ -767,8 +765,9 @@ def is_orthologous(a, b): (hog_member["EntryNr"] >= genome_entry_range[0]) & (hog_member["EntryNr"] < genome_entry_range[1]) ) ]["OmaHOG"] + pe = ProteinEntry(self, entry) query_genome_genes_cnt = numpy.array( - [is_orthologous(entry, hog_member[i]) for i in range(len(hog_member))], + [is_orthologous(pe, ProteinEntry(self, hog_member[i])) for i in range(len(hog_member))], dtype="i4", ) mask = numpy.asarray(query_genome_genes_cnt, bool) @@ -811,18 +810,14 @@ def get_hog_induced_pairwise_paralogs(self, entry, start=0, stop=None): levels = levels[numpy.isin(levels["Level"], lineage)] hog_member = self._members_of_hog_id(self.format_hogid(fam)) - def is_paralogous(a, b): + def is_paralogous(a: ProteinEntry, b: ProteinEntry): """genes are orthologs if their HOG id have a common prefix that is either the base id of the family or the prefix does not end with a subfamily number, ie. not a digit as common prefix. See LOFT paper for details on encoding.""" - if a["EntryNr"] == b["EntryNr"]: - return False - prefix = os.path.commonprefix((a["OmaHOG"], b["OmaHOG"])).decode() - if "." in prefix and prefix[-1].isdigit(): - # gene is paralog. find MRCA in taxonomy of common HOGid prefix - k = prefix.rfind(".") - hog_id = prefix[:k].encode("ascii") + prefix_or_false = a.is_paralogous_to(b) + if prefix_or_false: + hog_id = prefix_or_false.encode("ascii") cand_levels = levels[numpy.where(levels["ID"] == hog_id)] sortidx = lineage.searchsorted(cand_levels["Level"], sorter=lineage_sorter) lin_idx = numpy.take(lineage_sorter, sortidx, mode="clip") @@ -830,11 +825,12 @@ def is_paralogous(a, b): # we take the first diverged lineage, meaning the duplication happened # on the branch to that level. return lineage[lin_idx[mask].min() - 1] - return None + return False def filter_candidates(entry, candidates): + pe = ProteinEntry(self, entry) for cand in candidates: - lev = is_paralogous(entry, cand) + lev = is_paralogous(pe, ProteinEntry(self, cand)) if lev: yield tuple(cand) + (lev,) diff --git a/pyoma/browser/models.py b/pyoma/browser/models.py index f13e7b8f..403b75ac 100644 --- a/pyoma/browser/models.py +++ b/pyoma/browser/models.py @@ -1,6 +1,9 @@ -from __future__ import division +from __future__ import division, annotations import collections +import re +from typing import List, Tuple + import numpy import time from .exceptions import UnknownSpecies, InvalidTaxonId, InvalidId @@ -225,6 +228,49 @@ def hog_family_nr(self): fam = 0 return fam + @LazyProperty + def _subhog_parts(self) -> List[Tuple[int, str]]: + root, *subhog_tokens = self.oma_hog.split(".") + parsed_parts = [] + for part in subhog_tokens: + if match := re.match(r"(\d+)([a-z]+)", part): + num = int(match.group(1)) + char = match.group(2) + parsed_parts.append((num, char)) + else: + raise ValueError(f"Invalid part format in '{part}'") + return parsed_parts + + def is_orthologous_to(self, other: ProteinEntry) -> bool | str: + if self.entry_nr == other.entry_nr: + return False + elif self.hog_family_nr != other.hog_family_nr: + return False + else: + common = [self.oma_hog.split(".")[0]] + for (num1, char1), (num2, char2) in zip(self._subhog_parts, other._subhog_parts): + if num1 != num2: + break + elif char1 != char2: + return False + common.append(f"{num1}{char1}") + return ".".join(common) + + def is_paralogous_to(self, other: ProteinEntry) -> bool | str: + if self.entry_nr == other.entry_nr: + return False + elif self.hog_family_nr != other.hog_family_nr: + return False + else: + common = [self.oma_hog.split(".")[0]] + for (num1, char1), (num2, char2) in zip(self._subhog_parts, other._subhog_parts): + if num1 != num2: + return False + elif char1 != char2: + return ".".join(common) + common.append(f"{num1}{char1}") + return False + @property def is_main_isoform(self): return bool(self._entry["AltSpliceVariant"] == 0 or self._entry["AltSpliceVariant"] == self._entry["EntryNr"]) diff --git a/tests/browser/test_models.py b/tests/browser/test_models.py index 86bba816..2abb2d5d 100644 --- a/tests/browser/test_models.py +++ b/tests/browser/test_models.py @@ -3,6 +3,7 @@ import sys import time import unittest +from unittest.mock import MagicMock from builtins import str import numpy @@ -94,6 +95,101 @@ def test_keyword_of_hog(self): self.assertEqual("", hog.keyword) +class TestProteinEntryOrthologyParalogy(unittest.TestCase): + def setUp(self): + # Mock database and entry data + self.mock_db = MagicMock() + # Minimal entry dicts with required fields + self.entry1 = {"EntryNr": 1, "OmaHOG": b"1.15a.2bz"} + self.entry2 = {"EntryNr": 2, "OmaHOG": b"1.15a.2b"} + self.entry3 = { + "EntryNr": 3, + "OmaHOG": b"1.15b.2b", + } + self.entry4 = { + "EntryNr": 4, + "OmaHOG": b"5.1b", + } + self.entry5 = { + "EntryNr": 5, + "OmaHOG": b"5", + } + self.entry6 = { + "EntryNr": 6, + "OmaHOG": b"", + } + self.entry7 = { + "EntryNr": 7, + "OmaHOG": b"", + } + + def hog_family_sideffect(entry): + f = entry["OmaHOG"].split(b".")[0] + if len(f) == 0: + raise pyoma.browser.exceptions.Singleton(entry) + return int(f) + + # Patch hog_family method + + self.mock_db.hog_family.side_effect = hog_family_sideffect + + def make_entry(self, entry_dict): + # Patch id_mapper and other required db methods if needed + return models.ProteinEntry(self.mock_db, entry_dict) + + def test_not_orthologous_if_same(self): + p1 = self.make_entry(self.entry1) + self.assertFalse(p1.is_orthologous_to(p1)) + + def test_not_paralogous_if_same(self): + p1 = self.make_entry(self.entry1) + self.assertFalse(p1.is_paralogous_to(p1)) + + def test_not_orthologous_but_paralogous_if_different_subhogid(self): + p1 = self.make_entry(self.entry1) + p2 = self.make_entry(self.entry2) + self.assertFalse(p1.is_orthologous_to(p2)) + self.assertFalse(p2.is_orthologous_to(p1)) + self.assertEqual("1.15a", p1.is_paralogous_to(p2)) + self.assertEqual("1.15a", p2.is_paralogous_to(p1)) + + def test_not_orthologous_if_different_family(self): + p1 = self.make_entry(self.entry1) + p4 = self.make_entry(self.entry4) + self.assertFalse(p1.is_orthologous_to(p4)) + + def test_subfam_orthologous_to_superfam(self): + p4 = self.make_entry(self.entry4) + p5 = self.make_entry(self.entry5) + self.assertEqual("5", p4.is_orthologous_to(p5)) + self.assertEqual("5", p5.is_orthologous_to(p4)) + + def test_subfam_are_not_paralogous_to_superfam(self): + p4 = self.make_entry(self.entry4) + p5 = self.make_entry(self.entry5) + self.assertFalse(p4.is_paralogous_to(p5)) + + def test_paralog_not_ortholog_on_subhog(self): + p2 = self.make_entry(self.entry2) + p3 = self.make_entry(self.entry3) + self.assertFalse(p2.is_orthologous_to(p3)) + self.assertFalse(p3.is_orthologous_to(p2)) + self.assertEqual("1", p2.is_paralogous_to(p3)) + self.assertEqual("1", p3.is_paralogous_to(p2)) + + def test_neither_paralog_nor_ortholog_on_subhog_with_empty_hogid(self): + p6 = self.make_entry(self.entry6) + p3 = self.make_entry(self.entry3) + self.assertFalse(p6.is_orthologous_to(p3)) + self.assertFalse(p3.is_paralogous_to(p6)) + + def test_orthologous_to_paralogous_on_singletons(self): + p6 = self.make_entry(self.entry6) + p7 = self.make_entry(self.entry7) + self.assertFalse(p6.is_orthologous_to(p7)) + self.assertFalse(p7.is_paralogous_to(p6)) + + class OmaGroupModelTest(TestDbBase): def test_instant_with_grpnr(self): grp = models.OmaGroup(self.db, 1384) From 1de76cf237cf63aafca9c34b8e9ebaed14ca06cf Mon Sep 17 00:00:00 2001 From: Adrian Altenhoff Date: Thu, 26 Feb 2026 14:40:49 +0100 Subject: [PATCH 3/7] [FIX] search of xrefs with .\d\d format PomBase identifiers have often look like SPBC11B10.09. Prior to this fix, we would trim the .09 from the id and report an ambigous match. Now, we first search without truncating the numbers at the end, and only if no hits are found, we do with truncation. --- pyoma/browser/idmapper.py | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/pyoma/browser/idmapper.py b/pyoma/browser/idmapper.py index e2e18c4e..628de31c 100644 --- a/pyoma/browser/idmapper.py +++ b/pyoma/browser/idmapper.py @@ -99,21 +99,37 @@ def _query_prefix(self, query: bytes, entrynr_range=None, exact_match=False) -> return stmt, condvals def _prefix_reduced_search(self, query, entrynr_range, limit=None, exact=False): - query = self._version_free_query(query).lower().encode("utf-8") - if query in self.gene_name_lookup: - return self.gene_name_lookup.get_matching_xref_row_nrs(query, entrynr_range) - red_tab_it = self.reduced_xref_tab.where(*self._query_prefix(query, entrynr_range, exact)) - xref_rows = numpy.fromiter(itertools.islice((row["XRefRow"] for row in red_tab_it), limit), dtype="i4") + def _search(query): + if query in self.gene_name_lookup: + return self.gene_name_lookup.get_matching_xref_row_nrs(query, entrynr_range) + red_tab_it = self.reduced_xref_tab.where(*self._query_prefix(query, entrynr_range, exact)) + xref_rows = numpy.fromiter(itertools.islice((row["XRefRow"] for row in red_tab_it), limit), dtype="i4") + return xref_rows + + query_lower = query.lower().encode("utf-8") + xref_rows = _search(query_lower) + if len(xref_rows) == 0: + query_lower_without_version = self._version_free_query(query).lower().encode("utf-8") + if query_lower_without_version != query_lower: + xref_rows = _search(query_lower_without_version) return xref_rows def _prefix_reducecd_count(self, query, entrynr_range=None): from .db import count_elements - query = self._version_free_query(query).lower().encode("utf-8") - cnts = self.gene_name_lookup.count(query) - if cnts == 0: - cnts = count_elements(self.reduced_xref_tab.where(*self._query_prefix(query, entrynr_range))) - return cnts + def _count(query): + cnts = self.gene_name_lookup.count(query) + if cnts == 0: + cnts = count_elements(self.reduced_xref_tab.where(*self._query_prefix(query, entrynr_range))) + return cnts + + query_lower = query.lower().encode("utf-8") + count = _count(query_lower) + if count == 0: + query_lower_without_version = self._version_free_query(query).lower().encode("utf-8") + if query_lower_without_version != query_lower: + count = _count(query_lower_without_version) + return count def _suffix_search(self, query, limit=None, unspecific_exception=50000): cnts = self.fulltext_index.count(query) From ba53bba7ce63043a6dca714df3ce701373fc31fd Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 27 Jan 2025 02:17:16 +0000 Subject: [PATCH 4/7] Bump pypa/gh-action-pypi-publish from 1.12.2 to 1.12.4 Bumps [pypa/gh-action-pypi-publish](https://github.com/pypa/gh-action-pypi-publish) from 1.12.2 to 1.12.4. - [Release notes](https://github.com/pypa/gh-action-pypi-publish/releases) - [Commits](https://github.com/pypa/gh-action-pypi-publish/compare/15c56dba361d8335944d31a2ecd17d700fc7bcbc...76f52bc884231f62b9a034ebfe128415bbaabdfc) --- updated-dependencies: - dependency-name: pypa/gh-action-pypi-publish dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- .github/workflows/upload-release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/upload-release.yml b/.github/workflows/upload-release.yml index 55dd8820..60501cc4 100644 --- a/.github/workflows/upload-release.yml +++ b/.github/workflows/upload-release.yml @@ -33,4 +33,4 @@ jobs: - name: Build package run: python -m build - name: Publish package - uses: pypa/gh-action-pypi-publish@897895f1e160c830e369f9779632ebc134688e1b + uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc From a1e76ca79a89bf600886609e9ac1ccf89bd9fcda Mon Sep 17 00:00:00 2001 From: Adrian Altenhoff Date: Fri, 27 Feb 2026 07:36:44 +0100 Subject: [PATCH 5/7] Ensure Taxonomy root can be build reliably ensure backward and forward compatible way to generate or identify the root node in the taxonomy. now, this is resilient to changes in the Taxonomy schema. --- pyoma/browser/db.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/pyoma/browser/db.py b/pyoma/browser/db.py index f23e1ae2..805f1b0d 100644 --- a/pyoma/browser/db.py +++ b/pyoma/browser/db.py @@ -2972,19 +2972,29 @@ def _get_root_taxon(self): elif i2 - i1 == 1: res = self.tax_table[self.parent_key[i1]] else: - res = numpy.array([(0, -1, b"LUCA", 4250)[: len(self.tax_table.dtype)]], dtype=self.tax_table.dtype)[0] + i0 = self.tax_table["NCBITaxonId"].searchsorted(0, sorter=self.taxid_key) + res = self.tax_table[self.taxid_key[i0]] + if res["NCBITaxonId"] != 0: + res = self._create_luca() return res def _add_luca_if_needed(self): i1 = self.tax_table["ParentTaxonId"].searchsorted(0, sorter=self.parent_key) i2 = self.tax_table["ParentTaxonId"].searchsorted(0, sorter=self.parent_key, side="right") if i2 - i1 > 1: - self.tax_table = numpy.append( - self.tax_table, - numpy.array([(0, -1, b"LUCA", 4250)[: len(self.tax_table.dtype)]], dtype=self.tax_table.dtype), - ) - self.taxid_key = self.tax_table.argsort(order=("NCBITaxonId")) - self.parent_key = self.tax_table.argsort(order=("ParentTaxonId")) + i0 = self.tax_table["NCBITaxonId"].searchsorted(0, sorter=self.taxid_key) + if self.tax_table[self.taxid_key[i0]]["NCBITaxonId"] != 0: + self.tax_table = numpy.append(self.tax_table, self._create_luca()) + self.taxid_key = self.tax_table.argsort(order=("NCBITaxonId")) + self.parent_key = self.tax_table.argsort(order=("ParentTaxonId")) + + def _create_luca(self): + luca_fields = {"NCBITaxonId": 0, "ParentTaxonId": -1, "Name": b"LUCA", "IsGenome": False, "Age": 4250} + luca_row = numpy.zeros((1,), dtype=self.tax_table.dtype) + for k, v in luca_fields.items(): + if k in res.dtype.fields: + luca_row[k] = v + return luca_row def _taxon_from_numeric(self, tid): idx = self._table_idx_from_numeric(tid) From 0e886f0337ce4c5e45f7263b0e584366214c36c0 Mon Sep 17 00:00:00 2001 From: Adrian Altenhoff Date: Fri, 27 Feb 2026 07:48:48 +0100 Subject: [PATCH 6/7] =?UTF-8?q?Bump=20version:=200.13.4=20=E2=86=92=200.13?= =?UTF-8?q?.5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyoma/__init__.py | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyoma/__init__.py b/pyoma/__init__.py index 582cd187..de467150 100644 --- a/pyoma/__init__.py +++ b/pyoma/__init__.py @@ -1,5 +1,5 @@ __author__ = "Adrian Altenhoff" -__version__ = "0.13.4" +__version__ = "0.13.5" def version(): diff --git a/setup.cfg b/setup.cfg index 31f8b149..8d6482b0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.13.4 +current_version = 0.13.5 commit = True tag = False From 8aaf3958e86ff626d7c7a6c6f7446a84fce95649 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 8 Apr 2026 02:53:12 +0000 Subject: [PATCH 7/7] Bump pypa/gh-action-pypi-publish from 1.12.4 to 1.14.0 Bumps [pypa/gh-action-pypi-publish](https://github.com/pypa/gh-action-pypi-publish) from 1.12.4 to 1.14.0. - [Release notes](https://github.com/pypa/gh-action-pypi-publish/releases) - [Commits](https://github.com/pypa/gh-action-pypi-publish/compare/76f52bc884231f62b9a034ebfe128415bbaabdfc...cef221092ed1bacb1cc03d23a2d87d1d172e277b) --- updated-dependencies: - dependency-name: pypa/gh-action-pypi-publish dependency-version: 1.14.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- .github/workflows/upload-release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/upload-release.yml b/.github/workflows/upload-release.yml index 60501cc4..24425dab 100644 --- a/.github/workflows/upload-release.yml +++ b/.github/workflows/upload-release.yml @@ -33,4 +33,4 @@ jobs: - name: Build package run: python -m build - name: Publish package - uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc + uses: pypa/gh-action-pypi-publish@cef221092ed1bacb1cc03d23a2d87d1d172e277b