diff --git a/.github/workflows/upload-release.yml b/.github/workflows/upload-release.yml index 55dd882..24425da 100644 --- a/.github/workflows/upload-release.yml +++ b/.github/workflows/upload-release.yml @@ -33,4 +33,4 @@ jobs: - name: Build package run: python -m build - name: Publish package - uses: pypa/gh-action-pypi-publish@897895f1e160c830e369f9779632ebc134688e1b + uses: pypa/gh-action-pypi-publish@cef221092ed1bacb1cc03d23a2d87d1d172e277b diff --git a/pyoma/__init__.py b/pyoma/__init__.py index 582cd18..de46715 100644 --- a/pyoma/__init__.py +++ b/pyoma/__init__.py @@ -1,5 +1,5 @@ __author__ = "Adrian Altenhoff" -__version__ = "0.13.4" +__version__ = "0.13.5" def version(): diff --git a/pyoma/browser/db.py b/pyoma/browser/db.py index 7001b48..805f1b0 100644 --- a/pyoma/browser/db.py +++ b/pyoma/browser/db.py @@ -742,18 +742,16 @@ def get_hog_induced_pairwise_orthologs(self, entry): entry = self.ensure_entry(entry) genome_entry_range = self.id_mapper["OMA"].genome_range(entry["EntryNr"]) - def is_orthologous(a, b): + def is_orthologous(a: ProteinEntry, b: ProteinEntry): """genes are orthologs if their HOG id have a common prefix that is either the base id of the family or the prefix does not end with a subfamily number, ie. not a digit as common prefix. See LOFT paper for details on encoding.""" - if a["EntryNr"] == b["EntryNr"]: - return False - prefix = os.path.commonprefix((a["OmaHOG"], b["OmaHOG"])).decode() - if "." in prefix and prefix[-1].isdigit(): + prefix_or_false = a.is_orthologous_to(b) + if prefix_or_false is False: return False # count number of genes in query genome that are co-orthologs (== having the prefix) - cnts = numpy.char.startswith(hogids_of_genes_in_query_genome, prefix.encode("utf-8")).sum() + cnts = numpy.char.startswith(hogids_of_genes_in_query_genome, prefix_or_false.encode("utf-8")).sum() return cnts try: @@ -767,8 +765,9 @@ def is_orthologous(a, b): (hog_member["EntryNr"] >= genome_entry_range[0]) & (hog_member["EntryNr"] < genome_entry_range[1]) ) ]["OmaHOG"] + pe = ProteinEntry(self, entry) query_genome_genes_cnt = numpy.array( - [is_orthologous(entry, hog_member[i]) for i in range(len(hog_member))], + [is_orthologous(pe, ProteinEntry(self, hog_member[i])) for i in range(len(hog_member))], dtype="i4", ) mask = numpy.asarray(query_genome_genes_cnt, bool) @@ -811,18 +810,14 @@ def get_hog_induced_pairwise_paralogs(self, entry, start=0, stop=None): levels = levels[numpy.isin(levels["Level"], lineage)] hog_member = self._members_of_hog_id(self.format_hogid(fam)) - def is_paralogous(a, b): + def is_paralogous(a: ProteinEntry, b: ProteinEntry): """genes are orthologs if their HOG id have a common prefix that is either the base id of the family or the prefix does not end with a subfamily number, ie. not a digit as common prefix. See LOFT paper for details on encoding.""" - if a["EntryNr"] == b["EntryNr"]: - return False - prefix = os.path.commonprefix((a["OmaHOG"], b["OmaHOG"])).decode() - if "." in prefix and prefix[-1].isdigit(): - # gene is paralog. find MRCA in taxonomy of common HOGid prefix - k = prefix.rfind(".") - hog_id = prefix[:k].encode("ascii") + prefix_or_false = a.is_paralogous_to(b) + if prefix_or_false: + hog_id = prefix_or_false.encode("ascii") cand_levels = levels[numpy.where(levels["ID"] == hog_id)] sortidx = lineage.searchsorted(cand_levels["Level"], sorter=lineage_sorter) lin_idx = numpy.take(lineage_sorter, sortidx, mode="clip") @@ -830,11 +825,12 @@ def is_paralogous(a, b): # we take the first diverged lineage, meaning the duplication happened # on the branch to that level. return lineage[lin_idx[mask].min() - 1] - return None + return False def filter_candidates(entry, candidates): + pe = ProteinEntry(self, entry) for cand in candidates: - lev = is_paralogous(entry, cand) + lev = is_paralogous(pe, ProteinEntry(self, cand)) if lev: yield tuple(cand) + (lev,) @@ -2976,19 +2972,29 @@ def _get_root_taxon(self): elif i2 - i1 == 1: res = self.tax_table[self.parent_key[i1]] else: - res = numpy.array([(0, -1, b"LUCA", 4250)[: len(self.tax_table.dtype)]], dtype=self.tax_table.dtype)[0] + i0 = self.tax_table["NCBITaxonId"].searchsorted(0, sorter=self.taxid_key) + res = self.tax_table[self.taxid_key[i0]] + if res["NCBITaxonId"] != 0: + res = self._create_luca() return res def _add_luca_if_needed(self): i1 = self.tax_table["ParentTaxonId"].searchsorted(0, sorter=self.parent_key) i2 = self.tax_table["ParentTaxonId"].searchsorted(0, sorter=self.parent_key, side="right") if i2 - i1 > 1: - self.tax_table = numpy.append( - self.tax_table, - numpy.array([(0, -1, b"LUCA", 4250)[: len(self.tax_table.dtype)]], dtype=self.tax_table.dtype), - ) - self.taxid_key = self.tax_table.argsort(order=("NCBITaxonId")) - self.parent_key = self.tax_table.argsort(order=("ParentTaxonId")) + i0 = self.tax_table["NCBITaxonId"].searchsorted(0, sorter=self.taxid_key) + if self.tax_table[self.taxid_key[i0]]["NCBITaxonId"] != 0: + self.tax_table = numpy.append(self.tax_table, self._create_luca()) + self.taxid_key = self.tax_table.argsort(order=("NCBITaxonId")) + self.parent_key = self.tax_table.argsort(order=("ParentTaxonId")) + + def _create_luca(self): + luca_fields = {"NCBITaxonId": 0, "ParentTaxonId": -1, "Name": b"LUCA", "IsGenome": False, "Age": 4250} + luca_row = numpy.zeros((1,), dtype=self.tax_table.dtype) + for k, v in luca_fields.items(): + if k in res.dtype.fields: + luca_row[k] = v + return luca_row def _taxon_from_numeric(self, tid): idx = self._table_idx_from_numeric(tid) diff --git a/pyoma/browser/idmapper.py b/pyoma/browser/idmapper.py index e2e18c4..628de31 100644 --- a/pyoma/browser/idmapper.py +++ b/pyoma/browser/idmapper.py @@ -99,21 +99,37 @@ def _query_prefix(self, query: bytes, entrynr_range=None, exact_match=False) -> return stmt, condvals def _prefix_reduced_search(self, query, entrynr_range, limit=None, exact=False): - query = self._version_free_query(query).lower().encode("utf-8") - if query in self.gene_name_lookup: - return self.gene_name_lookup.get_matching_xref_row_nrs(query, entrynr_range) - red_tab_it = self.reduced_xref_tab.where(*self._query_prefix(query, entrynr_range, exact)) - xref_rows = numpy.fromiter(itertools.islice((row["XRefRow"] for row in red_tab_it), limit), dtype="i4") + def _search(query): + if query in self.gene_name_lookup: + return self.gene_name_lookup.get_matching_xref_row_nrs(query, entrynr_range) + red_tab_it = self.reduced_xref_tab.where(*self._query_prefix(query, entrynr_range, exact)) + xref_rows = numpy.fromiter(itertools.islice((row["XRefRow"] for row in red_tab_it), limit), dtype="i4") + return xref_rows + + query_lower = query.lower().encode("utf-8") + xref_rows = _search(query_lower) + if len(xref_rows) == 0: + query_lower_without_version = self._version_free_query(query).lower().encode("utf-8") + if query_lower_without_version != query_lower: + xref_rows = _search(query_lower_without_version) return xref_rows def _prefix_reducecd_count(self, query, entrynr_range=None): from .db import count_elements - query = self._version_free_query(query).lower().encode("utf-8") - cnts = self.gene_name_lookup.count(query) - if cnts == 0: - cnts = count_elements(self.reduced_xref_tab.where(*self._query_prefix(query, entrynr_range))) - return cnts + def _count(query): + cnts = self.gene_name_lookup.count(query) + if cnts == 0: + cnts = count_elements(self.reduced_xref_tab.where(*self._query_prefix(query, entrynr_range))) + return cnts + + query_lower = query.lower().encode("utf-8") + count = _count(query_lower) + if count == 0: + query_lower_without_version = self._version_free_query(query).lower().encode("utf-8") + if query_lower_without_version != query_lower: + count = _count(query_lower_without_version) + return count def _suffix_search(self, query, limit=None, unspecific_exception=50000): cnts = self.fulltext_index.count(query) diff --git a/pyoma/browser/models.py b/pyoma/browser/models.py index f13e7b8..403b75a 100644 --- a/pyoma/browser/models.py +++ b/pyoma/browser/models.py @@ -1,6 +1,9 @@ -from __future__ import division +from __future__ import division, annotations import collections +import re +from typing import List, Tuple + import numpy import time from .exceptions import UnknownSpecies, InvalidTaxonId, InvalidId @@ -225,6 +228,49 @@ def hog_family_nr(self): fam = 0 return fam + @LazyProperty + def _subhog_parts(self) -> List[Tuple[int, str]]: + root, *subhog_tokens = self.oma_hog.split(".") + parsed_parts = [] + for part in subhog_tokens: + if match := re.match(r"(\d+)([a-z]+)", part): + num = int(match.group(1)) + char = match.group(2) + parsed_parts.append((num, char)) + else: + raise ValueError(f"Invalid part format in '{part}'") + return parsed_parts + + def is_orthologous_to(self, other: ProteinEntry) -> bool | str: + if self.entry_nr == other.entry_nr: + return False + elif self.hog_family_nr != other.hog_family_nr: + return False + else: + common = [self.oma_hog.split(".")[0]] + for (num1, char1), (num2, char2) in zip(self._subhog_parts, other._subhog_parts): + if num1 != num2: + break + elif char1 != char2: + return False + common.append(f"{num1}{char1}") + return ".".join(common) + + def is_paralogous_to(self, other: ProteinEntry) -> bool | str: + if self.entry_nr == other.entry_nr: + return False + elif self.hog_family_nr != other.hog_family_nr: + return False + else: + common = [self.oma_hog.split(".")[0]] + for (num1, char1), (num2, char2) in zip(self._subhog_parts, other._subhog_parts): + if num1 != num2: + return False + elif char1 != char2: + return ".".join(common) + common.append(f"{num1}{char1}") + return False + @property def is_main_isoform(self): return bool(self._entry["AltSpliceVariant"] == 0 or self._entry["AltSpliceVariant"] == self._entry["EntryNr"]) diff --git a/pyoma/browser/tablefmt.py b/pyoma/browser/tablefmt.py index 2208d50..b82a739 100644 --- a/pyoma/browser/tablefmt.py +++ b/pyoma/browser/tablefmt.py @@ -33,9 +33,7 @@ class OrthoXmlHogTable(tables.IsDescription): HogAugmentedBufferLength = tables.UInt32Col(pos=4) -class AncestralSyntenyRels(tables.IsDescription): - HogRow1 = tables.UInt32Col(pos=0) - HogRow2 = tables.UInt32Col(pos=1) +class AbstractSyntenyRels(tables.IsDescription): Weight = tables.Float16Col(pos=2) Evidence = tables.EnumCol( tables.Enum({"linearized": 1, "parsimonious": 2, "any": 4}), @@ -46,6 +44,16 @@ class AncestralSyntenyRels(tables.IsDescription): LCA_taxid = tables.Int32Col(pos=4, dflt=-1) +class AncestralSyntenyRels(AbstractSyntenyRels): + HogRow1 = tables.UInt32Col(pos=0) + HogRow2 = tables.UInt32Col(pos=1) + + +class ExtantSyntenyRels(AbstractSyntenyRels): + EntryNr1 = tables.UInt32Col(pos=0) + EntryNr2 = tables.UInt32Col(pos=1) + + class ProteinTable(tables.IsDescription): EntryNr = tables.UInt32Col(pos=1) SeqBufferOffset = tables.UInt64Col(pos=2) diff --git a/setup.cfg b/setup.cfg index 31f8b14..8d6482b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.13.4 +current_version = 0.13.5 commit = True tag = False diff --git a/tests/browser/test_models.py b/tests/browser/test_models.py index 86bba81..2abb2d5 100644 --- a/tests/browser/test_models.py +++ b/tests/browser/test_models.py @@ -3,6 +3,7 @@ import sys import time import unittest +from unittest.mock import MagicMock from builtins import str import numpy @@ -94,6 +95,101 @@ def test_keyword_of_hog(self): self.assertEqual("", hog.keyword) +class TestProteinEntryOrthologyParalogy(unittest.TestCase): + def setUp(self): + # Mock database and entry data + self.mock_db = MagicMock() + # Minimal entry dicts with required fields + self.entry1 = {"EntryNr": 1, "OmaHOG": b"1.15a.2bz"} + self.entry2 = {"EntryNr": 2, "OmaHOG": b"1.15a.2b"} + self.entry3 = { + "EntryNr": 3, + "OmaHOG": b"1.15b.2b", + } + self.entry4 = { + "EntryNr": 4, + "OmaHOG": b"5.1b", + } + self.entry5 = { + "EntryNr": 5, + "OmaHOG": b"5", + } + self.entry6 = { + "EntryNr": 6, + "OmaHOG": b"", + } + self.entry7 = { + "EntryNr": 7, + "OmaHOG": b"", + } + + def hog_family_sideffect(entry): + f = entry["OmaHOG"].split(b".")[0] + if len(f) == 0: + raise pyoma.browser.exceptions.Singleton(entry) + return int(f) + + # Patch hog_family method + + self.mock_db.hog_family.side_effect = hog_family_sideffect + + def make_entry(self, entry_dict): + # Patch id_mapper and other required db methods if needed + return models.ProteinEntry(self.mock_db, entry_dict) + + def test_not_orthologous_if_same(self): + p1 = self.make_entry(self.entry1) + self.assertFalse(p1.is_orthologous_to(p1)) + + def test_not_paralogous_if_same(self): + p1 = self.make_entry(self.entry1) + self.assertFalse(p1.is_paralogous_to(p1)) + + def test_not_orthologous_but_paralogous_if_different_subhogid(self): + p1 = self.make_entry(self.entry1) + p2 = self.make_entry(self.entry2) + self.assertFalse(p1.is_orthologous_to(p2)) + self.assertFalse(p2.is_orthologous_to(p1)) + self.assertEqual("1.15a", p1.is_paralogous_to(p2)) + self.assertEqual("1.15a", p2.is_paralogous_to(p1)) + + def test_not_orthologous_if_different_family(self): + p1 = self.make_entry(self.entry1) + p4 = self.make_entry(self.entry4) + self.assertFalse(p1.is_orthologous_to(p4)) + + def test_subfam_orthologous_to_superfam(self): + p4 = self.make_entry(self.entry4) + p5 = self.make_entry(self.entry5) + self.assertEqual("5", p4.is_orthologous_to(p5)) + self.assertEqual("5", p5.is_orthologous_to(p4)) + + def test_subfam_are_not_paralogous_to_superfam(self): + p4 = self.make_entry(self.entry4) + p5 = self.make_entry(self.entry5) + self.assertFalse(p4.is_paralogous_to(p5)) + + def test_paralog_not_ortholog_on_subhog(self): + p2 = self.make_entry(self.entry2) + p3 = self.make_entry(self.entry3) + self.assertFalse(p2.is_orthologous_to(p3)) + self.assertFalse(p3.is_orthologous_to(p2)) + self.assertEqual("1", p2.is_paralogous_to(p3)) + self.assertEqual("1", p3.is_paralogous_to(p2)) + + def test_neither_paralog_nor_ortholog_on_subhog_with_empty_hogid(self): + p6 = self.make_entry(self.entry6) + p3 = self.make_entry(self.entry3) + self.assertFalse(p6.is_orthologous_to(p3)) + self.assertFalse(p3.is_paralogous_to(p6)) + + def test_orthologous_to_paralogous_on_singletons(self): + p6 = self.make_entry(self.entry6) + p7 = self.make_entry(self.entry7) + self.assertFalse(p6.is_orthologous_to(p7)) + self.assertFalse(p7.is_paralogous_to(p6)) + + class OmaGroupModelTest(TestDbBase): def test_instant_with_grpnr(self): grp = models.OmaGroup(self.db, 1384)