Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/upload-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,4 @@ jobs:
- name: Build package
run: python -m build
- name: Publish package
uses: pypa/gh-action-pypi-publish@897895f1e160c830e369f9779632ebc134688e1b
uses: pypa/gh-action-pypi-publish@cef221092ed1bacb1cc03d23a2d87d1d172e277b
2 changes: 1 addition & 1 deletion pyoma/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
__author__ = "Adrian Altenhoff"
__version__ = "0.13.4"
__version__ = "0.13.5"


def version():
Expand Down
54 changes: 30 additions & 24 deletions pyoma/browser/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -742,18 +742,16 @@ def get_hog_induced_pairwise_orthologs(self, entry):
entry = self.ensure_entry(entry)
genome_entry_range = self.id_mapper["OMA"].genome_range(entry["EntryNr"])

def is_orthologous(a, b):
def is_orthologous(a: ProteinEntry, b: ProteinEntry):
"""genes are orthologs if their HOG id have a common prefix that is
either the base id of the family or the prefix does not end with
a subfamily number, ie. not a digit as common prefix. See LOFT paper
for details on encoding."""
if a["EntryNr"] == b["EntryNr"]:
return False
prefix = os.path.commonprefix((a["OmaHOG"], b["OmaHOG"])).decode()
if "." in prefix and prefix[-1].isdigit():
prefix_or_false = a.is_orthologous_to(b)
if prefix_or_false is False:
return False
# count number of genes in query genome that are co-orthologs (== having the prefix)
cnts = numpy.char.startswith(hogids_of_genes_in_query_genome, prefix.encode("utf-8")).sum()
cnts = numpy.char.startswith(hogids_of_genes_in_query_genome, prefix_or_false.encode("utf-8")).sum()
return cnts

try:
Expand All @@ -767,8 +765,9 @@ def is_orthologous(a, b):
(hog_member["EntryNr"] >= genome_entry_range[0]) & (hog_member["EntryNr"] < genome_entry_range[1])
)
]["OmaHOG"]
pe = ProteinEntry(self, entry)
query_genome_genes_cnt = numpy.array(
[is_orthologous(entry, hog_member[i]) for i in range(len(hog_member))],
[is_orthologous(pe, ProteinEntry(self, hog_member[i])) for i in range(len(hog_member))],
dtype="i4",
)
mask = numpy.asarray(query_genome_genes_cnt, bool)
Expand Down Expand Up @@ -811,30 +810,27 @@ def get_hog_induced_pairwise_paralogs(self, entry, start=0, stop=None):
levels = levels[numpy.isin(levels["Level"], lineage)]
hog_member = self._members_of_hog_id(self.format_hogid(fam))

def is_paralogous(a, b):
def is_paralogous(a: ProteinEntry, b: ProteinEntry):
"""genes are orthologs if their HOG id have a common prefix that is
either the base id of the family or the prefix does not end with
a subfamily number, ie. not a digit as common prefix. See LOFT paper
for details on encoding."""
if a["EntryNr"] == b["EntryNr"]:
return False
prefix = os.path.commonprefix((a["OmaHOG"], b["OmaHOG"])).decode()
if "." in prefix and prefix[-1].isdigit():
# gene is paralog. find MRCA in taxonomy of common HOGid prefix
k = prefix.rfind(".")
hog_id = prefix[:k].encode("ascii")
prefix_or_false = a.is_paralogous_to(b)
if prefix_or_false:
hog_id = prefix_or_false.encode("ascii")
cand_levels = levels[numpy.where(levels["ID"] == hog_id)]
sortidx = lineage.searchsorted(cand_levels["Level"], sorter=lineage_sorter)
lin_idx = numpy.take(lineage_sorter, sortidx, mode="clip")
mask = lineage[lin_idx] == cand_levels["Level"]
# we take the first diverged lineage, meaning the duplication happened
# on the branch to that level.
return lineage[lin_idx[mask].min() - 1]
return None
return False

def filter_candidates(entry, candidates):
pe = ProteinEntry(self, entry)
for cand in candidates:
lev = is_paralogous(entry, cand)
lev = is_paralogous(pe, ProteinEntry(self, cand))
if lev:
yield tuple(cand) + (lev,)

Expand Down Expand Up @@ -2976,19 +2972,29 @@ def _get_root_taxon(self):
elif i2 - i1 == 1:
res = self.tax_table[self.parent_key[i1]]
else:
res = numpy.array([(0, -1, b"LUCA", 4250)[: len(self.tax_table.dtype)]], dtype=self.tax_table.dtype)[0]
i0 = self.tax_table["NCBITaxonId"].searchsorted(0, sorter=self.taxid_key)
res = self.tax_table[self.taxid_key[i0]]
if res["NCBITaxonId"] != 0:
res = self._create_luca()
return res

def _add_luca_if_needed(self):
i1 = self.tax_table["ParentTaxonId"].searchsorted(0, sorter=self.parent_key)
i2 = self.tax_table["ParentTaxonId"].searchsorted(0, sorter=self.parent_key, side="right")
if i2 - i1 > 1:
self.tax_table = numpy.append(
self.tax_table,
numpy.array([(0, -1, b"LUCA", 4250)[: len(self.tax_table.dtype)]], dtype=self.tax_table.dtype),
)
self.taxid_key = self.tax_table.argsort(order=("NCBITaxonId"))
self.parent_key = self.tax_table.argsort(order=("ParentTaxonId"))
i0 = self.tax_table["NCBITaxonId"].searchsorted(0, sorter=self.taxid_key)
if self.tax_table[self.taxid_key[i0]]["NCBITaxonId"] != 0:
self.tax_table = numpy.append(self.tax_table, self._create_luca())
self.taxid_key = self.tax_table.argsort(order=("NCBITaxonId"))
self.parent_key = self.tax_table.argsort(order=("ParentTaxonId"))

def _create_luca(self):
luca_fields = {"NCBITaxonId": 0, "ParentTaxonId": -1, "Name": b"LUCA", "IsGenome": False, "Age": 4250}
luca_row = numpy.zeros((1,), dtype=self.tax_table.dtype)
for k, v in luca_fields.items():
if k in res.dtype.fields:
luca_row[k] = v
return luca_row

def _taxon_from_numeric(self, tid):
idx = self._table_idx_from_numeric(tid)
Expand Down
36 changes: 26 additions & 10 deletions pyoma/browser/idmapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,21 +99,37 @@ def _query_prefix(self, query: bytes, entrynr_range=None, exact_match=False) ->
return stmt, condvals

def _prefix_reduced_search(self, query, entrynr_range, limit=None, exact=False):
query = self._version_free_query(query).lower().encode("utf-8")
if query in self.gene_name_lookup:
return self.gene_name_lookup.get_matching_xref_row_nrs(query, entrynr_range)
red_tab_it = self.reduced_xref_tab.where(*self._query_prefix(query, entrynr_range, exact))
xref_rows = numpy.fromiter(itertools.islice((row["XRefRow"] for row in red_tab_it), limit), dtype="i4")
def _search(query):
if query in self.gene_name_lookup:
return self.gene_name_lookup.get_matching_xref_row_nrs(query, entrynr_range)
red_tab_it = self.reduced_xref_tab.where(*self._query_prefix(query, entrynr_range, exact))
xref_rows = numpy.fromiter(itertools.islice((row["XRefRow"] for row in red_tab_it), limit), dtype="i4")
return xref_rows

query_lower = query.lower().encode("utf-8")
xref_rows = _search(query_lower)
if len(xref_rows) == 0:
query_lower_without_version = self._version_free_query(query).lower().encode("utf-8")
if query_lower_without_version != query_lower:
xref_rows = _search(query_lower_without_version)
return xref_rows

def _prefix_reducecd_count(self, query, entrynr_range=None):
from .db import count_elements

query = self._version_free_query(query).lower().encode("utf-8")
cnts = self.gene_name_lookup.count(query)
if cnts == 0:
cnts = count_elements(self.reduced_xref_tab.where(*self._query_prefix(query, entrynr_range)))
return cnts
def _count(query):
cnts = self.gene_name_lookup.count(query)
if cnts == 0:
cnts = count_elements(self.reduced_xref_tab.where(*self._query_prefix(query, entrynr_range)))
return cnts

query_lower = query.lower().encode("utf-8")
count = _count(query_lower)
if count == 0:
query_lower_without_version = self._version_free_query(query).lower().encode("utf-8")
if query_lower_without_version != query_lower:
count = _count(query_lower_without_version)
return count

def _suffix_search(self, query, limit=None, unspecific_exception=50000):
cnts = self.fulltext_index.count(query)
Expand Down
48 changes: 47 additions & 1 deletion pyoma/browser/models.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
from __future__ import division
from __future__ import division, annotations

import collections
import re
from typing import List, Tuple

import numpy
import time
from .exceptions import UnknownSpecies, InvalidTaxonId, InvalidId
Expand Down Expand Up @@ -225,6 +228,49 @@ def hog_family_nr(self):
fam = 0
return fam

@LazyProperty
def _subhog_parts(self) -> List[Tuple[int, str]]:
root, *subhog_tokens = self.oma_hog.split(".")
parsed_parts = []
for part in subhog_tokens:
if match := re.match(r"(\d+)([a-z]+)", part):
num = int(match.group(1))
char = match.group(2)
parsed_parts.append((num, char))
else:
raise ValueError(f"Invalid part format in '{part}'")
return parsed_parts

def is_orthologous_to(self, other: ProteinEntry) -> bool | str:
if self.entry_nr == other.entry_nr:
return False
elif self.hog_family_nr != other.hog_family_nr:
return False
else:
common = [self.oma_hog.split(".")[0]]
for (num1, char1), (num2, char2) in zip(self._subhog_parts, other._subhog_parts):
if num1 != num2:
break
elif char1 != char2:
return False
common.append(f"{num1}{char1}")
return ".".join(common)

def is_paralogous_to(self, other: ProteinEntry) -> bool | str:
if self.entry_nr == other.entry_nr:
return False
elif self.hog_family_nr != other.hog_family_nr:
return False
else:
common = [self.oma_hog.split(".")[0]]
for (num1, char1), (num2, char2) in zip(self._subhog_parts, other._subhog_parts):
if num1 != num2:
return False
elif char1 != char2:
return ".".join(common)
common.append(f"{num1}{char1}")
return False

@property
def is_main_isoform(self):
return bool(self._entry["AltSpliceVariant"] == 0 or self._entry["AltSpliceVariant"] == self._entry["EntryNr"])
Expand Down
14 changes: 11 additions & 3 deletions pyoma/browser/tablefmt.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,7 @@ class OrthoXmlHogTable(tables.IsDescription):
HogAugmentedBufferLength = tables.UInt32Col(pos=4)


class AncestralSyntenyRels(tables.IsDescription):
HogRow1 = tables.UInt32Col(pos=0)
HogRow2 = tables.UInt32Col(pos=1)
class AbstractSyntenyRels(tables.IsDescription):
Weight = tables.Float16Col(pos=2)
Evidence = tables.EnumCol(
tables.Enum({"linearized": 1, "parsimonious": 2, "any": 4}),
Expand All @@ -46,6 +44,16 @@ class AncestralSyntenyRels(tables.IsDescription):
LCA_taxid = tables.Int32Col(pos=4, dflt=-1)


class AncestralSyntenyRels(AbstractSyntenyRels):
HogRow1 = tables.UInt32Col(pos=0)
HogRow2 = tables.UInt32Col(pos=1)


class ExtantSyntenyRels(AbstractSyntenyRels):
EntryNr1 = tables.UInt32Col(pos=0)
EntryNr2 = tables.UInt32Col(pos=1)


class ProteinTable(tables.IsDescription):
EntryNr = tables.UInt32Col(pos=1)
SeqBufferOffset = tables.UInt64Col(pos=2)
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.13.4
current_version = 0.13.5
commit = True
tag = False

Expand Down
96 changes: 96 additions & 0 deletions tests/browser/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import sys
import time
import unittest
from unittest.mock import MagicMock
from builtins import str

import numpy
Expand Down Expand Up @@ -94,6 +95,101 @@ def test_keyword_of_hog(self):
self.assertEqual("", hog.keyword)


class TestProteinEntryOrthologyParalogy(unittest.TestCase):
def setUp(self):
# Mock database and entry data
self.mock_db = MagicMock()
# Minimal entry dicts with required fields
self.entry1 = {"EntryNr": 1, "OmaHOG": b"1.15a.2bz"}
self.entry2 = {"EntryNr": 2, "OmaHOG": b"1.15a.2b"}
self.entry3 = {
"EntryNr": 3,
"OmaHOG": b"1.15b.2b",
}
self.entry4 = {
"EntryNr": 4,
"OmaHOG": b"5.1b",
}
self.entry5 = {
"EntryNr": 5,
"OmaHOG": b"5",
}
self.entry6 = {
"EntryNr": 6,
"OmaHOG": b"",
}
self.entry7 = {
"EntryNr": 7,
"OmaHOG": b"",
}

def hog_family_sideffect(entry):
f = entry["OmaHOG"].split(b".")[0]
if len(f) == 0:
raise pyoma.browser.exceptions.Singleton(entry)
return int(f)

# Patch hog_family method

self.mock_db.hog_family.side_effect = hog_family_sideffect

def make_entry(self, entry_dict):
# Patch id_mapper and other required db methods if needed
return models.ProteinEntry(self.mock_db, entry_dict)

def test_not_orthologous_if_same(self):
p1 = self.make_entry(self.entry1)
self.assertFalse(p1.is_orthologous_to(p1))

def test_not_paralogous_if_same(self):
p1 = self.make_entry(self.entry1)
self.assertFalse(p1.is_paralogous_to(p1))

def test_not_orthologous_but_paralogous_if_different_subhogid(self):
p1 = self.make_entry(self.entry1)
p2 = self.make_entry(self.entry2)
self.assertFalse(p1.is_orthologous_to(p2))
self.assertFalse(p2.is_orthologous_to(p1))
self.assertEqual("1.15a", p1.is_paralogous_to(p2))
self.assertEqual("1.15a", p2.is_paralogous_to(p1))

def test_not_orthologous_if_different_family(self):
p1 = self.make_entry(self.entry1)
p4 = self.make_entry(self.entry4)
self.assertFalse(p1.is_orthologous_to(p4))

def test_subfam_orthologous_to_superfam(self):
p4 = self.make_entry(self.entry4)
p5 = self.make_entry(self.entry5)
self.assertEqual("5", p4.is_orthologous_to(p5))
self.assertEqual("5", p5.is_orthologous_to(p4))

def test_subfam_are_not_paralogous_to_superfam(self):
p4 = self.make_entry(self.entry4)
p5 = self.make_entry(self.entry5)
self.assertFalse(p4.is_paralogous_to(p5))

def test_paralog_not_ortholog_on_subhog(self):
p2 = self.make_entry(self.entry2)
p3 = self.make_entry(self.entry3)
self.assertFalse(p2.is_orthologous_to(p3))
self.assertFalse(p3.is_orthologous_to(p2))
self.assertEqual("1", p2.is_paralogous_to(p3))
self.assertEqual("1", p3.is_paralogous_to(p2))

def test_neither_paralog_nor_ortholog_on_subhog_with_empty_hogid(self):
p6 = self.make_entry(self.entry6)
p3 = self.make_entry(self.entry3)
self.assertFalse(p6.is_orthologous_to(p3))
self.assertFalse(p3.is_paralogous_to(p6))

def test_orthologous_to_paralogous_on_singletons(self):
p6 = self.make_entry(self.entry6)
p7 = self.make_entry(self.entry7)
self.assertFalse(p6.is_orthologous_to(p7))
self.assertFalse(p7.is_paralogous_to(p6))


class OmaGroupModelTest(TestDbBase):
def test_instant_with_grpnr(self):
grp = models.OmaGroup(self.db, 1384)
Expand Down