Skip to content

Commit f904749

Browse files
authored
Alignments using C API (#3326)
* Implement alignments using C API * More tests for C side * Change back to iterator
1 parent 179963c commit f904749

File tree

6 files changed

+390
-65
lines changed

6 files changed

+390
-65
lines changed

c/tests/test_genotypes.c

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1297,6 +1297,35 @@ test_alignments_partial_isolation(void)
12971297
tsk_treeseq_free(&ts);
12981298
}
12991299

1300+
static void
1301+
test_alignments_return_code_truncated_interval(void)
1302+
{
1303+
int ret = 0;
1304+
const char *nodes = "1 0 0 -1\n"
1305+
"1 0 0 -1\n"
1306+
"0 1 0 -1\n";
1307+
/* Tree over [0,5): samples 0 and 1 under root 2.
1308+
* Tree over [5,10): only sample 1 under root 2 (sample 0 isolated). */
1309+
const char *edges = "0 5 2 0\n"
1310+
"0 10 2 1\n";
1311+
tsk_treeseq_t ts;
1312+
const tsk_id_t *samples;
1313+
tsk_size_t n;
1314+
char buf[10];
1315+
const char *ref = "NNNNNNNNNN";
1316+
1317+
tsk_treeseq_from_text(&ts, 10, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0);
1318+
samples = tsk_treeseq_get_samples(&ts);
1319+
n = tsk_treeseq_get_num_samples(&ts);
1320+
1321+
ret = tsk_treeseq_decode_alignments(&ts, ref, 10, samples, n, 0, 5, 'N', buf, 0);
1322+
CU_ASSERT_EQUAL_FATAL(ret, 0);
1323+
CU_ASSERT_NSTRING_EQUAL(buf + 0 * 5, "NNNNN", 5);
1324+
CU_ASSERT_NSTRING_EQUAL(buf + 1 * 5, "NNNNN", 5);
1325+
1326+
tsk_treeseq_free(&ts);
1327+
}
1328+
13001329
static void
13011330
test_alignments_invalid_allele_length(void)
13021331
{
@@ -1367,6 +1396,64 @@ test_alignments_discrete_genome_required(void)
13671396
tsk_treeseq_free(&ts);
13681397
}
13691398

1399+
static void
1400+
test_alignments_null_reference(void)
1401+
{
1402+
int ret = 0;
1403+
tsk_treeseq_t ts;
1404+
const tsk_id_t *samples;
1405+
tsk_size_t n;
1406+
char buf[10];
1407+
1408+
build_balanced_three_example_align(&ts);
1409+
samples = tsk_treeseq_get_samples(&ts);
1410+
n = tsk_treeseq_get_num_samples(&ts);
1411+
1412+
ret = tsk_treeseq_decode_alignments(&ts, NULL, 10, samples, n, 0, 10, 'N', buf, 0);
1413+
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_PARAM_VALUE);
1414+
tsk_treeseq_free(&ts);
1415+
}
1416+
1417+
static void
1418+
test_alignments_null_nodes_or_buf(void)
1419+
{
1420+
int ret = 0;
1421+
tsk_treeseq_t ts;
1422+
const char *ref = "NNNNNNNNNN";
1423+
const tsk_id_t *samples;
1424+
tsk_size_t n;
1425+
char buf[30];
1426+
1427+
build_balanced_three_example_align(&ts);
1428+
samples = tsk_treeseq_get_samples(&ts);
1429+
n = tsk_treeseq_get_num_samples(&ts);
1430+
1431+
ret = tsk_treeseq_decode_alignments(&ts, ref, 10, NULL, n, 0, 10, 'N', buf, 0);
1432+
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_PARAM_VALUE);
1433+
1434+
ret = tsk_treeseq_decode_alignments(&ts, ref, 10, samples, n, 0, 10, 'N', NULL, 0);
1435+
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_PARAM_VALUE);
1436+
1437+
tsk_treeseq_free(&ts);
1438+
}
1439+
1440+
static void
1441+
test_alignments_node_out_of_bounds(void)
1442+
{
1443+
int ret = 0;
1444+
tsk_treeseq_t ts;
1445+
const char *ref = "NNNNNNNNNN";
1446+
tsk_id_t bad_node;
1447+
char buf[10];
1448+
1449+
build_balanced_three_example_align(&ts);
1450+
bad_node = (tsk_id_t) tsk_treeseq_get_num_nodes(&ts);
1451+
1452+
ret = tsk_treeseq_decode_alignments(&ts, ref, 10, &bad_node, 1, 0, 10, 'N', buf, 0);
1453+
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS);
1454+
tsk_treeseq_free(&ts);
1455+
}
1456+
13701457
static void
13711458
test_alignments_isolated_as_not_missing(void)
13721459
{
@@ -1600,6 +1687,8 @@ main(int argc, char **argv)
16001687
{ "test_alignments_basic_default", test_alignments_basic_default },
16011688
{ "test_alignments_reference_sequence", test_alignments_reference_sequence },
16021689
{ "test_alignments_partial_isolation", test_alignments_partial_isolation },
1690+
{ "test_alignments_return_code_truncated_interval",
1691+
test_alignments_return_code_truncated_interval },
16031692
{ "test_alignments_isolated_as_not_missing",
16041693
test_alignments_isolated_as_not_missing },
16051694
{ "test_alignments_internal_node_non_sample",
@@ -1610,6 +1699,9 @@ main(int argc, char **argv)
16101699
{ "test_alignments_non_integer_bounds", test_alignments_non_integer_bounds },
16111700
{ "test_alignments_discrete_genome_required",
16121701
test_alignments_discrete_genome_required },
1702+
{ "test_alignments_null_reference", test_alignments_null_reference },
1703+
{ "test_alignments_null_nodes_or_buf", test_alignments_null_nodes_or_buf },
1704+
{ "test_alignments_node_out_of_bounds", test_alignments_node_out_of_bounds },
16131705
{ "test_alignments_missing_char_collision",
16141706
test_alignments_missing_char_collision },
16151707
{ "test_alignments_zero_nodes_ok", test_alignments_zero_nodes_ok },

c/tskit/genotypes.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -710,6 +710,8 @@ tsk_treeseq_decode_alignments_overlay_missing(const tsk_treeseq_t *self,
710710
}
711711
}
712712

713+
/* On success we should return 0, not TSK_TREE_OK from the last tsk_tree_next */
714+
ret = 0;
713715
out:
714716
tsk_tree_free(&tree);
715717
return ret;

python/_tskitmodule.c

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6079,6 +6079,102 @@ TreeSequence_get_individuals_nodes(TreeSequence *self)
60796079
return ret;
60806080
}
60816081

6082+
static PyObject *
6083+
TreeSequence_decode_alignments(TreeSequence *self, PyObject *args, PyObject *kwds)
6084+
{
6085+
int err;
6086+
PyObject *ret = NULL;
6087+
PyObject *py_ref, *py_nodes, *py_missing;
6088+
PyArrayObject *nodes_array = NULL;
6089+
const char *ref_seq;
6090+
Py_ssize_t ref_len, missing_len;
6091+
tsk_id_t *nodes;
6092+
tsk_size_t num_nodes;
6093+
double left, right;
6094+
char missing_char;
6095+
const char *missing_utf8;
6096+
int isolated_as_missing = 1;
6097+
tsk_flags_t options = 0;
6098+
PyObject *buf_obj = NULL;
6099+
char *buf = NULL;
6100+
6101+
static char *kwlist[] = { "reference_sequence", "nodes", "left", "right",
6102+
"missing_data_character", "isolated_as_missing", NULL };
6103+
6104+
if (TreeSequence_check_state(self) != 0) {
6105+
goto out;
6106+
}
6107+
6108+
if (!PyArg_ParseTupleAndKeywords(args, kwds, "OOddOp", kwlist, &py_ref, &py_nodes,
6109+
&left, &right, &py_missing, &isolated_as_missing)) {
6110+
goto out;
6111+
}
6112+
6113+
if (!PyBytes_Check(py_ref)) {
6114+
PyErr_SetString(PyExc_TypeError, "reference_sequence must be bytes");
6115+
goto out;
6116+
}
6117+
if (PyBytes_AsStringAndSize(py_ref, (char **) &ref_seq, &ref_len) < 0) {
6118+
goto out;
6119+
}
6120+
6121+
if (!PyUnicode_Check(py_missing)) {
6122+
PyErr_SetString(
6123+
PyExc_TypeError, "missing_data_character must be a (length 1) string");
6124+
goto out;
6125+
}
6126+
missing_utf8 = PyUnicode_AsUTF8AndSize(py_missing, &missing_len);
6127+
if (missing_utf8 == NULL) {
6128+
goto out;
6129+
}
6130+
if (missing_len != 1) {
6131+
PyErr_SetString(
6132+
PyExc_TypeError, "missing_data_character must be a single character");
6133+
goto out;
6134+
}
6135+
missing_char = missing_utf8[0];
6136+
6137+
if (!isolated_as_missing) {
6138+
options |= TSK_ISOLATED_NOT_MISSING;
6139+
}
6140+
6141+
nodes_array = (PyArrayObject *) PyArray_FROMANY(
6142+
py_nodes, NPY_INT32, 1, 1, NPY_ARRAY_IN_ARRAY);
6143+
if (nodes_array == NULL) {
6144+
goto out;
6145+
}
6146+
num_nodes = (tsk_size_t) PyArray_DIM(nodes_array, 0);
6147+
nodes = PyArray_DATA(nodes_array);
6148+
6149+
buf_obj = PyBytes_FromStringAndSize(
6150+
NULL, (Py_ssize_t)(num_nodes * (tsk_size_t)(right - left)));
6151+
if (buf_obj == NULL) {
6152+
goto out;
6153+
}
6154+
buf = PyBytes_AS_STRING(buf_obj);
6155+
6156+
// clang-format off
6157+
Py_BEGIN_ALLOW_THREADS
6158+
err = tsk_treeseq_decode_alignments(self->tree_sequence,
6159+
ref_seq, (tsk_size_t) ref_len, nodes, num_nodes, left, right, missing_char, buf,
6160+
options);
6161+
Py_END_ALLOW_THREADS
6162+
// clang-format on
6163+
if (err != 0)
6164+
{
6165+
handle_library_error(err);
6166+
goto out;
6167+
}
6168+
6169+
ret = buf_obj;
6170+
buf_obj = NULL;
6171+
6172+
out:
6173+
Py_XDECREF(nodes_array);
6174+
Py_XDECREF(buf_obj);
6175+
return ret;
6176+
}
6177+
60826178
static PyObject *
60836179
TreeSequence_get_mutations_edge(TreeSequence *self)
60846180
{
@@ -8660,6 +8756,10 @@ static PyMethodDef TreeSequence_methods[] = {
86608756
.ml_meth = (PyCFunction) TreeSequence_get_individuals_nodes,
86618757
.ml_flags = METH_NOARGS,
86628758
.ml_doc = "Returns an array of the node ids for each individual" },
8759+
{ .ml_name = "decode_alignments",
8760+
.ml_meth = (PyCFunction) TreeSequence_decode_alignments,
8761+
.ml_flags = METH_VARARGS | METH_KEYWORDS,
8762+
.ml_doc = "Decode full alignments for given nodes and interval." },
86638763
{ .ml_name = "get_mutations_edge",
86648764
.ml_meth = (PyCFunction) TreeSequence_get_mutations_edge,
86658765
.ml_flags = METH_NOARGS,

0 commit comments

Comments
 (0)