From 2665998b097cb1c2d8629176f560dda027084b7e Mon Sep 17 00:00:00 2001 From: Aspirational Date: Sat, 18 Apr 2026 16:37:44 +0300 Subject: [PATCH 01/51] test(datalog): cmp + arith body literals work as documented Added /datalog/cmp_const_filter proving (< W 60) filters EDB rows. Added /datalog/arith_assignment proving (= S (+ A B)) derives arithmetic results across all input rows. Both pass; confirms upstream-master Datalog body cmp + arithmetic infrastructure is fully functional via dl_rule_add_cmp_const, dl_rule_add_assign, and dl_expr_binop. --- test/test_datalog.c | 131 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) diff --git a/test/test_datalog.c b/test/test_datalog.c index 40611078..d979c2d1 100644 --- a/test/test_datalog.c +++ b/test/test_datalog.c @@ -175,9 +175,140 @@ static MunitResult test_source_prov_requires_flag(const void* params, void* fixt return MUNIT_OK; } +/* Verify cmp body literal filters tuples: rule keeps only rows where col0 < 60. + * + * Program: + * EDB: weight(50), weight(60), weight(75), weight(85) + * Rule: small(W) :- weight(W), (< W 60) + * + * Expected: small has exactly 1 row = 50. + */ +static MunitResult test_cmp_const_filter(const void* params, void* fixture) { + (void)params; (void)fixture; + + int64_t vals[] = {50, 60, 75, 85}; + ray_t* col = ray_vec_from_raw(RAY_I64, vals, 4); + munit_assert_ptr_not_null(col); + + ray_t* weight = ray_table_new(1); + weight = ray_table_add_col(weight, ray_sym_intern("weight__c0", 10), col); + munit_assert_false(RAY_IS_ERR(weight)); + + dl_program_t* prog = dl_program_new(); + munit_assert_ptr_not_null(prog); + + int weight_idx = dl_add_edb(prog, "weight", weight, 1); + munit_assert_int(weight_idx, ==, 0); + + /* small(W) :- weight(W), (< W 60) */ + dl_rule_t rule; + dl_rule_init(&rule, "small", 1); + dl_rule_head_var(&rule, 0, 0); /* head var idx 0 = W */ + + int body = dl_rule_add_atom(&rule, "weight", 1); + dl_body_set_var(&rule, body, 0, 0); /* weight(W) */ + + int cmp = dl_rule_add_cmp_const(&rule, DL_CMP_LT, 0, 60); /* W < 60 */ + munit_assert_int(cmp, >=, 0); + + rule.n_vars = 1; + munit_assert_int(dl_add_rule(prog, &rule), ==, 0); + munit_assert_int(dl_eval(prog), ==, 0); + + ray_t* out = dl_query(prog, "small"); + munit_assert_ptr_not_null(out); + munit_assert_int((int)ray_table_nrows(out), ==, 1); + + ray_t* out_col = ray_table_get_col_idx(out, 0); + munit_assert_ptr_not_null(out_col); + int64_t* od = (int64_t*)ray_data(out_col); + munit_assert_int((int)od[0], ==, 50); + + dl_program_free(prog); + ray_release(weight); + ray_release(col); + return MUNIT_OK; +} + +/* Verify arithmetic assignment derives a new variable from input columns. + * + * Program: + * EDB: pair(2, 3), pair(5, 7), pair(10, 1) + * Rule: sum_rel(A, B, S) :- pair(A, B), (= S (+ A B)) + * + * Expected: sum_rel has 3 rows: (2,3,5), (5,7,12), (10,1,11). + */ +static MunitResult test_arith_assignment(const void* params, void* fixture) { + (void)params; (void)fixture; + + int64_t a_vals[] = {2, 5, 10}; + int64_t b_vals[] = {3, 7, 1}; + ray_t* a_col = ray_vec_from_raw(RAY_I64, a_vals, 3); + ray_t* b_col = ray_vec_from_raw(RAY_I64, b_vals, 3); + munit_assert_ptr_not_null(a_col); + munit_assert_ptr_not_null(b_col); + + ray_t* pair = ray_table_new(2); + pair = ray_table_add_col(pair, ray_sym_intern("pair__c0", 8), a_col); + munit_assert_false(RAY_IS_ERR(pair)); + pair = ray_table_add_col(pair, ray_sym_intern("pair__c1", 8), b_col); + munit_assert_false(RAY_IS_ERR(pair)); + + dl_program_t* prog = dl_program_new(); + munit_assert_ptr_not_null(prog); + munit_assert_int(dl_add_edb(prog, "pair", pair, 2), ==, 0); + + /* sum_rel(A, B, S) :- pair(A, B), (= S (+ A B)) */ + dl_rule_t rule; + dl_rule_init(&rule, "sum_rel", 3); + dl_rule_head_var(&rule, 0, 0); /* A */ + dl_rule_head_var(&rule, 1, 1); /* B */ + dl_rule_head_var(&rule, 2, 2); /* S */ + + int body = dl_rule_add_atom(&rule, "pair", 2); + dl_body_set_var(&rule, body, 0, 0); /* A */ + dl_body_set_var(&rule, body, 1, 1); /* B */ + + /* expr = (+ A B) */ + dl_expr_t* expr = dl_expr_binop(OP_ADD, dl_expr_var(0), dl_expr_var(1)); + munit_assert_ptr_not_null(expr); + int as = dl_rule_add_assign(&rule, 2, DL_OP_EQ, expr); /* S = A + B */ + munit_assert_int(as, >=, 0); + + rule.n_vars = 3; + munit_assert_int(dl_add_rule(prog, &rule), ==, 0); + munit_assert_int(dl_eval(prog), ==, 0); + + ray_t* out = dl_query(prog, "sum_rel"); + munit_assert_ptr_not_null(out); + munit_assert_int((int)ray_table_nrows(out), ==, 3); + + ray_t* s_col = ray_table_get_col_idx(out, 2); + munit_assert_ptr_not_null(s_col); + int64_t* sd = (int64_t*)ray_data(s_col); + /* Sums must include 5, 12, 11 (order may differ). */ + int saw5 = 0, saw12 = 0, saw11 = 0; + for (int i = 0; i < 3; i++) { + if (sd[i] == 5) saw5 = 1; + if (sd[i] == 12) saw12 = 1; + if (sd[i] == 11) saw11 = 1; + } + munit_assert_int(saw5, ==, 1); + munit_assert_int(saw12, ==, 1); + munit_assert_int(saw11, ==, 1); + + dl_program_free(prog); + ray_release(pair); + ray_release(a_col); + ray_release(b_col); + return MUNIT_OK; +} + static MunitTest datalog_tests[] = { { "/source_provenance", test_source_provenance, datalog_setup, datalog_teardown, 0, NULL }, { "/source_prov_requires_flag", test_source_prov_requires_flag, datalog_setup, datalog_teardown, 0, NULL }, + { "/cmp_const_filter", test_cmp_const_filter, datalog_setup, datalog_teardown, 0, NULL }, + { "/arith_assignment", test_arith_assignment, datalog_setup, datalog_teardown, 0, NULL }, { NULL, NULL, NULL, NULL, 0, NULL }, }; From 717fab867b911fc139a2c37a379dd4041bcdc119 Mon Sep 17 00:00:00 2001 From: Aspirational Date: Sat, 18 Apr 2026 16:41:40 +0300 Subject: [PATCH 02/51] docs(plans): datalog aggregates + ray-exomem consumer refactor Two-phase plan covering: - Phase A (this branch): add count/sum/min/max/avg body literals, float constants in expressions, and (between ?x lo hi) parser sugar to the Datalog engine. - Phase B (ray-exomem branch): delete native_derived_relations procedural Rust and replace with declarative rules that exercise the cmp + arith + neg + (eventually) aggregate features the engine already provides. Phase A's verification baseline is the cmp_const_filter + arith_assignment tests landed in 3ebc6ba. --- ...04-18-datalog-aggregates-and-onboarding.md | 1012 +++++++++++++++++ 1 file changed, 1012 insertions(+) create mode 100644 docs/plans/2026-04-18-datalog-aggregates-and-onboarding.md diff --git a/docs/plans/2026-04-18-datalog-aggregates-and-onboarding.md b/docs/plans/2026-04-18-datalog-aggregates-and-onboarding.md new file mode 100644 index 00000000..1556ac09 --- /dev/null +++ b/docs/plans/2026-04-18-datalog-aggregates-and-onboarding.md @@ -0,0 +1,1012 @@ +# Datalog Aggregates + ray-exomem Consumer Refactor — Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add aggregate body literals (`count`, `sum`, `min`, `max`, `avg`), float constants in expressions, and a `between` sugar to the rayforce2 Datalog engine; then delete the procedural Rust derivations in ray-exomem `system_schema.rs::native_derived_relations` and replace them with declarative rules expressed entirely in the engine. + +**Architecture:** +- Phase A (rayforce2, branch `feature/datalog-aggregates`): extend `dl_body_t` with a new literal type `DL_AGG`; lower `(count ?N pred)` etc. to a post-fixpoint aggregation pass within the containing rule's stratum. Introduce a tagged constant value (`DL_CONST_I64` / `DL_CONST_F64`) so `dl_expr_const` accepts both. Add `(between ?x lo hi)` as parser sugar lowering to two `DL_CMP` literals. +- Phase B (ray-exomem, separate branch `feature/declarative-derivations`): consume the new operators. Express health-band derivations as Datalog rules attached during exom seeding, and remove the procedural `native_derived_relations` / `known_derived_samples` paths. The procedural lookup tables in `web.rs` for `health/recommended-water-ml` etc. are deleted. + +**Tech Stack:** rayforce2 C engine (datalog.h/datalog.c), munit, ray-exomem Rust (`brain.rs`, `system_schema.rs`, `server.rs`, `auth/routes.rs`). + +**Branch state at plan write time:** `feature/datalog-aggregates` HEAD = commit `3ebc6ba` (`test(datalog): cmp + arith body literals work as documented`) which proves cmp + arith already work in upstream master and is the baseline this plan extends. + +--- + +## Phase A — rayforce2: Datalog++ extensions + +### Task A1: Define aggregate body-literal type + constants + +**Files:** +- Modify: `src/ops/datalog.h` (add `DL_AGG`, `dl_agg_op_t`, fields on `dl_body_t`) + +- [ ] **Step 1: Add `DL_AGG` to body literal type constants** + +In `src/ops/datalog.h`, after the existing `DL_INTERVAL 5` line, add: + +```c +#define DL_AGG 6 /* aggregate: (count ?N pred), (sum ?S ?expr pred), ... */ +``` + +- [ ] **Step 2: Add aggregate operator enum** + +In `src/ops/datalog.h`, after the comparison-operator block, add: + +```c +/* ===== Aggregate operators (for DL_AGG) ===== */ +#define DL_AGG_COUNT 0 +#define DL_AGG_SUM 1 +#define DL_AGG_MIN 2 +#define DL_AGG_MAX 3 +#define DL_AGG_AVG 4 +``` + +- [ ] **Step 3: Extend `dl_body_t` with aggregate fields** + +In `src/ops/datalog.h`, inside the `dl_body_t` struct (after the existing `interval_*` fields), add: + +```c + int agg_op; /* aggregate operator (for DL_AGG) */ + int agg_target_var; /* variable that receives the aggregate result */ + char agg_pred[64]; /* predicate name being aggregated over */ + int agg_arity; /* arity of agg_pred */ + int agg_value_col; /* column index inside agg_pred to aggregate (sum/min/max/avg) */ +``` + +- [ ] **Step 4: Add public builder API** + +In `src/ops/datalog.h`, after `dl_rule_add_interval(...)`, add: + +```c +/* Add an aggregate body literal: (op ?target pred col) + * - op: DL_AGG_COUNT (col is ignored), DL_AGG_SUM/MIN/MAX/AVG + * - target_var: variable that receives the aggregate result + * - pred: predicate to aggregate over + * - pred_arity: arity of that predicate + * - value_col: which column to aggregate (ignored for COUNT) + * Returns body literal index. */ +int dl_rule_add_agg(dl_rule_t* rule, int op, int target_var, + const char* pred, int pred_arity, int value_col); +``` + +- [ ] **Step 5: Compile to verify header parses** + +Run: `make lib` +Expected: clean build, no `-Werror` failures. + +- [ ] **Step 6: Commit** + +```bash +git add src/ops/datalog.h +git commit -m "feat(datalog): declare DL_AGG body literal + aggregate operators" +``` + +### Task A2: Implement `dl_rule_add_agg` builder + +**Files:** +- Modify: `src/ops/datalog.c` (add builder near existing `dl_rule_add_interval`) +- Test: `test/test_datalog.c` + +- [ ] **Step 1: Write the failing test** + +Append to `test/test_datalog.c` (before the `datalog_tests[]` array): + +```c +/* Verify dl_rule_add_agg populates body fields correctly. */ +static MunitResult test_agg_builder(const void* params, void* fixture) { + (void)params; (void)fixture; + dl_rule_t rule; + dl_rule_init(&rule, "stats", 1); + dl_rule_head_var(&rule, 0, 0); + + int idx = dl_rule_add_agg(&rule, DL_AGG_COUNT, 0, "weight", 1, 0); + munit_assert_int(idx, ==, 0); + munit_assert_int(rule.body[0].type, ==, DL_AGG); + munit_assert_int(rule.body[0].agg_op, ==, DL_AGG_COUNT); + munit_assert_int(rule.body[0].agg_target_var, ==, 0); + munit_assert_string_equal(rule.body[0].agg_pred, "weight"); + munit_assert_int(rule.body[0].agg_arity, ==, 1); + munit_assert_int(rule.body[0].agg_value_col, ==, 0); + return MUNIT_OK; +} +``` + +Add to the `datalog_tests[]` array: + +```c + { "/agg_builder", test_agg_builder, datalog_setup, datalog_teardown, 0, NULL }, +``` + +- [ ] **Step 2: Run test, expect failure** + +Run: `make test 2>&1 | grep -E "agg_builder|error"` +Expected: link error `undefined symbol dl_rule_add_agg`. + +- [ ] **Step 3: Implement the builder** + +In `src/ops/datalog.c`, after `dl_rule_add_interval` (around line 363), add: + +```c +int dl_rule_add_agg(dl_rule_t* rule, int op, int target_var, + const char* pred, int pred_arity, int value_col) { + if (rule->n_body >= DL_MAX_BODY) return -1; + int idx = rule->n_body++; + dl_body_t* b = &rule->body[idx]; + memset(b, 0, sizeof(*b)); + b->type = DL_AGG; + b->agg_op = op; + b->agg_target_var = target_var; + snprintf(b->agg_pred, sizeof(b->agg_pred), "%s", pred); + b->agg_arity = pred_arity; + b->agg_value_col = value_col; + return idx; +} +``` + +- [ ] **Step 4: Run test, expect pass** + +Run: `make test 2>&1 | grep -E "agg_builder"` +Expected: `/datalog/agg_builder [ OK ]`. + +- [ ] **Step 5: Commit** + +```bash +git add src/ops/datalog.c test/test_datalog.c +git commit -m "feat(datalog): dl_rule_add_agg builder for aggregate literals" +``` + +### Task A3: Stratification — treat aggregates as non-monotonic + +Aggregates are non-monotonic (adding a fact can change `min`); they must live in a higher stratum than the predicate they aggregate. The existing `dl_stratify` topological sort treats negation as the only non-monotonic edge — extend it to treat aggregate dependencies the same way. + +**Files:** +- Modify: `src/ops/datalog.c` (function building the negation edge set, currently driven by `DL_NEG`) + +- [ ] **Step 1: Locate stratification edge construction** + +Run: `grep -n "DL_NEG" src/ops/datalog.c | head` +Identify the loop that pushes negative dependency edges (typically inside `dl_stratify` or a helper named `dl_build_dep_graph`). + +- [ ] **Step 2: Write the failing test** + +Append to `test/test_datalog.c`: + +```c +/* Aggregates over an IDB must be evaluated in a strictly higher stratum + * than the IDB itself. Program: + * EDB: edge(1,2), edge(2,3) + * Rule R0: path(X,Y) :- edge(X,Y) + * Rule R1: path_count(N) :- (count ?N path) + * After stratification: R1.stratum > R0.stratum. */ +static MunitResult test_agg_stratifies_above_source(const void* params, void* fixture) { + (void)params; (void)fixture; + int64_t s_vals[] = {1, 2}; + int64_t d_vals[] = {2, 3}; + ray_t* sc = ray_vec_from_raw(RAY_I64, s_vals, 2); + ray_t* dc = ray_vec_from_raw(RAY_I64, d_vals, 2); + ray_t* edge = ray_table_new(2); + edge = ray_table_add_col(edge, ray_sym_intern("edge__c0", 8), sc); + edge = ray_table_add_col(edge, ray_sym_intern("edge__c1", 8), dc); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "edge", edge, 2); + + dl_rule_t r0; dl_rule_init(&r0, "path", 2); + dl_rule_head_var(&r0, 0, 0); dl_rule_head_var(&r0, 1, 1); + int b = dl_rule_add_atom(&r0, "edge", 2); + dl_body_set_var(&r0, b, 0, 0); dl_body_set_var(&r0, b, 1, 1); + r0.n_vars = 2; + dl_add_rule(prog, &r0); + + dl_rule_t r1; dl_rule_init(&r1, "path_count", 1); + dl_rule_head_var(&r1, 0, 0); + dl_rule_add_agg(&r1, DL_AGG_COUNT, 0, "path", 2, 0); + r1.n_vars = 1; + dl_add_rule(prog, &r1); + + munit_assert_int(dl_stratify(prog), ==, 0); + munit_assert_int(prog->rules[1].stratum, >, prog->rules[0].stratum); + + dl_program_free(prog); + ray_release(edge); ray_release(sc); ray_release(dc); + return MUNIT_OK; +} +``` + +Add `{ "/agg_stratifies_above_source", test_agg_stratifies_above_source, ... }` to `datalog_tests[]`. + +- [ ] **Step 3: Run test, expect failure** + +Run: `make test 2>&1 | grep agg_stratifies` +Expected: assertion failure `r1.stratum > r0.stratum` because the aggregate edge is currently invisible to the stratifier (or the program is mis-stratified). + +- [ ] **Step 4: Add aggregate edges to dependency graph** + +In the dependency-graph builder identified in Step 1, add a branch for `DL_AGG` mirroring the `DL_NEG` branch: + +```c +} else if (b->type == DL_AGG) { + int src_idx = dl_find_rel(prog, b->agg_pred); + if (src_idx >= 0) { + /* aggregate creates non-monotonic dependency: head depends negatively on source */ + add_dep_edge(graph, head_rel_idx, src_idx, /*negative=*/true); + } +} +``` + +(Use the helper names actually present in `datalog.c` — adapt the snippet to match.) + +- [ ] **Step 5: Run test, expect pass** + +Run: `make test 2>&1 | grep agg_stratifies` +Expected: `[ OK ]`. + +- [ ] **Step 6: Commit** + +```bash +git add src/ops/datalog.c test/test_datalog.c +git commit -m "feat(datalog): aggregates participate in stratification (non-monotonic)" +``` + +### Task A4: Evaluate aggregates inside `dl_compile_rule` + +Aggregates fire after the source predicate's stratum reaches fixpoint. The simplest implementation: when compiling a rule whose body contains a `DL_AGG` literal, generate a graph node that scans the source IDB/EDB and reduces it. + +**Files:** +- Modify: `src/ops/datalog.c` — extend `dl_compile_rule` + +- [ ] **Step 1: Write failing test (count over EDB)** + +Append: + +```c +/* (count ?N weight) where weight has 4 rows -> N = 4. */ +static MunitResult test_agg_count_edb(const void* params, void* fixture) { + (void)params; (void)fixture; + int64_t vals[] = {50, 60, 75, 85}; + ray_t* col = ray_vec_from_raw(RAY_I64, vals, 4); + ray_t* weight = ray_table_new(1); + weight = ray_table_add_col(weight, ray_sym_intern("weight__c0", 10), col); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "weight", weight, 1); + + dl_rule_t r; dl_rule_init(&r, "wcount", 1); + dl_rule_head_var(&r, 0, 0); + dl_rule_add_agg(&r, DL_AGG_COUNT, 0, "weight", 1, 0); + r.n_vars = 1; + dl_add_rule(prog, &r); + + munit_assert_int(dl_eval(prog), ==, 0); + ray_t* out = dl_query(prog, "wcount"); + munit_assert_ptr_not_null(out); + munit_assert_int((int)ray_table_nrows(out), ==, 1); + int64_t* od = (int64_t*)ray_data(ray_table_get_col_idx(out, 0)); + munit_assert_int((int)od[0], ==, 4); + + dl_program_free(prog); + ray_release(weight); ray_release(col); + return MUNIT_OK; +} +``` + +Add to `datalog_tests[]`. + +- [ ] **Step 2: Run test, expect failure** + +Run: `make test 2>&1 | grep agg_count_edb` +Expected: assertion failure (`wcount` table has 0 rows because compiler ignores `DL_AGG`). + +- [ ] **Step 3: Extend `dl_compile_rule`** + +Locate `dl_compile_rule` in `src/ops/datalog.c` (function declared at `datalog.h:278`). After the loop that emits nodes for atoms / cmp / assign, add a branch: + +```c +case DL_AGG: { + int src_idx = dl_find_rel(prog, b->agg_pred); + if (src_idx < 0) return NULL; /* unknown predicate */ + ray_t* src_table = prog->rels[src_idx].table; + int64_t nrows = src_table ? ray_table_nrows(src_table) : 0; + + int64_t result; + switch (b->agg_op) { + case DL_AGG_COUNT: result = nrows; break; + case DL_AGG_SUM: + case DL_AGG_MIN: + case DL_AGG_MAX: + case DL_AGG_AVG: { + ray_t* vc = ray_table_get_col_idx(src_table, b->agg_value_col); + int64_t* vd = vc ? (int64_t*)ray_data(vc) : NULL; + if (!vd || nrows == 0) { result = 0; break; } + int64_t acc = vd[0]; + for (int64_t i = 1; i < nrows; i++) { + if (b->agg_op == DL_AGG_SUM) acc += vd[i]; + else if (b->agg_op == DL_AGG_MIN) { if (vd[i] < acc) acc = vd[i]; } + else if (b->agg_op == DL_AGG_MAX) { if (vd[i] > acc) acc = vd[i]; } + else /* AVG */ acc += vd[i]; + } + result = (b->agg_op == DL_AGG_AVG) ? acc / nrows : acc; + break; + } + default: return NULL; + } + /* Bind result to the aggregate's target variable in the head's binding row. */ + bind_const_to_var(g, b->agg_target_var, result); + break; +} +``` + +(Adapt `bind_const_to_var` to whatever helper the existing `DL_ASSIGN` path uses.) + +- [ ] **Step 4: Run test, expect pass** + +Run: `make test 2>&1 | grep agg_count_edb` +Expected: `[ OK ]`. + +- [ ] **Step 5: Add SUM/MIN/MAX/AVG tests** + +Append four parallel tests using the same `weight` EDB (50, 60, 75, 85): +- `test_agg_sum`: expect 270 +- `test_agg_min`: expect 50 +- `test_agg_max`: expect 85 +- `test_agg_avg`: expect 67 (integer div: 270/4) + +Each rule looks like: +```c +dl_rule_add_agg(&r, DL_AGG_SUM, 0, "weight", 1, 0); +``` + +- [ ] **Step 6: Run all aggregate tests** + +Run: `make test 2>&1 | grep '/datalog/agg'` +Expected: all `[ OK ]`. + +- [ ] **Step 7: Commit** + +```bash +git add src/ops/datalog.c test/test_datalog.c +git commit -m "feat(datalog): evaluate count/sum/min/max/avg aggregates" +``` + +### Task A5: Surface-syntax parser for aggregates + +Allow Rayfall users to write `(count ?N weight)` and `(sum ?S weight 0)` in rule bodies, parsed by `dl_parse_body_clause` (`src/ops/datalog.c:2336`). + +**Files:** +- Modify: `src/ops/datalog.c` (add `dl_is_aggregate`, branch inside `dl_parse_body_clause`) + +- [ ] **Step 1: Write failing test through surface parser** + +Append a test that goes through `ray_rule_fn` (the special form). Use existing test_lang harnesses as a template — e.g. find how `test_lang_rf.inc` calls `eval_rf_string("(rule ...)")`. + +Skeleton: + +```c +static MunitResult test_agg_parse_count(const void* params, void* fixture) { + (void)params; (void)fixture; + /* (rule (wcount ?N) (count ?N weight)) */ + /* Run via lang eval helper, then dl_query. */ + ... +} +``` + +If a lang harness is unavailable from `test_datalog.c`, the equivalent integration test must live alongside the rule-evaluation tests in `test/test_lang_rf.inc`. + +- [ ] **Step 2: Add `dl_is_aggregate` helper** + +In `src/ops/datalog.c` near `dl_is_assignment` (around line 2281): + +```c +static bool dl_is_aggregate(ray_t* clause) { + if (!is_list(clause) || ray_len(clause) < 3) return false; + ray_t** ce = (ray_t**)ray_data(clause); + if (ce[0]->type != -RAY_SYM) return false; + ray_t* name = ray_sym_str(ce[0]->i64); + if (!name) return false; + const char* n = ray_str_ptr(name); + return strcmp(n, "count") == 0 || strcmp(n, "sum") == 0 + || strcmp(n, "min") == 0 || strcmp(n, "max") == 0 + || strcmp(n, "avg") == 0; +} + +static int dl_agg_op_from_name(const char* n) { + if (strcmp(n, "count") == 0) return DL_AGG_COUNT; + if (strcmp(n, "sum") == 0) return DL_AGG_SUM; + if (strcmp(n, "min") == 0) return DL_AGG_MIN; + if (strcmp(n, "max") == 0) return DL_AGG_MAX; + if (strcmp(n, "avg") == 0) return DL_AGG_AVG; + return -1; +} +``` + +- [ ] **Step 3: Branch in `dl_parse_body_clause`** + +In `src/ops/datalog.c`, immediately before the assignment branch (line 2400), insert: + +```c +/* -- Aggregate: (count ?N pred) | (sum ?S pred col) | ... -- */ +if (dl_is_aggregate(clause)) { + ray_t* op_name = ray_sym_str(ce[0]->i64); + int op = dl_agg_op_from_name(ray_str_ptr(op_name)); + + if (!is_dl_var(ce[1])) + return ray_error("type", "aggregate target must be a ?variable"); + int target_vi = dl_var_get_or_create(vars, ce[1]->i64); + + if (ce[2]->type != -RAY_SYM) + return ray_error("type", "aggregate source predicate must be a symbol"); + ray_t* pred_str = ray_sym_str(ce[2]->i64); + + int value_col = 0; + if (op != DL_AGG_COUNT) { + if (clen < 4 || ce[3]->type != -RAY_I64) + return ray_error("type", "sum/min/max/avg requires explicit column index"); + value_col = (int)ce[3]->i64; + } + + /* Need predicate arity — best effort: use existing relation if registered, + * otherwise default to 1 (engine will validate at eval time). */ + int rel_idx = dl_find_rel(rule_prog_unused_param, ray_str_ptr(pred_str)); + int arity = (rel_idx >= 0) ? rule_prog_unused_param->rels[rel_idx].arity : 1; + + dl_rule_add_agg(rule, op, target_vi, ray_str_ptr(pred_str), arity, value_col); + return NULL; +} +``` + +Note: `dl_parse_body_clause` does not currently take the `dl_program_t*`. Adding it requires updating its signature plus all callers (`dl_parse_inline_rule`, `ray_rule_fn`, etc.). Either thread the program through or store the pred name + look up arity at compile time. Pick one approach in this step, document it in the commit. + +- [ ] **Step 4: Run test, expect pass** + +Run: `make test 2>&1 | grep -E "agg_parse"` +Expected: `[ OK ]`. + +- [ ] **Step 5: Commit** + +```bash +git add src/ops/datalog.c test/test_datalog.c test/test_lang_rf.inc +git commit -m "feat(datalog): surface syntax (count|sum|min|max|avg ?v pred [col])" +``` + +### Task A6: Float constants in `dl_expr_t` + +Today `dl_expr_t.const_val` is `int64_t` and `dl_expr_const(int64_t)` is the only way to build a constant. Add a sibling `dl_expr_const_f64(double)` and a tag field so the evaluator and arithmetic builtins know which representation to use. + +**Files:** +- Modify: `src/ops/datalog.h` (extend `dl_expr_t`, add `DL_EXPR_CONST_F64` enum value) +- Modify: `src/ops/datalog.c` (extend `dl_build_expr`, add `dl_expr_const_f64`) +- Test: `test/test_datalog.c` + +- [ ] **Step 1: Extend the AST** + +In `src/ops/datalog.h`, change `dl_expr_kind_t`: + +```c +typedef enum { + DL_EXPR_CONST, /* integer constant (back-compat) */ + DL_EXPR_CONST_F64, /* float constant */ + DL_EXPR_VAR, + DL_EXPR_BINOP, +} dl_expr_kind_t; +``` + +And the struct: + +```c +typedef struct dl_expr { + dl_expr_kind_t kind; + int64_t const_val; /* DL_EXPR_CONST */ + double const_f64; /* DL_EXPR_CONST_F64 */ + int var_idx; + int binop; + struct dl_expr *left, *right; +} dl_expr_t; +``` + +- [ ] **Step 2: Builder + parser hook** + +In `src/ops/datalog.c` add: + +```c +dl_expr_t* dl_expr_const_f64(double v) { + dl_expr_t* e = calloc(1, sizeof(*e)); + if (!e) return NULL; + e->kind = DL_EXPR_CONST_F64; + e->const_f64 = v; + return e; +} +``` + +In `dl_build_expr` (line 2203), accept `RAY_F64` literals: + +```c +if (node->type == -RAY_F64) + return dl_expr_const_f64(node->f64); +``` + +- [ ] **Step 3: Evaluator coverage** + +Locate the expression evaluator (likely `dl_eval_expr` or inlined inside `dl_compile_rule`). Add a `DL_EXPR_CONST_F64` arm that returns the float as the materialized value. Coerce when mixed-arithmetic is requested (promote i64 → f64 if either side is f64). + +- [ ] **Step 4: Test float assignment** + +Append: + +```c +/* Float arithmetic: (= ?z (+ 1.5 2.5)) -> 4.0 (assuming i64 coercion to f64). */ +static MunitResult test_arith_assign_f64(const void* params, void* fixture) { + (void)params; (void)fixture; + int64_t one[] = { 1 }; + ray_t* col = ray_vec_from_raw(RAY_I64, one, 1); + ray_t* trig = ray_table_new(1); + trig = ray_table_add_col(trig, ray_sym_intern("trig__c0", 8), col); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "trig", trig, 1); + + dl_rule_t r; dl_rule_init(&r, "fres", 2); + dl_rule_head_var(&r, 0, 0); dl_rule_head_var(&r, 1, 1); + int b = dl_rule_add_atom(&r, "trig", 1); + dl_body_set_var(&r, b, 0, 0); + + dl_expr_t* e = dl_expr_binop(OP_ADD, dl_expr_const_f64(1.5), dl_expr_const_f64(2.5)); + dl_rule_add_assign(&r, 1, DL_OP_EQ, e); + r.n_vars = 2; + dl_add_rule(prog, &r); + + munit_assert_int(dl_eval(prog), ==, 0); + ray_t* out = dl_query(prog, "fres"); + munit_assert_int((int)ray_table_nrows(out), ==, 1); + /* Assert the result column type or value depending on coercion choice. */ + munit_assert_ptr_not_null(out); + + dl_program_free(prog); + ray_release(trig); ray_release(col); + return MUNIT_OK; +} +``` + +- [ ] **Step 5: Commit** + +```bash +git add src/ops/datalog.h src/ops/datalog.c test/test_datalog.c +git commit -m "feat(datalog): float constants in expressions; mixed-mode arithmetic" +``` + +### Task A7: `(between ?x lo hi)` parser sugar + +Pure parser-level rewrite: `(between ?x lo hi)` lowers to `(>= ?x lo)` + `(<= ?x hi)` — two `DL_CMP` body literals. No new evaluator code. + +**Files:** +- Modify: `src/ops/datalog.c` (`dl_parse_body_clause` — add branch) +- Test: `test/test_datalog.c` + +- [ ] **Step 1: Failing test** + +```c +/* (rule (mid ?w) (weight ?w) (between ?w 60 80)) + * weight has 50, 60, 75, 85 -> mid has 60, 75. */ +static MunitResult test_between_sugar(const void* params, void* fixture) { + (void)params; (void)fixture; + int64_t vals[] = {50, 60, 75, 85}; + ray_t* col = ray_vec_from_raw(RAY_I64, vals, 4); + ray_t* weight = ray_table_new(1); + weight = ray_table_add_col(weight, ray_sym_intern("weight__c0", 10), col); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "weight", weight, 1); + + /* Manually build to mirror what the parser will produce; once the + * parser is in, swap to a (rule ...) eval call. */ + dl_rule_t r; dl_rule_init(&r, "mid", 1); + dl_rule_head_var(&r, 0, 0); + int b = dl_rule_add_atom(&r, "weight", 1); dl_body_set_var(&r, b, 0, 0); + dl_rule_add_cmp_const(&r, DL_CMP_GE, 0, 60); + dl_rule_add_cmp_const(&r, DL_CMP_LE, 0, 80); + r.n_vars = 1; + dl_add_rule(prog, &r); + munit_assert_int(dl_eval(prog), ==, 0); + ray_t* out = dl_query(prog, "mid"); + munit_assert_int((int)ray_table_nrows(out), ==, 2); + + dl_program_free(prog); + ray_release(weight); ray_release(col); + return MUNIT_OK; +} +``` + +This test passes immediately because it sidesteps the parser. Then add a parallel surface-syntax test that requires the parser change. + +- [ ] **Step 2: Implement parser sugar** + +In `dl_parse_body_clause`, before the assignment branch: + +```c +if (clen == 4 && ce[0]->type == -RAY_SYM) { + ray_t* nm = ray_sym_str(ce[0]->i64); + if (nm && strcmp(ray_str_ptr(nm), "between") == 0) { + if (!is_dl_var(ce[1])) + return ray_error("type", "between target must be a ?variable"); + int vi = dl_var_get_or_create(vars, ce[1]->i64); + if (ce[2]->type != -RAY_I64 || ce[3]->type != -RAY_I64) + return ray_error("type", "between bounds must be integer constants"); + dl_rule_add_cmp_const(rule, DL_CMP_GE, vi, ce[2]->i64); + dl_rule_add_cmp_const(rule, DL_CMP_LE, vi, ce[3]->i64); + return NULL; + } +} +``` + +- [ ] **Step 3: Test surface parsing via lang harness** + +Add to `test/test_lang_rf.inc`: + +```c +/* Confirm (between ?x lo hi) lowers to two cmp literals at the parser. */ +``` + +(Use the project's existing `eval_rf_string` / rule-eval helper; mirror nearby tests.) + +- [ ] **Step 4: Run + commit** + +```bash +make test +git add src/ops/datalog.c test/test_datalog.c test/test_lang_rf.inc +git commit -m "feat(datalog): (between ?x lo hi) parser sugar lowers to two cmps" +``` + +### Task A8: Push Phase A branch + +- [ ] **Step 1: Push to origin** + +```bash +git push origin feature/datalog-aggregates +``` + +- [ ] **Step 2: Open draft PR against `RayforceDB/rayforce2:master`** + +Use `gh pr create --draft --base master --head theaspirational:feature/datalog-aggregates`. Title: "feat(datalog): aggregates, float constants, and `between` sugar". Body lists Task IDs from this plan. + +--- + +## Phase B — ray-exomem: declarative health derivations + +Phase B runs on a separate branch `feature/declarative-derivations` in `/Users/aspirational/Documents/code/lynx/Teide/ray-exomem`. It depends on Phase A's float support only if templates use float thresholds; the integer-only path can land first. + +**Note on dependency direction:** `ray-exomem` consumes `rayforce2` via the sibling checkout (`build.rs:97`). To use Phase A features locally, switch the rayforce2 checkout to `feature/datalog-aggregates` before building ray-exomem. To consume from the upstream merge, wait until the PR from Task A8 is merged into `RayforceDB/rayforce2:master` and the sibling is back on `master`. + +### Task B1: Capture current native_derived_relations behavior in tests + +**Files:** +- Test: `src/system_schema.rs` (extend the existing `tests` module that contains `native_derived_relations` cases — see line 816) + +- [ ] **Step 1: Read the existing test** + +```bash +sed -n '810,860p' /Users/aspirational/Documents/code/lynx/Teide/ray-exomem/src/system_schema.rs +``` + +Note the inputs (age=30, height=175, weight=75) and the expected derived relations. + +- [ ] **Step 2: Add a regression test that pins the public-facing rule strings** + +In the existing test module, add: + +```rust +#[test] +fn native_derived_water_band_for_default_profile() { + let exom = "alice/personal/health/main"; + let mut brain = Brain::new(); + let ctx = MutationContext::default(); + brain.assert_fact(HEALTH_PROFILE_HEIGHT_CM_FACT_ID, PROFILE_HEIGHT_CM, "175", + 1.0, "test", None, None, &ctx).unwrap(); + brain.assert_fact(HEALTH_PROFILE_WEIGHT_KG_FACT_ID, PROFILE_WEIGHT_KG, "75", + 1.0, "test", None, None, &ctx).unwrap(); + + let rels = native_derived_relations(exom, &brain); + let band = rels.iter().find(|r| r.name == HEALTH_WATER_BAND).unwrap(); + assert_eq!(band.sample_tuples, vec![vec!["medium".to_string()]]); +} +``` + +Repeat for `step_band` with age 30 → "high". + +- [ ] **Step 3: Run + commit** + +```bash +cargo test -p ray-exomem system_schema::tests::native_derived_water_band_for_default_profile +git add src/system_schema.rs +git commit -m "test(system_schema): pin water/step band derivations before refactor" +``` + +### Task B2: Express water_band as a Datalog rule + +The current Rust: + +```rust +let band = if weight_kg < 60 && height_cm < 170 { "small" } + else if weight_kg >= 85 || height_cm >= 185 { "large" } + else { "medium" }; +``` + +Three rules in Datalog (using `==`/`<`/`>=` already supported by the engine): + +```scheme +(rule {exom} + (health/water-band "small") + (?w_id 'health/profile/weight_kg ?w) + (?h_id 'health/profile/height_cm ?h) + (< ?w 60) + (< ?h 170)) + +(rule {exom} + (health/water-band "large") + (?w_id 'health/profile/weight_kg ?w) + (?h_id 'health/profile/height_cm ?h) + (>= ?w 85)) + +(rule {exom} + (health/water-band "large") + (?w_id 'health/profile/weight_kg ?w) + (?h_id 'health/profile/height_cm ?h) + (>= ?h 185)) + +(rule {exom} + (health/water-band "medium") + (?w_id 'health/profile/weight_kg ?w) + (?h_id 'health/profile/height_cm ?h) + (not (health/water-band "small")) + (not (health/water-band "large"))) +``` + +The fourth rule uses negation; this requires the existing stratifier (already in upstream) to keep `medium` in a higher stratum than `small`/`large`. + +**Files:** +- Modify: `src/auth/routes.rs` (`health_bootstrap_rules`, lines 204-225) + +- [ ] **Step 1: Replace `health_bootstrap_rules` body with the four rules above** + +Show full replacement code (the existing function returns a `Vec`, format the new rules with `format!` so the `{exom}` token is interpolated): + +```rust +fn health_bootstrap_rules(exom: &str) -> Vec { + vec![ + format!( + r#"(rule {exom} (health/water-band "small") \ + (?w_id 'health/profile/weight_kg ?w) \ + (?h_id 'health/profile/height_cm ?h) \ + (< ?w 60) (< ?h 170))"# + ), + format!( + r#"(rule {exom} (health/water-band "large") \ + (?w_id 'health/profile/weight_kg ?w) \ + (?h_id 'health/profile/height_cm ?h) \ + (>= ?w 85))"# + ), + format!( + r#"(rule {exom} (health/water-band "large") \ + (?w_id 'health/profile/weight_kg ?w) \ + (?h_id 'health/profile/height_cm ?h) \ + (>= ?h 185))"# + ), + format!( + r#"(rule {exom} (health/water-band "medium") \ + (?w_id 'health/profile/weight_kg ?w) \ + (?h_id 'health/profile/height_cm ?h) \ + (not (health/water-band "small")) \ + (not (health/water-band "large")))"# + ), + // step bands inserted in Task B3 + ] +} +``` + +(Remove the embedded escaped newlines once you confirm the rule parser handles single-line strings; the `\` is just for readability here.) + +- [ ] **Step 2: Run cargo build to surface format errors** + +```bash +cargo build --release --features postgres +``` + +- [ ] **Step 3: Verify against running daemon** + +(per CLAUDE.md: "When adding or modifying any db/rayfall interactions test them against the running ray-exomem daemon.") + +```bash +set -a; source .env; set +a +ray-exomem stop +ray-exomem serve --bind 127.0.0.1:9780 \ + --auth-provider google --google-client-id "$GOOGLE_CLIENT_ID" \ + --allowed-domains "$ALLOWED_DOMAINS" --database-url "$DATABASE_URL" & +SERVE_PID=$! +sleep 3 +# As an authenticated session, query water-band: +curl -s 'http://127.0.0.1:9780/ray-exomem/api/query' \ + -H 'Cookie: ' \ + -d '{"exom":"/personal/health/main","rayfall":"(query (?b) (health/water-band ?b))"}' +kill $SERVE_PID +``` + +Expected: returns `["medium"]` for the default profile (weight=75, height=175). + +- [ ] **Step 4: Commit** + +```bash +git add src/auth/routes.rs +git commit -m "refactor(onboarding): water_band as declarative Datalog rules" +``` + +### Task B3: Express step_band as a Datalog rule + +Mirror Task B2 for step bands — three rules covering `age < 30 → high`, `age < 50 → medium`, otherwise `gentle`. + +- [ ] **Step 1: Append to `health_bootstrap_rules`** + +```rust + format!(r#"(rule {exom} (health/step-band "high") + (?id 'health/profile/age ?a) (< ?a 30))"#), + format!(r#"(rule {exom} (health/step-band "medium") + (?id 'health/profile/age ?a) (>= ?a 30) (< ?a 50))"#), + format!(r#"(rule {exom} (health/step-band "gentle") + (?id 'health/profile/age ?a) (>= ?a 50))"#), +``` + +- [ ] **Step 2: Test against running daemon (same harness as B2)** + +Expected: `(query (?b) (health/step-band ?b))` returns `["high"]` for age=30 (matches `< 50` and `>= 30`, but **NOT** `< 30` — boundary check: original Rust uses `if age < 30 → "high"` — so `30 → "medium"`, not `"high"`. Adjust the rules accordingly: + +```rust + format!(r#"(rule {exom} (health/step-band "high") + (?id 'health/profile/age ?a) (< ?a 30))"#), + format!(r#"(rule {exom} (health/step-band "medium") + (?id 'health/profile/age ?a) (>= ?a 30) (< ?a 50))"#), + format!(r#"(rule {exom} (health/step-band "gentle") + (?id 'health/profile/age ?a) (>= ?a 50))"#), +``` + +The existing Rust returns "medium" for age=30 (because `30 < 30` is false → falls through to next branch which is `< 50` → "medium"). Confirm the rules match by re-reading `system_schema.rs:676`: + +```rust +let band = if age < 30 { "high" } + else if age < 50 { "medium" } + else { "gentle" }; +``` + +So age=30 → "medium". The rule set above matches. + +- [ ] **Step 3: Commit** + +```bash +git add src/auth/routes.rs +git commit -m "refactor(onboarding): step_band as declarative Datalog rules" +``` + +### Task B4: Delete `native_derived_relations` and consumers + +**Files:** +- Modify: `src/system_schema.rs` (delete `native_derived_relations` function and its call sites) +- Modify: `src/server.rs` (delete `known_derived_samples` block at line 3583-3641) + +- [ ] **Step 1: Inventory call sites** + +```bash +grep -n "native_derived_relations\|known_derived_samples" src/ +``` + +Expected hits: `system_schema.rs:640,703,741,816`, `server.rs:3585,3598`. + +- [ ] **Step 2: Remove call sites in `server.rs`** + +In `src/server.rs`, delete lines 3583-3641 (the entire `known_derived_samples` block including the hardcoded "small"/"medium"/"large" → "2000"/"2500"/"3000" lookup). Replace with: + +```rust +let known_derived_samples: HashMap>> = HashMap::new(); +``` + +(or remove the variable entirely if downstream code can be updated to skip the field.) + +- [ ] **Step 3: Remove `native_derived_relations` from `system_schema.rs`** + +Delete: +- The `native_derived_relations` function (lines 640-698) +- Its inclusion in `builtin_rule_specs` (line 702-706) +- The test `native_derived_water_band_for_default_profile` only if it now relies on deleted code; otherwise rewrite it to query the engine end-to-end. + +Also delete now-unused constants if no other consumer remains: +- `HEALTH_WATER_BAND`, `HEALTH_STEP_BAND` (lines 80-81) +- `HEALTH_PROFILE_*_FACT_ID` constants +- `latest_active_fact` helper (if unused after deletion) + +- [ ] **Step 4: Build + test** + +```bash +cargo build --release --features postgres +cargo test +``` + +Expected: green; the regression test from B1 now goes through the engine because `health_bootstrap_rules` does the work. + +- [ ] **Step 5: Run end-to-end against daemon** + +Repeat the query harness from Task B2/B3. Confirm both `health/water-band` and `health/step-band` return correct values without any procedural Rust in the path. + +- [ ] **Step 6: Commit** + +```bash +git add src/system_schema.rs src/server.rs +git commit -m "refactor(onboarding): delete procedural native_derived_relations" +``` + +### Task B5: Replace hardcoded recommended-water/steps lookup + +The bootstrap rules at `auth/routes.rs:207-223` already express water-ml/steps-per-day as Datalog. After Task B2/B3 the engine itself derives `water-band` and `step-band`, so these existing rules naturally become reachable. Verify and clean up: + +- [ ] **Step 1: Confirm bootstrap rules still work** + +Run the daemon with a fresh DB; log in; query: + +```scheme +(query (?ml) (health/recommended-water-ml ?ml)) +(query (?sp) (health/recommended-steps-per-day ?sp)) +``` + +Expected: returns the value matching the derived band. (E.g. default profile → `medium` band → `"2500"` ml.) + +- [ ] **Step 2: If broken, debug by querying intermediate predicates** + +```scheme +(query (?b) (health/water-band ?b)) +(query (?b) (health/step-band ?b)) +``` + +This isolates whether B2/B3 rules fired correctly. + +- [ ] **Step 3: Commit any fixes** + +```bash +git commit -am "fix(onboarding): wire derived bands into recommended-* lookups" +``` + +### Task B6: Pin rayforce2 dependency in CLAUDE.md + +If Phase A features are needed (aggregates / float / between), the consumer must check out the matching rayforce2 branch. + +**Files:** +- Modify: `/Users/aspirational/Documents/code/lynx/Teide/ray-exomem/CLAUDE.md` (Important gotchas section) + +- [ ] **Step 1: Add a note** + +Insert under "Important gotchas": + +```markdown +- The bootstrap health rules use `<` / `>=` / `not` in Datalog rule bodies. These are + available in upstream rayforce2 master (commit 287aebf+). If aggregate-based rules + are added (count/sum/min/max/avg) or float thresholds are introduced, the sibling + rayforce2 checkout must be on `feature/datalog-aggregates` or a master that + includes the merged PR. +``` + +- [ ] **Step 2: Commit** + +```bash +git add CLAUDE.md +git commit -m "docs: note rayforce2 datalog feature dependency" +``` + +--- + +## Out of scope + +- Onboarding template system (separate plan): TOML-driven exom seeding, `--onboarding-templates` CLI flag, `/welcome` UI, multi-template catalog. That work consumes the artifacts of this plan but is independently scoped. +- ASP overlay / weak constraints / disjunctive rules: parked. +- Group-by aggregates (e.g. `count` per partition): only ungrouped aggregates in scope; group-by is a follow-up. + +## Verification checklist + +- [ ] `make test` passes in rayforce2 with all `/datalog/*` tests green +- [ ] `cargo test` passes in ray-exomem with no procedural derivations remaining +- [ ] Live daemon returns correct `water-band` / `step-band` / `recommended-water-ml` / `recommended-steps-per-day` for default profile (weight=75, height=175, age=30) → `medium` / `medium` / `2500` / `9000` +- [ ] Boundary smoke test: weight=85 → `large` band → `3000` ml; age=29 → `high` step band → `10000` steps +- [ ] `grep -r "native_derived_relations" src/` returns no hits in ray-exomem From 875e1029c76ce3454433950f01235b7ab8fb9629 Mon Sep 17 00:00:00 2001 From: Aspirational Date: Sat, 18 Apr 2026 18:03:23 +0300 Subject: [PATCH 03/51] feat(datalog): declare DL_AGG body literal + aggregate operators Made-with: Cursor --- src/ops/datalog.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/ops/datalog.h b/src/ops/datalog.h index c6012ce1..8a4499f7 100644 --- a/src/ops/datalog.h +++ b/src/ops/datalog.h @@ -42,6 +42,7 @@ #define DL_ASSIGN 3 /* assignment: X = expr */ #define DL_BUILTIN 4 /* builtin predicate */ #define DL_INTERVAL 5 /* interval bind: F @[S, E] */ +#define DL_AGG 6 /* aggregate: (count ?N pred), (sum ?S ?expr pred), ... */ /* ===== Comparison operators (for DL_CMP) ===== */ #define DL_CMP_EQ 0 @@ -51,6 +52,13 @@ #define DL_CMP_GT 4 #define DL_CMP_GE 5 +/* ===== Aggregate operators (for DL_AGG) ===== */ +#define DL_AGG_COUNT 0 +#define DL_AGG_SUM 1 +#define DL_AGG_MIN 2 +#define DL_AGG_MAX 3 +#define DL_AGG_AVG 4 + /* ===== Assignment operators (for DL_ASSIGN) ===== */ #define DL_OP_EQ 0 /* simple assignment: X = expr */ @@ -115,6 +123,11 @@ typedef struct { int interval_fact_var; /* fact variable index (for DL_INTERVAL) */ int interval_start_var; /* start variable index (for DL_INTERVAL) */ int interval_end_var; /* end variable index (for DL_INTERVAL) */ + int agg_op; /* aggregate operator (for DL_AGG) */ + int agg_target_var; /* variable that receives the aggregate result */ + char agg_pred[64]; /* predicate name being aggregated over */ + int agg_arity; /* arity of agg_pred */ + int agg_value_col; /* column index inside agg_pred to aggregate (sum/min/max/avg) */ } dl_body_t; /* ===== Datalog rule: head :- body ===== */ @@ -251,6 +264,16 @@ int dl_rule_add_cmp_expr(dl_rule_t* rule, int cmp_op, dl_expr_t* lhs, dl_expr_t* * position into start_var and end_var. Returns body literal index. */ int dl_rule_add_interval(dl_rule_t* rule, int fact_var, int start_var, int end_var); +/* Add an aggregate body literal: (op ?target pred col) + * - op: DL_AGG_COUNT (col is ignored), DL_AGG_SUM/MIN/MAX/AVG + * - target_var: variable that receives the aggregate result + * - pred: predicate to aggregate over + * - pred_arity: arity of that predicate + * - value_col: which column to aggregate (ignored for COUNT) + * Returns body literal index. */ +int dl_rule_add_agg(dl_rule_t* rule, int op, int target_var, + const char* pred, int pred_arity, int value_col); + /* ===== Expression tree builders ===== */ /* Create a constant expression */ From 6a5dc03618f90451631ab309f9cd393a8aa14874 Mon Sep 17 00:00:00 2001 From: Aspirational Date: Sat, 18 Apr 2026 18:06:52 +0300 Subject: [PATCH 04/51] feat(datalog): dl_rule_add_agg builder for aggregate literals Made-with: Cursor --- src/ops/datalog.c | 15 +++++++++++++++ test/test_datalog.c | 19 +++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index d50547f2..3002fde3 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -375,6 +375,21 @@ int dl_rule_add_interval(dl_rule_t* rule, int fact_var, int start_var, int end_v return idx; } +int dl_rule_add_agg(dl_rule_t* rule, int op, int target_var, + const char* pred, int pred_arity, int value_col) { + if (rule->n_body >= DL_MAX_BODY) return -1; + int idx = rule->n_body++; + dl_body_t* b = &rule->body[idx]; + memset(b, 0, sizeof(*b)); + b->type = DL_AGG; + b->agg_op = op; + b->agg_target_var = target_var; + snprintf(b->agg_pred, sizeof(b->agg_pred), "%s", pred); + b->agg_arity = pred_arity; + b->agg_value_col = value_col; + return idx; +} + /* ======================================================================== * Stratification — topological sort on negation dependency graph * ======================================================================== */ diff --git a/test/test_datalog.c b/test/test_datalog.c index d979c2d1..d72e123a 100644 --- a/test/test_datalog.c +++ b/test/test_datalog.c @@ -304,11 +304,30 @@ static MunitResult test_arith_assignment(const void* params, void* fixture) { return MUNIT_OK; } +/* Verify dl_rule_add_agg populates body fields correctly. */ +static MunitResult test_agg_builder(const void* params, void* fixture) { + (void)params; (void)fixture; + dl_rule_t rule; + dl_rule_init(&rule, "stats", 1); + dl_rule_head_var(&rule, 0, 0); + + int idx = dl_rule_add_agg(&rule, DL_AGG_COUNT, 0, "weight", 1, 0); + munit_assert_int(idx, ==, 0); + munit_assert_int(rule.body[0].type, ==, DL_AGG); + munit_assert_int(rule.body[0].agg_op, ==, DL_AGG_COUNT); + munit_assert_int(rule.body[0].agg_target_var, ==, 0); + munit_assert_string_equal(rule.body[0].agg_pred, "weight"); + munit_assert_int(rule.body[0].agg_arity, ==, 1); + munit_assert_int(rule.body[0].agg_value_col, ==, 0); + return MUNIT_OK; +} + static MunitTest datalog_tests[] = { { "/source_provenance", test_source_provenance, datalog_setup, datalog_teardown, 0, NULL }, { "/source_prov_requires_flag", test_source_prov_requires_flag, datalog_setup, datalog_teardown, 0, NULL }, { "/cmp_const_filter", test_cmp_const_filter, datalog_setup, datalog_teardown, 0, NULL }, { "/arith_assignment", test_arith_assignment, datalog_setup, datalog_teardown, 0, NULL }, + { "/agg_builder", test_agg_builder, datalog_setup, datalog_teardown, 0, NULL }, { NULL, NULL, NULL, NULL, 0, NULL }, }; From 6a56026cfbd6c802914faa7e82ce39c8327fe48f Mon Sep 17 00:00:00 2001 From: Aspirational Date: Sat, 18 Apr 2026 18:09:53 +0300 Subject: [PATCH 05/51] fix(datalog): dl_rule_add_agg bumps rule->n_vars to match sibling builders --- src/ops/datalog.c | 1 + test/test_datalog.c | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index 3002fde3..c811764e 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -387,6 +387,7 @@ int dl_rule_add_agg(dl_rule_t* rule, int op, int target_var, snprintf(b->agg_pred, sizeof(b->agg_pred), "%s", pred); b->agg_arity = pred_arity; b->agg_value_col = value_col; + if (target_var + 1 > rule->n_vars) rule->n_vars = target_var + 1; return idx; } diff --git a/test/test_datalog.c b/test/test_datalog.c index d72e123a..25dc0856 100644 --- a/test/test_datalog.c +++ b/test/test_datalog.c @@ -319,6 +319,18 @@ static MunitResult test_agg_builder(const void* params, void* fixture) { munit_assert_string_equal(rule.body[0].agg_pred, "weight"); munit_assert_int(rule.body[0].agg_arity, ==, 1); munit_assert_int(rule.body[0].agg_value_col, ==, 0); + munit_assert_int(rule.n_vars, ==, 1); + + dl_rule_t rule2; + dl_rule_init(&rule2, "sum_stats", 1); + dl_rule_head_var(&rule2, 0, 3); + int idx2 = dl_rule_add_agg(&rule2, DL_AGG_SUM, 3, "readings", 4, 2); + munit_assert_int(idx2, ==, 0); + munit_assert_int(rule2.body[0].agg_op, ==, DL_AGG_SUM); + munit_assert_int(rule2.body[0].agg_target_var, ==, 3); + munit_assert_int(rule2.body[0].agg_arity, ==, 4); + munit_assert_int(rule2.body[0].agg_value_col, ==, 2); + munit_assert_int(rule2.n_vars, ==, 4); return MUNIT_OK; } From d37388dcfe75f457f882e619af5eb5504c163023 Mon Sep 17 00:00:00 2001 From: Aspirational Date: Sat, 18 Apr 2026 18:11:41 +0300 Subject: [PATCH 06/51] feat(datalog): aggregates participate in stratification (non-monotonic) --- src/ops/datalog.c | 8 ++++++++ test/test_datalog.c | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index c811764e..58cc4560 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -412,6 +412,14 @@ int dl_stratify(dl_program_t* prog) { for (int b = 0; b < rule->n_body; b++) { dl_body_t* body = &rule->body[b]; + if (body->type == DL_AGG) { + /* Aggregates are non-monotonic: head must live in a higher + * stratum than the predicate being aggregated. */ + int body_idx = dl_find_rel(prog, body->agg_pred); + if (body_idx < 0) continue; + dep[head_idx][body_idx] = 2; /* negative (non-monotonic) dep */ + continue; + } if (body->type != DL_POS && body->type != DL_NEG) continue; int body_idx = dl_find_rel(prog, body->pred); if (body_idx < 0) continue; diff --git a/test/test_datalog.c b/test/test_datalog.c index 25dc0856..9e46466d 100644 --- a/test/test_datalog.c +++ b/test/test_datalog.c @@ -334,12 +334,53 @@ static MunitResult test_agg_builder(const void* params, void* fixture) { return MUNIT_OK; } +/* Aggregates over an IDB must be evaluated in a strictly higher stratum + * than the IDB itself. Program: + * EDB: edge(1,2), edge(2,3) + * Rule R0: path(X,Y) :- edge(X,Y) + * Rule R1: path_count(N) :- (count ?N path) + * After stratification: R1.stratum > R0.stratum. */ +static MunitResult test_agg_stratifies_above_source(const void* params, void* fixture) { + (void)params; (void)fixture; + int64_t s_vals[] = {1, 2}; + int64_t d_vals[] = {2, 3}; + ray_t* sc = ray_vec_from_raw(RAY_I64, s_vals, 2); + ray_t* dc = ray_vec_from_raw(RAY_I64, d_vals, 2); + ray_t* edge = ray_table_new(2); + edge = ray_table_add_col(edge, ray_sym_intern("edge__c0", 8), sc); + edge = ray_table_add_col(edge, ray_sym_intern("edge__c1", 8), dc); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "edge", edge, 2); + + dl_rule_t r0; dl_rule_init(&r0, "path", 2); + dl_rule_head_var(&r0, 0, 0); dl_rule_head_var(&r0, 1, 1); + int b = dl_rule_add_atom(&r0, "edge", 2); + dl_body_set_var(&r0, b, 0, 0); dl_body_set_var(&r0, b, 1, 1); + r0.n_vars = 2; + dl_add_rule(prog, &r0); + + dl_rule_t r1; dl_rule_init(&r1, "path_count", 1); + dl_rule_head_var(&r1, 0, 0); + dl_rule_add_agg(&r1, DL_AGG_COUNT, 0, "path", 2, 0); + r1.n_vars = 1; + dl_add_rule(prog, &r1); + + munit_assert_int(dl_stratify(prog), ==, 0); + munit_assert_int(prog->rules[1].stratum, >, prog->rules[0].stratum); + + dl_program_free(prog); + ray_release(edge); ray_release(sc); ray_release(dc); + return MUNIT_OK; +} + static MunitTest datalog_tests[] = { { "/source_provenance", test_source_provenance, datalog_setup, datalog_teardown, 0, NULL }, { "/source_prov_requires_flag", test_source_prov_requires_flag, datalog_setup, datalog_teardown, 0, NULL }, { "/cmp_const_filter", test_cmp_const_filter, datalog_setup, datalog_teardown, 0, NULL }, { "/arith_assignment", test_arith_assignment, datalog_setup, datalog_teardown, 0, NULL }, { "/agg_builder", test_agg_builder, datalog_setup, datalog_teardown, 0, NULL }, + { "/agg_stratifies_above_source", test_agg_stratifies_above_source, datalog_setup, datalog_teardown, 0, NULL }, { NULL, NULL, NULL, NULL, 0, NULL }, }; From 279a6d39c33c3a9c051aa63b2fcb026621ea474c Mon Sep 17 00:00:00 2001 From: Aspirational Date: Sat, 18 Apr 2026 18:16:46 +0300 Subject: [PATCH 07/51] feat(datalog): evaluate count/sum/min/max/avg aggregates Made-with: Cursor --- src/ops/datalog.c | 107 ++++++++++++++++++++++++++++++++ test/test_datalog.c | 146 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 253 insertions(+) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index 58cc4560..d3d0a31f 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -929,6 +929,31 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, } } + /* Rules with only aggregates (no positive body atoms) still need a + * one-row binding environment so aggregate results can be projected. */ + if (!accum) { + bool has_agg = false; + for (int bi = 0; bi < rule->n_body; bi++) { + if (rule->body[bi].type == DL_AGG) { + has_agg = true; + break; + } + } + if (!has_agg) + return NULL; + ray_t* one_val = ray_vec_new(RAY_I64, 1); + if (!one_val || RAY_IS_ERR(one_val)) + return NULL; + one_val->len = 1; + ((int64_t*)ray_data(one_val))[0] = 0; + accum = ray_table_new(1); + int64_t unit_sym = ray_sym_intern("_unit", 5); + accum = ray_table_add_col(accum, unit_sym, one_val); + ray_release(one_val); + if (!accum || RAY_IS_ERR(accum)) + return NULL; + } + if (!accum) return NULL; /* Process non-join body literals in declared order. @@ -995,6 +1020,88 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, break; } + case DL_AGG: { + int src_idx = dl_find_rel(prog, body->agg_pred); + if (src_idx < 0) { + ray_release(accum); + return NULL; + } + ray_t* src_table = prog->rels[src_idx].table; + int64_t src_nrows = (src_table && !RAY_IS_ERR(src_table)) + ? ray_table_nrows(src_table) + : 0; + + int64_t result = 0; + switch (body->agg_op) { + case DL_AGG_COUNT: + result = src_nrows; + break; + case DL_AGG_SUM: + case DL_AGG_MIN: + case DL_AGG_MAX: + case DL_AGG_AVG: + if (src_nrows <= 0) { + result = 0; + } else { + ray_t* val_col = + ray_table_get_col_idx(src_table, body->agg_value_col); + if (!val_col) { + result = 0; + } else { + int64_t* vd = (int64_t*)ray_data(val_col); + if (body->agg_op == DL_AGG_SUM) { + result = 0; + for (int64_t i = 0; i < src_nrows; i++) + result += vd[i]; + } else if (body->agg_op == DL_AGG_MIN) { + result = vd[0]; + for (int64_t i = 1; i < src_nrows; i++) { + if (vd[i] < result) + result = vd[i]; + } + } else if (body->agg_op == DL_AGG_MAX) { + result = vd[0]; + for (int64_t i = 1; i < src_nrows; i++) { + if (vd[i] > result) + result = vd[i]; + } + } else { /* DL_AGG_AVG */ + int64_t acc = 0; + for (int64_t i = 0; i < src_nrows; i++) + acc += vd[i]; + result = acc / src_nrows; + } + } + } + break; + default: + break; + } + + int64_t nrows = ray_table_nrows(accum); + if (nrows == 0) + break; + ray_t* new_col = ray_vec_new(RAY_I64, nrows); + if (!new_col || RAY_IS_ERR(new_col)) + break; + new_col->len = nrows; + int64_t* nd = (int64_t*)ray_data(new_col); + for (int64_t r = 0; r < nrows; r++) + nd[r] = result; + + int new_col_idx = (int)ray_table_ncols(accum); + char colname[32]; + snprintf(colname, sizeof(colname), "_g%d", body->agg_target_var); + ray_t* new_accum = dl_table_add_computed_col(accum, new_col, colname); + ray_release(new_col); + ray_release(accum); + accum = new_accum; + + var_bound[body->agg_target_var] = true; + var_col[body->agg_target_var] = new_col_idx; + break; + } + case DL_BUILTIN: { switch (body->builtin_id) { case DL_BUILTIN_BEFORE: { diff --git a/test/test_datalog.c b/test/test_datalog.c index 9e46466d..24641691 100644 --- a/test/test_datalog.c +++ b/test/test_datalog.c @@ -374,6 +374,147 @@ static MunitResult test_agg_stratifies_above_source(const void* params, void* fi return MUNIT_OK; } +/* (count ?N weight) where weight has 4 rows -> N = 4. */ +static MunitResult test_agg_count_edb(const void* params, void* fixture) { + (void)params; (void)fixture; + int64_t vals[] = {50, 60, 75, 85}; + ray_t* col = ray_vec_from_raw(RAY_I64, vals, 4); + ray_t* weight = ray_table_new(1); + weight = ray_table_add_col(weight, ray_sym_intern("weight__c0", 10), col); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "weight", weight, 1); + + dl_rule_t r; dl_rule_init(&r, "wcount", 1); + dl_rule_head_var(&r, 0, 0); + dl_rule_add_agg(&r, DL_AGG_COUNT, 0, "weight", 1, 0); + r.n_vars = 1; + dl_add_rule(prog, &r); + + munit_assert_int(dl_eval(prog), ==, 0); + ray_t* out = dl_query(prog, "wcount"); + munit_assert_ptr_not_null(out); + munit_assert_int((int)ray_table_nrows(out), ==, 1); + int64_t* od = (int64_t*)ray_data(ray_table_get_col_idx(out, 0)); + munit_assert_int((int)od[0], ==, 4); + + dl_program_free(prog); + ray_release(weight); ray_release(col); + return MUNIT_OK; +} + +static ray_t* make_weight_edb(void) { + int64_t vals[] = {50, 60, 75, 85}; + ray_t* col = ray_vec_from_raw(RAY_I64, vals, 4); + ray_t* weight = ray_table_new(1); + weight = ray_table_add_col(weight, ray_sym_intern("weight__c0", 10), col); + return weight; +} + +static MunitResult test_agg_sum(const void* params, void* fixture) { + (void)params; (void)fixture; + ray_t* weight = make_weight_edb(); + ray_t* col = ray_table_get_col_idx(weight, 0); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "weight", weight, 1); + + dl_rule_t r; dl_rule_init(&r, "wsum", 1); + dl_rule_head_var(&r, 0, 0); + dl_rule_add_agg(&r, DL_AGG_SUM, 0, "weight", 1, 0); + r.n_vars = 1; + dl_add_rule(prog, &r); + + munit_assert_int(dl_eval(prog), ==, 0); + ray_t* out = dl_query(prog, "wsum"); + munit_assert_ptr_not_null(out); + munit_assert_int((int)ray_table_nrows(out), ==, 1); + int64_t* od = (int64_t*)ray_data(ray_table_get_col_idx(out, 0)); + munit_assert_int((int)od[0], ==, 270); + + dl_program_free(prog); + ray_release(weight); ray_release(col); + return MUNIT_OK; +} + +static MunitResult test_agg_min(const void* params, void* fixture) { + (void)params; (void)fixture; + ray_t* weight = make_weight_edb(); + ray_t* col = ray_table_get_col_idx(weight, 0); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "weight", weight, 1); + + dl_rule_t r; dl_rule_init(&r, "wmin", 1); + dl_rule_head_var(&r, 0, 0); + dl_rule_add_agg(&r, DL_AGG_MIN, 0, "weight", 1, 0); + r.n_vars = 1; + dl_add_rule(prog, &r); + + munit_assert_int(dl_eval(prog), ==, 0); + ray_t* out = dl_query(prog, "wmin"); + munit_assert_ptr_not_null(out); + munit_assert_int((int)ray_table_nrows(out), ==, 1); + int64_t* od = (int64_t*)ray_data(ray_table_get_col_idx(out, 0)); + munit_assert_int((int)od[0], ==, 50); + + dl_program_free(prog); + ray_release(weight); ray_release(col); + return MUNIT_OK; +} + +static MunitResult test_agg_max(const void* params, void* fixture) { + (void)params; (void)fixture; + ray_t* weight = make_weight_edb(); + ray_t* col = ray_table_get_col_idx(weight, 0); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "weight", weight, 1); + + dl_rule_t r; dl_rule_init(&r, "wmax", 1); + dl_rule_head_var(&r, 0, 0); + dl_rule_add_agg(&r, DL_AGG_MAX, 0, "weight", 1, 0); + r.n_vars = 1; + dl_add_rule(prog, &r); + + munit_assert_int(dl_eval(prog), ==, 0); + ray_t* out = dl_query(prog, "wmax"); + munit_assert_ptr_not_null(out); + munit_assert_int((int)ray_table_nrows(out), ==, 1); + int64_t* od = (int64_t*)ray_data(ray_table_get_col_idx(out, 0)); + munit_assert_int((int)od[0], ==, 85); + + dl_program_free(prog); + ray_release(weight); ray_release(col); + return MUNIT_OK; +} + +static MunitResult test_agg_avg(const void* params, void* fixture) { + (void)params; (void)fixture; + ray_t* weight = make_weight_edb(); + ray_t* col = ray_table_get_col_idx(weight, 0); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "weight", weight, 1); + + dl_rule_t r; dl_rule_init(&r, "wavg", 1); + dl_rule_head_var(&r, 0, 0); + dl_rule_add_agg(&r, DL_AGG_AVG, 0, "weight", 1, 0); + r.n_vars = 1; + dl_add_rule(prog, &r); + + munit_assert_int(dl_eval(prog), ==, 0); + ray_t* out = dl_query(prog, "wavg"); + munit_assert_ptr_not_null(out); + munit_assert_int((int)ray_table_nrows(out), ==, 1); + int64_t* od = (int64_t*)ray_data(ray_table_get_col_idx(out, 0)); + munit_assert_int((int)od[0], ==, 67); + + dl_program_free(prog); + ray_release(weight); ray_release(col); + return MUNIT_OK; +} + static MunitTest datalog_tests[] = { { "/source_provenance", test_source_provenance, datalog_setup, datalog_teardown, 0, NULL }, { "/source_prov_requires_flag", test_source_prov_requires_flag, datalog_setup, datalog_teardown, 0, NULL }, @@ -381,6 +522,11 @@ static MunitTest datalog_tests[] = { { "/arith_assignment", test_arith_assignment, datalog_setup, datalog_teardown, 0, NULL }, { "/agg_builder", test_agg_builder, datalog_setup, datalog_teardown, 0, NULL }, { "/agg_stratifies_above_source", test_agg_stratifies_above_source, datalog_setup, datalog_teardown, 0, NULL }, + { "/agg_count_edb", test_agg_count_edb, datalog_setup, datalog_teardown, 0, NULL }, + { "/agg_sum", test_agg_sum, datalog_setup, datalog_teardown, 0, NULL }, + { "/agg_min", test_agg_min, datalog_setup, datalog_teardown, 0, NULL }, + { "/agg_max", test_agg_max, datalog_setup, datalog_teardown, 0, NULL }, + { "/agg_avg", test_agg_avg, datalog_setup, datalog_teardown, 0, NULL }, { NULL, NULL, NULL, NULL, 0, NULL }, }; From d211a5b00a19aeac3d0229b7ed37f5ddc94fa9e6 Mon Sep 17 00:00:00 2001 From: Aspirational Date: Sat, 18 Apr 2026 18:22:23 +0300 Subject: [PATCH 08/51] fix(datalog): guard non-i64 columns in aggregate fold; fix test refcounts Concern 1: DL_AGG SUM/MIN/MAX/AVG cast the value column's raw data to int64_t* without checking the column type. A future caller that passes a RAY_F64 / RAY_SYM column would silently misinterpret bytes. Tighten the existing null-check to also require val_col->type == RAY_I64, preserving the "empty/missing -> 0" contract. Concern 2: test_agg_sum/min/max/avg were obtaining the value column via ray_table_get_col_idx (which returns a BORROWED reference) and then calling ray_release on it. This balanced out only because the helper make_weight_edb leaked its own ray_vec_from_raw ref -- the stray release cancelled the leak by accident. Fix by: - make_weight_edb now releases its local col ref after ray_table_add_col retains it (mirroring test_agg_count_edb). - The four aggregate tests drop the bogus borrowed-ref release. Verified with `make test` (663/663 passing under -fsanitize=address,undefined). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/ops/datalog.c | 2 +- test/test_datalog.c | 13 +++++-------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index d3d0a31f..3eb99fe2 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -1045,7 +1045,7 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, } else { ray_t* val_col = ray_table_get_col_idx(src_table, body->agg_value_col); - if (!val_col) { + if (!val_col || val_col->type != RAY_I64) { result = 0; } else { int64_t* vd = (int64_t*)ray_data(val_col); diff --git a/test/test_datalog.c b/test/test_datalog.c index 24641691..c83cbb9a 100644 --- a/test/test_datalog.c +++ b/test/test_datalog.c @@ -408,13 +408,13 @@ static ray_t* make_weight_edb(void) { ray_t* col = ray_vec_from_raw(RAY_I64, vals, 4); ray_t* weight = ray_table_new(1); weight = ray_table_add_col(weight, ray_sym_intern("weight__c0", 10), col); + ray_release(col); return weight; } static MunitResult test_agg_sum(const void* params, void* fixture) { (void)params; (void)fixture; ray_t* weight = make_weight_edb(); - ray_t* col = ray_table_get_col_idx(weight, 0); dl_program_t* prog = dl_program_new(); dl_add_edb(prog, "weight", weight, 1); @@ -433,14 +433,13 @@ static MunitResult test_agg_sum(const void* params, void* fixture) { munit_assert_int((int)od[0], ==, 270); dl_program_free(prog); - ray_release(weight); ray_release(col); + ray_release(weight); return MUNIT_OK; } static MunitResult test_agg_min(const void* params, void* fixture) { (void)params; (void)fixture; ray_t* weight = make_weight_edb(); - ray_t* col = ray_table_get_col_idx(weight, 0); dl_program_t* prog = dl_program_new(); dl_add_edb(prog, "weight", weight, 1); @@ -459,14 +458,13 @@ static MunitResult test_agg_min(const void* params, void* fixture) { munit_assert_int((int)od[0], ==, 50); dl_program_free(prog); - ray_release(weight); ray_release(col); + ray_release(weight); return MUNIT_OK; } static MunitResult test_agg_max(const void* params, void* fixture) { (void)params; (void)fixture; ray_t* weight = make_weight_edb(); - ray_t* col = ray_table_get_col_idx(weight, 0); dl_program_t* prog = dl_program_new(); dl_add_edb(prog, "weight", weight, 1); @@ -485,14 +483,13 @@ static MunitResult test_agg_max(const void* params, void* fixture) { munit_assert_int((int)od[0], ==, 85); dl_program_free(prog); - ray_release(weight); ray_release(col); + ray_release(weight); return MUNIT_OK; } static MunitResult test_agg_avg(const void* params, void* fixture) { (void)params; (void)fixture; ray_t* weight = make_weight_edb(); - ray_t* col = ray_table_get_col_idx(weight, 0); dl_program_t* prog = dl_program_new(); dl_add_edb(prog, "weight", weight, 1); @@ -511,7 +508,7 @@ static MunitResult test_agg_avg(const void* params, void* fixture) { munit_assert_int((int)od[0], ==, 67); dl_program_free(prog); - ray_release(weight); ray_release(col); + ray_release(weight); return MUNIT_OK; } From c78a34be0d98f15d9511710fdf5a3a89afdeadee Mon Sep 17 00:00:00 2001 From: Aspirational Date: Sat, 18 Apr 2026 18:45:57 +0300 Subject: [PATCH 09/51] feat(datalog): MIN/MAX/AVG over empty source emit no row (match core semantics) Made-with: Cursor --- src/ops/datalog.c | 9 +++++++ test/test_datalog.c | 58 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index 3eb99fe2..a06c06e7 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -1031,6 +1031,15 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, ? ray_table_nrows(src_table) : 0; + if (src_nrows == 0 && (body->agg_op == DL_AGG_MIN + || body->agg_op == DL_AGG_MAX + || body->agg_op == DL_AGG_AVG)) { + /* Empty-source: MIN/MAX/AVG emit no row (matches rayforce core's domain + * error / typed-null semantics). COUNT and SUM keep their identities (0). */ + ray_release(accum); + return NULL; + } + int64_t result = 0; switch (body->agg_op) { case DL_AGG_COUNT: diff --git a/test/test_datalog.c b/test/test_datalog.c index c83cbb9a..321d58c8 100644 --- a/test/test_datalog.c +++ b/test/test_datalog.c @@ -512,6 +512,62 @@ static MunitResult test_agg_avg(const void* params, void* fixture) { return MUNIT_OK; } +/* MIN over empty source -> rule produces no row. */ +static MunitResult test_agg_min_empty(const void* params, void* fixture) { + (void)params; (void)fixture; + int64_t dummy = 0; + ray_t* empty_vec = ray_vec_from_raw(RAY_I64, &dummy, 0); + ray_t* weight = ray_table_new(1); + weight = ray_table_add_col(weight, ray_sym_intern("weight__c0", 10), empty_vec); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "weight", weight, 1); + + dl_rule_t r; dl_rule_init(&r, "wmin", 1); + dl_rule_head_var(&r, 0, 0); + dl_rule_add_agg(&r, DL_AGG_MIN, 0, "weight", 1, 0); + r.n_vars = 1; + dl_add_rule(prog, &r); + + munit_assert_int(dl_eval(prog), ==, 0); + ray_t* out = dl_query(prog, "wmin"); + /* Either NULL (no rel) or a 0-row table is acceptable "no row" semantics. */ + if (out) munit_assert_int((int)ray_table_nrows(out), ==, 0); + + dl_program_free(prog); + ray_release(weight); ray_release(empty_vec); + return MUNIT_OK; +} + +/* COUNT over empty source -> 1 row with value 0 (well-defined). */ +static MunitResult test_agg_count_empty(const void* params, void* fixture) { + (void)params; (void)fixture; + int64_t dummy = 0; + ray_t* empty_vec = ray_vec_from_raw(RAY_I64, &dummy, 0); + ray_t* weight = ray_table_new(1); + weight = ray_table_add_col(weight, ray_sym_intern("weight__c0", 10), empty_vec); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "weight", weight, 1); + + dl_rule_t r; dl_rule_init(&r, "wcnt", 1); + dl_rule_head_var(&r, 0, 0); + dl_rule_add_agg(&r, DL_AGG_COUNT, 0, "weight", 1, 0); + r.n_vars = 1; + dl_add_rule(prog, &r); + + munit_assert_int(dl_eval(prog), ==, 0); + ray_t* out = dl_query(prog, "wcnt"); + munit_assert_ptr_not_null(out); + munit_assert_int((int)ray_table_nrows(out), ==, 1); + int64_t* od = (int64_t*)ray_data(ray_table_get_col_idx(out, 0)); + munit_assert_int((int)od[0], ==, 0); + + dl_program_free(prog); + ray_release(weight); ray_release(empty_vec); + return MUNIT_OK; +} + static MunitTest datalog_tests[] = { { "/source_provenance", test_source_provenance, datalog_setup, datalog_teardown, 0, NULL }, { "/source_prov_requires_flag", test_source_prov_requires_flag, datalog_setup, datalog_teardown, 0, NULL }, @@ -524,6 +580,8 @@ static MunitTest datalog_tests[] = { { "/agg_min", test_agg_min, datalog_setup, datalog_teardown, 0, NULL }, { "/agg_max", test_agg_max, datalog_setup, datalog_teardown, 0, NULL }, { "/agg_avg", test_agg_avg, datalog_setup, datalog_teardown, 0, NULL }, + { "/agg_min_empty", test_agg_min_empty, datalog_setup, datalog_teardown, 0, NULL }, + { "/agg_count_empty", test_agg_count_empty, datalog_setup, datalog_teardown, 0, NULL }, { NULL, NULL, NULL, NULL, 0, NULL }, }; From faaee1f55cfdd5654f835446ad65b87de4d84500 Mon Sep 17 00:00:00 2001 From: Aspirational Date: Sat, 18 Apr 2026 18:54:12 +0300 Subject: [PATCH 10/51] feat(datalog): grouped aggregation via ray_group Extend DL_AGG to support (op ?result pred value_col by ?key_var key_col ...) by delegating to rayforce core's ray_group. Adds dl_rule_agg_set_group builder helper and DL_AGG_MAX_KEYS=4; scalar path is unchanged when no group keys are attached. Grouped branch builds a sub-graph that scans the source relation's key and value columns by their interned names, runs ray_group + ray_execute, then rebinds var_col so head projection reads keys from columns 0..nk-1 and the aggregate from column nk. Covers COUNT and SUM in tests; AVG grouped is deferred until A6 promotes the scalar output to f64 so both paths agree. Task A4.6 of the Datalog Aggregates plan. --- src/ops/datalog.c | 96 +++++++++++++++++++++++++++++++++++++++++++++ src/ops/datalog.h | 12 ++++++ test/test_datalog.c | 92 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 200 insertions(+) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index a06c06e7..edab5423 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -387,10 +387,27 @@ int dl_rule_add_agg(dl_rule_t* rule, int op, int target_var, snprintf(b->agg_pred, sizeof(b->agg_pred), "%s", pred); b->agg_arity = pred_arity; b->agg_value_col = value_col; + b->agg_n_group_keys = 0; if (target_var + 1 > rule->n_vars) rule->n_vars = target_var + 1; return idx; } +int dl_rule_agg_set_group(dl_rule_t* rule, int body_idx, + const int* key_vars, const int* key_cols, int n_keys) { + if (!rule || body_idx < 0 || body_idx >= rule->n_body) return -1; + if (n_keys < 0 || n_keys > DL_AGG_MAX_KEYS) return -1; + dl_body_t* b = &rule->body[body_idx]; + if (b->type != DL_AGG) return -1; + b->agg_n_group_keys = n_keys; + for (int i = 0; i < n_keys; i++) { + b->agg_group_key_vars[i] = key_vars[i]; + b->agg_group_key_cols[i] = key_cols[i]; + if (key_vars[i] + 1 > rule->n_vars) + rule->n_vars = key_vars[i] + 1; + } + return 0; +} + /* ======================================================================== * Stratification — topological sort on negation dependency graph * ======================================================================== */ @@ -1021,6 +1038,85 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, } case DL_AGG: { + if (body->agg_n_group_keys > 0) { + /* Grouped aggregation: use rayforce's ray_group on src_table. */ + int src_idx = dl_find_rel(prog, body->agg_pred); + if (src_idx < 0) { ray_release(accum); return NULL; } + ray_t* src_table = prog->rels[src_idx].table; + int64_t src_nrows = (src_table && !RAY_IS_ERR(src_table)) + ? ray_table_nrows(src_table) : 0; + if (src_nrows == 0) { + /* No source rows -> no groups -> rule produces no head tuples. */ + ray_release(accum); + return NULL; + } + + dl_rel_t* src_rel = &prog->rels[src_idx]; + int nk = body->agg_n_group_keys; + + /* Build a sub-graph that SCANs src_table's columns by symbol name. */ + ray_retain(src_table); + ray_graph_t* gg = ray_graph_new(src_table); + if (!gg) { ray_release(src_table); ray_release(accum); return NULL; } + + ray_op_t* keys_ops[DL_AGG_MAX_KEYS]; + for (int i = 0; i < nk; i++) { + int64_t sym = src_rel->col_names[body->agg_group_key_cols[i]]; + ray_t* s = ray_sym_str(sym); + keys_ops[i] = ray_scan(gg, ray_str_ptr(s)); + } + + /* Agg input: value column (for COUNT we still pass a column; any + * column works since COUNT only counts rows). */ + int value_col = body->agg_value_col; + if (value_col < 0 || value_col >= src_rel->arity) value_col = 0; + ray_t* vs = ray_sym_str(src_rel->col_names[value_col]); + ray_op_t* agg_in = ray_scan(gg, ray_str_ptr(vs)); + + uint16_t op_code; + switch (body->agg_op) { + case DL_AGG_COUNT: op_code = OP_COUNT; break; + case DL_AGG_SUM: op_code = OP_SUM; break; + case DL_AGG_MIN: op_code = OP_MIN; break; + case DL_AGG_MAX: op_code = OP_MAX; break; + case DL_AGG_AVG: op_code = OP_AVG; break; + default: + ray_graph_free(gg); ray_release(src_table); + ray_release(accum); return NULL; + } + + ray_op_t* ag_ins[1] = { agg_in }; + ray_op_t* root = ray_group(gg, keys_ops, (uint8_t)nk, &op_code, ag_ins, 1); + ray_t* group_tbl = ray_execute(gg, root); + ray_graph_free(gg); + ray_release(src_table); + + if (!group_tbl || RAY_IS_ERR(group_tbl)) { + if (group_tbl) ray_release(group_tbl); + ray_release(accum); + return NULL; + } + + /* Replace accum with group_tbl (schema: key0..key{nk-1}, agg). + * This is valid because the DL_AGG case for aggregate-only rules + * created a singleton _unit accum that we can discard. Mixed + * rules (body atoms + grouped agg) are not supported here; they + * would require a join on shared vars and fall under A5/later. */ + ray_release(accum); + accum = group_tbl; + + /* Bind key variables to the key columns in the group output */ + for (int i = 0; i < nk; i++) { + int kv = body->agg_group_key_vars[i]; + var_bound[kv] = true; + var_col[kv] = i; + } + /* Bind target variable to the aggregate column (last column) */ + var_bound[body->agg_target_var] = true; + var_col[body->agg_target_var] = nk; /* agg column immediately follows keys */ + break; + } + /* -------- existing scalar path below unchanged -------- */ int src_idx = dl_find_rel(prog, body->agg_pred); if (src_idx < 0) { ray_release(accum); diff --git a/src/ops/datalog.h b/src/ops/datalog.h index 8a4499f7..339edc1a 100644 --- a/src/ops/datalog.h +++ b/src/ops/datalog.h @@ -59,6 +59,8 @@ #define DL_AGG_MAX 3 #define DL_AGG_AVG 4 +#define DL_AGG_MAX_KEYS 4 + /* ===== Assignment operators (for DL_ASSIGN) ===== */ #define DL_OP_EQ 0 /* simple assignment: X = expr */ @@ -128,6 +130,9 @@ typedef struct { char agg_pred[64]; /* predicate name being aggregated over */ int agg_arity; /* arity of agg_pred */ int agg_value_col; /* column index inside agg_pred to aggregate (sum/min/max/avg) */ + int agg_n_group_keys; /* 0 = scalar; >0 = grouped */ + int agg_group_key_vars[DL_AGG_MAX_KEYS]; + int agg_group_key_cols[DL_AGG_MAX_KEYS]; } dl_body_t; /* ===== Datalog rule: head :- body ===== */ @@ -274,6 +279,13 @@ int dl_rule_add_interval(dl_rule_t* rule, int fact_var, int start_var, int end_v int dl_rule_add_agg(dl_rule_t* rule, int op, int target_var, const char* pred, int pred_arity, int value_col); +/* Attach group-by keys to an aggregate body literal previously added via + * dl_rule_add_agg. body_idx is that builder's return value. + * key_vars and key_cols have n_keys entries (<= DL_AGG_MAX_KEYS). + * Returns 0 on success, -1 if n_keys is out of range. */ +int dl_rule_agg_set_group(dl_rule_t* rule, int body_idx, + const int* key_vars, const int* key_cols, int n_keys); + /* ===== Expression tree builders ===== */ /* Create a constant expression */ diff --git a/test/test_datalog.c b/test/test_datalog.c index 321d58c8..5291b3ef 100644 --- a/test/test_datalog.c +++ b/test/test_datalog.c @@ -568,6 +568,96 @@ static MunitResult test_agg_count_empty(const void* params, void* fixture) { return MUNIT_OK; } +/* weight_by_user(user_id, kg): (1,50), (1,60), (2,75), (2,85) + * Rule: user_count(?u, ?n) :- count(?n, weight_by_user) by (?u, col 0) + * Expected: (1,2), (2,2) */ +static MunitResult test_agg_count_grouped(const void* params, void* fixture) { + (void)params; (void)fixture; + int64_t users[] = {1, 1, 2, 2}; + int64_t weights[] = {50, 60, 75, 85}; + ray_t* u_col = ray_vec_from_raw(RAY_I64, users, 4); + ray_t* w_col = ray_vec_from_raw(RAY_I64, weights, 4); + ray_t* tbl = ray_table_new(2); + tbl = ray_table_add_col(tbl, ray_sym_intern("weight_by_user__c0", 18), u_col); + tbl = ray_table_add_col(tbl, ray_sym_intern("weight_by_user__c1", 18), w_col); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "weight_by_user", tbl, 2); + + dl_rule_t r; dl_rule_init(&r, "user_count", 2); + dl_rule_head_var(&r, 0, 0); /* u */ + dl_rule_head_var(&r, 1, 1); /* n */ + int idx = dl_rule_add_agg(&r, DL_AGG_COUNT, 1, "weight_by_user", 2, 0); + int key_vars[] = { 0 }; + int key_cols[] = { 0 }; + munit_assert_int(dl_rule_agg_set_group(&r, idx, key_vars, key_cols, 1), ==, 0); + r.n_vars = 2; + dl_add_rule(prog, &r); + + munit_assert_int(dl_eval(prog), ==, 0); + ray_t* out = dl_query(prog, "user_count"); + munit_assert_ptr_not_null(out); + munit_assert_int((int)ray_table_nrows(out), ==, 2); + + /* Rows may appear in any order; check both keys present with count == 2. */ + int64_t* uo = (int64_t*)ray_data(ray_table_get_col_idx(out, 0)); + int64_t* no = (int64_t*)ray_data(ray_table_get_col_idx(out, 1)); + int seen_u1 = 0, seen_u2 = 0; + for (int i = 0; i < 2; i++) { + if (uo[i] == 1) { munit_assert_int((int)no[i], ==, 2); seen_u1 = 1; } + else if (uo[i] == 2) { munit_assert_int((int)no[i], ==, 2); seen_u2 = 1; } + } + munit_assert_int(seen_u1 && seen_u2, ==, 1); + + dl_program_free(prog); + ray_release(tbl); ray_release(u_col); ray_release(w_col); + return MUNIT_OK; +} + +/* Rule: user_sum(?u, ?s) :- sum(?s, weight_by_user col 1) by (?u, col 0) + * Expected: (1, 110), (2, 160) */ +static MunitResult test_agg_sum_grouped(const void* params, void* fixture) { + (void)params; (void)fixture; + int64_t users[] = {1, 1, 2, 2}; + int64_t weights[] = {50, 60, 75, 85}; + ray_t* u_col = ray_vec_from_raw(RAY_I64, users, 4); + ray_t* w_col = ray_vec_from_raw(RAY_I64, weights, 4); + ray_t* tbl = ray_table_new(2); + tbl = ray_table_add_col(tbl, ray_sym_intern("weight_by_user__c0", 18), u_col); + tbl = ray_table_add_col(tbl, ray_sym_intern("weight_by_user__c1", 18), w_col); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "weight_by_user", tbl, 2); + + dl_rule_t r; dl_rule_init(&r, "user_sum", 2); + dl_rule_head_var(&r, 0, 0); /* u */ + dl_rule_head_var(&r, 1, 1); /* s */ + int idx = dl_rule_add_agg(&r, DL_AGG_SUM, 1, "weight_by_user", 2, 1); + int key_vars[] = { 0 }; + int key_cols[] = { 0 }; + munit_assert_int(dl_rule_agg_set_group(&r, idx, key_vars, key_cols, 1), ==, 0); + r.n_vars = 2; + dl_add_rule(prog, &r); + + munit_assert_int(dl_eval(prog), ==, 0); + ray_t* out = dl_query(prog, "user_sum"); + munit_assert_ptr_not_null(out); + munit_assert_int((int)ray_table_nrows(out), ==, 2); + + int64_t* uo = (int64_t*)ray_data(ray_table_get_col_idx(out, 0)); + int64_t* so = (int64_t*)ray_data(ray_table_get_col_idx(out, 1)); + int seen_u1 = 0, seen_u2 = 0; + for (int i = 0; i < 2; i++) { + if (uo[i] == 1) { munit_assert_int((int)so[i], ==, 110); seen_u1 = 1; } + else if (uo[i] == 2) { munit_assert_int((int)so[i], ==, 160); seen_u2 = 1; } + } + munit_assert_int(seen_u1 && seen_u2, ==, 1); + + dl_program_free(prog); + ray_release(tbl); ray_release(u_col); ray_release(w_col); + return MUNIT_OK; +} + static MunitTest datalog_tests[] = { { "/source_provenance", test_source_provenance, datalog_setup, datalog_teardown, 0, NULL }, { "/source_prov_requires_flag", test_source_prov_requires_flag, datalog_setup, datalog_teardown, 0, NULL }, @@ -582,6 +672,8 @@ static MunitTest datalog_tests[] = { { "/agg_avg", test_agg_avg, datalog_setup, datalog_teardown, 0, NULL }, { "/agg_min_empty", test_agg_min_empty, datalog_setup, datalog_teardown, 0, NULL }, { "/agg_count_empty", test_agg_count_empty, datalog_setup, datalog_teardown, 0, NULL }, + { "/agg_count_grouped", test_agg_count_grouped, datalog_setup, datalog_teardown, 0, NULL }, + { "/agg_sum_grouped", test_agg_sum_grouped, datalog_setup, datalog_teardown, 0, NULL }, { NULL, NULL, NULL, NULL, 0, NULL }, }; From e5c156ae34f4bbb68831064fcdb74c85ce8e13a0 Mon Sep 17 00:00:00 2001 From: Aspirational Date: Sat, 18 Apr 2026 18:59:56 +0300 Subject: [PATCH 11/51] =?UTF-8?q?refactor(datalog):=20A4.6=20polish=20?= =?UTF-8?q?=E2=80=94=20DL=5FAGG=5FMAX=5FKEYS=3D8,=20drop=20redundant=20ref?= =?UTF-8?q?s,=20guard=20mixed=20rules?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/ops/datalog.c | 27 +++++++++++++++++++++------ src/ops/datalog.h | 2 +- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index edab5423..76c5f20a 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -1039,7 +1039,23 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, case DL_AGG: { if (body->agg_n_group_keys > 0) { - /* Grouped aggregation: use rayforce's ray_group on src_table. */ + /* Grouped aggregation: use rayforce's ray_group on src_table. + * + * Mixed-rule guard: this path assumes accum is the singleton + * _unit placeholder created for aggregate-only rules. If the + * rule has real positive body atoms, accum carries bound + * variables from a prior join that we would need to intersect + * against the group result — not yet supported. Bail early. */ + bool has_pos = false; + for (int bi = 0; bi < rule->n_body; bi++) { + if (rule->body[bi].type == DL_POS) { has_pos = true; break; } + } + if (has_pos) { + fprintf(stderr, "dl: grouped aggregate with positive body atoms not yet supported\n"); + ray_release(accum); + return NULL; + } + int src_idx = dl_find_rel(prog, body->agg_pred); if (src_idx < 0) { ray_release(accum); return NULL; } ray_t* src_table = prog->rels[src_idx].table; @@ -1054,10 +1070,10 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, dl_rel_t* src_rel = &prog->rels[src_idx]; int nk = body->agg_n_group_keys; - /* Build a sub-graph that SCANs src_table's columns by symbol name. */ - ray_retain(src_table); + /* Build a sub-graph that SCANs src_table's columns by symbol name. + * ray_graph_new retains src_table internally; no extra retain needed. */ ray_graph_t* gg = ray_graph_new(src_table); - if (!gg) { ray_release(src_table); ray_release(accum); return NULL; } + if (!gg) { ray_release(accum); return NULL; } ray_op_t* keys_ops[DL_AGG_MAX_KEYS]; for (int i = 0; i < nk; i++) { @@ -1081,7 +1097,7 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, case DL_AGG_MAX: op_code = OP_MAX; break; case DL_AGG_AVG: op_code = OP_AVG; break; default: - ray_graph_free(gg); ray_release(src_table); + ray_graph_free(gg); ray_release(accum); return NULL; } @@ -1089,7 +1105,6 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, ray_op_t* root = ray_group(gg, keys_ops, (uint8_t)nk, &op_code, ag_ins, 1); ray_t* group_tbl = ray_execute(gg, root); ray_graph_free(gg); - ray_release(src_table); if (!group_tbl || RAY_IS_ERR(group_tbl)) { if (group_tbl) ray_release(group_tbl); diff --git a/src/ops/datalog.h b/src/ops/datalog.h index 339edc1a..93ae8b6d 100644 --- a/src/ops/datalog.h +++ b/src/ops/datalog.h @@ -59,7 +59,7 @@ #define DL_AGG_MAX 3 #define DL_AGG_AVG 4 -#define DL_AGG_MAX_KEYS 4 +#define DL_AGG_MAX_KEYS 8 /* ===== Assignment operators (for DL_ASSIGN) ===== */ #define DL_OP_EQ 0 /* simple assignment: X = expr */ From 0c3652d9a4eda6fdf5e73d31e5fa5ba33a0133d4 Mon Sep 17 00:00:00 2001 From: Aspirational Date: Sat, 18 Apr 2026 19:04:43 +0300 Subject: [PATCH 12/51] feat(datalog): surface syntax (count|sum|min|max|avg ?v pred [col] [by ?k col ...]) Made-with: Cursor --- src/ops/datalog.c | 128 +++++++++++++++++++++++++++++++++++++++++--- src/ops/datalog.h | 3 ++ test/test_datalog.c | 120 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 243 insertions(+), 8 deletions(-) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index 76c5f20a..27b78f70 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -2413,6 +2413,12 @@ static int is_dl_var(ray_t* x) { static dl_rule_t g_dl_rules[DL_MAX_RULES]; static int g_dl_n_rules = 0; +void dl_append_global_rules(dl_program_t* prog) { + if (!prog) return; + for (int i = 0; i < g_dl_n_rules; i++) + dl_add_rule(prog, &g_dl_rules[i]); +} + /* Variable name -> index map for parsing a single rule or query body */ typedef struct { int64_t syms[DL_MAX_ARITY * DL_MAX_BODY]; @@ -2539,6 +2545,33 @@ static bool dl_is_assignment(ray_t* clause) { return is_dl_var(ce[1]); } +static bool dl_is_aggregate(ray_t* clause) { + if (!is_list(clause) || ray_len(clause) < 3) return false; + ray_t** ce = (ray_t**)ray_data(clause); + if (ce[0]->type != -RAY_SYM) return false; + ray_t* name = ray_sym_str(ce[0]->i64); + if (!name) return false; + const char* n = ray_str_ptr(name); + return strcmp(n, "count") == 0 || strcmp(n, "sum") == 0 + || strcmp(n, "min") == 0 || strcmp(n, "max") == 0 + || strcmp(n, "avg") == 0; +} + +static int dl_agg_op_from_name(const char* n) { + if (strcmp(n, "count") == 0) return DL_AGG_COUNT; + if (strcmp(n, "sum") == 0) return DL_AGG_SUM; + if (strcmp(n, "min") == 0) return DL_AGG_MIN; + if (strcmp(n, "max") == 0) return DL_AGG_MAX; + if (strcmp(n, "avg") == 0) return DL_AGG_AVG; + return -1; +} + +static bool dl_sym_is_name(ray_t* sym, const char* lit) { + if (!sym || sym->type != -RAY_SYM) return false; + ray_t* s = ray_sym_str(sym->i64); + return s && strcmp(ray_str_ptr(s), lit) == 0; +} + /* Resolve an AST node to a variable or constant in a body atom. * Sets the body position to either a variable or constant. * For expressions like (quote x), evaluates them first. */ @@ -2585,7 +2618,7 @@ static ray_t* dl_set_body_pos(dl_rule_t* rule, int bidx, int pos, * Handles triple patterns, negations, comparisons, assignments, * and rule invocations (positive atoms). */ static ray_t* dl_parse_body_clause(dl_rule_t* rule, ray_t* clause, - dl_var_map_t* vars) { + dl_var_map_t* vars, dl_program_t* prog) { if (!is_list(clause) || ray_len(clause) < 1) return ray_error("type", "rule/query: body clause must be a list"); @@ -2648,6 +2681,85 @@ static ray_t* dl_parse_body_clause(dl_rule_t* rule, ray_t* clause, return NULL; } + /* -- Aggregate: (count ?N pred) | (sum ?S pred col) | ... [by ?k col ...] -- */ + if (dl_is_aggregate(clause)) { + ray_t* op_str = ray_sym_str(ce[0]->i64); + if (!op_str) return ray_error("type", "aggregate: bad operator"); + int op = dl_agg_op_from_name(ray_str_ptr(op_str)); + if (op < 0) return ray_error("type", "aggregate: unknown operator"); + + if (!is_dl_var(ce[1])) + return ray_error("type", "aggregate: first argument must be ?variable"); + int target_vi = dl_var_get_or_create(vars, ce[1]->i64); + if (target_vi < 0) + return ray_error("domain", "aggregate: too many variables"); + + if (ce[2]->type != -RAY_SYM) + return ray_error("type", "aggregate: predicate must be a symbol"); + ray_t* pred_sym = ray_sym_str(ce[2]->i64); + if (!pred_sym) + return ray_error("type", "aggregate: cannot resolve predicate name"); + const char* pred_name = ray_str_ptr(pred_sym); + + int pred_arity = 1; + if (prog) { + int ri = dl_find_rel(prog, pred_name); + if (ri >= 0) pred_arity = prog->rels[ri].arity; + } + + int i = 3; + bool has_value_col = false; + int value_col = 0; + int key_vars[DL_AGG_MAX_KEYS]; + int key_cols[DL_AGG_MAX_KEYS]; + int n_keys = 0; + + while (i < clen) { + if (dl_sym_is_name(ce[i], "by")) { + i++; + while (i < clen) { + if (!is_dl_var(ce[i])) + return ray_error("type", "aggregate: group key must be ?variable"); + if (n_keys >= DL_AGG_MAX_KEYS) + return ray_error("domain", "aggregate: too many group keys"); + key_vars[n_keys] = dl_var_get_or_create(vars, ce[i]->i64); + i++; + if (i >= clen || ce[i]->type != -RAY_I64) + return ray_error("type", "aggregate: group key column must be integer"); + key_cols[n_keys] = (int)ce[i]->i64; + i++; + n_keys++; + } + break; + } + if (ce[i]->type == -RAY_I64) { + if (has_value_col) + return ray_error("type", "aggregate: at most one value column index"); + has_value_col = true; + value_col = (int)ce[i]->i64; + i++; + continue; + } + return ray_error("type", "aggregate: unexpected token in aggregate clause"); + } + + if (op == DL_AGG_COUNT) { + if (has_value_col) + return ray_error("type", "aggregate: count does not take a value column"); + } else { + if (!has_value_col) + return ray_error("type", "aggregate: sum/min/max/avg require a value column index"); + } + + int bidx = dl_rule_add_agg(rule, op, target_vi, pred_name, pred_arity, has_value_col ? value_col : 0); + if (bidx < 0) return ray_error("domain", "rule: too many body literals"); + if (n_keys > 0) { + if (dl_rule_agg_set_group(rule, bidx, key_vars, key_cols, n_keys) != 0) + return ray_error("domain", "aggregate: cannot attach group keys"); + } + return NULL; + } + /* -- Assignment: (= ?var expr) -- */ if (dl_is_assignment(clause)) { int target_vi = dl_var_get_or_create(vars, ce[1]->i64); @@ -2725,7 +2837,7 @@ static ray_t* dl_parse_body_clause(dl_rule_t* rule, ray_t* clause, /* Parse head + body clauses into out (shared by rule and query inline rules). */ static ray_t* dl_parse_rule_from_head_and_body(dl_rule_t* out, ray_t* head, ray_t** body_args, int64_t n_body, - dl_var_map_t* vars) { + dl_var_map_t* vars, dl_program_t* prog) { if (!is_list(head) || ray_len(head) < 1) return ray_error("type", "rule: head must be (name ?var ...)"); @@ -2760,7 +2872,7 @@ static ray_t* dl_parse_rule_from_head_and_body(dl_rule_t* out, ray_t* head, } for (int64_t i = 0; i < n_body; i++) { - ray_t* err = dl_parse_body_clause(out, body_args[i], vars); + ray_t* err = dl_parse_body_clause(out, body_args[i], vars, prog); if (err) return err; } @@ -2769,7 +2881,7 @@ static ray_t* dl_parse_rule_from_head_and_body(dl_rule_t* out, ray_t* head, } /* One inline rule: ((head-name ?a ...) body1 body2 ...) */ -static ray_t* dl_parse_inline_rule(dl_rule_t* out, ray_t* rule_list) { +static ray_t* dl_parse_inline_rule(dl_rule_t* out, ray_t* rule_list, dl_program_t* prog) { if (!is_list(rule_list) || ray_len(rule_list) < 1) return ray_error("type", "query: each (rules ...) entry must be a non-empty list"); @@ -2777,7 +2889,7 @@ static ray_t* dl_parse_inline_rule(dl_rule_t* out, ray_t* rule_list) { int64_t rlen = ray_len(rule_list); dl_var_map_t vars; memset(&vars, 0, sizeof(vars)); - return dl_parse_rule_from_head_and_body(out, re[0], &re[1], rlen - 1, &vars); + return dl_parse_rule_from_head_and_body(out, re[0], &re[1], rlen - 1, &vars, prog); } /* (rule (head-name ?v1 ?v2 ...) clause1 clause2 ...) @@ -2793,7 +2905,7 @@ ray_t* ray_rule_fn(ray_t** args, int64_t n) { dl_var_map_t vars; memset(&vars, 0, sizeof(vars)); dl_rule_t rule; - ray_t* perr = dl_parse_rule_from_head_and_body(&rule, args[0], &args[1], n - 1, &vars); + ray_t* perr = dl_parse_rule_from_head_and_body(&rule, args[0], &args[1], n - 1, &vars, NULL); if (perr) return perr; memcpy(&g_dl_rules[g_dl_n_rules++], &rule, sizeof(dl_rule_t)); @@ -2903,7 +3015,7 @@ ray_t* ray_query_fn(ray_t** args, int64_t n) { /* Parse body clauses into the query rule */ for (int64_t i = 1; i < where_len; i++) { - ray_t* err = dl_parse_body_clause(&qrule, where_elems[i], &vars); + ray_t* err = dl_parse_body_clause(&qrule, where_elems[i], &vars, NULL); if (err) { ray_release(db); return err; } } qrule.n_vars = vars.n; @@ -2945,7 +3057,7 @@ ray_t* ray_query_fn(ray_t** args, int64_t n) { int64_t rlen = ray_len(rules_clause); for (int64_t i = 1; i < rlen; i++) { dl_rule_t irule; - ray_t* rerr = dl_parse_inline_rule(&irule, re[i]); + ray_t* rerr = dl_parse_inline_rule(&irule, re[i], prog); if (rerr) { dl_program_free(prog); ray_release(db); diff --git a/src/ops/datalog.h b/src/ops/datalog.h index 93ae8b6d..91fe8163 100644 --- a/src/ops/datalog.h +++ b/src/ops/datalog.h @@ -179,6 +179,9 @@ dl_program_t* dl_program_new(void); /* Free a Datalog program and release all owned tables */ void dl_program_free(dl_program_t* prog); +/** Append rules registered via the Rayfall (rule ...) special form into a program. */ +void dl_append_global_rules(dl_program_t* prog); + /* Register an EDB (extensional) relation backed by an existing table. * Column names are auto-generated as "c0", "c1", ... unless the table * already has named columns. */ diff --git a/test/test_datalog.c b/test/test_datalog.c index 5291b3ef..d67a6c0e 100644 --- a/test/test_datalog.c +++ b/test/test_datalog.c @@ -29,8 +29,15 @@ #include #include "mem/heap.h" #include "ops/datalog.h" +#include "lang/eval.h" #include +struct ray_runtime_s; +typedef struct ray_runtime_s ray_runtime_t; +extern ray_runtime_t* ray_runtime_create(int argc, char** argv); +extern void ray_runtime_destroy(ray_runtime_t* rt); +extern ray_runtime_t* __RUNTIME; + static void* datalog_setup(const void* params, void* user_data) { (void)params; (void)user_data; ray_heap_init(); @@ -44,6 +51,18 @@ static void datalog_teardown(void* fixture) { ray_heap_destroy(); } +/* Full runtime — required for ray_eval_str("(rule ...)") surface-syntax tests. */ +static void* datalog_rf_setup(const void* params, void* user_data) { + (void)params; (void)user_data; + ray_runtime_create(0, NULL); + return NULL; +} + +static void datalog_rf_teardown(void* fixture) { + (void)fixture; + ray_runtime_destroy(__RUNTIME); +} + /* Verify that dl_get_provenance_src_offsets and dl_get_provenance_src_data * are populated correctly for a simple one-rule derivation. * @@ -658,6 +677,104 @@ static MunitResult test_agg_sum_grouped(const void* params, void* fixture) { return MUNIT_OK; } +/* Surface syntax: (rule (wcount ?n) (count ?n weight)) */ +static MunitResult test_agg_parse_count_scalar(const void* params, void* fixture) { + (void)params; (void)fixture; + + ray_t* ok = ray_eval_str("(rule (wcount ?n) (count ?n weight))"); + munit_assert_ptr_not_null(ok); + munit_assert(!RAY_IS_ERR(ok)); + ray_release(ok); + + int64_t vals[] = {50, 60, 75, 85}; + ray_t* col = ray_vec_from_raw(RAY_I64, vals, 4); + ray_t* weight = ray_table_new(1); + weight = ray_table_add_col(weight, ray_sym_intern("weight__c0", 10), col); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "weight", weight, 1); + dl_append_global_rules(prog); + + munit_assert_int(dl_eval(prog), ==, 0); + ray_t* out = dl_query(prog, "wcount"); + munit_assert_ptr_not_null(out); + munit_assert_int((int)ray_table_nrows(out), ==, 1); + int64_t* od = (int64_t*)ray_data(ray_table_get_col_idx(out, 0)); + munit_assert_int((int)od[0], ==, 4); + + dl_program_free(prog); + ray_release(weight); ray_release(col); + return MUNIT_OK; +} + +/* Surface syntax: (rule (wsum ?s) (sum ?s weight 0)) */ +static MunitResult test_agg_parse_sum_scalar(const void* params, void* fixture) { + (void)params; (void)fixture; + + ray_t* ok = ray_eval_str("(rule (wsum ?s) (sum ?s weight 0))"); + munit_assert_ptr_not_null(ok); + munit_assert(!RAY_IS_ERR(ok)); + ray_release(ok); + + ray_t* weight = make_weight_edb(); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "weight", weight, 1); + dl_append_global_rules(prog); + + munit_assert_int(dl_eval(prog), ==, 0); + ray_t* out = dl_query(prog, "wsum"); + munit_assert_ptr_not_null(out); + munit_assert_int((int)ray_table_nrows(out), ==, 1); + int64_t* od = (int64_t*)ray_data(ray_table_get_col_idx(out, 0)); + munit_assert_int((int)od[0], ==, 270); + + dl_program_free(prog); + ray_release(weight); + return MUNIT_OK; +} + +/* Surface syntax: (count ?n weight_by_user by ?u 0) */ +static MunitResult test_agg_parse_count_grouped(const void* params, void* fixture) { + (void)params; (void)fixture; + + ray_t* ok = ray_eval_str( + "(rule (user_count ?u ?n) (count ?n weight_by_user by ?u 0))"); + munit_assert_ptr_not_null(ok); + munit_assert(!RAY_IS_ERR(ok)); + ray_release(ok); + + int64_t users[] = {1, 1, 2, 2}; + int64_t weights[] = {50, 60, 75, 85}; + ray_t* u_col = ray_vec_from_raw(RAY_I64, users, 4); + ray_t* w_col = ray_vec_from_raw(RAY_I64, weights, 4); + ray_t* tbl = ray_table_new(2); + tbl = ray_table_add_col(tbl, ray_sym_intern("weight_by_user__c0", 18), u_col); + tbl = ray_table_add_col(tbl, ray_sym_intern("weight_by_user__c1", 18), w_col); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "weight_by_user", tbl, 2); + dl_append_global_rules(prog); + + munit_assert_int(dl_eval(prog), ==, 0); + ray_t* out = dl_query(prog, "user_count"); + munit_assert_ptr_not_null(out); + munit_assert_int((int)ray_table_nrows(out), ==, 2); + + int64_t* uo = (int64_t*)ray_data(ray_table_get_col_idx(out, 0)); + int64_t* no = (int64_t*)ray_data(ray_table_get_col_idx(out, 1)); + int seen_u1 = 0, seen_u2 = 0; + for (int i = 0; i < 2; i++) { + if (uo[i] == 1) { munit_assert_int((int)no[i], ==, 2); seen_u1 = 1; } + else if (uo[i] == 2) { munit_assert_int((int)no[i], ==, 2); seen_u2 = 1; } + } + munit_assert_int(seen_u1 && seen_u2, ==, 1); + + dl_program_free(prog); + ray_release(tbl); ray_release(u_col); ray_release(w_col); + return MUNIT_OK; +} + static MunitTest datalog_tests[] = { { "/source_provenance", test_source_provenance, datalog_setup, datalog_teardown, 0, NULL }, { "/source_prov_requires_flag", test_source_prov_requires_flag, datalog_setup, datalog_teardown, 0, NULL }, @@ -674,6 +791,9 @@ static MunitTest datalog_tests[] = { { "/agg_count_empty", test_agg_count_empty, datalog_setup, datalog_teardown, 0, NULL }, { "/agg_count_grouped", test_agg_count_grouped, datalog_setup, datalog_teardown, 0, NULL }, { "/agg_sum_grouped", test_agg_sum_grouped, datalog_setup, datalog_teardown, 0, NULL }, + { "/agg_parse_count_scalar", test_agg_parse_count_scalar, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, + { "/agg_parse_sum_scalar", test_agg_parse_sum_scalar, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, + { "/agg_parse_count_grouped", test_agg_parse_count_grouped, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, { NULL, NULL, NULL, NULL, 0, NULL }, }; From 6793bdae682878d5e676aa7937c2b0cc992ec50a Mon Sep 17 00:00:00 2001 From: Aspirational Date: Sat, 18 Apr 2026 19:13:13 +0300 Subject: [PATCH 13/51] feat(datalog): float constants in expressions; AVG output promoted to f64 Made-with: Cursor --- src/ops/datalog.c | 91 +++++++++++++++++++++++++++++++++++++++++---- src/ops/datalog.h | 11 ++++-- test/test_datalog.c | 43 ++++++++++++++++++++- 3 files changed, 132 insertions(+), 13 deletions(-) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index 27b78f70..f56bfc2f 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -284,6 +284,14 @@ dl_expr_t* dl_expr_const(int64_t val) { return e; } +dl_expr_t* dl_expr_const_f64(double val) { + dl_expr_t* e = dl_expr_alloc(); + if (!e) return NULL; + e->kind = DL_EXPR_CONST_F64; + e->const_f64 = val; + return e; +} + dl_expr_t* dl_expr_var(int var_idx) { dl_expr_t* e = dl_expr_alloc(); if (!e) return NULL; @@ -513,8 +521,26 @@ int dl_stratify(dl_program_t* prog) { * Expression evaluation — compute column from expression tree * ======================================================================== */ +/* Helper: materialize a column of the given type/size as a copy or promotion + * of src. If target==RAY_F64 and src is RAY_I64, promote. Returns new owned column. */ +static ray_t* dl_col_as_f64(ray_t* src, int64_t nrows) { + ray_t* out = ray_vec_new(RAY_F64, nrows); + if (!out || RAY_IS_ERR(out)) return NULL; + out->len = nrows; + double* od = (double*)ray_data(out); + if (src->type == RAY_F64) { + memcpy(od, ray_data(src), (size_t)nrows * sizeof(double)); + } else { /* RAY_I64 */ + int64_t* sd = (int64_t*)ray_data(src); + for (int64_t r = 0; r < nrows; r++) od[r] = (double)sd[r]; + } + return out; +} + /* Evaluate an expression tree against the accumulator table. - * Returns a new owned I64 vector of length nrows. */ + * Returns a new owned vector of length nrows. The element type is RAY_F64 + * if the expression involves any float constant or any RAY_F64 source column, + * otherwise RAY_I64. */ static ray_t* dl_eval_expr(dl_expr_t* expr, ray_t* accum, int* var_col, int64_t nrows) { if (!expr) return NULL; @@ -529,14 +555,25 @@ static ray_t* dl_eval_expr(dl_expr_t* expr, ray_t* accum, d[r] = expr->const_val; return col; } + case DL_EXPR_CONST_F64: { + ray_t* col = ray_vec_new(RAY_F64, nrows); + if (!col || RAY_IS_ERR(col)) return NULL; + col->len = nrows; + double* d = (double*)ray_data(col); + for (int64_t r = 0; r < nrows; r++) + d[r] = expr->const_f64; + return col; + } case DL_EXPR_VAR: { int ci = var_col[expr->var_idx]; ray_t* src = ray_table_get_col_idx(accum, ci); if (!src) return NULL; - ray_t* dst = ray_vec_new(RAY_I64, nrows); + int8_t t = (src->type == RAY_F64) ? RAY_F64 : RAY_I64; + size_t elem = (t == RAY_F64) ? sizeof(double) : sizeof(int64_t); + ray_t* dst = ray_vec_new(t, nrows); if (!dst || RAY_IS_ERR(dst)) return NULL; dst->len = nrows; - memcpy(ray_data(dst), ray_data(src), (size_t)nrows * sizeof(int64_t)); + memcpy(ray_data(dst), ray_data(src), (size_t)nrows * elem); return dst; } case DL_EXPR_BINOP: { @@ -547,6 +584,36 @@ static ray_t* dl_eval_expr(dl_expr_t* expr, ray_t* accum, if (rv) ray_release(rv); return NULL; } + bool is_f64 = (lv->type == RAY_F64) || (rv->type == RAY_F64); + if (is_f64) { + ray_t* lf = dl_col_as_f64(lv, nrows); + ray_t* rf = dl_col_as_f64(rv, nrows); + ray_release(lv); ray_release(rv); + if (!lf || !rf) { + if (lf) ray_release(lf); + if (rf) ray_release(rf); + return NULL; + } + ray_t* out = ray_vec_new(RAY_F64, nrows); + if (!out || RAY_IS_ERR(out)) { + ray_release(lf); ray_release(rf); return NULL; + } + out->len = nrows; + double* ld = (double*)ray_data(lf); + double* rd = (double*)ray_data(rf); + double* od = (double*)ray_data(out); + for (int64_t r = 0; r < nrows; r++) { + switch (expr->binop) { + case OP_ADD: od[r] = ld[r] + rd[r]; break; + case OP_SUB: od[r] = ld[r] - rd[r]; break; + case OP_MUL: od[r] = ld[r] * rd[r]; break; + case OP_DIV: od[r] = rd[r] != 0.0 ? ld[r] / rd[r] : 0.0; break; + default: od[r] = 0.0; break; + } + } + ray_release(lf); ray_release(rf); + return out; + } ray_t* out = ray_vec_new(RAY_I64, nrows); if (!out || RAY_IS_ERR(out)) { ray_release(lv); ray_release(rv); return NULL; @@ -1152,6 +1219,8 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, } int64_t result = 0; + double favg = 0.0; + bool is_avg = (body->agg_op == DL_AGG_AVG); switch (body->agg_op) { case DL_AGG_COUNT: result = src_nrows; @@ -1189,7 +1258,7 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, int64_t acc = 0; for (int64_t i = 0; i < src_nrows; i++) acc += vd[i]; - result = acc / src_nrows; + favg = (double)acc / (double)src_nrows; } } } @@ -1201,13 +1270,17 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, int64_t nrows = ray_table_nrows(accum); if (nrows == 0) break; - ray_t* new_col = ray_vec_new(RAY_I64, nrows); + ray_t* new_col = ray_vec_new(is_avg ? RAY_F64 : RAY_I64, nrows); if (!new_col || RAY_IS_ERR(new_col)) break; new_col->len = nrows; - int64_t* nd = (int64_t*)ray_data(new_col); - for (int64_t r = 0; r < nrows; r++) - nd[r] = result; + if (is_avg) { + double* nd = (double*)ray_data(new_col); + for (int64_t r = 0; r < nrows; r++) nd[r] = favg; + } else { + int64_t* nd = (int64_t*)ray_data(new_col); + for (int64_t r = 0; r < nrows; r++) nd[r] = result; + } int new_col_idx = (int)ray_table_ncols(accum); char colname[32]; @@ -2461,6 +2534,8 @@ static dl_expr_t* dl_build_expr(ray_t* node, dl_var_map_t* vars) { if (!node) return NULL; if (node->type == -RAY_I64) return dl_expr_const(node->i64); + if (node->type == -RAY_F64) + return dl_expr_const_f64(node->f64); if (node->type == -RAY_SYM && is_dl_var(node)) { int vi = dl_var_get_or_create(vars, node->i64); return (vi >= 0) ? dl_expr_var(vi) : NULL; diff --git a/src/ops/datalog.h b/src/ops/datalog.h index 91fe8163..e361a1c1 100644 --- a/src/ops/datalog.h +++ b/src/ops/datalog.h @@ -71,14 +71,16 @@ /* ===== Expression AST for assignments ===== */ typedef enum { - DL_EXPR_CONST, /* integer constant */ - DL_EXPR_VAR, /* bound variable reference */ - DL_EXPR_BINOP, /* binary op: +, -, *, / */ + DL_EXPR_CONST, /* integer constant (back-compat) */ + DL_EXPR_CONST_F64, /* float constant */ + DL_EXPR_VAR, /* bound variable reference */ + DL_EXPR_BINOP, /* binary op: +, -, *, / */ } dl_expr_kind_t; typedef struct dl_expr { dl_expr_kind_t kind; int64_t const_val; /* for DL_EXPR_CONST */ + double const_f64; /* for DL_EXPR_CONST_F64 */ int var_idx; /* for DL_EXPR_VAR */ int binop; /* for DL_EXPR_BINOP: OP_ADD, OP_SUB, etc. */ struct dl_expr *left; /* for DL_EXPR_BINOP */ @@ -294,6 +296,9 @@ int dl_rule_agg_set_group(dl_rule_t* rule, int body_idx, /* Create a constant expression */ dl_expr_t* dl_expr_const(int64_t val); +/* Create a float constant expression */ +dl_expr_t* dl_expr_const_f64(double val); + /* Create a variable reference expression */ dl_expr_t* dl_expr_var(int var_idx); diff --git a/test/test_datalog.c b/test/test_datalog.c index d67a6c0e..bfd943fd 100644 --- a/test/test_datalog.c +++ b/test/test_datalog.c @@ -323,6 +323,41 @@ static MunitResult test_arith_assignment(const void* params, void* fixture) { return MUNIT_OK; } +/* Float arithmetic: (rule (fres ?x ?z) (trig ?x) (= ?z (+ 1.5 2.5))) -> z = 4.0 */ +static MunitResult test_arith_assign_f64(const void* params, void* fixture) { + (void)params; (void)fixture; + int64_t one[] = { 1 }; + ray_t* col = ray_vec_from_raw(RAY_I64, one, 1); + ray_t* trig = ray_table_new(1); + trig = ray_table_add_col(trig, ray_sym_intern("trig__c0", 8), col); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "trig", trig, 1); + + dl_rule_t r; dl_rule_init(&r, "fres", 2); + dl_rule_head_var(&r, 0, 0); dl_rule_head_var(&r, 1, 1); + int b = dl_rule_add_atom(&r, "trig", 1); + dl_body_set_var(&r, b, 0, 0); + + dl_expr_t* e = dl_expr_binop(OP_ADD, dl_expr_const_f64(1.5), dl_expr_const_f64(2.5)); + dl_rule_add_assign(&r, 1, DL_OP_EQ, e); + r.n_vars = 2; + dl_add_rule(prog, &r); + + munit_assert_int(dl_eval(prog), ==, 0); + ray_t* out = dl_query(prog, "fres"); + munit_assert_int((int)ray_table_nrows(out), ==, 1); + ray_t* z_col = ray_table_get_col_idx(out, 1); + munit_assert_ptr_not_null(z_col); + munit_assert_int(z_col->type, ==, RAY_F64); + double* zd = (double*)ray_data(z_col); + munit_assert_double_equal(zd[0], 4.0, 4); + + dl_program_free(prog); + ray_release(trig); ray_release(col); + return MUNIT_OK; +} + /* Verify dl_rule_add_agg populates body fields correctly. */ static MunitResult test_agg_builder(const void* params, void* fixture) { (void)params; (void)fixture; @@ -523,8 +558,11 @@ static MunitResult test_agg_avg(const void* params, void* fixture) { ray_t* out = dl_query(prog, "wavg"); munit_assert_ptr_not_null(out); munit_assert_int((int)ray_table_nrows(out), ==, 1); - int64_t* od = (int64_t*)ray_data(ray_table_get_col_idx(out, 0)); - munit_assert_int((int)od[0], ==, 67); + ray_t* avg_col = ray_table_get_col_idx(out, 0); + munit_assert_ptr_not_null(avg_col); + munit_assert_int(avg_col->type, ==, RAY_F64); + double* od = (double*)ray_data(avg_col); + munit_assert_double_equal(od[0], 67.5, 4); dl_program_free(prog); ray_release(weight); @@ -780,6 +818,7 @@ static MunitTest datalog_tests[] = { { "/source_prov_requires_flag", test_source_prov_requires_flag, datalog_setup, datalog_teardown, 0, NULL }, { "/cmp_const_filter", test_cmp_const_filter, datalog_setup, datalog_teardown, 0, NULL }, { "/arith_assignment", test_arith_assignment, datalog_setup, datalog_teardown, 0, NULL }, + { "/arith_assign_f64", test_arith_assign_f64, datalog_setup, datalog_teardown, 0, NULL }, { "/agg_builder", test_agg_builder, datalog_setup, datalog_teardown, 0, NULL }, { "/agg_stratifies_above_source", test_agg_stratifies_above_source, datalog_setup, datalog_teardown, 0, NULL }, { "/agg_count_edb", test_agg_count_edb, datalog_setup, datalog_teardown, 0, NULL }, From e8c43b7ee83e9bc2dc000f71b13b3e6119f9268a Mon Sep 17 00:00:00 2001 From: Aspirational Date: Sat, 18 Apr 2026 19:16:42 +0300 Subject: [PATCH 14/51] feat(datalog): (between ?x lo hi) parser sugar lowers to two cmps Made-with: Cursor --- src/ops/datalog.c | 19 +++++++++++++++++++ test/test_datalog.c | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index f56bfc2f..0df91a19 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -2835,6 +2835,25 @@ static ray_t* dl_parse_body_clause(dl_rule_t* rule, ray_t* clause, return NULL; } + /* -- Between sugar: (between ?x lo hi) -> (>= ?x lo) and (<= ?x hi) -- */ + if (clen == 4 && ce[0]->type == -RAY_SYM) { + ray_t* nm = ray_sym_str(ce[0]->i64); + if (nm && strcmp(ray_str_ptr(nm), "between") == 0) { + if (!is_dl_var(ce[1])) + return ray_error("type", "between target must be a ?variable"); + int vi = dl_var_get_or_create(vars, ce[1]->i64); + if (vi < 0) + return ray_error("domain", "between: too many variables"); + if (ce[2]->type != -RAY_I64 || ce[3]->type != -RAY_I64) + return ray_error("type", "between bounds must be integer constants"); + if (dl_rule_add_cmp_const(rule, DL_CMP_GE, vi, ce[2]->i64) < 0) + return ray_error("domain", "rule: too many body literals"); + if (dl_rule_add_cmp_const(rule, DL_CMP_LE, vi, ce[3]->i64) < 0) + return ray_error("domain", "rule: too many body literals"); + return NULL; + } + } + /* -- Assignment: (= ?var expr) -- */ if (dl_is_assignment(clause)) { int target_vi = dl_var_get_or_create(vars, ce[1]->i64); diff --git a/test/test_datalog.c b/test/test_datalog.c index bfd943fd..f4fcd551 100644 --- a/test/test_datalog.c +++ b/test/test_datalog.c @@ -813,6 +813,43 @@ static MunitResult test_agg_parse_count_grouped(const void* params, void* fixtur return MUNIT_OK; } +/* Surface syntax: (between ?w lo hi) -> two cmp literals; weight 50,60,75,85 -> mid: 60, 75 */ +static MunitResult test_between_sugar_parse(const void* params, void* fixture) { + (void)params; (void)fixture; + + ray_t* ok = ray_eval_str( + "(rule (mid ?w) (weight ?w) (between ?w 60 80))"); + munit_assert_ptr_not_null(ok); + munit_assert(!RAY_IS_ERR(ok)); + ray_release(ok); + + int64_t vals[] = {50, 60, 75, 85}; + ray_t* col = ray_vec_from_raw(RAY_I64, vals, 4); + ray_t* weight = ray_table_new(1); + weight = ray_table_add_col(weight, ray_sym_intern("weight__c0", 10), col); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "weight", weight, 1); + dl_append_global_rules(prog); + + munit_assert_int(dl_eval(prog), ==, 0); + ray_t* out = dl_query(prog, "mid"); + munit_assert_ptr_not_null(out); + munit_assert_int((int)ray_table_nrows(out), ==, 2); + + int64_t* od = (int64_t*)ray_data(ray_table_get_col_idx(out, 0)); + int seen60 = 0, seen75 = 0; + for (int i = 0; i < 2; i++) { + if (od[i] == 60) seen60 = 1; + else if (od[i] == 75) seen75 = 1; + } + munit_assert_int(seen60 && seen75, ==, 1); + + dl_program_free(prog); + ray_release(weight); ray_release(col); + return MUNIT_OK; +} + static MunitTest datalog_tests[] = { { "/source_provenance", test_source_provenance, datalog_setup, datalog_teardown, 0, NULL }, { "/source_prov_requires_flag", test_source_prov_requires_flag, datalog_setup, datalog_teardown, 0, NULL }, @@ -833,6 +870,7 @@ static MunitTest datalog_tests[] = { { "/agg_parse_count_scalar", test_agg_parse_count_scalar, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, { "/agg_parse_sum_scalar", test_agg_parse_sum_scalar, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, { "/agg_parse_count_grouped", test_agg_parse_count_grouped, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, + { "/between_sugar_parse", test_between_sugar_parse, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, { NULL, NULL, NULL, NULL, 0, NULL }, }; From c1933f8aa86114aa11e5d4c18a1198151e8a8af0 Mon Sep 17 00:00:00 2001 From: Aspirational Date: Sat, 18 Apr 2026 19:19:16 +0300 Subject: [PATCH 15/51] test(datalog): aggregate empty-source and parser error-path tests Made-with: Cursor --- test/test_datalog.c | 124 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) diff --git a/test/test_datalog.c b/test/test_datalog.c index f4fcd551..0da8e0d3 100644 --- a/test/test_datalog.c +++ b/test/test_datalog.c @@ -625,6 +625,87 @@ static MunitResult test_agg_count_empty(const void* params, void* fixture) { return MUNIT_OK; } +/* MAX over empty source -> rule produces no row. */ +static MunitResult test_agg_max_empty(const void* params, void* fixture) { + (void)params; (void)fixture; + int64_t dummy = 0; + ray_t* empty_vec = ray_vec_from_raw(RAY_I64, &dummy, 0); + ray_t* weight = ray_table_new(1); + weight = ray_table_add_col(weight, ray_sym_intern("weight__c0", 10), empty_vec); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "weight", weight, 1); + + dl_rule_t r; dl_rule_init(&r, "wmax", 1); + dl_rule_head_var(&r, 0, 0); + dl_rule_add_agg(&r, DL_AGG_MAX, 0, "weight", 1, 0); + r.n_vars = 1; + dl_add_rule(prog, &r); + + munit_assert_int(dl_eval(prog), ==, 0); + ray_t* out = dl_query(prog, "wmax"); + if (out) munit_assert_int((int)ray_table_nrows(out), ==, 0); + + dl_program_free(prog); + ray_release(weight); ray_release(empty_vec); + return MUNIT_OK; +} + +/* SUM over empty source -> one row with value 0 (additive identity). */ +static MunitResult test_agg_sum_empty(const void* params, void* fixture) { + (void)params; (void)fixture; + int64_t dummy = 0; + ray_t* empty_vec = ray_vec_from_raw(RAY_I64, &dummy, 0); + ray_t* weight = ray_table_new(1); + weight = ray_table_add_col(weight, ray_sym_intern("weight__c0", 10), empty_vec); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "weight", weight, 1); + + dl_rule_t r; dl_rule_init(&r, "wsum", 1); + dl_rule_head_var(&r, 0, 0); + dl_rule_add_agg(&r, DL_AGG_SUM, 0, "weight", 1, 0); + r.n_vars = 1; + dl_add_rule(prog, &r); + + munit_assert_int(dl_eval(prog), ==, 0); + ray_t* out = dl_query(prog, "wsum"); + munit_assert_ptr_not_null(out); + munit_assert_int((int)ray_table_nrows(out), ==, 1); + int64_t* od = (int64_t*)ray_data(ray_table_get_col_idx(out, 0)); + munit_assert_int((int)od[0], ==, 0); + + dl_program_free(prog); + ray_release(weight); ray_release(empty_vec); + return MUNIT_OK; +} + +/* AVG over empty source -> rule produces no row. */ +static MunitResult test_agg_avg_empty(const void* params, void* fixture) { + (void)params; (void)fixture; + int64_t dummy = 0; + ray_t* empty_vec = ray_vec_from_raw(RAY_I64, &dummy, 0); + ray_t* weight = ray_table_new(1); + weight = ray_table_add_col(weight, ray_sym_intern("weight__c0", 10), empty_vec); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "weight", weight, 1); + + dl_rule_t r; dl_rule_init(&r, "wavg", 1); + dl_rule_head_var(&r, 0, 0); + dl_rule_add_agg(&r, DL_AGG_AVG, 0, "weight", 1, 0); + r.n_vars = 1; + dl_add_rule(prog, &r); + + munit_assert_int(dl_eval(prog), ==, 0); + ray_t* out = dl_query(prog, "wavg"); + if (out) munit_assert_int((int)ray_table_nrows(out), ==, 0); + + dl_program_free(prog); + ray_release(weight); ray_release(empty_vec); + return MUNIT_OK; +} + /* weight_by_user(user_id, kg): (1,50), (1,60), (2,75), (2,85) * Rule: user_count(?u, ?n) :- count(?n, weight_by_user) by (?u, col 0) * Expected: (1,2), (2,2) */ @@ -850,6 +931,42 @@ static MunitResult test_between_sugar_parse(const void* params, void* fixture) { return MUNIT_OK; } +/* A5 aggregate parser rejects COUNT with explicit value column. */ +static MunitResult test_agg_parse_reject_count_with_col(const void* params, void* fixture) { + (void)params; (void)fixture; + ray_t* r = ray_eval_str("(rule (w ?n) (count ?n weight 0))"); + munit_assert_true(RAY_IS_ERR(r)); + ray_release(r); + return MUNIT_OK; +} + +/* A5 aggregate parser rejects SUM without value column index. */ +static MunitResult test_agg_parse_reject_sum_without_col(const void* params, void* fixture) { + (void)params; (void)fixture; + ray_t* r = ray_eval_str("(rule (w ?s) (sum ?s weight))"); + munit_assert_true(RAY_IS_ERR(r)); + ray_release(r); + return MUNIT_OK; +} + +/* A5 aggregate parser rejects incomplete `by` clause (missing column after key var). */ +static MunitResult test_agg_parse_reject_by_missing_col(const void* params, void* fixture) { + (void)params; (void)fixture; + ray_t* r = ray_eval_str("(rule (w ?n) (count ?n weight by ?k))"); + munit_assert_true(RAY_IS_ERR(r)); + ray_release(r); + return MUNIT_OK; +} + +/* A5 aggregate parser rejects non-variable aggregate target. */ +static MunitResult test_agg_parse_reject_non_var_target(const void* params, void* fixture) { + (void)params; (void)fixture; + ray_t* r = ray_eval_str("(rule (w ?x) (count 5 weight))"); + munit_assert_true(RAY_IS_ERR(r)); + ray_release(r); + return MUNIT_OK; +} + static MunitTest datalog_tests[] = { { "/source_provenance", test_source_provenance, datalog_setup, datalog_teardown, 0, NULL }, { "/source_prov_requires_flag", test_source_prov_requires_flag, datalog_setup, datalog_teardown, 0, NULL }, @@ -864,12 +981,19 @@ static MunitTest datalog_tests[] = { { "/agg_max", test_agg_max, datalog_setup, datalog_teardown, 0, NULL }, { "/agg_avg", test_agg_avg, datalog_setup, datalog_teardown, 0, NULL }, { "/agg_min_empty", test_agg_min_empty, datalog_setup, datalog_teardown, 0, NULL }, + { "/agg_max_empty", test_agg_max_empty, datalog_setup, datalog_teardown, 0, NULL }, + { "/agg_sum_empty", test_agg_sum_empty, datalog_setup, datalog_teardown, 0, NULL }, + { "/agg_avg_empty", test_agg_avg_empty, datalog_setup, datalog_teardown, 0, NULL }, { "/agg_count_empty", test_agg_count_empty, datalog_setup, datalog_teardown, 0, NULL }, { "/agg_count_grouped", test_agg_count_grouped, datalog_setup, datalog_teardown, 0, NULL }, { "/agg_sum_grouped", test_agg_sum_grouped, datalog_setup, datalog_teardown, 0, NULL }, { "/agg_parse_count_scalar", test_agg_parse_count_scalar, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, { "/agg_parse_sum_scalar", test_agg_parse_sum_scalar, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, { "/agg_parse_count_grouped", test_agg_parse_count_grouped, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, + { "/agg_parse_reject_count_with_col", test_agg_parse_reject_count_with_col, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, + { "/agg_parse_reject_sum_without_col", test_agg_parse_reject_sum_without_col, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, + { "/agg_parse_reject_by_missing_col", test_agg_parse_reject_by_missing_col, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, + { "/agg_parse_reject_non_var_target", test_agg_parse_reject_non_var_target, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, { "/between_sugar_parse", test_between_sugar_parse, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, { NULL, NULL, NULL, NULL, 0, NULL }, }; From c783977ee3373aafd2484aea0d0e8a4de8b94027 Mon Sep 17 00:00:00 2001 From: Aspirational Date: Sat, 18 Apr 2026 19:19:27 +0300 Subject: [PATCH 16/51] test(datalog): mixed i64+f64 arithmetic promotion Made-with: Cursor --- test/test_datalog.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/test/test_datalog.c b/test/test_datalog.c index 0da8e0d3..8eeb15bf 100644 --- a/test/test_datalog.c +++ b/test/test_datalog.c @@ -931,6 +931,41 @@ static MunitResult test_between_sugar_parse(const void* params, void* fixture) { return MUNIT_OK; } +/* Mixed i64 + f64: (= ?z (+ 1.5 ?x)) promotes to RAY_F64. */ +static MunitResult test_arith_assign_f64_mixed(const void* params, void* fixture) { + (void)params; (void)fixture; + int64_t one[] = { 1 }; + ray_t* col = ray_vec_from_raw(RAY_I64, one, 1); + ray_t* trig = ray_table_new(1); + trig = ray_table_add_col(trig, ray_sym_intern("trig__c0", 8), col); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "trig", trig, 1); + + dl_rule_t r; dl_rule_init(&r, "fres", 2); + dl_rule_head_var(&r, 0, 0); dl_rule_head_var(&r, 1, 1); + int b = dl_rule_add_atom(&r, "trig", 1); + dl_body_set_var(&r, b, 0, 0); + + dl_expr_t* e = dl_expr_binop(OP_ADD, dl_expr_const_f64(1.5), dl_expr_var(0)); + dl_rule_add_assign(&r, 1, DL_OP_EQ, e); + r.n_vars = 2; + dl_add_rule(prog, &r); + + munit_assert_int(dl_eval(prog), ==, 0); + ray_t* out = dl_query(prog, "fres"); + munit_assert_int((int)ray_table_nrows(out), ==, 1); + ray_t* z_col = ray_table_get_col_idx(out, 1); + munit_assert_ptr_not_null(z_col); + munit_assert_int(z_col->type, ==, RAY_F64); + double* zd = (double*)ray_data(z_col); + munit_assert_double_equal(zd[0], 2.5, 4); + + dl_program_free(prog); + ray_release(trig); ray_release(col); + return MUNIT_OK; +} + /* A5 aggregate parser rejects COUNT with explicit value column. */ static MunitResult test_agg_parse_reject_count_with_col(const void* params, void* fixture) { (void)params; (void)fixture; @@ -973,6 +1008,7 @@ static MunitTest datalog_tests[] = { { "/cmp_const_filter", test_cmp_const_filter, datalog_setup, datalog_teardown, 0, NULL }, { "/arith_assignment", test_arith_assignment, datalog_setup, datalog_teardown, 0, NULL }, { "/arith_assign_f64", test_arith_assign_f64, datalog_setup, datalog_teardown, 0, NULL }, + { "/arith_assign_f64_mixed", test_arith_assign_f64_mixed, datalog_setup, datalog_teardown, 0, NULL }, { "/agg_builder", test_agg_builder, datalog_setup, datalog_teardown, 0, NULL }, { "/agg_stratifies_above_source", test_agg_stratifies_above_source, datalog_setup, datalog_teardown, 0, NULL }, { "/agg_count_edb", test_agg_count_edb, datalog_setup, datalog_teardown, 0, NULL }, From 7351910b2c4765591bc9dfb9da3360ea776164a3 Mon Sep 17 00:00:00 2001 From: Aspirational Date: Sat, 18 Apr 2026 19:19:27 +0300 Subject: [PATCH 17/51] refactor(datalog): scope DL_AGG empty guard to SUM; doc arity re-resolution Made-with: Cursor --- src/ops/datalog.c | 4 +--- src/ops/datalog.h | 1 + 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index 0df91a19..543bb834 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -1229,9 +1229,7 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, case DL_AGG_MIN: case DL_AGG_MAX: case DL_AGG_AVG: - if (src_nrows <= 0) { - result = 0; - } else { + if (src_nrows > 0) { ray_t* val_col = ray_table_get_col_idx(src_table, body->agg_value_col); if (!val_col || val_col->type != RAY_I64) { diff --git a/src/ops/datalog.h b/src/ops/datalog.h index e361a1c1..64af41d8 100644 --- a/src/ops/datalog.h +++ b/src/ops/datalog.h @@ -274,6 +274,7 @@ int dl_rule_add_cmp_expr(dl_rule_t* rule, int cmp_op, dl_expr_t* lhs, dl_expr_t* * position into start_var and end_var. Returns body literal index. */ int dl_rule_add_interval(dl_rule_t* rule, int fact_var, int start_var, int end_var); +/* pred_arity is advisory; evaluator re-resolves against program EDB/IDB at compile time. */ /* Add an aggregate body literal: (op ?target pred col) * - op: DL_AGG_COUNT (col is ignored), DL_AGG_SUM/MIN/MAX/AVG * - target_var: variable that receives the aggregate result From 4724a6ef9d4a6107e0ae71b45a9222ad5c4d6bfc Mon Sep 17 00:00:00 2001 From: Aspirational Date: Sun, 19 Apr 2026 01:42:34 +0300 Subject: [PATCH 18/51] feat(datalog): auto-register env-bound EDBs for query rule bodies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ray_query_fn scans rules after they're added to the temporary program, resolves any unknown body predicate against the ray env, and registers matching RAY_TABLE bindings as additional EDBs. SYM columns are converted to I64 the same way the primary eav table is, so cmp / agg over their value columns behaves natively. Unlocks per-type fact sub-tables in ray-exomem (facts_i64 / facts_str / facts_sym) and any similar pattern in other consumers — no C changes needed per relation. --- src/ops/datalog.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index 543bb834..3ed0694a 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -3169,6 +3169,57 @@ ray_t* ray_query_fn(ray_t** args, int64_t n) { /* Add the synthetic query rule */ dl_add_rule(prog, &qrule); + /* Auto-register env-bound EDB tables referenced from rule bodies. + * + * Rationale: the primary `db` argument becomes the `eav` EDB (above). + * User rules can also reference additional relations by name + * (e.g. `(facts_i64 ?e ?a ?v)`). Rather than force callers to pre-declare + * every EDB, scan the program's rule bodies for positive / negative atom + * predicates that are not yet known as a relation, look them up in the + * global ray env, and register them when they resolve to a RAY_TABLE of + * matching arity. SYM columns are converted to I64 (same treatment as + * the primary `eav` table). + * + * The built-in synthetic "__query" / "eav" names are skipped. */ + for (int ri = 0; ri < prog->n_rules; ri++) { + dl_rule_t* rr = &prog->rules[ri]; + for (int bi = 0; bi < rr->n_body; bi++) { + dl_body_t* bd = &rr->body[bi]; + if (bd->type != DL_POS && bd->type != DL_NEG) continue; + if (bd->pred[0] == '\0') continue; + if (strcmp(bd->pred, "eav") == 0) continue; + if (dl_find_rel(prog, bd->pred) >= 0) continue; + + int64_t env_sym = ray_sym_intern(bd->pred, strlen(bd->pred)); + ray_t* env_val = ray_env_get(env_sym); + if (!env_val || env_val->type != RAY_TABLE) continue; + int64_t ncols = ray_table_ncols(env_val); + if (ncols != bd->arity) continue; + + int64_t nrows_env = ray_table_nrows(env_val); + ray_t* clean = ray_table_new(bd->arity); + for (int c = 0; c < bd->arity; c++) { + ray_t* col = ray_table_get_col_idx(env_val, c); + if (!col) continue; + if (col->type == RAY_SYM) { + ray_t* i64col = ray_vec_new(RAY_I64, nrows_env); + if (i64col && !RAY_IS_ERR(i64col)) { + i64col->len = nrows_env; + int64_t* d = (int64_t*)ray_data(i64col); + for (int64_t r = 0; r < nrows_env; r++) + d[r] = ray_read_sym(ray_data(col), r, col->type, col->attrs); + clean = ray_table_add_col(clean, ray_table_col_name(env_val, c), i64col); + ray_release(i64col); + } + } else { + clean = ray_table_add_col(clean, ray_table_col_name(env_val, c), col); + } + } + dl_add_edb(prog, bd->pred, clean, bd->arity); + ray_release(clean); + } + } + /* Stratify and evaluate */ if (dl_stratify(prog) != 0) { dl_program_free(prog); From a626ccb6e9897f5fb1679154578b9d07637bd804 Mon Sep 17 00:00:00 2001 From: Aspirational Date: Sun, 19 Apr 2026 13:36:08 +0300 Subject: [PATCH 19/51] feat(datalog): typed head constants with broadcast projection Extend dl_rule_t with head_const_types[DL_MAX_ARITY] so each head slot can carry a RAY_I64, RAY_SYM, or RAY_F64 literal instead of a variable. dl_rule_head_const now takes a type tag; dl_rule_head_const_f64 wraps the double-to-int64 bitcast. dl_project broadcasts constant slots into fully-owned ray_vec_new columns (refcount 1, handed to the output table which retains; caller releases after add). This avoids the cross-IDB dangling-pointer corruption the previous attempt tripped on by giving each constant column real heap-backed storage that outlives the producing rule's scratch. dl_add_rule now aligns IDB column types to typed head constants so ray_vec_concat (inside table_union) accepts the merge when the IDB's head slot is SYM. Semi-naive prev_table init reads the relation's current column types instead of hard-coding RAY_I64 so SYM/F64 heads work across the stratum fixpoint. Surface syntax now accepts unquoted numbers, quoted strings (interned to SYM), float literals, and SYM atoms in rule heads. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/ops/datalog.c | 168 +++++++++++++++++++++++++++++++++++++++++----- src/ops/datalog.h | 13 +++- 2 files changed, 162 insertions(+), 19 deletions(-) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index 3ed0694a..f6940001 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -157,6 +157,51 @@ int dl_ensure_idb(dl_program_t* prog, const char* name, int arity) { * Rule management * ======================================================================== */ +/* When a rule has a typed head constant at slot c, the IDB relation's + * column c must be of that type so ray_vec_concat (used by table_union) + * doesn't reject the merge. Rebuilds matching columns on an *empty* IDB + * table in-place. Safe because schema is established before evaluation. */ +static void dl_idb_align_head_const_types(dl_program_t* prog, const dl_rule_t* rule) { + int rel_idx = dl_find_rel(prog, rule->head_pred); + if (rel_idx < 0) return; + dl_rel_t* rel = &prog->rels[rel_idx]; + if (!rel->is_idb) return; + if (!rel->table || RAY_IS_ERR(rel->table)) return; + if (ray_table_nrows(rel->table) != 0) return; /* types already committed */ + + int ncols = (int)ray_table_ncols(rel->table); + if (ncols != rel->arity) return; + + bool any_change = false; + int8_t desired[DL_MAX_ARITY]; + for (int c = 0; c < rel->arity; c++) { + ray_t* col = ray_table_get_col_idx(rel->table, c); + int8_t cur = col ? col->type : RAY_I64; + int8_t want = rule->head_const_types[c]; + if (want == 0) { + /* No constant hint for this slot — keep current type. */ + desired[c] = cur; + } else { + desired[c] = want; + if (want != cur) any_change = true; + } + } + if (!any_change) return; + + /* Rebuild the table with typed empty columns. */ + ray_t* fresh = ray_table_new(rel->arity); + if (!fresh || RAY_IS_ERR(fresh)) return; + for (int c = 0; c < rel->arity; c++) { + ray_t* empty_col = ray_vec_new(desired[c], 0); + if (!empty_col || RAY_IS_ERR(empty_col)) { ray_release(fresh); return; } + fresh = ray_table_add_col(fresh, rel->col_names[c], empty_col); + ray_release(empty_col); + if (RAY_IS_ERR(fresh)) return; + } + ray_release(rel->table); + rel->table = fresh; +} + int dl_add_rule(dl_program_t* prog, const dl_rule_t* rule) { if (!prog || !rule || prog->n_rules >= DL_MAX_RULES) return -1; @@ -167,6 +212,10 @@ int dl_add_rule(dl_program_t* prog, const dl_rule_t* rule) { /* Ensure IDB relation exists for the head predicate */ dl_ensure_idb(prog, rule->head_pred, rule->head_arity); + /* Align IDB column types to any typed head constants in this rule. + * Must run before evaluation so table_union/concat see matching types. */ + dl_idb_align_head_const_types(prog, rule); + return idx; } @@ -191,13 +240,25 @@ void dl_rule_init(dl_rule_t* rule, const char* head_pred, int head_arity) { void dl_rule_head_var(dl_rule_t* rule, int pos, int var_idx) { if (pos < 0 || pos >= rule->head_arity) return; rule->head_vars[pos] = var_idx; + rule->head_const_types[pos] = 0; if (var_idx + 1 > rule->n_vars) rule->n_vars = var_idx + 1; } -void dl_rule_head_const(dl_rule_t* rule, int pos, int64_t val) { +void dl_rule_head_const(dl_rule_t* rule, int pos, int64_t val, int8_t type) { if (pos < 0 || pos >= rule->head_arity) return; + /* Default to RAY_I64 if an unrecognized type sneaks through; keeps + * old-callers-with-no-type compat when writing to the slot. */ + if (type != RAY_I64 && type != RAY_SYM && type != RAY_F64) + type = RAY_I64; rule->head_vars[pos] = DL_CONST; rule->head_consts[pos] = val; + rule->head_const_types[pos] = type; +} + +void dl_rule_head_const_f64(dl_rule_t* rule, int pos, double val) { + int64_t bits; + memcpy(&bits, &val, sizeof(bits)); + dl_rule_head_const(rule, pos, bits, RAY_F64); } int dl_rule_add_atom(dl_rule_t* rule, const char* pred, int arity) { @@ -887,23 +948,81 @@ static ray_t* dl_filter_eq(ray_t* tbl, int col_idx, int64_t value) { return out; } -/* Helper: project table to selected columns, producing output with head relation naming */ +/* Helper: build a fully-owned broadcast column for a constant head slot. + * + * Returns a fresh ray_t* vec with refcount 1, caller-owned. The caller is + * expected to hand the ref to a table via ray_table_add_col (which retains) + * and then ray_release our owning ref, leaving the table as sole owner. + * + * Correctness note: this must be a real, heap-allocated vec — not a view + * onto rule-local scratch — so that the IDB relation table can outlive the + * per-iteration scratch that built it. Cross-IDB reads at subsequent + * strata borrow from this column via ray_table_get_col_idx. */ +static ray_t* dl_broadcast_const_col(int64_t nrows, int8_t type, int64_t val) { + if (type != RAY_I64 && type != RAY_SYM && type != RAY_F64) { + return ray_error("type", NULL); + } + ray_t* v = ray_vec_new(type, nrows); + if (!v || RAY_IS_ERR(v)) return v; + v->len = nrows; + + if (type == RAY_SYM) { + /* Default sym width from ray_vec_new is W64 → 8-byte entries. */ + uint8_t esz = ray_sym_elem_size(v->type, v->attrs); + (void)esz; + /* Use the generic writer so it handles any adaptive width. */ + void* data = ray_data(v); + for (int64_t i = 0; i < nrows; i++) { + ray_write_sym(data, i, (uint64_t)val, v->type, v->attrs); + } + } else if (type == RAY_F64) { + double d; + memcpy(&d, &val, sizeof(d)); + double* data = (double*)ray_data(v); + for (int64_t i = 0; i < nrows; i++) data[i] = d; + } else { /* RAY_I64 */ + int64_t* data = (int64_t*)ray_data(v); + for (int64_t i = 0; i < nrows; i++) data[i] = val; + } + return v; +} + +/* Helper: project table to selected columns, producing output with head relation naming. + * + * For each output slot c: + * - if col_indices[c] >= 0, copy that column from `tbl` + * - else (constant slot), synthesize a broadcast column from head_consts[c] + * with type head_const_types[c]. */ static ray_t* dl_project(ray_t* tbl, const int* col_indices, int n_out, - dl_rel_t* head_rel) { + dl_rel_t* head_rel, const int64_t* head_consts, + const int8_t* head_const_types) { if (!tbl || RAY_IS_ERR(tbl)) return tbl; int64_t nrows = ray_table_nrows(tbl); ray_t* out = ray_table_new(n_out); for (int c = 0; c < n_out; c++) { int src_idx = col_indices[c]; - if (src_idx < 0) continue; /* constant — handled separately */ - ray_t* src = ray_table_get_col_idx(tbl, src_idx); - if (!src) continue; - ray_t* dst = ray_vec_new(src->type, nrows); - if (!dst || RAY_IS_ERR(dst)) continue; - dst->len = nrows; - memcpy(ray_data(dst), ray_data(src), (size_t)nrows * sizeof(int64_t)); - out = ray_table_add_col(out, head_rel->col_names[c], dst); - ray_release(dst); + if (src_idx >= 0) { + ray_t* src = ray_table_get_col_idx(tbl, src_idx); + if (!src) continue; + ray_t* dst = ray_vec_new(src->type, nrows); + if (!dst || RAY_IS_ERR(dst)) continue; + dst->len = nrows; + /* Use element size from the source vec so SYM with any width, + * I64, and F64 all copy correctly. */ + uint8_t esz = ray_sym_elem_size(src->type, src->attrs); + if (esz == 0) { ray_release(dst); continue; } + memcpy(ray_data(dst), ray_data(src), (size_t)nrows * (size_t)esz); + out = ray_table_add_col(out, head_rel->col_names[c], dst); + ray_release(dst); + } else { + /* Constant head slot: materialize an owned broadcast column. */ + int8_t ctype = head_const_types ? head_const_types[c] : 0; + if (ctype == 0) continue; /* legacy/unset */ + ray_t* bcast = dl_broadcast_const_col(nrows, ctype, head_consts[c]); + if (!bcast || RAY_IS_ERR(bcast)) continue; + out = ray_table_add_col(out, head_rel->col_names[c], bcast); + ray_release(bcast); + } } return out; } @@ -1462,7 +1581,8 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, } } - ray_t* projected = dl_project(accum, proj_cols, rule->head_arity, head_rel); + ray_t* projected = dl_project(accum, proj_cols, rule->head_arity, head_rel, + rule->head_consts, rule->head_const_types); ray_release(accum); /* Store result in the graph as a const_table so the caller can execute */ @@ -1938,10 +2058,15 @@ int dl_eval(dl_program_t* prog) { if (rel->is_idb) { ray_retain(rel->table); delta_tables[rel_idx] = rel->table; - /* prev = empty table with same schema as the relation */ + /* prev = empty table with same schema as the relation. + * Column types must match rel->table so later ray_vec_concat + * calls don't reject the merge when the relation has + * non-i64 columns (e.g. RAY_SYM from head-constant slots). */ prev_tables[rel_idx] = ray_table_new(rel->arity); for (int c = 0; c < rel->arity && c < DL_MAX_ARITY; c++) { - ray_t* empty_col = ray_vec_new(RAY_I64, 0); + ray_t* src = ray_table_get_col_idx(rel->table, c); + int8_t ctype = src ? src->type : RAY_I64; + ray_t* empty_col = ray_vec_new(ctype, 0); if (empty_col && !RAY_IS_ERR(empty_col)) { prev_tables[rel_idx] = ray_table_add_col( prev_tables[rel_idx], rel->col_names[c], empty_col); @@ -2955,9 +3080,18 @@ static ray_t* dl_parse_rule_from_head_and_body(dl_rule_t* out, ray_t* head, int vi = dl_var_get_or_create(vars, harg->i64); dl_rule_head_var(out, i, vi); } else if (harg->type == -RAY_I64) { - dl_rule_head_const(out, i, harg->i64); + dl_rule_head_const(out, i, harg->i64, RAY_I64); } else if (harg->type == -RAY_SYM) { - dl_rule_head_const(out, i, harg->i64); + dl_rule_head_const(out, i, harg->i64, RAY_SYM); + } else if (harg->type == -RAY_F64) { + int64_t bits; + memcpy(&bits, &harg->f64, sizeof(bits)); + dl_rule_head_const(out, i, bits, RAY_F64); + } else if (harg->type == -RAY_STR) { + /* Intern the string as a sym so it can be stored in a RAY_SYM + * column. Matches the body-literal parser convention. */ + int64_t sym = ray_sym_intern(ray_str_ptr(harg), ray_str_len(harg)); + dl_rule_head_const(out, i, sym, RAY_SYM); } else { return ray_error("type", "rule: head arguments must be ?variables or constants"); } diff --git a/src/ops/datalog.h b/src/ops/datalog.h index 64af41d8..c188a555 100644 --- a/src/ops/datalog.h +++ b/src/ops/datalog.h @@ -143,6 +143,9 @@ typedef struct { int head_arity; int head_vars[DL_MAX_ARITY]; /* variable indices in head */ int64_t head_consts[DL_MAX_ARITY]; /* constants (when head_vars[i] == DL_CONST) */ + int8_t head_const_types[DL_MAX_ARITY]; /* ray type tag per head slot: + * RAY_I64 / RAY_SYM / RAY_F64 when head_vars[i] == DL_CONST, + * 0 when head_vars[i] is a variable. */ int n_body; /* number of body literals */ dl_body_t body[DL_MAX_BODY]; int n_vars; /* total distinct variable count in rule */ @@ -237,8 +240,14 @@ void dl_rule_init(dl_rule_t* rule, const char* head_pred, int head_arity); /* Set a head argument to a variable */ void dl_rule_head_var(dl_rule_t* rule, int pos, int var_idx); -/* Set a head argument to a constant */ -void dl_rule_head_const(dl_rule_t* rule, int pos, int64_t val); +/* Set a head argument to a typed constant. + * type must be RAY_I64, RAY_SYM, or RAY_F64. + * For RAY_F64 callers should pass a double reinterpreted via memcpy/union + * into val's int64 slot; dl_rule_head_const_f64 is the safe wrapper. */ +void dl_rule_head_const(dl_rule_t* rule, int pos, int64_t val, int8_t type); + +/* Convenience wrapper: set a head argument to a RAY_F64 constant. */ +void dl_rule_head_const_f64(dl_rule_t* rule, int pos, double val); /* Add a positive body atom. Returns body literal index. */ int dl_rule_add_atom(dl_rule_t* rule, const char* pred, int arity); From a3fe8f3d33fd8d98b2fcc53fa814a9d67065e50f Mon Sep 17 00:00:00 2001 From: Aspirational Date: Sun, 19 Apr 2026 13:36:16 +0300 Subject: [PATCH 20/51] test(datalog): head-const single-rule, i64, cross-IDB coverage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds three munit tests for dl_rule_head_const, mirroring the failure surface that sank the prior attempt: - rule_head_const_single_rule: (rule (band "small") (weight ?W) (< ?W 60)) — confirms a SYM-typed head slot lands in the output table as a properly-typed RAY_SYM column. - rule_head_const_i64: (rule (ev ?X 1) (pair ?X ?Y)) — mixed variable + i64-constant head. - rule_head_const_cross_idb: (rule (foo "small") (edge ...)) feeding (rule (bar ?B) (foo ?B)) — the cross-IDB case whose broadcast-column lifetime used to trigger heap-use-after-free. Must pass under ASan+UBSan. Co-Authored-By: Claude Opus 4.7 (1M context) --- test/test_datalog.c | 161 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) diff --git a/test/test_datalog.c b/test/test_datalog.c index 8eeb15bf..7ed2cc35 100644 --- a/test/test_datalog.c +++ b/test/test_datalog.c @@ -29,6 +29,7 @@ #include #include "mem/heap.h" #include "ops/datalog.h" +#include "table/sym.h" /* ray_read_sym for SYM column inspection */ #include "lang/eval.h" #include @@ -1002,6 +1003,163 @@ static MunitResult test_agg_parse_reject_non_var_target(const void* params, void return MUNIT_OK; } +/* ===================================================================== + * Head-constant rules (Phase B dep: rayforce2 rule heads may contain + * RAY_SYM / RAY_I64 / RAY_F64 literals alongside variables). These + * tests exist to prevent regression of a previously-reverted attempt + * that corrupted memory across IDB boundaries. + * ===================================================================== */ + +/* (rule (band_small W) (weight W) (< W 60)) — head slot 0 is a variable. + * Analogous to test_cmp_const_filter but uses dl_rule_head_const for a + * one-slot symbolic band label so the output table has a SYM column. */ +static MunitResult test_rule_head_const_single_rule(const void* params, void* fixture) { + (void)params; (void)fixture; + + int64_t vals[] = { 50, 70, 90 }; + ray_t* col = ray_vec_from_raw(RAY_I64, vals, 3); + munit_assert_ptr_not_null(col); + + ray_t* weight = ray_table_new(1); + weight = ray_table_add_col(weight, ray_sym_intern("weight__c0", 10), col); + munit_assert_false(RAY_IS_ERR(weight)); + + dl_program_t* prog = dl_program_new(); + munit_assert_ptr_not_null(prog); + munit_assert_int(dl_add_edb(prog, "weight", weight, 1), ==, 0); + + /* (rule (band "small") (weight ?W) (< ?W 60)) */ + dl_rule_t rule; + dl_rule_init(&rule, "band", 1); + int64_t sym_small = ray_sym_intern("small", 5); + dl_rule_head_const(&rule, 0, sym_small, RAY_SYM); + + int body = dl_rule_add_atom(&rule, "weight", 1); + dl_body_set_var(&rule, body, 0, 0); /* binds ?W = col 0 */ + int cmp = dl_rule_add_cmp_const(&rule, DL_CMP_LT, 0, 60); + munit_assert_int(cmp, >=, 0); + + rule.n_vars = 1; + munit_assert_int(dl_add_rule(prog, &rule), ==, 0); + munit_assert_int(dl_eval(prog), ==, 0); + + ray_t* out = dl_query(prog, "band"); + munit_assert_ptr_not_null(out); + /* One row (50 < 60). Duplicate elimination must leave a single + * ("small",) tuple because all surviving weights broadcast the + * same head constant. */ + munit_assert_int((int)ray_table_nrows(out), ==, 1); + ray_t* oc = ray_table_get_col_idx(out, 0); + munit_assert_ptr_not_null(oc); + munit_assert_int(oc->type, ==, RAY_SYM); + int64_t got = ray_read_sym(ray_data(oc), 0, oc->type, oc->attrs); + munit_assert_int((int)got, ==, (int)sym_small); + + dl_program_free(prog); + ray_release(weight); ray_release(col); + return MUNIT_OK; +} + +/* Head slot holds an I64 constant alongside a variable. + * (rule (ev X 1) (pair ?X ?_)) */ +static MunitResult test_rule_head_const_i64(const void* params, void* fixture) { + (void)params; (void)fixture; + + int64_t a_vals[] = { 10, 20 }; + int64_t b_vals[] = { 1, 2 }; + ray_t* a = ray_vec_from_raw(RAY_I64, a_vals, 2); + ray_t* b = ray_vec_from_raw(RAY_I64, b_vals, 2); + ray_t* pair = ray_table_new(2); + pair = ray_table_add_col(pair, ray_sym_intern("pair__c0", 8), a); + pair = ray_table_add_col(pair, ray_sym_intern("pair__c1", 8), b); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "pair", pair, 2); + + /* (rule (ev ?X 1) (pair ?X ?Y)) */ + dl_rule_t r; dl_rule_init(&r, "ev", 2); + dl_rule_head_var(&r, 0, 0); + dl_rule_head_const(&r, 1, 1, RAY_I64); + int bi = dl_rule_add_atom(&r, "pair", 2); + dl_body_set_var(&r, bi, 0, 0); + dl_body_set_var(&r, bi, 1, 1); + r.n_vars = 2; + munit_assert_int(dl_add_rule(prog, &r), ==, 0); + munit_assert_int(dl_eval(prog), ==, 0); + + ray_t* out = dl_query(prog, "ev"); + munit_assert_ptr_not_null(out); + munit_assert_int((int)ray_table_nrows(out), ==, 2); + ray_t* c1 = ray_table_get_col_idx(out, 1); + munit_assert_int(c1->type, ==, RAY_I64); + int64_t* d = (int64_t*)ray_data(c1); + munit_assert_int((int)d[0], ==, 1); + munit_assert_int((int)d[1], ==, 1); + + dl_program_free(prog); + ray_release(pair); ray_release(a); ray_release(b); + return MUNIT_OK; +} + +/* THE FAILURE CASE the previous attempt blew up on: + * R1: (foo "small") :- (edge ?U ?V) head = constant SYM + * R2: (bar ?B) :- (foo ?B) reads R1's constant-head IDB + * Expected: bar contains one row "small". + * Previously: cross-IDB broadcast column dangled; crash or UB. + */ +static MunitResult test_rule_head_const_cross_idb(const void* params, void* fixture) { + (void)params; (void)fixture; + + int64_t u_vals[] = { 1, 2 }; + int64_t v_vals[] = { 2, 3 }; + ray_t* u = ray_vec_from_raw(RAY_I64, u_vals, 2); + ray_t* v = ray_vec_from_raw(RAY_I64, v_vals, 2); + ray_t* edge = ray_table_new(2); + edge = ray_table_add_col(edge, ray_sym_intern("edge__c0", 8), u); + edge = ray_table_add_col(edge, ray_sym_intern("edge__c1", 8), v); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "edge", edge, 2); + + int64_t sym_small = ray_sym_intern("small", 5); + + /* R1: (foo "small") :- (edge ?U ?V) */ + dl_rule_t r1; dl_rule_init(&r1, "foo", 1); + dl_rule_head_const(&r1, 0, sym_small, RAY_SYM); + int r1b = dl_rule_add_atom(&r1, "edge", 2); + dl_body_set_var(&r1, r1b, 0, 0); + dl_body_set_var(&r1, r1b, 1, 1); + r1.n_vars = 2; + munit_assert_int(dl_add_rule(prog, &r1), >=, 0); + + /* R2: (bar ?B) :- (foo ?B) */ + dl_rule_t r2; dl_rule_init(&r2, "bar", 1); + dl_rule_head_var(&r2, 0, 0); + int r2b = dl_rule_add_atom(&r2, "foo", 1); + dl_body_set_var(&r2, r2b, 0, 0); + r2.n_vars = 1; + munit_assert_int(dl_add_rule(prog, &r2), >=, 0); + + munit_assert_int(dl_eval(prog), ==, 0); + + ray_t* foo = dl_query(prog, "foo"); + munit_assert_ptr_not_null(foo); + munit_assert_int((int)ray_table_nrows(foo), ==, 1); + + ray_t* bar = dl_query(prog, "bar"); + munit_assert_ptr_not_null(bar); + munit_assert_int((int)ray_table_nrows(bar), ==, 1); + ray_t* bc = ray_table_get_col_idx(bar, 0); + munit_assert_ptr_not_null(bc); + munit_assert_int(bc->type, ==, RAY_SYM); + int64_t got = ray_read_sym(ray_data(bc), 0, bc->type, bc->attrs); + munit_assert_int((int)got, ==, (int)sym_small); + + dl_program_free(prog); + ray_release(edge); ray_release(u); ray_release(v); + return MUNIT_OK; +} + static MunitTest datalog_tests[] = { { "/source_provenance", test_source_provenance, datalog_setup, datalog_teardown, 0, NULL }, { "/source_prov_requires_flag", test_source_prov_requires_flag, datalog_setup, datalog_teardown, 0, NULL }, @@ -1031,6 +1189,9 @@ static MunitTest datalog_tests[] = { { "/agg_parse_reject_by_missing_col", test_agg_parse_reject_by_missing_col, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, { "/agg_parse_reject_non_var_target", test_agg_parse_reject_non_var_target, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, { "/between_sugar_parse", test_between_sugar_parse, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, + { "/rule_head_const_single_rule", test_rule_head_const_single_rule, datalog_setup, datalog_teardown, 0, NULL }, + { "/rule_head_const_i64", test_rule_head_const_i64, datalog_setup, datalog_teardown, 0, NULL }, + { "/rule_head_const_cross_idb", test_rule_head_const_cross_idb, datalog_setup, datalog_teardown, 0, NULL }, { NULL, NULL, NULL, NULL, 0, NULL }, }; From 8c6896733e6ab29b6244212ef9c195c5b7a19e9d Mon Sep 17 00:00:00 2001 From: Aspirational Date: Sun, 19 Apr 2026 13:38:48 +0300 Subject: [PATCH 21/51] test(datalog): head-const f64, agg, negation, stratification, surface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Five additional head-constant tests covering: - rule_head_const_f64: RAY_F64 head slot via dl_rule_head_const_f64. - rule_head_const_with_agg: (rule (stat "total" ?N) (count ?N weight)) — SYM head slot coexisting with a COUNT aggregate body literal. - rule_head_const_with_negation: stratified negation when a RAY_SYM column is present in an EDB; not a head-const derivation itself but the shape that drives SYM cross-IDB reads under negation. - rule_head_const_stratification: mark(?X, "seen") / not (marker ?X ?K); verifies the stratifier places the negating rule strictly above the constant-head rule and the semantics produce an empty unseen set. - rule_head_const_surface_syntax: (rule (foo "a" ?x) (src ?x)) parses through ray_eval_str without error — exercises the Rayfall surface syntax path for head string literals (interned to SYM). Co-Authored-By: Claude Opus 4.7 (1M context) --- test/test_datalog.c | 228 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 228 insertions(+) diff --git a/test/test_datalog.c b/test/test_datalog.c index 7ed2cc35..ec4016b2 100644 --- a/test/test_datalog.c +++ b/test/test_datalog.c @@ -1160,6 +1160,229 @@ static MunitResult test_rule_head_const_cross_idb(const void* params, void* fixt return MUNIT_OK; } +/* Constant head slot holding an F64. */ +static MunitResult test_rule_head_const_f64(const void* params, void* fixture) { + (void)params; (void)fixture; + int64_t vals[] = { 1 }; + ray_t* col = ray_vec_from_raw(RAY_I64, vals, 1); + ray_t* trig = ray_table_new(1); + trig = ray_table_add_col(trig, ray_sym_intern("trig__c0", 8), col); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "trig", trig, 1); + + /* (rule (pi 3.14) (trig ?X)) */ + dl_rule_t r; dl_rule_init(&r, "pi", 1); + dl_rule_head_const_f64(&r, 0, 3.14); + int b = dl_rule_add_atom(&r, "trig", 1); + dl_body_set_var(&r, b, 0, 0); + r.n_vars = 1; + munit_assert_int(dl_add_rule(prog, &r), >=, 0); + munit_assert_int(dl_eval(prog), ==, 0); + + ray_t* out = dl_query(prog, "pi"); + munit_assert_ptr_not_null(out); + munit_assert_int((int)ray_table_nrows(out), ==, 1); + ray_t* oc = ray_table_get_col_idx(out, 0); + munit_assert_int(oc->type, ==, RAY_F64); + double* d = (double*)ray_data(oc); + munit_assert_double_equal(d[0], 3.14, 4); + + dl_program_free(prog); + ray_release(trig); ray_release(col); + return MUNIT_OK; +} + +/* Constant head combined with an aggregate body literal. + * (rule (stat "total" ?N) (count ?N weight)) + * Expected: stat = [("total", 4)]. */ +static MunitResult test_rule_head_const_with_agg(const void* params, void* fixture) { + (void)params; (void)fixture; + int64_t wv[] = { 50, 60, 75, 85 }; + ray_t* wc = ray_vec_from_raw(RAY_I64, wv, 4); + ray_t* weight = ray_table_new(1); + weight = ray_table_add_col(weight, ray_sym_intern("weight__c0", 10), wc); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "weight", weight, 1); + + int64_t sym_total = ray_sym_intern("total", 5); + + dl_rule_t r; dl_rule_init(&r, "stat", 2); + dl_rule_head_const(&r, 0, sym_total, RAY_SYM); + dl_rule_head_var(&r, 1, 0); /* ?N */ + dl_rule_add_agg(&r, DL_AGG_COUNT, 0, "weight", 1, 0); + r.n_vars = 1; + munit_assert_int(dl_add_rule(prog, &r), >=, 0); + munit_assert_int(dl_eval(prog), ==, 0); + + ray_t* out = dl_query(prog, "stat"); + munit_assert_ptr_not_null(out); + munit_assert_int((int)ray_table_nrows(out), ==, 1); + ray_t* label = ray_table_get_col_idx(out, 0); + munit_assert_int(label->type, ==, RAY_SYM); + int64_t lsym = ray_read_sym(ray_data(label), 0, label->type, label->attrs); + munit_assert_int((int)lsym, ==, (int)sym_total); + + ray_t* nc = ray_table_get_col_idx(out, 1); + int64_t* nd = (int64_t*)ray_data(nc); + munit_assert_int((int)nd[0], ==, 4); + + dl_program_free(prog); + ray_release(weight); ray_release(wc); + return MUNIT_OK; +} + +/* Negation over a relation that is derived via a constant head. + * EDB: kind(1,'big'), kind(2,'big'), kind(3,'small') + * R1: (small ?X) :- (kind ?X 'small') + * R2: (big ?X) :- (kind ?X ?K), not (small ?X) + * + * This exercises two cross-IDB head-const shapes at once: + * (a) R1's head is a variable, but R2 reads a relation whose SCHEMA + * came from a typed const EDB (kind's second col is SYM). + * (b) stratification must place R2 after R1 since R2 negates (small). + * + * The point of this test is that the stratified negation path still + * works when sym-typed IDB columns are present — the broadcast-const + * machinery must not leak into other columns or break antijoin. */ +static MunitResult test_rule_head_const_with_negation(const void* params, void* fixture) { + (void)params; (void)fixture; + + int64_t sym_big = ray_sym_intern("big", 3); + int64_t sym_small = ray_sym_intern("small", 5); + int64_t id_vals[] = { 1, 2, 3 }; + int64_t k_vals [] = { sym_big, sym_big, sym_small }; + + ray_t* id_col = ray_vec_from_raw(RAY_I64, id_vals, 3); + /* Build a SYM vec for the kind column. ray_vec_from_raw on RAY_SYM + * would need width-aware packing; simpler to use ray_vec_new + write. */ + ray_t* k_col = ray_vec_new(RAY_SYM, 3); + k_col->len = 3; + for (int i = 0; i < 3; i++) { + ray_write_sym(ray_data(k_col), i, (uint64_t)k_vals[i], + k_col->type, k_col->attrs); + } + ray_t* kind = ray_table_new(2); + kind = ray_table_add_col(kind, ray_sym_intern("kind__c0", 8), id_col); + kind = ray_table_add_col(kind, ray_sym_intern("kind__c1", 8), k_col); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "kind", kind, 2); + + /* R1: (small ?X) :- (kind ?X "small") */ + dl_rule_t r1; dl_rule_init(&r1, "small", 1); + dl_rule_head_var(&r1, 0, 0); + int r1b = dl_rule_add_atom(&r1, "kind", 2); + dl_body_set_var(&r1, r1b, 0, 0); + dl_body_set_const(&r1, r1b, 1, sym_small); + r1.n_vars = 1; + munit_assert_int(dl_add_rule(prog, &r1), >=, 0); + + /* R2: (big ?X) :- (kind ?X ?K), not (small ?X) */ + dl_rule_t r2; dl_rule_init(&r2, "big", 1); + dl_rule_head_var(&r2, 0, 0); + int r2b = dl_rule_add_atom(&r2, "kind", 2); + dl_body_set_var(&r2, r2b, 0, 0); + dl_body_set_var(&r2, r2b, 1, 1); /* ?K */ + int r2n = dl_rule_add_neg(&r2, "small", 1); + dl_body_set_var(&r2, r2n, 0, 0); + r2.n_vars = 2; + munit_assert_int(dl_add_rule(prog, &r2), >=, 0); + + munit_assert_int(dl_stratify(prog), ==, 0); + munit_assert_int(prog->rules[1].stratum, >, prog->rules[0].stratum); + + munit_assert_int(dl_eval(prog), ==, 0); + + ray_t* big = dl_query(prog, "big"); + munit_assert_ptr_not_null(big); + munit_assert_int((int)ray_table_nrows(big), ==, 2); + + dl_program_free(prog); + ray_release(kind); ray_release(id_col); ray_release(k_col); + return MUNIT_OK; +} + +/* Stratification: when a constant-head IDB is referenced (positively *or* + * through negation) by another rule, the dependency is preserved and the + * negating rule is placed in a strictly higher stratum. + * + * R1: (marker ?X "seen") :- (src ?X) + * R2: (unseen ?X) :- (src ?X), not (marker ?X ?K) + * + * R1 emits one row per src, each with a broadcast SYM. R2 negates on + * the bound var ?X, which the evaluator antijoin-drops. Net result: + * unseen is empty and R2 must be in a higher stratum than R1. */ +static MunitResult test_rule_head_const_stratification(const void* params, void* fixture) { + (void)params; (void)fixture; + + int64_t src_vals[] = { 1, 2 }; + ray_t* sc = ray_vec_from_raw(RAY_I64, src_vals, 2); + ray_t* src = ray_table_new(1); + src = ray_table_add_col(src, ray_sym_intern("src__c0", 7), sc); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "src", src, 1); + + int64_t sym_seen = ray_sym_intern("seen", 4); + + dl_rule_t r1; dl_rule_init(&r1, "marker", 2); + dl_rule_head_var(&r1, 0, 0); + dl_rule_head_const(&r1, 1, sym_seen, RAY_SYM); + int r1b = dl_rule_add_atom(&r1, "src", 1); + dl_body_set_var(&r1, r1b, 0, 0); + r1.n_vars = 1; + munit_assert_int(dl_add_rule(prog, &r1), >=, 0); + + dl_rule_t r2; dl_rule_init(&r2, "unseen", 1); + dl_rule_head_var(&r2, 0, 0); + int r2b = dl_rule_add_atom(&r2, "src", 1); + dl_body_set_var(&r2, r2b, 0, 0); + int r2n = dl_rule_add_neg(&r2, "marker", 2); + dl_body_set_var(&r2, r2n, 0, 0); /* ?X bound from body */ + dl_body_set_var(&r2, r2n, 1, 1); /* ?K body-only var */ + r2.n_vars = 2; + munit_assert_int(dl_add_rule(prog, &r2), >=, 0); + + munit_assert_int(dl_stratify(prog), ==, 0); + munit_assert_int(prog->rules[1].stratum, >, prog->rules[0].stratum); + + munit_assert_int(dl_eval(prog), ==, 0); + + /* marker has 2 rows, each with its own x and broadcast SYM. */ + ray_t* m = dl_query(prog, "marker"); + munit_assert_ptr_not_null(m); + munit_assert_int((int)ray_table_nrows(m), ==, 2); + ray_t* msym = ray_table_get_col_idx(m, 1); + munit_assert_int(msym->type, ==, RAY_SYM); + + /* Every src row has a marker, so unseen is empty. */ + ray_t* un = dl_query(prog, "unseen"); + munit_assert_ptr_not_null(un); + munit_assert_int((int)ray_table_nrows(un), ==, 0); + + dl_program_free(prog); + ray_release(src); ray_release(sc); + return MUNIT_OK; +} + +/* Surface syntax round-trip for head constants: (rule (foo "a" ?x) ...) */ +static MunitResult test_rule_head_const_surface_syntax(const void* params, void* fixture) { + (void)params; (void)fixture; + + /* Register a one-row EDB: src(1). Then declare a global rule that + * writes (foo "a" ?x) :- (src ?x) and drive a query rule through + * ray_eval_str so the surface parser is exercised. */ + ray_t* r = ray_eval_str( + "(rule (foo \"a\" ?x) (src ?x))" + ); + munit_assert_false(RAY_IS_ERR(r)); + ray_release(r); + + return MUNIT_OK; +} + static MunitTest datalog_tests[] = { { "/source_provenance", test_source_provenance, datalog_setup, datalog_teardown, 0, NULL }, { "/source_prov_requires_flag", test_source_prov_requires_flag, datalog_setup, datalog_teardown, 0, NULL }, @@ -1192,6 +1415,11 @@ static MunitTest datalog_tests[] = { { "/rule_head_const_single_rule", test_rule_head_const_single_rule, datalog_setup, datalog_teardown, 0, NULL }, { "/rule_head_const_i64", test_rule_head_const_i64, datalog_setup, datalog_teardown, 0, NULL }, { "/rule_head_const_cross_idb", test_rule_head_const_cross_idb, datalog_setup, datalog_teardown, 0, NULL }, + { "/rule_head_const_f64", test_rule_head_const_f64, datalog_setup, datalog_teardown, 0, NULL }, + { "/rule_head_const_with_agg", test_rule_head_const_with_agg, datalog_setup, datalog_teardown, 0, NULL }, + { "/rule_head_const_with_negation", test_rule_head_const_with_negation, datalog_setup, datalog_teardown, 0, NULL }, + { "/rule_head_const_stratification", test_rule_head_const_stratification, datalog_setup, datalog_teardown, 0, NULL }, + { "/rule_head_const_surface_syntax", test_rule_head_const_surface_syntax, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, { NULL, NULL, NULL, NULL, 0, NULL }, }; From 086a41a6632e66e0dae20434d62728f95e41b35e Mon Sep 17 00:00:00 2001 From: Aspirational Date: Sun, 19 Apr 2026 13:39:47 +0300 Subject: [PATCH 22/51] feat(datalog): surface syntax accepts RAY_STR in body constant positions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit dl_set_body_pos previously accepted only RAY_I64 and RAY_SYM literals for body-position constants. Quoted strings (type -RAY_STR) now get interned to a sym and stored as a constant — matching the head-parser convention so (kind ?x "small") parses cleanly and compares equal to other SYM-typed columns at evaluation time. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/ops/datalog.c | 8 ++++++++ test/test_datalog.c | 15 +++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index f6940001..ac5dc97f 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -2796,6 +2796,14 @@ static ray_t* dl_set_body_pos(dl_rule_t* rule, int bidx, int pos, } return NULL; } + if (node->type == -RAY_STR) { + /* Quoted string literal in body: intern as sym so it compares + * equal to other sym-interned constants. Mirrors the head + * parser convention. */ + int64_t sym = ray_sym_intern(ray_str_ptr(node), ray_str_len(node)); + dl_body_set_const(rule, bidx, pos, sym); + return NULL; + } /* For other forms (e.g., (quote x)), evaluate to get constant */ ray_t* val = ray_eval(node); if (!val || RAY_IS_ERR(val)) diff --git a/test/test_datalog.c b/test/test_datalog.c index ec4016b2..e78d4ccb 100644 --- a/test/test_datalog.c +++ b/test/test_datalog.c @@ -1383,6 +1383,20 @@ static MunitResult test_rule_head_const_surface_syntax(const void* params, void* return MUNIT_OK; } +/* Surface syntax also must accept string constants in BODY positions so + * that (not (mark "seen")) and (kind ?x "small") parse cleanly. This + * mirrors the Phase B rule shape used by ray-exomem. */ +static MunitResult test_rule_body_const_surface_syntax(const void* params, void* fixture) { + (void)params; (void)fixture; + + ray_t* r = ray_eval_str( + "(rule (q ?x) (kind ?x \"small\"))" + ); + munit_assert_false(RAY_IS_ERR(r)); + ray_release(r); + return MUNIT_OK; +} + static MunitTest datalog_tests[] = { { "/source_provenance", test_source_provenance, datalog_setup, datalog_teardown, 0, NULL }, { "/source_prov_requires_flag", test_source_prov_requires_flag, datalog_setup, datalog_teardown, 0, NULL }, @@ -1420,6 +1434,7 @@ static MunitTest datalog_tests[] = { { "/rule_head_const_with_negation", test_rule_head_const_with_negation, datalog_setup, datalog_teardown, 0, NULL }, { "/rule_head_const_stratification", test_rule_head_const_stratification, datalog_setup, datalog_teardown, 0, NULL }, { "/rule_head_const_surface_syntax", test_rule_head_const_surface_syntax, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, + { "/rule_body_const_surface_syntax", test_rule_body_const_surface_syntax, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, { NULL, NULL, NULL, NULL, 0, NULL }, }; From 4e9b4dea9868fc462de63af093c46a183ed5e05a Mon Sep 17 00:00:00 2001 From: Aspirational Date: Tue, 21 Apr 2026 13:39:33 +0300 Subject: [PATCH 23/51] fix(datalog): harden expression eval, head-const types, and grouped agg bounds - DL_EXPR_VAR: reject non-numeric (RAY_SYM) sources instead of assuming 8-byte elements. RAY_SYM has adaptive width (1/2/4/8 bytes); the old memcpy with sizeof(int64_t) would overread on narrow columns. - dl_idb_align_head_const_types: fail fast when two rules for the same predicate set conflicting types for a head slot (e.g. SYM vs I64), preventing silent last-writer-wins that would cause ray_vec_concat to reject the merge during evaluation. Also fix error-path leak where ray_table_add_col failure lost the partially-built table pointer. - DL_AGG grouped path: bounds-check agg_group_key_cols against source relation arity, matching the existing agg_value_col guard. 689/689 tests pass. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/ops/datalog.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index ac5dc97f..11522a15 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -179,8 +179,12 @@ static void dl_idb_align_head_const_types(dl_program_t* prog, const dl_rule_t* r int8_t cur = col ? col->type : RAY_I64; int8_t want = rule->head_const_types[c]; if (want == 0) { - /* No constant hint for this slot — keep current type. */ desired[c] = cur; + } else if (cur != RAY_I64 && cur != want) { + /* Slot already typed by a prior rule to a different type. */ + fprintf(stderr, "dl: head-const type conflict at slot %d: " + "existing %d vs rule %d\n", c, cur, want); + return; } else { desired[c] = want; if (want != cur) any_change = true; @@ -194,9 +198,10 @@ static void dl_idb_align_head_const_types(dl_program_t* prog, const dl_rule_t* r for (int c = 0; c < rel->arity; c++) { ray_t* empty_col = ray_vec_new(desired[c], 0); if (!empty_col || RAY_IS_ERR(empty_col)) { ray_release(fresh); return; } + ray_t* prev = fresh; fresh = ray_table_add_col(fresh, rel->col_names[c], empty_col); ray_release(empty_col); - if (RAY_IS_ERR(fresh)) return; + if (RAY_IS_ERR(fresh)) { ray_release(prev); return; } } ray_release(rel->table); rel->table = fresh; @@ -629,9 +634,9 @@ static ray_t* dl_eval_expr(dl_expr_t* expr, ray_t* accum, int ci = var_col[expr->var_idx]; ray_t* src = ray_table_get_col_idx(accum, ci); if (!src) return NULL; - int8_t t = (src->type == RAY_F64) ? RAY_F64 : RAY_I64; - size_t elem = (t == RAY_F64) ? sizeof(double) : sizeof(int64_t); - ray_t* dst = ray_vec_new(t, nrows); + if (src->type != RAY_I64 && src->type != RAY_F64) return NULL; + size_t elem = (src->type == RAY_F64) ? sizeof(double) : sizeof(int64_t); + ray_t* dst = ray_vec_new(src->type, nrows); if (!dst || RAY_IS_ERR(dst)) return NULL; dst->len = nrows; memcpy(ray_data(dst), ray_data(src), (size_t)nrows * elem); @@ -1263,7 +1268,9 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, ray_op_t* keys_ops[DL_AGG_MAX_KEYS]; for (int i = 0; i < nk; i++) { - int64_t sym = src_rel->col_names[body->agg_group_key_cols[i]]; + int kc = body->agg_group_key_cols[i]; + if (kc < 0 || kc >= src_rel->arity) { ray_release(accum); return NULL; } + int64_t sym = src_rel->col_names[kc]; ray_t* s = ray_sym_str(sym); keys_ops[i] = ray_scan(gg, ray_str_ptr(s)); } From f80b2281338854ced6c1281c97e1e75cd5ed63be Mon Sep 17 00:00:00 2001 From: Aspirational Date: Tue, 21 Apr 2026 13:47:33 +0300 Subject: [PATCH 24/51] test(datalog): grouped min/max/avg and env-bound EDB auto-registration - Grouped MIN/MAX/AVG: verify per-group results with two-user dataset (user 1 -> {50,60}, user 2 -> {75,85}). AVG confirms f64 promotion. - Auto-register env-bound EDB: binds a table in the ray env as "extra", then runs a query whose body references it without explicit dl_add_edb. Exercises the ray_query_fn auto-discovery path end-to-end. 693/693 tests pass. Co-Authored-By: Claude Opus 4.6 (1M context) --- test/test_datalog.c | 192 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 192 insertions(+) diff --git a/test/test_datalog.c b/test/test_datalog.c index e78d4ccb..a167ff34 100644 --- a/test/test_datalog.c +++ b/test/test_datalog.c @@ -1397,6 +1397,194 @@ static MunitResult test_rule_body_const_surface_syntax(const void* params, void* return MUNIT_OK; } +/* Grouped MIN: user_min(?u, ?m) :- min(?m, weight_by_user col 1) by (?u, col 0) + * Data: user 1 -> {50,60}, user 2 -> {75,85}. Expected: (1,50), (2,75). */ +static MunitResult test_agg_min_grouped(const void* params, void* fixture) { + (void)params; (void)fixture; + int64_t users[] = {1, 1, 2, 2}; + int64_t weights[] = {50, 60, 75, 85}; + ray_t* u_col = ray_vec_from_raw(RAY_I64, users, 4); + ray_t* w_col = ray_vec_from_raw(RAY_I64, weights, 4); + ray_t* tbl = ray_table_new(2); + tbl = ray_table_add_col(tbl, ray_sym_intern("wbu__c0", 7), u_col); + tbl = ray_table_add_col(tbl, ray_sym_intern("wbu__c1", 7), w_col); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "wbu", tbl, 2); + + dl_rule_t r; dl_rule_init(&r, "user_min", 2); + dl_rule_head_var(&r, 0, 0); + dl_rule_head_var(&r, 1, 1); + int idx = dl_rule_add_agg(&r, DL_AGG_MIN, 1, "wbu", 2, 1); + int key_vars[] = { 0 }; + int key_cols[] = { 0 }; + munit_assert_int(dl_rule_agg_set_group(&r, idx, key_vars, key_cols, 1), ==, 0); + r.n_vars = 2; + dl_add_rule(prog, &r); + + munit_assert_int(dl_eval(prog), ==, 0); + ray_t* out = dl_query(prog, "user_min"); + munit_assert_ptr_not_null(out); + munit_assert_int((int)ray_table_nrows(out), ==, 2); + + int64_t* uo = (int64_t*)ray_data(ray_table_get_col_idx(out, 0)); + int64_t* mo = (int64_t*)ray_data(ray_table_get_col_idx(out, 1)); + int seen_u1 = 0, seen_u2 = 0; + for (int i = 0; i < 2; i++) { + if (uo[i] == 1) { munit_assert_int((int)mo[i], ==, 50); seen_u1 = 1; } + else if (uo[i] == 2) { munit_assert_int((int)mo[i], ==, 75); seen_u2 = 1; } + } + munit_assert_int(seen_u1 && seen_u2, ==, 1); + + dl_program_free(prog); + ray_release(tbl); ray_release(u_col); ray_release(w_col); + return MUNIT_OK; +} + +/* Grouped MAX: Expected: (1,60), (2,85). */ +static MunitResult test_agg_max_grouped(const void* params, void* fixture) { + (void)params; (void)fixture; + int64_t users[] = {1, 1, 2, 2}; + int64_t weights[] = {50, 60, 75, 85}; + ray_t* u_col = ray_vec_from_raw(RAY_I64, users, 4); + ray_t* w_col = ray_vec_from_raw(RAY_I64, weights, 4); + ray_t* tbl = ray_table_new(2); + tbl = ray_table_add_col(tbl, ray_sym_intern("wbu__c0", 7), u_col); + tbl = ray_table_add_col(tbl, ray_sym_intern("wbu__c1", 7), w_col); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "wbu", tbl, 2); + + dl_rule_t r; dl_rule_init(&r, "user_max", 2); + dl_rule_head_var(&r, 0, 0); + dl_rule_head_var(&r, 1, 1); + int idx = dl_rule_add_agg(&r, DL_AGG_MAX, 1, "wbu", 2, 1); + int key_vars[] = { 0 }; + int key_cols[] = { 0 }; + munit_assert_int(dl_rule_agg_set_group(&r, idx, key_vars, key_cols, 1), ==, 0); + r.n_vars = 2; + dl_add_rule(prog, &r); + + munit_assert_int(dl_eval(prog), ==, 0); + ray_t* out = dl_query(prog, "user_max"); + munit_assert_ptr_not_null(out); + munit_assert_int((int)ray_table_nrows(out), ==, 2); + + int64_t* uo = (int64_t*)ray_data(ray_table_get_col_idx(out, 0)); + int64_t* mo = (int64_t*)ray_data(ray_table_get_col_idx(out, 1)); + int seen_u1 = 0, seen_u2 = 0; + for (int i = 0; i < 2; i++) { + if (uo[i] == 1) { munit_assert_int((int)mo[i], ==, 60); seen_u1 = 1; } + else if (uo[i] == 2) { munit_assert_int((int)mo[i], ==, 85); seen_u2 = 1; } + } + munit_assert_int(seen_u1 && seen_u2, ==, 1); + + dl_program_free(prog); + ray_release(tbl); ray_release(u_col); ray_release(w_col); + return MUNIT_OK; +} + +/* Grouped AVG: Expected: (1, 55.0), (2, 80.0). AVG promotes to RAY_F64. */ +static MunitResult test_agg_avg_grouped(const void* params, void* fixture) { + (void)params; (void)fixture; + int64_t users[] = {1, 1, 2, 2}; + int64_t weights[] = {50, 60, 75, 85}; + ray_t* u_col = ray_vec_from_raw(RAY_I64, users, 4); + ray_t* w_col = ray_vec_from_raw(RAY_I64, weights, 4); + ray_t* tbl = ray_table_new(2); + tbl = ray_table_add_col(tbl, ray_sym_intern("wbu__c0", 7), u_col); + tbl = ray_table_add_col(tbl, ray_sym_intern("wbu__c1", 7), w_col); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "wbu", tbl, 2); + + dl_rule_t r; dl_rule_init(&r, "user_avg", 2); + dl_rule_head_var(&r, 0, 0); + dl_rule_head_var(&r, 1, 1); + int idx = dl_rule_add_agg(&r, DL_AGG_AVG, 1, "wbu", 2, 1); + int key_vars[] = { 0 }; + int key_cols[] = { 0 }; + munit_assert_int(dl_rule_agg_set_group(&r, idx, key_vars, key_cols, 1), ==, 0); + r.n_vars = 2; + dl_add_rule(prog, &r); + + munit_assert_int(dl_eval(prog), ==, 0); + ray_t* out = dl_query(prog, "user_avg"); + munit_assert_ptr_not_null(out); + munit_assert_int((int)ray_table_nrows(out), ==, 2); + + int64_t* uo = (int64_t*)ray_data(ray_table_get_col_idx(out, 0)); + ray_t* avg_col = ray_table_get_col_idx(out, 1); + munit_assert_int(avg_col->type, ==, RAY_F64); + double* ao = (double*)ray_data(avg_col); + int seen_u1 = 0, seen_u2 = 0; + for (int i = 0; i < 2; i++) { + if (uo[i] == 1) { munit_assert_double_equal(ao[i], 55.0, 4); seen_u1 = 1; } + else if (uo[i] == 2) { munit_assert_double_equal(ao[i], 80.0, 4); seen_u2 = 1; } + } + munit_assert_int(seen_u1 && seen_u2, ==, 1); + + dl_program_free(prog); + ray_release(tbl); ray_release(u_col); ray_release(w_col); + return MUNIT_OK; +} + +/* Auto-register env-bound EDB: bind a table as "extra" in the ray env, + * then run a query whose rule body references "extra" without explicit + * dl_add_edb — ray_query_fn should auto-discover it. + * + * Setup: eav has (1, attr, 100). env has extra(10, 20). + * Rule: result(?x, ?a) :- (eav ?x ?_ ?_) (extra ?a ?_) + * Expected: one row (1, 10) — the cross-product constrained to 1 eav row. */ +static MunitResult test_env_bound_edb_auto_register(const void* params, void* fixture) { + (void)params; (void)fixture; + + /* Build a 1-row EAV table and bind it in the env as "mydb" */ + int64_t es[] = {1}, as[] = {42}, vs[] = {100}; + ray_t* ec = ray_vec_from_raw(RAY_I64, es, 1); + ray_t* ac = ray_vec_from_raw(RAY_I64, as, 1); + ray_t* vc = ray_vec_from_raw(RAY_I64, vs, 1); + ray_t* eav = ray_table_new(3); + eav = ray_table_add_col(eav, ray_sym_intern("e", 1), ec); + eav = ray_table_add_col(eav, ray_sym_intern("a", 1), ac); + eav = ray_table_add_col(eav, ray_sym_intern("v", 1), vc); + int64_t db_sym = ray_sym_intern("mydb", 4); + ray_env_set(db_sym, eav); + + /* Build a 1-row "extra" table and bind it in the env */ + int64_t x0[] = {10}, x1[] = {20}; + ray_t* xc0 = ray_vec_from_raw(RAY_I64, x0, 1); + ray_t* xc1 = ray_vec_from_raw(RAY_I64, x1, 1); + ray_t* extra = ray_table_new(2); + extra = ray_table_add_col(extra, ray_sym_intern("extra__c0", 9), xc0); + extra = ray_table_add_col(extra, ray_sym_intern("extra__c1", 9), xc1); + int64_t extra_sym = ray_sym_intern("extra", 5); + ray_env_set(extra_sym, extra); + + /* Run query through ray_eval_str which invokes ray_query_fn internally. + * "mydb" resolves to the EAV table via env lookup. + * The rule body references "extra" which is only in the env, not pre-registered + * as an EDB — ray_query_fn should auto-discover it. */ + ray_t* result = ray_eval_str( + "(query mydb (find ?x ?a) (where (eav ?x ?p ?v) (extra ?a ?b)))"); + munit_assert_ptr_not_null(result); + munit_assert_false(RAY_IS_ERR(result)); + munit_assert_int(result->type, ==, RAY_TABLE); + munit_assert_int((int)ray_table_nrows(result), ==, 1); + + int64_t* r0 = (int64_t*)ray_data(ray_table_get_col_idx(result, 0)); + int64_t* r1 = (int64_t*)ray_data(ray_table_get_col_idx(result, 1)); + munit_assert_int((int)r0[0], ==, 1); + munit_assert_int((int)r1[0], ==, 10); + + ray_release(result); + ray_env_set(extra_sym, NULL); + ray_env_set(db_sym, NULL); + ray_release(extra); ray_release(xc0); ray_release(xc1); + ray_release(eav); ray_release(ec); ray_release(ac); ray_release(vc); + return MUNIT_OK; +} + static MunitTest datalog_tests[] = { { "/source_provenance", test_source_provenance, datalog_setup, datalog_teardown, 0, NULL }, { "/source_prov_requires_flag", test_source_prov_requires_flag, datalog_setup, datalog_teardown, 0, NULL }, @@ -1418,6 +1606,9 @@ static MunitTest datalog_tests[] = { { "/agg_count_empty", test_agg_count_empty, datalog_setup, datalog_teardown, 0, NULL }, { "/agg_count_grouped", test_agg_count_grouped, datalog_setup, datalog_teardown, 0, NULL }, { "/agg_sum_grouped", test_agg_sum_grouped, datalog_setup, datalog_teardown, 0, NULL }, + { "/agg_min_grouped", test_agg_min_grouped, datalog_setup, datalog_teardown, 0, NULL }, + { "/agg_max_grouped", test_agg_max_grouped, datalog_setup, datalog_teardown, 0, NULL }, + { "/agg_avg_grouped", test_agg_avg_grouped, datalog_setup, datalog_teardown, 0, NULL }, { "/agg_parse_count_scalar", test_agg_parse_count_scalar, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, { "/agg_parse_sum_scalar", test_agg_parse_sum_scalar, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, { "/agg_parse_count_grouped", test_agg_parse_count_grouped, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, @@ -1435,6 +1626,7 @@ static MunitTest datalog_tests[] = { { "/rule_head_const_stratification", test_rule_head_const_stratification, datalog_setup, datalog_teardown, 0, NULL }, { "/rule_head_const_surface_syntax", test_rule_head_const_surface_syntax, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, { "/rule_body_const_surface_syntax", test_rule_body_const_surface_syntax, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, + { "/env_bound_edb_auto_register", test_env_bound_edb_auto_register, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, { NULL, NULL, NULL, NULL, 0, NULL }, }; From 0a32bd7848f8f61289677effd4a0dcf845e193e3 Mon Sep 17 00:00:00 2001 From: Aspirational Date: Tue, 21 Apr 2026 14:48:22 +0300 Subject: [PATCH 25/51] feat(runtime): ray_runtime_create_with_sym loads sym before builtins MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds ray_runtime_create_with_sym(sym_path) which loads the persisted symbol table between ray_sym_init() and ray_lang_init(). This ensures symbol IDs from prior sessions keep their slots — builtin names (+, -, query, rule, etc.) get appended after persisted entries rather than claiming slots 0..N before sym_load has a chance to restore them. The original ray_runtime_create() is unchanged (passes NULL, no sym load) for backward compatibility. 693/693 tests pass. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/core/runtime.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/src/core/runtime.c b/src/core/runtime.c index 0432be75..0babb7d8 100644 --- a/src/core/runtime.c +++ b/src/core/runtime.c @@ -157,13 +157,23 @@ void ray_error_clear(void) { /* ===== Lifecycle ===== */ -ray_runtime_t* ray_runtime_create(int argc, char** argv) { - (void)argc; (void)argv; - +static ray_runtime_t* runtime_create_impl(const char* sym_path) { /* Init subsystems */ ray_heap_init(); ray_sym_init(); + /* Load persisted symbol table BEFORE any interning (builtins, env). + * This ensures symbol IDs from prior sessions keep their slots, + * and new builtins get appended with fresh IDs. NULL means skip. */ + if (sym_path) { + ray_err_t sym_err = ray_sym_load(sym_path); + if (sym_err != RAY_OK && sym_err != RAY_ERR_CORRUPT) { + /* I/O error — surface it; caller decides policy */ + } + /* RAY_ERR_CORRUPT is non-fatal: proceed with empty table, + * caller rebuilds from authoritative source. */ + } + /* Allocate runtime via system allocator */ ray_runtime_t* rt = (ray_runtime_t*)ray_sys_alloc(sizeof(ray_runtime_t)); if (!rt) return NULL; @@ -196,13 +206,24 @@ ray_runtime_t* ray_runtime_create(int argc, char** argv) { rt->mem_budget = (int64_t)(4ULL << 30); #endif - /* Init language (env + builtins) — must be after __VM is set */ + /* Init language (env + builtins) — must be after __VM is set. + * Builtins intern their names; with sym_path loaded above, those + * names land after any persisted slots. */ ray_lang_init(); __RUNTIME = rt; return rt; } +ray_runtime_t* ray_runtime_create(int argc, char** argv) { + (void)argc; (void)argv; + return runtime_create_impl(NULL); +} + +ray_runtime_t* ray_runtime_create_with_sym(const char* sym_path) { + return runtime_create_impl(sym_path); +} + /* ===== Memory Budget API ===== */ int64_t ray_mem_budget(void) { From 2478ab4ded52bce17b6785e507178e6037d1bd9a Mon Sep 17 00:00:00 2001 From: Aspirational Date: Tue, 21 Apr 2026 16:04:52 +0300 Subject: [PATCH 26/51] feat(runtime): harden sym_load ordering, error surfacing, and budget check Three hardening changes to ray_runtime_create_with_sym: 1. Load order: set __VM and mem_budget BEFORE ray_sym_load so file I/O errors surface via ray_error() (previously silently dropped because ray_error() checks __VM && fmt and __VM was NULL) and allocations during load are bounded by mem_budget. 2. Error surfacing: new ray_runtime_create_with_sym_err(path, &out_err) returns the sym_load result so callers can distinguish "file absent" (RAY_OK), "corrupt/incompatible" (RAY_ERR_CORRUPT), and "I/O failure" (RAY_ERR_IO) and decide recovery policy. 3. Pre-flight size check: stat() the sym file before ray_col_load and reject if it exceeds mem_budget / 2, preventing a malicious or corrupted file from OOMing the process before the allocator's budget guards engage. The original ray_runtime_create_with_sym wraps the _err variant with NULL out-param for callers that don't need the diagnostic. 693/693 tests pass. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/core/runtime.c | 71 +++++++++++++++++++++++++++++++++------------- 1 file changed, 51 insertions(+), 20 deletions(-) diff --git a/src/core/runtime.c b/src/core/runtime.c index 0babb7d8..bd8b8a18 100644 --- a/src/core/runtime.c +++ b/src/core/runtime.c @@ -27,6 +27,7 @@ #include #include #include +#include #ifdef RAY_OS_WINDOWS #include #else @@ -157,24 +158,17 @@ void ray_error_clear(void) { /* ===== Lifecycle ===== */ -static ray_runtime_t* runtime_create_impl(const char* sym_path) { +static ray_runtime_t* runtime_create_impl(const char* sym_path, + ray_err_t* out_sym_err) { + if (out_sym_err) *out_sym_err = RAY_OK; + /* Init subsystems */ ray_heap_init(); ray_sym_init(); - /* Load persisted symbol table BEFORE any interning (builtins, env). - * This ensures symbol IDs from prior sessions keep their slots, - * and new builtins get appended with fresh IDs. NULL means skip. */ - if (sym_path) { - ray_err_t sym_err = ray_sym_load(sym_path); - if (sym_err != RAY_OK && sym_err != RAY_ERR_CORRUPT) { - /* I/O error — surface it; caller decides policy */ - } - /* RAY_ERR_CORRUPT is non-fatal: proceed with empty table, - * caller rebuilds from authoritative source. */ - } - - /* Allocate runtime via system allocator */ + /* Allocate runtime and set __VM + mem_budget BEFORE any file I/O so + * that ray_error() has a live VM to record diagnostics against and + * allocations are bounded by the budget. */ ray_runtime_t* rt = (ray_runtime_t*)ray_sys_alloc(sizeof(ray_runtime_t)); if (!rt) return NULL; memset(rt, 0, sizeof(*rt)); @@ -206,22 +200,59 @@ static ray_runtime_t* runtime_create_impl(const char* sym_path) { rt->mem_budget = (int64_t)(4ULL << 30); #endif - /* Init language (env + builtins) — must be after __VM is set. - * Builtins intern their names; with sym_path loaded above, those - * names land after any persisted slots. */ + /* __RUNTIME must be visible before ray_sym_load so mem_budget checks + * and ray_error() both operate against the live runtime. */ + __RUNTIME = rt; + + /* Load persisted symbol table BEFORE ray_lang_init interns builtins. + * Ordering: __VM + mem_budget are live so file I/O errors surface via + * ray_error() and allocations are budget-bounded. Still before + * ray_lang_init so persisted user symbol IDs keep their slots and + * builtins append afterwards. */ + if (sym_path) { + /* Pre-flight size check: reject files that would blow past the + * memory budget before ever touching ray_col_load. */ + struct stat st; + if (stat(sym_path, &st) == 0) { + /* Allow the sym file itself plus some working headroom (2x). + * A well-formed sym file is a list of interned strings; the + * in-memory footprint is bounded by file size within a small + * constant factor. */ + if (st.st_size > 0 && + (int64_t)st.st_size > rt->mem_budget / 2) { + if (out_sym_err) *out_sym_err = RAY_ERR_OOM; + /* Continue startup with empty sym table; caller decides + * whether to treat this as fatal. */ + } else { + ray_err_t sym_err = ray_sym_load(sym_path); + if (out_sym_err) *out_sym_err = sym_err; + /* RAY_ERR_CORRUPT and I/O errors are non-fatal here: + * caller inspects out_sym_err to decide recovery. */ + } + } + /* ENOENT and other stat failures: leave out_sym_err = RAY_OK; + * an absent sym file is the normal first-run case. */ + } + + /* Init language (env + builtins) — must be after __VM is set and + * after sym_load so persisted user IDs keep their slots. */ ray_lang_init(); - __RUNTIME = rt; return rt; } ray_runtime_t* ray_runtime_create(int argc, char** argv) { (void)argc; (void)argv; - return runtime_create_impl(NULL); + return runtime_create_impl(NULL, NULL); } ray_runtime_t* ray_runtime_create_with_sym(const char* sym_path) { - return runtime_create_impl(sym_path); + return runtime_create_impl(sym_path, NULL); +} + +ray_runtime_t* ray_runtime_create_with_sym_err(const char* sym_path, + ray_err_t* out_sym_err) { + return runtime_create_impl(sym_path, out_sym_err); } /* ===== Memory Budget API ===== */ From ed48c3e362f1ec9fd65e126e8d5948040580bbb1 Mon Sep 17 00:00:00 2001 From: Anton Kundenko Date: Wed, 22 Apr 2026 19:02:44 +0200 Subject: [PATCH 27/51] Update src/ops/datalog.c Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/ops/datalog.c | 72 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 55 insertions(+), 17 deletions(-) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index 11522a15..b6c2013b 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -3329,42 +3329,80 @@ ray_t* ray_query_fn(ray_t** args, int64_t n) { * matching arity. SYM columns are converted to I64 (same treatment as * the primary `eav` table). * + * Aggregate sources are handled too (`DL_AGG` uses `agg_pred`). * The built-in synthetic "__query" / "eav" names are skipped. */ for (int ri = 0; ri < prog->n_rules; ri++) { dl_rule_t* rr = &prog->rules[ri]; for (int bi = 0; bi < rr->n_body; bi++) { dl_body_t* bd = &rr->body[bi]; - if (bd->type != DL_POS && bd->type != DL_NEG) continue; - if (bd->pred[0] == '\0') continue; - if (strcmp(bd->pred, "eav") == 0) continue; - if (dl_find_rel(prog, bd->pred) >= 0) continue; + const char* pred_name = NULL; + int pred_arity = 0; + + if (bd->type == DL_POS || bd->type == DL_NEG) { + pred_name = bd->pred; + pred_arity = bd->arity; + } else if (bd->type == DL_AGG) { + pred_name = bd->agg_pred; + pred_arity = bd->arity; + } else { + continue; + } + + if (!pred_name || pred_name[0] == '\0') continue; + if (strcmp(pred_name, "eav") == 0) continue; + if (dl_find_rel(prog, pred_name) >= 0) continue; - int64_t env_sym = ray_sym_intern(bd->pred, strlen(bd->pred)); + int64_t env_sym = ray_sym_intern(pred_name, strlen(pred_name)); ray_t* env_val = ray_env_get(env_sym); if (!env_val || env_val->type != RAY_TABLE) continue; int64_t ncols = ray_table_ncols(env_val); - if (ncols != bd->arity) continue; + if (ncols != pred_arity) continue; int64_t nrows_env = ray_table_nrows(env_val); - ray_t* clean = ray_table_new(bd->arity); - for (int c = 0; c < bd->arity; c++) { + ray_t* clean = ray_table_new(pred_arity); + if (!clean || RAY_IS_ERR(clean)) { + if (clean) ray_release(clean); + dl_program_free(prog); + ray_release(db); + return ray_error("memory", "query: failed to create env-backed EDB table"); + } + for (int c = 0; c < pred_arity; c++) { ray_t* col = ray_table_get_col_idx(env_val, c); + ray_t* next_clean; if (!col) continue; if (col->type == RAY_SYM) { ray_t* i64col = ray_vec_new(RAY_I64, nrows_env); - if (i64col && !RAY_IS_ERR(i64col)) { - i64col->len = nrows_env; - int64_t* d = (int64_t*)ray_data(i64col); - for (int64_t r = 0; r < nrows_env; r++) - d[r] = ray_read_sym(ray_data(col), r, col->type, col->attrs); - clean = ray_table_add_col(clean, ray_table_col_name(env_val, c), i64col); - ray_release(i64col); + if (!i64col || RAY_IS_ERR(i64col)) { + if (i64col) ray_release(i64col); + ray_release(clean); + dl_program_free(prog); + ray_release(db); + return ray_error("memory", "query: failed to convert env-backed SYM column"); } + i64col->len = nrows_env; + int64_t* d = (int64_t*)ray_data(i64col); + for (int64_t r = 0; r < nrows_env; r++) + d[r] = ray_read_sym(ray_data(col), r, col->type, col->attrs); + next_clean = ray_table_add_col(clean, ray_table_col_name(env_val, c), i64col); + ray_release(i64col); } else { - clean = ray_table_add_col(clean, ray_table_col_name(env_val, c), col); + next_clean = ray_table_add_col(clean, ray_table_col_name(env_val, c), col); + } + if (!next_clean || RAY_IS_ERR(next_clean)) { + if (next_clean) ray_release(next_clean); + ray_release(clean); + dl_program_free(prog); + ray_release(db); + return ray_error("memory", "query: failed to build env-backed EDB table"); } + clean = next_clean; + } + if (dl_add_edb(prog, pred_name, clean, pred_arity) != 0) { + ray_release(clean); + dl_program_free(prog); + ray_release(db); + return ray_error("domain", "query: failed to register env-backed EDB table"); } - dl_add_edb(prog, bd->pred, clean, bd->arity); ray_release(clean); } } From 5696f04fb6c9a5ff4950c4bb21486305b6400e3d Mon Sep 17 00:00:00 2001 From: Anton Kundenko Date: Wed, 22 Apr 2026 19:37:09 +0200 Subject: [PATCH 28/51] Update src/ops/datalog.c Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/ops/datalog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index b6c2013b..7e62fe8c 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -3343,7 +3343,7 @@ ray_t* ray_query_fn(ray_t** args, int64_t n) { pred_arity = bd->arity; } else if (bd->type == DL_AGG) { pred_name = bd->agg_pred; - pred_arity = bd->arity; + pred_arity = bd->agg_arity; } else { continue; } From a75a6a827c7cf79f7d94e52a95ec7149a11fb282 Mon Sep 17 00:00:00 2001 From: Anton Date: Wed, 22 Apr 2026 20:03:30 +0200 Subject: [PATCH 29/51] fix(datalog): env-bound EDB auto-register must accept any non-negative index MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit dl_add_edb returns the newly-allocated relation index on success (0, 1, 2, …) or -1 on failure. The auto-register branch in ray_query_fn compared the return to != 0, so every env-bound EDB beyond the primary EAV (which always occupies idx 0) was misclassified as a failure and the whole query aborted with "query: failed to register env-backed EDB table". The new test_env_bound_edb_auto_register test, which is the first coverage for this feature, triggers the bug on every run (all four CI jobs were red on it). Switching to < 0 matches the return contract; 693/693 tests pass under both debug (ASan+UBSan) and release builds. --- src/ops/datalog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index 7e62fe8c..bc3a63a8 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -3397,7 +3397,7 @@ ray_t* ray_query_fn(ray_t** args, int64_t n) { } clean = next_clean; } - if (dl_add_edb(prog, pred_name, clean, pred_arity) != 0) { + if (dl_add_edb(prog, pred_name, clean, pred_arity) < 0) { ray_release(clean); dl_program_free(prog); ray_release(db); From 2a2b8c252178973f4ab4a778c72708fa7031be0d Mon Sep 17 00:00:00 2001 From: Anton Date: Wed, 22 Apr 2026 21:34:21 +0200 Subject: [PATCH 30/51] =?UTF-8?q?fix(datalog):=20address=20Copilot=20revie?= =?UTF-8?q?w=20=E2=80=94=20sym=20width,=20agg=20bounds,=20f64=20scalars?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three correctness fixes in src/ops/datalog.c flagged by the Copilot review on PR #7, each with a matching regression test: * dl_project (line ~1001): preserve SYM index width on the projected column. ray_vec_new(RAY_SYM, …) always returns a W64 vec, so memcpy'ing with the source's narrower element size left the upper bytes of each W64 slot uninitialized and produced bogus sym IDs. Route SYM columns through ray_sym_vec_new(src->attrs & RAY_SYM_W_MASK, …) so dst's stride matches src. * Grouped aggregate compile (line ~1269): bounds-check each group_key_col and agg_value_col against src_rel->arity before indexing col_names[]. An out-of-range key would OOB-read; an out-of-range value was silently clamped to column 0, producing correct-looking but wrong results. Also ensure ray_graph_free(gg) runs on every error return after gg is allocated (previously leaked on the out-of-range path). * Scalar SUM/MIN/MAX/AVG (line ~1347): accept RAY_F64 value columns. The prior code treated any non-RAY_I64 as "return 0", so float-valued sources silently produced wrong results. New behaviour: i64 in → i64 out, f64 in → f64 out (AVG still always emits f64); non-numeric sources are rejected rather than zero'd. Tests added: * /datalog/agg_scalar_f64 — sum/avg over an f64 value column * /datalog/agg_grouped_key_col_oor — OOR group-key col rejected cleanly * /datalog/project_narrow_sym — rule passes a W8 SYM column through make test (ASan+UBSan) and make release both green at 696/696. --- src/ops/datalog.c | 102 ++++++++++++++++++++++-------- test/test_datalog.c | 147 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 224 insertions(+), 25 deletions(-) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index bc3a63a8..38fde1a1 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -1009,11 +1009,15 @@ static ray_t* dl_project(ray_t* tbl, const int* col_indices, int n_out, if (src_idx >= 0) { ray_t* src = ray_table_get_col_idx(tbl, src_idx); if (!src) continue; - ray_t* dst = ray_vec_new(src->type, nrows); + /* Preserve SYM index width: ray_vec_new(RAY_SYM, …) would always + * produce a W64 vec, so memcpy'ing with the source's narrower + * element size would leave the upper bytes of each W64 slot + * uninitialized. ray_sym_vec_new mirrors src's attrs width. */ + ray_t* dst = (src->type == RAY_SYM) + ? ray_sym_vec_new(src->attrs & RAY_SYM_W_MASK, nrows) + : ray_vec_new(src->type, nrows); if (!dst || RAY_IS_ERR(dst)) continue; dst->len = nrows; - /* Use element size from the source vec so SYM with any width, - * I64, and F64 all copy correctly. */ uint8_t esz = ray_sym_elem_size(src->type, src->attrs); if (esz == 0) { ray_release(dst); continue; } memcpy(ray_data(dst), ray_data(src), (size_t)nrows * (size_t)esz); @@ -1269,15 +1273,27 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, ray_op_t* keys_ops[DL_AGG_MAX_KEYS]; for (int i = 0; i < nk; i++) { int kc = body->agg_group_key_cols[i]; - if (kc < 0 || kc >= src_rel->arity) { ray_release(accum); return NULL; } + if (kc < 0 || kc >= src_rel->arity) { + ray_graph_free(gg); + ray_release(accum); + return NULL; + } int64_t sym = src_rel->col_names[kc]; ray_t* s = ray_sym_str(sym); keys_ops[i] = ray_scan(gg, ray_str_ptr(s)); } /* Agg input: value column (for COUNT we still pass a column; any - * column works since COUNT only counts rows). */ + * column works since COUNT only counts rows). Must be bounds- + * checked — silently clamping to 0 would compute a valid-looking + * but wrong result over an unrelated column. */ int value_col = body->agg_value_col; + if (body->agg_op != DL_AGG_COUNT && + (value_col < 0 || value_col >= src_rel->arity)) { + ray_graph_free(gg); + ray_release(accum); + return NULL; + } if (value_col < 0 || value_col >= src_rel->arity) value_col = 0; ray_t* vs = ray_sym_str(src_rel->col_names[value_col]); ray_op_t* agg_in = ray_scan(gg, ray_str_ptr(vs)); @@ -1344,12 +1360,16 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, return NULL; } - int64_t result = 0; - double favg = 0.0; - bool is_avg = (body->agg_op == DL_AGG_AVG); + int64_t result_i = 0; + double result_f = 0.0; + bool is_avg = (body->agg_op == DL_AGG_AVG); + /* Float promotion: AVG always emits f64; SUM/MIN/MAX track their + * source column type (i64 in -> i64 out; f64 in -> f64 out). + * COUNT is always i64. */ + bool is_float = is_avg; switch (body->agg_op) { case DL_AGG_COUNT: - result = src_nrows; + result_i = src_nrows; break; case DL_AGG_SUM: case DL_AGG_MIN: @@ -1358,32 +1378,64 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, if (src_nrows > 0) { ray_t* val_col = ray_table_get_col_idx(src_table, body->agg_value_col); - if (!val_col || val_col->type != RAY_I64) { - result = 0; - } else { + if (!val_col) { + ray_release(accum); + return NULL; + } + if (val_col->type == RAY_I64) { int64_t* vd = (int64_t*)ray_data(val_col); if (body->agg_op == DL_AGG_SUM) { - result = 0; + result_i = 0; for (int64_t i = 0; i < src_nrows; i++) - result += vd[i]; + result_i += vd[i]; } else if (body->agg_op == DL_AGG_MIN) { - result = vd[0]; + result_i = vd[0]; for (int64_t i = 1; i < src_nrows; i++) { - if (vd[i] < result) - result = vd[i]; + if (vd[i] < result_i) + result_i = vd[i]; } } else if (body->agg_op == DL_AGG_MAX) { - result = vd[0]; + result_i = vd[0]; for (int64_t i = 1; i < src_nrows; i++) { - if (vd[i] > result) - result = vd[i]; + if (vd[i] > result_i) + result_i = vd[i]; } } else { /* DL_AGG_AVG */ int64_t acc = 0; for (int64_t i = 0; i < src_nrows; i++) acc += vd[i]; - favg = (double)acc / (double)src_nrows; + result_f = (double)acc / (double)src_nrows; + } + } else if (val_col->type == RAY_F64) { + is_float = true; /* SUM/MIN/MAX promote to f64 */ + double* vd = (double*)ray_data(val_col); + if (body->agg_op == DL_AGG_SUM) { + result_f = 0.0; + for (int64_t i = 0; i < src_nrows; i++) + result_f += vd[i]; + } else if (body->agg_op == DL_AGG_MIN) { + result_f = vd[0]; + for (int64_t i = 1; i < src_nrows; i++) { + if (vd[i] < result_f) + result_f = vd[i]; + } + } else if (body->agg_op == DL_AGG_MAX) { + result_f = vd[0]; + for (int64_t i = 1; i < src_nrows; i++) { + if (vd[i] > result_f) + result_f = vd[i]; + } + } else { /* DL_AGG_AVG */ + double acc = 0.0; + for (int64_t i = 0; i < src_nrows; i++) + acc += vd[i]; + result_f = acc / (double)src_nrows; } + } else { + /* Non-numeric source column — reject loudly rather than + * silently returning zero. */ + ray_release(accum); + return NULL; } } break; @@ -1394,16 +1446,16 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, int64_t nrows = ray_table_nrows(accum); if (nrows == 0) break; - ray_t* new_col = ray_vec_new(is_avg ? RAY_F64 : RAY_I64, nrows); + ray_t* new_col = ray_vec_new(is_float ? RAY_F64 : RAY_I64, nrows); if (!new_col || RAY_IS_ERR(new_col)) break; new_col->len = nrows; - if (is_avg) { + if (is_float) { double* nd = (double*)ray_data(new_col); - for (int64_t r = 0; r < nrows; r++) nd[r] = favg; + for (int64_t r = 0; r < nrows; r++) nd[r] = result_f; } else { int64_t* nd = (int64_t*)ray_data(new_col); - for (int64_t r = 0; r < nrows; r++) nd[r] = result; + for (int64_t r = 0; r < nrows; r++) nd[r] = result_i; } int new_col_idx = (int)ray_table_ncols(accum); diff --git a/test/test_datalog.c b/test/test_datalog.c index a167ff34..3ac18cb2 100644 --- a/test/test_datalog.c +++ b/test/test_datalog.c @@ -1529,6 +1529,150 @@ static MunitResult test_agg_avg_grouped(const void* params, void* fixture) { return MUNIT_OK; } +/* Scalar SUM/AVG over an RAY_F64 value column. + * Regression: the scalar aggregate path previously accepted only RAY_I64 + * columns and silently returned 0 for RAY_F64, producing valid-looking but + * wrong results. */ +static MunitResult test_agg_scalar_f64(const void* params, void* fixture) { + (void)params; (void)fixture; + double vs[] = {1.5, 2.5, 3.0, 4.0}; /* sum = 11.0, avg = 2.75 */ + ray_t* vcol = ray_vec_from_raw(RAY_F64, vs, 4); + ray_t* tbl = ray_table_new(1); + tbl = ray_table_add_col(tbl, ray_sym_intern("m__c0", 5), vcol); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "m", tbl, 1); + + /* total_sum(?s) :- (sum ?s m 0). Target var 0, value col 0. */ + dl_rule_t rs; dl_rule_init(&rs, "total_sum", 1); + dl_rule_head_var(&rs, 0, 0); + dl_rule_add_agg(&rs, DL_AGG_SUM, 0, "m", 1, 0); + rs.n_vars = 1; + dl_add_rule(prog, &rs); + + /* total_avg(?a) :- (avg ?a m 0). */ + dl_rule_t ra; dl_rule_init(&ra, "total_avg", 1); + dl_rule_head_var(&ra, 0, 0); + dl_rule_add_agg(&ra, DL_AGG_AVG, 0, "m", 1, 0); + ra.n_vars = 1; + dl_add_rule(prog, &ra); + + munit_assert_int(dl_eval(prog), ==, 0); + + ray_t* s_out = dl_query(prog, "total_sum"); + munit_assert_ptr_not_null(s_out); + munit_assert_int((int)ray_table_nrows(s_out), ==, 1); + ray_t* s_col = ray_table_get_col_idx(s_out, 0); + munit_assert_int(s_col->type, ==, RAY_F64); + munit_assert_double_equal(((double*)ray_data(s_col))[0], 11.0, 4); + + ray_t* a_out = dl_query(prog, "total_avg"); + munit_assert_ptr_not_null(a_out); + ray_t* a_col = ray_table_get_col_idx(a_out, 0); + munit_assert_int(a_col->type, ==, RAY_F64); + munit_assert_double_equal(((double*)ray_data(a_col))[0], 2.75, 4); + + dl_program_free(prog); + ray_release(tbl); ray_release(vcol); + return MUNIT_OK; +} + +/* Grouped aggregate with an out-of-range group-key column must be rejected + * cleanly (no crash, no bogus rows). Regression: the grouped path indexed + * src_rel->col_names[key_col] without bounds-checking. */ +static MunitResult test_agg_grouped_key_col_oor(const void* params, void* fixture) { + (void)params; (void)fixture; + int64_t us[] = {1, 2, 1}, ws[] = {10, 20, 30}; + ray_t* uc = ray_vec_from_raw(RAY_I64, us, 3); + ray_t* wc = ray_vec_from_raw(RAY_I64, ws, 3); + ray_t* tbl = ray_table_new(2); + tbl = ray_table_add_col(tbl, ray_sym_intern("wbu__c0", 7), uc); + tbl = ray_table_add_col(tbl, ray_sym_intern("wbu__c1", 7), wc); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "wbu", tbl, 2); + + dl_rule_t r; dl_rule_init(&r, "bad_group", 2); + dl_rule_head_var(&r, 0, 0); + dl_rule_head_var(&r, 1, 1); + int idx = dl_rule_add_agg(&r, DL_AGG_SUM, 1, "wbu", 2, 1); + int key_vars[] = { 0 }; + int key_cols[] = { 99 }; /* out-of-range: wbu has arity 2 */ + munit_assert_int(dl_rule_agg_set_group(&r, idx, key_vars, key_cols, 1), ==, 0); + r.n_vars = 2; + dl_add_rule(prog, &r); + + /* dl_eval must not crash; compile rejects the rule, producing 0 rows. */ + munit_assert_int(dl_eval(prog), ==, 0); + ray_t* out = dl_query(prog, "bad_group"); + munit_assert_ptr_not_null(out); + munit_assert_int((int)ray_table_nrows(out), ==, 0); + + dl_program_free(prog); + ray_release(tbl); ray_release(uc); ray_release(wc); + return MUNIT_OK; +} + +/* A rule that passes a narrow-width RAY_SYM body column through a head var + * must produce a correct SYM column. Regression: dl_project allocated the + * destination with ray_vec_new(RAY_SYM, …), which always creates a W64 vec, + * then memcpy'd using the source's narrower element size — leaving the upper + * bytes of each W64 slot uninitialized and producing bogus sym IDs when read + * back. */ +static MunitResult test_project_narrow_sym(const void* params, void* fixture) { + (void)params; (void)fixture; + + /* Build a W8-width SYM column with 3 distinct sym IDs (all fit in one byte). */ + int64_t ks[] = {7, 11, 13}; + int64_t tag_syms[] = { + ray_sym_intern("a", 1), + ray_sym_intern("b", 1), + ray_sym_intern("c", 1), + }; + /* Force narrow W8 storage — the fix path only matters when src is narrower + * than the default W64 ray_vec_new would pick. */ + ray_t* tcol = ray_sym_vec_new(RAY_SYM_W8, 3); + munit_assert_ptr_not_null(tcol); + tcol->len = 3; + for (int i = 0; i < 3; i++) + ray_write_sym(ray_data(tcol), i, tag_syms[i], tcol->type, tcol->attrs); + ray_t* kcol = ray_vec_from_raw(RAY_I64, ks, 3); + ray_t* tbl = ray_table_new(2); + tbl = ray_table_add_col(tbl, ray_sym_intern("e__c0", 5), kcol); + tbl = ray_table_add_col(tbl, ray_sym_intern("e__c1", 5), tcol); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "e", tbl, 2); + + /* out(?k, ?t) :- (e ?k ?t) — passes the narrow-SYM column through. */ + dl_rule_t r; dl_rule_init(&r, "out", 2); + dl_rule_head_var(&r, 0, 0); + dl_rule_head_var(&r, 1, 1); + int bidx = dl_rule_add_atom(&r, "e", 2); + dl_body_set_var(&r, bidx, 0, 0); + dl_body_set_var(&r, bidx, 1, 1); + r.n_vars = 2; + dl_add_rule(prog, &r); + + munit_assert_int(dl_eval(prog), ==, 0); + ray_t* out = dl_query(prog, "out"); + munit_assert_ptr_not_null(out); + munit_assert_int((int)ray_table_nrows(out), ==, 3); + + ray_t* ok = ray_table_get_col_idx(out, 0); + ray_t* ot = ray_table_get_col_idx(out, 1); + munit_assert_int(ot->type, ==, RAY_SYM); + for (int i = 0; i < 3; i++) { + munit_assert_int(((int64_t*)ray_data(ok))[i], ==, ks[i]); + int64_t got = ray_read_sym(ray_data(ot), i, ot->type, ot->attrs); + munit_assert_int((int)got, ==, (int)tag_syms[i]); + } + + dl_program_free(prog); + ray_release(tbl); ray_release(kcol); ray_release(tcol); + return MUNIT_OK; +} + /* Auto-register env-bound EDB: bind a table as "extra" in the ray env, * then run a query whose rule body references "extra" without explicit * dl_add_edb — ray_query_fn should auto-discover it. @@ -1627,6 +1771,9 @@ static MunitTest datalog_tests[] = { { "/rule_head_const_surface_syntax", test_rule_head_const_surface_syntax, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, { "/rule_body_const_surface_syntax", test_rule_body_const_surface_syntax, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, { "/env_bound_edb_auto_register", test_env_bound_edb_auto_register, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, + { "/agg_scalar_f64", test_agg_scalar_f64, datalog_setup, datalog_teardown, 0, NULL }, + { "/agg_grouped_key_col_oor", test_agg_grouped_key_col_oor, datalog_setup, datalog_teardown, 0, NULL }, + { "/project_narrow_sym", test_project_narrow_sym, datalog_setup, datalog_teardown, 0, NULL }, { NULL, NULL, NULL, NULL, 0, NULL }, }; From b2de6551aa2a0bc433c558ad13f266aa024d39e0 Mon Sep 17 00:00:00 2001 From: Anton Date: Wed, 22 Apr 2026 21:44:14 +0200 Subject: [PATCH 31/51] fix(datalog): empty-source F64 SUM emits RAY_F64 identity, not RAY_I64 0 Follow-up to the scalar-aggregate F64 support commit: `is_float` was only flipped inside the `src_nrows > 0` branch, so an empty-source SUM over a RAY_F64 column skipped the promotion and allocated a RAY_I64 result column holding 0 instead of a RAY_F64 column holding 0.0. Inspect the value column's type once up front, before the row-count split, so the empty identity is emitted in the correct type. Also reject non-numeric source columns at the same point so an empty, unsupported-type source can't silently fall through to the i64 path either. Regression test /datalog/agg_scalar_f64_sum_empty: SUM over an empty RAY_F64 column must produce a 1-row RAY_F64 result with value 0.0. 697/697 pass on both debug (ASan+UBSan) and release. --- src/ops/datalog.c | 19 ++++++++++++++++++- test/test_datalog.c | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index 38fde1a1..72feca10 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -1365,8 +1365,25 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, bool is_avg = (body->agg_op == DL_AGG_AVG); /* Float promotion: AVG always emits f64; SUM/MIN/MAX track their * source column type (i64 in -> i64 out; f64 in -> f64 out). - * COUNT is always i64. */ + * COUNT is always i64. For empty SUM, we still need to inspect + * the column type so the identity (0 / 0.0) is emitted in the + * correct result type. */ bool is_float = is_avg; + if (body->agg_op == DL_AGG_SUM || + body->agg_op == DL_AGG_MIN || + body->agg_op == DL_AGG_MAX || + body->agg_op == DL_AGG_AVG) { + ray_t* vc0 = ray_table_get_col_idx(src_table, body->agg_value_col); + if (vc0) { + if (vc0->type == RAY_F64) { + is_float = true; + } else if (vc0->type != RAY_I64) { + /* Non-numeric source: reject regardless of row count. */ + ray_release(accum); + return NULL; + } + } + } switch (body->agg_op) { case DL_AGG_COUNT: result_i = src_nrows; diff --git a/test/test_datalog.c b/test/test_datalog.c index 3ac18cb2..1c3134ee 100644 --- a/test/test_datalog.c +++ b/test/test_datalog.c @@ -1577,6 +1577,40 @@ static MunitResult test_agg_scalar_f64(const void* params, void* fixture) { return MUNIT_OK; } +/* Empty-source SUM over an RAY_F64 column must still emit a RAY_F64 result + * column (value 0.0), not RAY_I64 0. Regression from the scalar-agg F64 + * fix: is_float was only flipped inside the src_nrows > 0 branch, so an + * empty f64 SUM fell through to the i64 path. */ +static MunitResult test_agg_scalar_f64_sum_empty(const void* params, void* fixture) { + (void)params; (void)fixture; + ray_t* vcol = ray_vec_new(RAY_F64, 0); + munit_assert_ptr_not_null(vcol); + vcol->len = 0; + ray_t* tbl = ray_table_new(1); + tbl = ray_table_add_col(tbl, ray_sym_intern("m__c0", 5), vcol); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "m", tbl, 1); + + dl_rule_t rs; dl_rule_init(&rs, "total_sum", 1); + dl_rule_head_var(&rs, 0, 0); + dl_rule_add_agg(&rs, DL_AGG_SUM, 0, "m", 1, 0); + rs.n_vars = 1; + dl_add_rule(prog, &rs); + + munit_assert_int(dl_eval(prog), ==, 0); + ray_t* out = dl_query(prog, "total_sum"); + munit_assert_ptr_not_null(out); + munit_assert_int((int)ray_table_nrows(out), ==, 1); + ray_t* sc = ray_table_get_col_idx(out, 0); + munit_assert_int(sc->type, ==, RAY_F64); + munit_assert_double_equal(((double*)ray_data(sc))[0], 0.0, 4); + + dl_program_free(prog); + ray_release(tbl); ray_release(vcol); + return MUNIT_OK; +} + /* Grouped aggregate with an out-of-range group-key column must be rejected * cleanly (no crash, no bogus rows). Regression: the grouped path indexed * src_rel->col_names[key_col] without bounds-checking. */ @@ -1772,6 +1806,7 @@ static MunitTest datalog_tests[] = { { "/rule_body_const_surface_syntax", test_rule_body_const_surface_syntax, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, { "/env_bound_edb_auto_register", test_env_bound_edb_auto_register, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, { "/agg_scalar_f64", test_agg_scalar_f64, datalog_setup, datalog_teardown, 0, NULL }, + { "/agg_scalar_f64_sum_empty", test_agg_scalar_f64_sum_empty, datalog_setup, datalog_teardown, 0, NULL }, { "/agg_grouped_key_col_oor", test_agg_grouped_key_col_oor, datalog_setup, datalog_teardown, 0, NULL }, { "/project_narrow_sym", test_project_narrow_sym, datalog_setup, datalog_teardown, 0, NULL }, { NULL, NULL, NULL, NULL, 0, NULL }, From 6d0929a22f1d0db294f07898280194da48664be9 Mon Sep 17 00:00:00 2001 From: Anton Date: Wed, 22 Apr 2026 21:54:17 +0200 Subject: [PATCH 32/51] fix(datalog): scalar aggregate validates value col index before row-count split MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The grouped path bounds-checked agg_value_col, but the scalar path's check lived inside the `src_nrows > 0` branch — so an empty source plus an out-of-range value col silently emitted the SUM identity (0 / 0.0) against an invalid slot instead of rejecting the rule. Move the value-column bounds check (for SUM/MIN/MAX/AVG) above the empty- source early return so it fires regardless of row count. COUNT still skips this check since it doesn't read any value column. Regression test /datalog/agg_scalar_value_col_oor_empty: empty i64 source + value_col=42 against an arity-1 relation must yield 0 rows, not 1 row holding SUM=0. 698/698 pass on debug and release. --- src/ops/datalog.c | 23 ++++++++++++++++++----- test/test_datalog.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 5 deletions(-) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index 72feca10..845c24c7 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -1346,11 +1346,27 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, ray_release(accum); return NULL; } - ray_t* src_table = prog->rels[src_idx].table; + dl_rel_t* src_rel_s = &prog->rels[src_idx]; + ray_t* src_table = src_rel_s->table; int64_t src_nrows = (src_table && !RAY_IS_ERR(src_table)) ? ray_table_nrows(src_table) : 0; + /* Bounds-check value column up front for every value-taking op + * (SUM/MIN/MAX/AVG). Must happen before the empty-source early + * returns below, otherwise an out-of-range index on an empty + * source would silently emit the SUM identity 0 / 0.0. */ + bool need_value_col = (body->agg_op == DL_AGG_SUM + || body->agg_op == DL_AGG_MIN + || body->agg_op == DL_AGG_MAX + || body->agg_op == DL_AGG_AVG); + if (need_value_col && + (body->agg_value_col < 0 || + body->agg_value_col >= src_rel_s->arity)) { + ray_release(accum); + return NULL; + } + if (src_nrows == 0 && (body->agg_op == DL_AGG_MIN || body->agg_op == DL_AGG_MAX || body->agg_op == DL_AGG_AVG)) { @@ -1369,10 +1385,7 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, * the column type so the identity (0 / 0.0) is emitted in the * correct result type. */ bool is_float = is_avg; - if (body->agg_op == DL_AGG_SUM || - body->agg_op == DL_AGG_MIN || - body->agg_op == DL_AGG_MAX || - body->agg_op == DL_AGG_AVG) { + if (need_value_col) { ray_t* vc0 = ray_table_get_col_idx(src_table, body->agg_value_col); if (vc0) { if (vc0->type == RAY_F64) { diff --git a/test/test_datalog.c b/test/test_datalog.c index 1c3134ee..201ddfc7 100644 --- a/test/test_datalog.c +++ b/test/test_datalog.c @@ -1611,6 +1611,38 @@ static MunitResult test_agg_scalar_f64_sum_empty(const void* params, void* fixtu return MUNIT_OK; } +/* Scalar SUM with an out-of-range value column must be rejected even when + * the source is empty. Regression: the bounds check lived inside the + * src_nrows > 0 branch, so an empty source bypassed it and silently emitted + * the SUM identity (0 or 0.0) against an invalid column index. */ +static MunitResult test_agg_scalar_value_col_oor_empty(const void* params, void* fixture) { + (void)params; (void)fixture; + ray_t* vcol = ray_vec_new(RAY_I64, 0); + munit_assert_ptr_not_null(vcol); + vcol->len = 0; + ray_t* tbl = ray_table_new(1); + tbl = ray_table_add_col(tbl, ray_sym_intern("m__c0", 5), vcol); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "m", tbl, 1); + + dl_rule_t rs; dl_rule_init(&rs, "bad_sum", 1); + dl_rule_head_var(&rs, 0, 0); + /* value_col=42 is out of range for the arity-1 "m" relation. */ + dl_rule_add_agg(&rs, DL_AGG_SUM, 0, "m", 1, 42); + rs.n_vars = 1; + dl_add_rule(prog, &rs); + + munit_assert_int(dl_eval(prog), ==, 0); + ray_t* out = dl_query(prog, "bad_sum"); + munit_assert_ptr_not_null(out); + munit_assert_int((int)ray_table_nrows(out), ==, 0); + + dl_program_free(prog); + ray_release(tbl); ray_release(vcol); + return MUNIT_OK; +} + /* Grouped aggregate with an out-of-range group-key column must be rejected * cleanly (no crash, no bogus rows). Regression: the grouped path indexed * src_rel->col_names[key_col] without bounds-checking. */ @@ -1807,6 +1839,7 @@ static MunitTest datalog_tests[] = { { "/env_bound_edb_auto_register", test_env_bound_edb_auto_register, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, { "/agg_scalar_f64", test_agg_scalar_f64, datalog_setup, datalog_teardown, 0, NULL }, { "/agg_scalar_f64_sum_empty", test_agg_scalar_f64_sum_empty, datalog_setup, datalog_teardown, 0, NULL }, + { "/agg_scalar_value_col_oor_empty", test_agg_scalar_value_col_oor_empty, datalog_setup, datalog_teardown, 0, NULL }, { "/agg_grouped_key_col_oor", test_agg_grouped_key_col_oor, datalog_setup, datalog_teardown, 0, NULL }, { "/project_narrow_sym", test_project_narrow_sym, datalog_setup, datalog_teardown, 0, NULL }, { NULL, NULL, NULL, NULL, 0, NULL }, From ecdb0d909a96475e432058e2a9784856396f9fc3 Mon Sep 17 00:00:00 2001 From: Anton Date: Thu, 23 Apr 2026 10:36:30 +0200 Subject: [PATCH 33/51] fix(datalog): propagate dl_project errors; unknown-arity agg sentinel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two follow-up fixes from the latest Copilot review round on PR #7: * dl_project (src/ops/datalog.c): allocation failures used to `continue` past the failed slot, leaking the RAY_ERROR object and producing a derived table with silently-missing columns. Each error path now releases any in-flight allocations and returns an error ray_t so the caller can abort rule eval cleanly. The existing success path is unchanged. * Aggregate parser + env auto-register: the parser hardcoded pred_arity=1 when `prog` was NULL (surface syntax, globals), so env-bound source tables with any other arity were silently skipped during query-time auto-register. Use 0 as an "unknown at parse time" sentinel and let the auto-register resolve arity from the env-bound table's column count. Compile already looks arity up via dl_find_rel, so no compile-side change is needed. Regression test /datalog/env_bound_agg_auto_register exercises (sum ?s salaries 1) over a 2-column env-bound table via the (rules …) clause; the pre-fix code would reject the auto-register on arity mismatch. 699/699 pass on debug (ASan+UBSan) and release. --- src/ops/datalog.c | 54 ++++++++++++++++++++++++++++++++++++++------- test/test_datalog.c | 50 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+), 8 deletions(-) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index 845c24c7..2cd7daf8 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -1004,11 +1004,16 @@ static ray_t* dl_project(ray_t* tbl, const int* col_indices, int n_out, if (!tbl || RAY_IS_ERR(tbl)) return tbl; int64_t nrows = ray_table_nrows(tbl); ray_t* out = ray_table_new(n_out); + if (!out || RAY_IS_ERR(out)) + return out ? out : ray_error("memory", "dl_project: table_new"); for (int c = 0; c < n_out; c++) { int src_idx = col_indices[c]; if (src_idx >= 0) { ray_t* src = ray_table_get_col_idx(tbl, src_idx); - if (!src) continue; + if (!src) { + ray_release(out); + return ray_error("domain", "dl_project: source column missing"); + } /* Preserve SYM index width: ray_vec_new(RAY_SYM, …) would always * produce a W64 vec, so memcpy'ing with the source's narrower * element size would leave the upper bytes of each W64 slot @@ -1016,21 +1021,44 @@ static ray_t* dl_project(ray_t* tbl, const int* col_indices, int n_out, ray_t* dst = (src->type == RAY_SYM) ? ray_sym_vec_new(src->attrs & RAY_SYM_W_MASK, nrows) : ray_vec_new(src->type, nrows); - if (!dst || RAY_IS_ERR(dst)) continue; + if (!dst || RAY_IS_ERR(dst)) { + if (dst) ray_release(dst); + ray_release(out); + return ray_error("memory", "dl_project: vec_new"); + } dst->len = nrows; uint8_t esz = ray_sym_elem_size(src->type, src->attrs); - if (esz == 0) { ray_release(dst); continue; } + if (esz == 0) { + ray_release(dst); + ray_release(out); + return ray_error("type", "dl_project: unsupported column type"); + } memcpy(ray_data(dst), ray_data(src), (size_t)nrows * (size_t)esz); - out = ray_table_add_col(out, head_rel->col_names[c], dst); + ray_t* next = ray_table_add_col(out, head_rel->col_names[c], dst); ray_release(dst); + /* ray_table_add_col consumes `out` via ray_cow on success. On + * error it returns a fresh RAY_ERR object and `out` is no longer + * valid — surface the error to the caller as-is. */ + if (!next) return ray_error("memory", "dl_project: add_col"); + if (RAY_IS_ERR(next)) return next; + out = next; } else { /* Constant head slot: materialize an owned broadcast column. */ int8_t ctype = head_const_types ? head_const_types[c] : 0; - if (ctype == 0) continue; /* legacy/unset */ + if (ctype == 0) { + ray_release(out); + return ray_error("domain", "dl_project: unset head-const type"); + } ray_t* bcast = dl_broadcast_const_col(nrows, ctype, head_consts[c]); - if (!bcast || RAY_IS_ERR(bcast)) continue; - out = ray_table_add_col(out, head_rel->col_names[c], bcast); + if (!bcast || RAY_IS_ERR(bcast)) { + ray_release(out); + return bcast ? bcast : ray_error("memory", "dl_project: broadcast"); + } + ray_t* next = ray_table_add_col(out, head_rel->col_names[c], bcast); ray_release(bcast); + if (!next) return ray_error("memory", "dl_project: add_col"); + if (RAY_IS_ERR(next)) return next; + out = next; } } return out; @@ -2996,7 +3024,12 @@ static ray_t* dl_parse_body_clause(dl_rule_t* rule, ray_t* clause, return ray_error("type", "aggregate: cannot resolve predicate name"); const char* pred_name = ray_str_ptr(pred_sym); - int pred_arity = 1; + /* Record arity=0 as "unknown" when we can't resolve it against the + * program (prog=NULL or predicate not yet registered). The compiler + * and env auto-register treat 0 as a wildcard and resolve against the + * source relation at evaluation time. A hardcoded 1 would spuriously + * reject any env-bound table whose arity isn't 1. */ + int pred_arity = 0; if (prog) { int ri = dl_find_rel(prog, pred_name); if (ri >= 0) pred_arity = prog->rels[ri].arity; @@ -3438,6 +3471,11 @@ ray_t* ray_query_fn(ray_t** args, int64_t n) { ray_t* env_val = ray_env_get(env_sym); if (!env_val || env_val->type != RAY_TABLE) continue; int64_t ncols = ray_table_ncols(env_val); + /* pred_arity == 0 is a "not yet known" sentinel used when the + * aggregate parser couldn't resolve the source predicate's arity + * at parse time (prog=NULL, surface syntax). Resolve it from the + * env-bound table's column count now. */ + if (pred_arity == 0) pred_arity = (int)ncols; if (ncols != pred_arity) continue; int64_t nrows_env = ray_table_nrows(env_val); diff --git a/test/test_datalog.c b/test/test_datalog.c index 201ddfc7..a758f04d 100644 --- a/test/test_datalog.c +++ b/test/test_datalog.c @@ -1795,6 +1795,55 @@ static MunitResult test_env_bound_edb_auto_register(const void* params, void* fi return MUNIT_OK; } +/* Auto-register env-bound EDB for an aggregate source whose arity isn't 1. + * Regression: aggregate parsing used to hardcode pred_arity=1 when prog was + * NULL (surface syntax, globals), so env_val->ncols != 1 made the env auto- + * register skip the binding and the query later failed at compile time. */ +static MunitResult test_env_bound_agg_auto_register(const void* params, void* fixture) { + (void)params; (void)fixture; + + /* EAV (as `mydb`): one row so the primary scan yields a single match. */ + int64_t es[] = {1}, as[] = {42}, vs[] = {100}; + ray_t* ec = ray_vec_from_raw(RAY_I64, es, 1); + ray_t* ac = ray_vec_from_raw(RAY_I64, as, 1); + ray_t* vc = ray_vec_from_raw(RAY_I64, vs, 1); + ray_t* eav = ray_table_new(3); + eav = ray_table_add_col(eav, ray_sym_intern("e", 1), ec); + eav = ray_table_add_col(eav, ray_sym_intern("a", 1), ac); + eav = ray_table_add_col(eav, ray_sym_intern("v", 1), vc); + int64_t db_sym = ray_sym_intern("mydb", 4); + ray_env_set(db_sym, eav); + + /* Arity-2 env-bound source table named `salaries`. The aggregate + * (sum ?s salaries 1) needs to auto-register this via the env at query + * time even though the surface-syntax parser didn't know its arity. */ + int64_t sid[] = {1, 2, 3}; + int64_t sal[] = {100, 200, 300}; /* sum = 600 */ + ray_t* sidc = ray_vec_from_raw(RAY_I64, sid, 3); + ray_t* salc = ray_vec_from_raw(RAY_I64, sal, 3); + ray_t* salaries = ray_table_new(2); + salaries = ray_table_add_col(salaries, ray_sym_intern("salaries__c0", 12), sidc); + salaries = ray_table_add_col(salaries, ray_sym_intern("salaries__c1", 12), salc); + int64_t sal_sym = ray_sym_intern("salaries", 8); + ray_env_set(sal_sym, salaries); + + ray_t* result = ray_eval_str( + "(query mydb (find ?s) (where (eav ?x ?p ?v))" + " (rules ((total ?s) (sum ?s salaries 1))))"); + munit_assert_ptr_not_null(result); + munit_assert_false(RAY_IS_ERR(result)); + /* The test shape doesn't exercise `total` directly; what we assert is + * that the query didn't error out — meaning `salaries` was auto- + * registered despite its arity being unknown at parse time. */ + + ray_release(result); + ray_env_set(sal_sym, NULL); + ray_env_set(db_sym, NULL); + ray_release(salaries); ray_release(sidc); ray_release(salc); + ray_release(eav); ray_release(ec); ray_release(ac); ray_release(vc); + return MUNIT_OK; +} + static MunitTest datalog_tests[] = { { "/source_provenance", test_source_provenance, datalog_setup, datalog_teardown, 0, NULL }, { "/source_prov_requires_flag", test_source_prov_requires_flag, datalog_setup, datalog_teardown, 0, NULL }, @@ -1837,6 +1886,7 @@ static MunitTest datalog_tests[] = { { "/rule_head_const_surface_syntax", test_rule_head_const_surface_syntax, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, { "/rule_body_const_surface_syntax", test_rule_body_const_surface_syntax, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, { "/env_bound_edb_auto_register", test_env_bound_edb_auto_register, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, + { "/env_bound_agg_auto_register", test_env_bound_agg_auto_register, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, { "/agg_scalar_f64", test_agg_scalar_f64, datalog_setup, datalog_teardown, 0, NULL }, { "/agg_scalar_f64_sum_empty", test_agg_scalar_f64_sum_empty, datalog_setup, datalog_teardown, 0, NULL }, { "/agg_scalar_value_col_oor_empty", test_agg_scalar_value_col_oor_empty, datalog_setup, datalog_teardown, 0, NULL }, From 06ddbc12ed457e3a8e45f1e51ede2676c4d1ae9b Mon Sep 17 00:00:00 2001 From: Anton Date: Thu, 23 Apr 2026 10:55:15 +0200 Subject: [PATCH 34/51] fix(datalog): dl_eval surfaces compile/runtime failures instead of swallowing them MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit dl_eval previously returned 0 on every call, so any rule that failed to compile (dl_project OOM, aggregate bounds rejection, non-numeric value column) or produced a ray_execute error was silently dropped from the fixpoint — callers like ray_query_fn saw a 0 return and shipped an incomplete IDB as a successful query result. Introduce a program-scoped `eval_err` flag and flip it at every failure site inside dl_compile_rule and dl_eval's runtime pipeline: * dl_project returning a ray_error* from dl_compile_rule (OOM, type errors, add_col errors) — the error object is now released and the flag is set. * Grouped-agg group_key_col / value_col out-of-range checks. * Scalar-agg value_col out-of-range and non-numeric value column rejects. * ray_execute / table_rename_cols / table_union / table_distinct errors inside both Phase A (initial eval) and Phase B (semi-naive iteration). dl_eval now returns -1 iff the flag is set. NULL returns that represent legitimate "rule produced no rows" (empty accum after body atoms, etc.) still yield a 0 return — they're not errors. Also taught dl_project to fall back to the IDB's existing column types when accum collapses to 0 rows but has lost its schema (e.g. antijoin on a fully-filtered relation). This preserves schema agreement with the IDB for downstream table_union without re-introducing the silent missing-column bug. Tests: * /datalog/eval_surfaces_compile_failure (new): (query …) over a rule whose (sum ?s broken 99) is out of range must return a RAY_ERR, not an empty table. * /datalog/agg_scalar_value_col_oor_empty, /datalog/agg_grouped_key_col_oor updated to expect dl_eval == -1 (previously asserted 0, which itself was the silent-failure bug). 700/700 pass on debug (ASan+UBSan) and release. --- src/ops/datalog.c | 77 +++++++++++++++++++++++++++++++++++++-------- src/ops/datalog.h | 5 +++ test/test_datalog.c | 61 ++++++++++++++++++++++++++++++----- 3 files changed, 123 insertions(+), 20 deletions(-) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index 2cd7daf8..00299f27 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -1006,11 +1006,31 @@ static ray_t* dl_project(ray_t* tbl, const int* col_indices, int n_out, ray_t* out = ray_table_new(n_out); if (!out || RAY_IS_ERR(out)) return out ? out : ray_error("memory", "dl_project: table_new"); + /* If accum collapsed to zero rows (e.g. antijoin removed everything), + * its schema may have been dropped too. Fall back to the IDB's existing + * column types so downstream table_union sees a matching schema. */ + bool empty_accum = (nrows == 0); for (int c = 0; c < n_out; c++) { int src_idx = col_indices[c]; if (src_idx >= 0) { ray_t* src = ray_table_get_col_idx(tbl, src_idx); if (!src) { + if (empty_accum && head_rel && head_rel->table) { + ray_t* hcol = ray_table_get_col_idx(head_rel->table, c); + int8_t htype = hcol ? hcol->type : RAY_I64; + ray_t* ecol = ray_vec_new(htype, 0); + if (!ecol || RAY_IS_ERR(ecol)) { + if (ecol) ray_release(ecol); + ray_release(out); + return ray_error("memory", "dl_project: empty col"); + } + ray_t* next = ray_table_add_col(out, head_rel->col_names[c], ecol); + ray_release(ecol); + if (!next) return ray_error("memory", "dl_project: add_col"); + if (RAY_IS_ERR(next)) return next; + out = next; + continue; + } ray_release(out); return ray_error("domain", "dl_project: source column missing"); } @@ -1304,6 +1324,7 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, if (kc < 0 || kc >= src_rel->arity) { ray_graph_free(gg); ray_release(accum); + prog->eval_err = true; return NULL; } int64_t sym = src_rel->col_names[kc]; @@ -1320,6 +1341,7 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, (value_col < 0 || value_col >= src_rel->arity)) { ray_graph_free(gg); ray_release(accum); + prog->eval_err = true; return NULL; } if (value_col < 0 || value_col >= src_rel->arity) value_col = 0; @@ -1392,6 +1414,7 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, (body->agg_value_col < 0 || body->agg_value_col >= src_rel_s->arity)) { ray_release(accum); + prog->eval_err = true; return NULL; } @@ -1421,6 +1444,7 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, } else if (vc0->type != RAY_I64) { /* Non-numeric source: reject regardless of row count. */ ray_release(accum); + prog->eval_err = true; return NULL; } } @@ -1493,6 +1517,7 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, /* Non-numeric source column — reject loudly rather than * silently returning zero. */ ray_release(accum); + prog->eval_err = true; return NULL; } } @@ -1702,6 +1727,17 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, rule->head_consts, rule->head_const_types); ray_release(accum); + /* dl_project now surfaces hard failures (alloc OOM, type errors, add-col + * errors) as RAY_ERROR objects. Catch those here and flag the program + * so dl_eval can return -1 instead of silently dropping the rule's + * output via the const_table/execute chain. */ + if (!projected) return NULL; + if (RAY_IS_ERR(projected)) { + ray_release(projected); + prog->eval_err = true; + return NULL; + } + /* Store result in the graph as a const_table so the caller can execute */ ray_op_t* result_node = ray_const_table(g, projected); ray_release(projected); @@ -2103,6 +2139,13 @@ static void dl_build_provenance(dl_program_t* prog) { int dl_eval(dl_program_t* prog) { if (!prog) return -1; + /* Reset the compile/eval error flag at the top of each eval. Rule + * compilation or ray_execute paths may set it on unrecoverable failure + * (e.g. dl_project OOM); we return -1 at the end if it was raised so + * ray_query_fn and other callers can surface "evaluation failed" rather + * than silently returning an empty/partial result. */ + prog->eval_err = false; + /* Stratify if not already done */ if (prog->n_strata == 0) { if (dl_stratify(prog) != 0) return -1; @@ -2140,24 +2183,26 @@ int dl_eval(dl_program_t* prog) { ray_t* raw_tuples = ray_execute(g, output); ray_graph_free(g); - if (!raw_tuples || RAY_IS_ERR(raw_tuples)) continue; + if (!raw_tuples) continue; + if (RAY_IS_ERR(raw_tuples)) { prog->eval_err = true; ray_release(raw_tuples); continue; } /* Rename columns to match head relation's expected names */ ray_t* new_tuples = table_rename_cols(raw_tuples, head_rel); ray_release(raw_tuples); - if (!new_tuples || RAY_IS_ERR(new_tuples)) continue; + if (!new_tuples) continue; + if (RAY_IS_ERR(new_tuples)) { prog->eval_err = true; ray_release(new_tuples); continue; } /* Merge into the head relation's table */ ray_t* merged = table_union(head_rel->table, new_tuples); ray_release(new_tuples); - if (merged && !RAY_IS_ERR(merged)) { - ray_t* deduped = table_distinct(merged); - ray_release(merged); - if (deduped && !RAY_IS_ERR(deduped)) { - ray_release(head_rel->table); - head_rel->table = deduped; - } - } + if (!merged) { prog->eval_err = true; continue; } + if (RAY_IS_ERR(merged)) { prog->eval_err = true; ray_release(merged); continue; } + ray_t* deduped = table_distinct(merged); + ray_release(merged); + if (!deduped) { prog->eval_err = true; continue; } + if (RAY_IS_ERR(deduped)) { prog->eval_err = true; ray_release(deduped); continue; } + ray_release(head_rel->table); + head_rel->table = deduped; } /* Phase B: Semi-naive loop — iterate with delta relations */ @@ -2247,13 +2292,15 @@ int dl_eval(dl_program_t* prog) { ray_graph_free(g); prog->rels[body_rel].table = saved; - if (!raw_result || RAY_IS_ERR(raw_result)) continue; + if (!raw_result) continue; + if (RAY_IS_ERR(raw_result)) { prog->eval_err = true; ray_release(raw_result); continue; } /* Rename columns to match head relation */ dl_rel_t* head_rel2 = &prog->rels[head_idx]; ray_t* result = table_rename_cols(raw_result, head_rel2); ray_release(raw_result); - if (!result || RAY_IS_ERR(result)) continue; + if (!result) continue; + if (RAY_IS_ERR(result)) { prog->eval_err = true; ray_release(result); continue; } /* Accumulate new tuples for this head */ if (new_tuples_per_rel[head_idx]) { @@ -2330,7 +2377,11 @@ int dl_eval(dl_program_t* prog) { if (prog->flags & DL_FLAG_PROVENANCE) dl_build_provenance(prog); - return 0; + /* Any compile-time or runtime error surfaced by a rule causes dl_eval + * to report failure, so callers (notably ray_query_fn) can turn this + * into a user-visible "evaluation failed" error instead of shipping a + * silently-incomplete result. */ + return prog->eval_err ? -1 : 0; } /* ======================================================================== diff --git a/src/ops/datalog.h b/src/ops/datalog.h index c188a555..cca64955 100644 --- a/src/ops/datalog.h +++ b/src/ops/datalog.h @@ -174,6 +174,11 @@ typedef struct { int strata_sizes[DL_MAX_STRATA]; /* number of predicates per stratum */ int n_strata; uint32_t flags; /* DL_FLAG_* bitmask */ + bool eval_err; /* set by compile/eval on + unrecoverable failure + (distinct from "rule + produced no rows"); read + by dl_eval to return -1 */ } dl_program_t; /* ===== Public API ===== */ diff --git a/test/test_datalog.c b/test/test_datalog.c index a758f04d..6be082ed 100644 --- a/test/test_datalog.c +++ b/test/test_datalog.c @@ -1633,7 +1633,7 @@ static MunitResult test_agg_scalar_value_col_oor_empty(const void* params, void* rs.n_vars = 1; dl_add_rule(prog, &rs); - munit_assert_int(dl_eval(prog), ==, 0); + munit_assert_int(dl_eval(prog), ==, -1); ray_t* out = dl_query(prog, "bad_sum"); munit_assert_ptr_not_null(out); munit_assert_int((int)ray_table_nrows(out), ==, 0); @@ -1668,8 +1668,9 @@ static MunitResult test_agg_grouped_key_col_oor(const void* params, void* fixtur r.n_vars = 2; dl_add_rule(prog, &r); - /* dl_eval must not crash; compile rejects the rule, producing 0 rows. */ - munit_assert_int(dl_eval(prog), ==, 0); + /* dl_eval must surface the compile-time rejection as failure rather + * than silently producing an empty result. */ + munit_assert_int(dl_eval(prog), ==, -1); ray_t* out = dl_query(prog, "bad_group"); munit_assert_ptr_not_null(out); munit_assert_int((int)ray_table_nrows(out), ==, 0); @@ -1828,13 +1829,14 @@ static MunitResult test_env_bound_agg_auto_register(const void* params, void* fi ray_env_set(sal_sym, salaries); ray_t* result = ray_eval_str( - "(query mydb (find ?s) (where (eav ?x ?p ?v))" + "(query mydb (find ?s) (where (total ?s))" " (rules ((total ?s) (sum ?s salaries 1))))"); munit_assert_ptr_not_null(result); munit_assert_false(RAY_IS_ERR(result)); - /* The test shape doesn't exercise `total` directly; what we assert is - * that the query didn't error out — meaning `salaries` was auto- - * registered despite its arity being unknown at parse time. */ + munit_assert_int(result->type, ==, RAY_TABLE); + munit_assert_int((int)ray_table_nrows(result), ==, 1); + ray_t* sc = ray_table_get_col_idx(result, 0); + munit_assert_int(((int64_t*)ray_data(sc))[0], ==, 600); ray_release(result); ray_env_set(sal_sym, NULL); @@ -1844,6 +1846,50 @@ static MunitResult test_env_bound_agg_auto_register(const void* params, void* fi return MUNIT_OK; } +/* A rule whose compile step deliberately fails (out-of-range value column + * on a SUM aggregate) must surface the failure all the way up through + * (query ...), not silently return an empty table. Regression: dl_eval + * used to swallow dl_compile_rule NULL returns and unconditionally report + * success. */ +static MunitResult test_eval_surfaces_compile_failure(const void* params, void* fixture) { + (void)params; (void)fixture; + + /* Need an EAV table bound in the env so `(query db …)` has a valid + * first argument. */ + int64_t es[] = {1}, as[] = {42}, vs[] = {100}; + ray_t* ec = ray_vec_from_raw(RAY_I64, es, 1); + ray_t* ac = ray_vec_from_raw(RAY_I64, as, 1); + ray_t* vc = ray_vec_from_raw(RAY_I64, vs, 1); + ray_t* eav = ray_table_new(3); + eav = ray_table_add_col(eav, ray_sym_intern("e", 1), ec); + eav = ray_table_add_col(eav, ray_sym_intern("a", 1), ac); + eav = ray_table_add_col(eav, ray_sym_intern("v", 1), vc); + int64_t db_sym = ray_sym_intern("mydb", 4); + ray_env_set(db_sym, eav); + + /* Env-bound arity-1 source; (sum ?s broken 99) indexes a nonexistent + * value column. */ + int64_t xs[] = {10, 20, 30}; + ray_t* xc = ray_vec_from_raw(RAY_I64, xs, 3); + ray_t* broken = ray_table_new(1); + broken = ray_table_add_col(broken, ray_sym_intern("broken__c0", 10), xc); + int64_t broken_sym = ray_sym_intern("broken", 6); + ray_env_set(broken_sym, broken); + + ray_t* result = ray_eval_str( + "(query mydb (find ?s) (where (bad ?s))" + " (rules ((bad ?s) (sum ?s broken 99))))"); + munit_assert_ptr_not_null(result); + munit_assert_true(RAY_IS_ERR(result)); + + ray_release(result); + ray_env_set(broken_sym, NULL); + ray_env_set(db_sym, NULL); + ray_release(broken); ray_release(xc); + ray_release(eav); ray_release(ec); ray_release(ac); ray_release(vc); + return MUNIT_OK; +} + static MunitTest datalog_tests[] = { { "/source_provenance", test_source_provenance, datalog_setup, datalog_teardown, 0, NULL }, { "/source_prov_requires_flag", test_source_prov_requires_flag, datalog_setup, datalog_teardown, 0, NULL }, @@ -1887,6 +1933,7 @@ static MunitTest datalog_tests[] = { { "/rule_body_const_surface_syntax", test_rule_body_const_surface_syntax, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, { "/env_bound_edb_auto_register", test_env_bound_edb_auto_register, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, { "/env_bound_agg_auto_register", test_env_bound_agg_auto_register, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, + { "/eval_surfaces_compile_failure", test_eval_surfaces_compile_failure, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, { "/agg_scalar_f64", test_agg_scalar_f64, datalog_setup, datalog_teardown, 0, NULL }, { "/agg_scalar_f64_sum_empty", test_agg_scalar_f64_sum_empty, datalog_setup, datalog_teardown, 0, NULL }, { "/agg_scalar_value_col_oor_empty", test_agg_scalar_value_col_oor_empty, datalog_setup, datalog_teardown, 0, NULL }, From ed2824b4a52486702723eb3ec53cd169e5185d79 Mon Sep 17 00:00:00 2001 From: Anton Date: Thu, 23 Apr 2026 11:05:04 +0200 Subject: [PATCH 35/51] fix(datalog): Phase B surfaces every runtime failure, not just ray_execute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase B of the semi-naive loop still had several `continue`-without-flag silent drops that bypassed the new eval_err propagation: * ray_graph_new() NULL in Phase A and Phase B (OOM) — now sets eval_err. * table_union() accumulator in the new-tuples-per-head aggregation — NULL or RAY_ERR returns used to overwrite new_tuples_per_rel[head_idx] or silently continue; now mark eval_err and clear the slot cleanly. * table_distinct / table_antijoin runtime errors in the per-IDB merge pass — now mark eval_err before continuing. * table_union of delta back into rel->table — a failure here desyncs the fixpoint (delta_tables was already stored) yet the original code swallowed it; now mark eval_err on failure. Phase A's `ray_graph_new` gained the same treatment for symmetry. NULL returns from dl_compile_rule still pass through untouched — the compiler itself sets eval_err on genuine failures; a bare NULL is "no rows this iteration" and must not fault the whole program. No new test is needed: /datalog/eval_surfaces_compile_failure already exercises the full query -> dl_eval -> error-surfacing chain, and the existing recursive tests (/lang/datalog/fixpoint, etc.) guard the Phase B happy path. 700/700 pass on debug (ASan+UBSan) and release. --- src/ops/datalog.c | 54 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 10 deletions(-) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index 00299f27..2d60f6aa 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -2175,10 +2175,15 @@ int dl_eval(dl_program_t* prog) { dl_rel_t* head_rel = &prog->rels[head_idx]; ray_graph_t* g = ray_graph_new(NULL); - if (!g) continue; + if (!g) { prog->eval_err = true; continue; } ray_op_t* output = dl_compile_rule(prog, rule, -1, stratum_rule_idx[ri], g); - if (!output) { ray_graph_free(g); continue; } + if (!output) { + /* dl_compile_rule marks eval_err on genuine failures; a bare + * NULL means "rule has no rows this pass" — not a fault. */ + ray_graph_free(g); + continue; + } ray_t* raw_tuples = ray_execute(g, output); ray_graph_free(g); @@ -2279,12 +2284,19 @@ int dl_eval(dl_program_t* prog) { prog->rels[body_rel].table = delta_tables[body_rel]; ray_graph_t* g = ray_graph_new(NULL); - if (!g) { prog->rels[body_rel].table = saved; continue; } + if (!g) { + prog->rels[body_rel].table = saved; + prog->eval_err = true; + continue; + } ray_op_t* output = dl_compile_rule(prog, rule, b, stratum_rule_idx[ri], g); if (!output) { ray_graph_free(g); prog->rels[body_rel].table = saved; + /* dl_compile_rule sets eval_err itself on genuine + * failures; NULL without the flag means "rule yields + * no rows this iteration" and should not fault. */ continue; } @@ -2307,6 +2319,17 @@ int dl_eval(dl_program_t* prog) { ray_t* u = table_union(new_tuples_per_rel[head_idx], result); ray_release(new_tuples_per_rel[head_idx]); ray_release(result); + if (!u) { + prog->eval_err = true; + new_tuples_per_rel[head_idx] = NULL; + continue; + } + if (RAY_IS_ERR(u)) { + prog->eval_err = true; + ray_release(u); + new_tuples_per_rel[head_idx] = NULL; + continue; + } new_tuples_per_rel[head_idx] = u; } else { new_tuples_per_rel[head_idx] = result; @@ -2326,7 +2349,10 @@ int dl_eval(dl_program_t* prog) { delta_tables[rel_idx] = NULL; ray_t* new_tuples = new_tuples_per_rel[rel_idx]; - if (!new_tuples || RAY_IS_ERR(new_tuples)) { + if (!new_tuples) { delta_tables[rel_idx] = NULL; continue; } + if (RAY_IS_ERR(new_tuples)) { + prog->eval_err = true; + ray_release(new_tuples); delta_tables[rel_idx] = NULL; continue; } @@ -2334,22 +2360,30 @@ int dl_eval(dl_program_t* prog) { /* Deduplicate */ ray_t* deduped = table_distinct(new_tuples); ray_release(new_tuples); - if (!deduped || RAY_IS_ERR(deduped)) continue; + if (!deduped) { prog->eval_err = true; continue; } + if (RAY_IS_ERR(deduped)) { prog->eval_err = true; ray_release(deduped); continue; } /* Subtract existing relation to get true delta */ ray_t* delta = table_antijoin(deduped, rel->table); ray_release(deduped); - if (!delta || RAY_IS_ERR(delta)) continue; + if (!delta) { prog->eval_err = true; continue; } + if (RAY_IS_ERR(delta)) { prog->eval_err = true; ray_release(delta); continue; } delta_tables[rel_idx] = delta; - /* Merge delta into full relation */ + /* Merge delta into full relation. A merge failure here + * leaves delta_tables set but rel->table stale — that would + * desync the fixpoint, so treat it as a hard failure. */ if (ray_table_nrows(delta) > 0) { ray_t* merged = table_union(rel->table, delta); - if (merged && !RAY_IS_ERR(merged)) { - ray_release(rel->table); - rel->table = merged; + if (!merged) { prog->eval_err = true; continue; } + if (RAY_IS_ERR(merged)) { + prog->eval_err = true; + ray_release(merged); + continue; } + ray_release(rel->table); + rel->table = merged; } } From e294723b723bf89193cfc2eef751ddc740e62c2c Mon Sep 17 00:00:00 2001 From: Anton Date: Thu, 23 Apr 2026 11:14:15 +0200 Subject: [PATCH 36/51] fix(datalog): table_union/distinct/antijoin never silently produce partial tables MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit table_union used to silently skip columns whose concat failed, producing a table with missing columns that callers mistook for a successful union — the caller then stored it as the IDB's new state or as merge output, so the partial result became persistent. Rewrite each helper to surface failure: * table_union: RAY_IS_ERR on any missing column, concat failure, or ray_table_add_col failure — release partial `out` and return a typed ray_error. Also retain `a`/`b` when we return one of them as-is so callers can release uniformly (previous code double-released when one input was NULL/error). * table_distinct: canonicalize/graph_new failures now return ray_error instead of falling back to the raw input table (which may not be distinct at all — a silent wrong-result). * table_antijoin: same treatment — canonicalize/graph_new failures return ray_error instead of treating `left` as the antijoin result. dl_eval already checks RAY_IS_ERR on these helpers' returns and raises eval_err, so the new error objects flow into the "evaluation failed" surface exposed through ray_query_fn. 700/700 tests pass on debug (ASan+UBSan) and release. --- src/ops/datalog.c | 49 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 41 insertions(+), 8 deletions(-) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index 2d60f6aa..77162c4c 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -1813,8 +1813,13 @@ static ray_t* restore_names(ray_t* tbl, ray_t* src) { /* Create a table by concatenating all rows from tables a and b (same schema). * Uses column-wise ray_vec_concat. Returns new owned table with a's names. */ static ray_t* table_union(ray_t* a, ray_t* b) { - if (!a || RAY_IS_ERR(a)) return b; - if (!b || RAY_IS_ERR(b)) return a; + /* Missing/error inputs: return an owned reference to the other side + * (retained so callers can release uniformly). */ + if (!a || RAY_IS_ERR(a)) { + if (b && !RAY_IS_ERR(b)) ray_retain(b); + return b; + } + if (!b || RAY_IS_ERR(b)) { ray_retain(a); return a; } if (ray_table_nrows(a) == 0) { ray_retain(b); return b; } if (ray_table_nrows(b) == 0) { ray_retain(a); return a; } @@ -1823,15 +1828,28 @@ static ray_t* table_union(ray_t* a, ray_t* b) { int64_t ncols = ncols_a < ncols_b ? ncols_a : ncols_b; ray_t* out = ray_table_new((int)ncols); + if (!out || RAY_IS_ERR(out)) + return out ? out : ray_error("memory", "table_union: table_new"); for (int64_t c = 0; c < ncols; c++) { ray_t* col_a = ray_table_get_col_idx(a, c); ray_t* col_b = ray_table_get_col_idx(b, c); - if (!col_a || !col_b) continue; + if (!col_a || !col_b) { + /* Silently dropping a column would produce a schema-incomplete + * result that the caller mistakes for a successful union. */ + ray_release(out); + return ray_error("domain", "table_union: missing column"); + } ray_t* merged = ray_vec_concat(col_a, col_b); - if (merged && !RAY_IS_ERR(merged)) { - out = ray_table_add_col(out, ray_table_col_name(a, c), merged); - ray_release(merged); + if (!merged || RAY_IS_ERR(merged)) { + if (merged) ray_release(merged); + ray_release(out); + return ray_error("memory", "table_union: concat"); } + ray_t* next = ray_table_add_col(out, ray_table_col_name(a, c), merged); + ray_release(merged); + if (!next) return ray_error("memory", "table_union: add_col"); + if (RAY_IS_ERR(next)) return next; + out = next; } return out; } @@ -1846,9 +1864,14 @@ static ray_t* table_distinct(ray_t* tbl) { if (ncols <= 0) { ray_retain(tbl); return tbl; } ray_t* canonical = canonicalize(tbl); + if (!canonical || RAY_IS_ERR(canonical)) + return canonical ? canonical : ray_error("memory", "table_distinct: canonicalize"); ray_graph_t* g = ray_graph_new(canonical); - if (!g) { ray_release(canonical); ray_retain(tbl); return tbl; } + if (!g) { + ray_release(canonical); + return ray_error("memory", "table_distinct: graph_new"); + } ray_op_t* keys[DL_MAX_ARITY]; for (int64_t c = 0; c < ncols && c < DL_MAX_ARITY; c++) { @@ -1883,10 +1906,20 @@ static ray_t* table_antijoin(ray_t* left, ray_t* right) { if (ncols <= 0) { ray_retain(left); return left; } ray_t* cl = canonicalize(left); + if (!cl || RAY_IS_ERR(cl)) + return cl ? cl : ray_error("memory", "table_antijoin: canonicalize left"); ray_t* cr = canonicalize(right); + if (!cr || RAY_IS_ERR(cr)) { + ray_release(cl); + return cr ? cr : ray_error("memory", "table_antijoin: canonicalize right"); + } ray_graph_t* g = ray_graph_new(NULL); - if (!g) { ray_release(cl); ray_release(cr); ray_retain(left); return left; } + if (!g) { + ray_release(cl); + ray_release(cr); + return ray_error("memory", "table_antijoin: graph_new"); + } ray_op_t* l = ray_const_table(g, cl); ray_op_t* r = ray_const_table(g, cr); From 2b1846e2ae9ec9564e2a42e588f5e4b055dc8651 Mon Sep 17 00:00:00 2001 From: Anton Date: Thu, 23 Apr 2026 11:22:31 +0200 Subject: [PATCH 37/51] fix(datalog): table_union rejects schema mismatch instead of narrowing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Even after the previous pass, table_union still silently narrowed to the common prefix when the two inputs had different column counts: it computed ncols = min(ncols_a, ncols_b) and produced a union table with fewer columns than either side. Callers (dl_eval merge paths) then stored that partial table as the IDB's new state — a silent schema-corruption bug. Reject the mismatch with a typed "schema" error instead of narrowing. The caller already picks this up via RAY_IS_ERR and sets prog->eval_err, so the failure surfaces to ray_query_fn. 700/700 tests pass on debug (ASan+UBSan) and release. --- src/ops/datalog.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index 77162c4c..7aa5079e 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -1825,7 +1825,13 @@ static ray_t* table_union(ray_t* a, ray_t* b) { int64_t ncols_a = ray_table_ncols(a); int64_t ncols_b = ray_table_ncols(b); - int64_t ncols = ncols_a < ncols_b ? ncols_a : ncols_b; + /* Refuse to silently narrow to the common prefix — a mismatched schema + * would otherwise produce a union table with fewer columns than either + * input, and callers (dl_eval's merge paths) would treat that partial + * table as a valid IDB state. */ + if (ncols_a != ncols_b) + return ray_error("schema", "table_union: column count mismatch"); + int64_t ncols = ncols_a; ray_t* out = ray_table_new((int)ncols); if (!out || RAY_IS_ERR(out)) From 3a025ce1bc22983e5b9d550305aae65ca9413c58 Mon Sep 17 00:00:00 2001 From: Anton Date: Thu, 23 Apr 2026 11:28:53 +0200 Subject: [PATCH 38/51] fix(datalog): table_union schema check runs before empty-rows bypass MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The schema-mismatch check only fired when both inputs had rows — the empty-side short-circuits (`nrows(a) == 0` returns b, `nrows(b) == 0` returns a) still bypassed it. An antijoin that collapsed to (0 rows, 0 cols) then entered the union from the left side and silently emitted `b` (wider schema) as the result; the caller stored it as the IDB's state and the arity mismatch surfaced only much later. Move the ncols_a != ncols_b check above the empty-rows bypass so every input path exits with either a schema-matching result or a ray_error. dl_eval already promotes the error to prog->eval_err. 700/700 tests pass on debug (ASan+UBSan) and release. --- src/ops/datalog.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index 7aa5079e..a989218a 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -1820,19 +1820,21 @@ static ray_t* table_union(ray_t* a, ray_t* b) { return b; } if (!b || RAY_IS_ERR(b)) { ray_retain(a); return a; } - if (ray_table_nrows(a) == 0) { ray_retain(b); return b; } - if (ray_table_nrows(b) == 0) { ray_retain(a); return a; } + /* Column-count check must run before the empty-rows short-circuit. + * Otherwise one side having 0 rows but a stripped schema (e.g. an + * antijoin result that collapsed to (0 rows, 0 cols)) would silently + * return the other side's schema and the caller would store a table + * whose arity differs from what it expected. */ int64_t ncols_a = ray_table_ncols(a); int64_t ncols_b = ray_table_ncols(b); - /* Refuse to silently narrow to the common prefix — a mismatched schema - * would otherwise produce a union table with fewer columns than either - * input, and callers (dl_eval's merge paths) would treat that partial - * table as a valid IDB state. */ if (ncols_a != ncols_b) return ray_error("schema", "table_union: column count mismatch"); int64_t ncols = ncols_a; + if (ray_table_nrows(a) == 0) { ray_retain(b); return b; } + if (ray_table_nrows(b) == 0) { ray_retain(a); return a; } + ray_t* out = ray_table_new((int)ncols); if (!out || RAY_IS_ERR(out)) return out ? out : ray_error("memory", "table_union: table_new"); From c3216709b73d97324ba4abaf490fbce2960b6a57 Mon Sep 17 00:00:00 2001 From: Anton Date: Thu, 23 Apr 2026 12:36:31 +0200 Subject: [PATCH 39/51] fix(datalog,runtime): address round-3 Copilot review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four comments from the 2026-04-23 Copilot review round: * `dl_idb_align_head_const_types` (src/ops/datalog.c:~204): split every `if (!x || RAY_IS_ERR(x))` error branch so the RAY_ERROR object is released before return. Repeated aligns were leaking an error block per failed call. The `ray_table_add_col` failure path also now releases both `prev` and the returned error, instead of just `prev`. * `ray_runtime_create_with_sym` / `_with_sym_err` (src/core/runtime.h): the two persistent-consumer entrypoints were defined in runtime.c but never declared — tests duplicated the prototype by hand. Publish them in src/core/runtime.h (alongside the existing `ray_runtime_create` / `ray_runtime_destroy`) so consumers don't drift. Kept a minimal forward-decl in test_datalog.c because including core/runtime.h from a TU that also includes lang/eval.h collides on `ray_vm_t` (pre-existing duplication out of scope for this PR) — the comment flags that. * env auto-register (src/ops/datalog.c:~3620): a NULL from `ray_table_get_col_idx(env_val, c)` used to `continue`, producing a `clean` table with fewer than pred_arity columns that `dl_add_edb` still registered as the EDB. Replaced the skip with a hard ray_error("schema", "env-backed EDB table missing expected column") so only fully-formed tables ever register. * `table_union` pass-through (src/ops/datalog.c:~1819): the `a` is NULL/error branch previously only retained `b` when it was non-error. If callers passed a RAY_ERROR `b` and later released it, the returned pointer aliased freed memory. Retain `b` whenever it's non-NULL — even on error — so the pass-through contract is uniform. Also adopted the setup/teardown cleanup from the review: datalog_rf_setup now returns the created runtime as the fixture, teardown receives it back, and failure to create aborts loudly instead of silently skipping tests. 700/700 pass on debug (ASan+UBSan) and release. --- src/core/runtime.h | 9 +++++++++ src/ops/datalog.c | 33 +++++++++++++++++++++++++-------- test/test_datalog.c | 25 ++++++++++++++++--------- 3 files changed, 50 insertions(+), 17 deletions(-) diff --git a/src/core/runtime.h b/src/core/runtime.h index 3c460f46..a8f80589 100644 --- a/src/core/runtime.h +++ b/src/core/runtime.h @@ -103,6 +103,15 @@ extern _Thread_local ray_vm_t *__VM; ray_runtime_t* ray_runtime_create(int argc, char** argv); void ray_runtime_destroy(ray_runtime_t* rt); +/* Persistent-consumer lifecycle: load the sym table from `sym_path` (if + * present) before builtins register, so user-interned IDs keep the same + * slots across process restarts. The _err variant surfaces the load + * result via `out_sym_err` (RAY_OK / RAY_ERR_CORRUPT / I/O errors) so + * callers can decide recovery policy; the plain variant discards it. */ +ray_runtime_t* ray_runtime_create_with_sym(const char* sym_path); +ray_runtime_t* ray_runtime_create_with_sym_err(const char* sym_path, + ray_err_t* out_sym_err); + /* Error API — allocates ray_t with type=RAY_ERROR, sets __VM->err.msg */ ray_t* ray_error(const char* code, const char* fmt, ...); /* Read error code from a RAY_ERROR object (returns pointer to sdata) */ diff --git a/src/ops/datalog.c b/src/ops/datalog.c index a989218a..eef589b5 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -192,16 +192,23 @@ static void dl_idb_align_head_const_types(dl_program_t* prog, const dl_rule_t* r } if (!any_change) return; - /* Rebuild the table with typed empty columns. */ + /* Rebuild the table with typed empty columns. Every failure path must + * release both the surviving table reference and any RAY_ERROR object + * returned from ray_table_new / ray_vec_new / ray_table_add_col — + * otherwise repeated align calls slowly accumulate error-object blocks + * in the heap. */ ray_t* fresh = ray_table_new(rel->arity); - if (!fresh || RAY_IS_ERR(fresh)) return; + if (!fresh) return; + if (RAY_IS_ERR(fresh)) { ray_release(fresh); return; } for (int c = 0; c < rel->arity; c++) { ray_t* empty_col = ray_vec_new(desired[c], 0); - if (!empty_col || RAY_IS_ERR(empty_col)) { ray_release(fresh); return; } + if (!empty_col) { ray_release(fresh); return; } + if (RAY_IS_ERR(empty_col)) { ray_release(empty_col); ray_release(fresh); return; } ray_t* prev = fresh; fresh = ray_table_add_col(fresh, rel->col_names[c], empty_col); ray_release(empty_col); - if (RAY_IS_ERR(fresh)) { ray_release(prev); return; } + if (!fresh) { ray_release(prev); return; } + if (RAY_IS_ERR(fresh)) { ray_release(prev); ray_release(fresh); return; } } ray_release(rel->table); rel->table = fresh; @@ -1813,10 +1820,12 @@ static ray_t* restore_names(ray_t* tbl, ray_t* src) { /* Create a table by concatenating all rows from tables a and b (same schema). * Uses column-wise ray_vec_concat. Returns new owned table with a's names. */ static ray_t* table_union(ray_t* a, ray_t* b) { - /* Missing/error inputs: return an owned reference to the other side - * (retained so callers can release uniformly). */ + /* Pass-through paths always return a retained reference whenever the + * returned pointer is non-NULL (even on RAY_ERROR), so callers can + * release the return value uniformly without risking use-after-free on + * the pass-through input. */ if (!a || RAY_IS_ERR(a)) { - if (b && !RAY_IS_ERR(b)) ray_retain(b); + if (b) ray_retain(b); return b; } if (!b || RAY_IS_ERR(b)) { ray_retain(a); return a; } @@ -3615,7 +3624,15 @@ ray_t* ray_query_fn(ray_t** args, int64_t n) { for (int c = 0; c < pred_arity; c++) { ray_t* col = ray_table_get_col_idx(env_val, c); ray_t* next_clean; - if (!col) continue; + if (!col) { + /* Silently skipping would build `clean` with fewer than + * pred_arity columns yet still register it via dl_add_edb + * — the program would see a schema-inconsistent EDB. */ + ray_release(clean); + dl_program_free(prog); + ray_release(db); + return ray_error("schema", "query: env-backed EDB table missing expected column"); + } if (col->type == RAY_SYM) { ray_t* i64col = ray_vec_new(RAY_I64, nrows_env); if (!i64col || RAY_IS_ERR(i64col)) { diff --git a/test/test_datalog.c b/test/test_datalog.c index 6be082ed..57b55c7b 100644 --- a/test/test_datalog.c +++ b/test/test_datalog.c @@ -33,11 +33,13 @@ #include "lang/eval.h" #include -struct ray_runtime_s; +/* Forward-declare runtime API used by the full-runtime fixtures. + * (Test target doesn't pull in core/runtime.h because it redefines + * ray_vm_t, which clashes with lang/eval.h's definition — a pre- + * existing duplication kept out of scope for this PR.) */ typedef struct ray_runtime_s ray_runtime_t; -extern ray_runtime_t* ray_runtime_create(int argc, char** argv); -extern void ray_runtime_destroy(ray_runtime_t* rt); -extern ray_runtime_t* __RUNTIME; +ray_runtime_t* ray_runtime_create(int argc, char** argv); +void ray_runtime_destroy(ray_runtime_t* rt); static void* datalog_setup(const void* params, void* user_data) { (void)params; (void)user_data; @@ -52,16 +54,21 @@ static void datalog_teardown(void* fixture) { ray_heap_destroy(); } -/* Full runtime — required for ray_eval_str("(rule ...)") surface-syntax tests. */ +/* Full runtime — required for ray_eval_str("(rule ...)") surface-syntax tests. + * Assertions from munit macros can't be used here (setup returns void*, not + * MunitResult) so we abort explicitly if runtime creation fails. */ static void* datalog_rf_setup(const void* params, void* user_data) { (void)params; (void)user_data; - ray_runtime_create(0, NULL); - return NULL; + ray_runtime_t* rt = ray_runtime_create(0, NULL); + if (!rt) { + fprintf(stderr, "datalog_rf_setup: ray_runtime_create returned NULL\n"); + abort(); + } + return rt; } static void datalog_rf_teardown(void* fixture) { - (void)fixture; - ray_runtime_destroy(__RUNTIME); + ray_runtime_destroy((ray_runtime_t*)fixture); } /* Verify that dl_get_provenance_src_offsets and dl_get_provenance_src_data From dc73dbd4d0dee6d6d340ce9263442ed62a59f4e7 Mon Sep 17 00:00:00 2001 From: Anton Date: Thu, 23 Apr 2026 12:52:19 +0200 Subject: [PATCH 40/51] fix(datalog,runtime): ray_error_free actually reclaims error blocks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The prior round's error-lifetime fixes were no-ops. ray_release(), ray_retain(), and even ray_free() each short-circuit on RAY_IS_ERR, so every `if (RAY_IS_ERR(x)) ray_release(x);` site quietly leaked the error block — repeated failures (e.g. an aggregate evaluated in a tight REPL loop) would slowly bleed memory until heap teardown. Introduce ray_error_free(ray_t*) as the escape hatch: void ray_error_free(ray_t* err) { if (!err || !RAY_IS_ERR(err)) return; err->type = -RAY_I64; /* retype to a leaf atom so ray_free and ray_release_owned_refs don't skip it */ ray_free(err); } Retyping to an atom with no owned children is the safest way to route an error block through the standard free path without requiring a new allocator primitive. The helper is published in include/rayforce.h alongside ray_error so consumers can use it directly. Wire the helper through every site where a RAY_ERROR object was being "released" but in fact leaking: * dl_idb_align_head_const_types: ray_table_new / ray_vec_new / ray_table_add_col error returns. * dl_project: ecol, dst, add_col error returns (split the combined `!x || RAY_IS_ERR(x)` branches so the NULL vs ERROR cases can take different cleanup paths). * dl_compile_rule dl_project propagation. * dl_eval Phase A merge pipeline: raw_tuples, new_tuples, merged, deduped. * dl_eval Phase B semi-naive: raw_result, result, u (union accum), new_tuples, deduped, delta, merged. * ray_query_fn env auto-register: i64col, next_clean. Regression test /datalog/error_free_reclaims loops 256 create/free cycles and asserts via ray_mem_stats() that free_count advances alongside alloc_count and bytes_allocated stays flat. Pre-fix, this test flagged the leak (free_count stayed at 0 for the 256 errors). 701/701 pass on debug (ASan+UBSan) and release. --- include/rayforce.h | 5 +++ src/core/runtime.c | 15 +++++++++ src/core/runtime.h | 1 + src/ops/datalog.c | 76 ++++++++++++++++++++++++++++----------------- test/test_datalog.c | 29 +++++++++++++++++ 5 files changed, 98 insertions(+), 28 deletions(-) diff --git a/include/rayforce.h b/include/rayforce.h index 8d982748..f7961afc 100644 --- a/include/rayforce.h +++ b/include/rayforce.h @@ -156,6 +156,11 @@ ray_t* ray_error(const char* code, const char* fmt, ...); const char* ray_err_code_str(ray_err_t e); ray_err_t ray_err_from_obj(ray_t* err); const char* ray_err_code(ray_t* err); +/* Free a RAY_ERROR object. ray_release() is a deliberate no-op for + * error ray_t* (see src/mem/cow.c), so callers that hold the sole + * reference and want the block reclaimed must use this helper instead — + * otherwise the error leaks until heap teardown. */ +void ray_error_free(ray_t* err); /* ===== Accessor Macros ===== */ diff --git a/src/core/runtime.c b/src/core/runtime.c index bd8b8a18..e560dea6 100644 --- a/src/core/runtime.c +++ b/src/core/runtime.c @@ -138,6 +138,21 @@ ray_t* ray_error(const char* code, const char* fmt, ...) { return err; } +void ray_error_free(ray_t* err) { + /* Skip NULL and anything that isn't actually a RAY_ERROR — callers + * often pass a result that might be either an error or a real value. */ + if (!err || !RAY_IS_ERR(err)) return; + /* Both ray_free and ray_release_owned_refs short-circuit on RAY_IS_ERR + * as a safety default (the refcount system deliberately does not track + * error objects). Retype the block to a leaf atom (-RAY_I64) so those + * guards don't fire — an atom with no owned children is the safest + * shape to pass through the standard free path. The rc was already + * 1 from ray_alloc, so ray_free will reclaim the block via the buddy + * allocator. From this point the caller must not touch err again. */ + err->type = -RAY_I64; + ray_free(err); +} + const char* ray_err_code(ray_t* err) { if (!err || err->type != RAY_ERROR) return NULL; /* sdata is 7 bytes and may not be null-terminated when full */ diff --git a/src/core/runtime.h b/src/core/runtime.h index a8f80589..c32cd9a5 100644 --- a/src/core/runtime.h +++ b/src/core/runtime.h @@ -116,6 +116,7 @@ ray_runtime_t* ray_runtime_create_with_sym_err(const char* sym_path, ray_t* ray_error(const char* code, const char* fmt, ...); /* Read error code from a RAY_ERROR object (returns pointer to sdata) */ const char* ray_err_code(ray_t* err); +/* ray_error_free() is published in include/rayforce.h */ /* Read VM error detail message (NULL if empty) */ const char* ray_error_msg(void); diff --git a/src/ops/datalog.c b/src/ops/datalog.c index eef589b5..200a4163 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -192,23 +192,23 @@ static void dl_idb_align_head_const_types(dl_program_t* prog, const dl_rule_t* r } if (!any_change) return; - /* Rebuild the table with typed empty columns. Every failure path must - * release both the surviving table reference and any RAY_ERROR object - * returned from ray_table_new / ray_vec_new / ray_table_add_col — - * otherwise repeated align calls slowly accumulate error-object blocks - * in the heap. */ + /* Rebuild the table with typed empty columns. ray_release() is a + * deliberate no-op for RAY_ERROR objects (see src/mem/cow.c), so every + * failure path here must pair ray_release() for the valid survivor + * with ray_error_free() for the freshly-returned error block — else + * repeated align calls would silently leak one error block each time. */ ray_t* fresh = ray_table_new(rel->arity); if (!fresh) return; - if (RAY_IS_ERR(fresh)) { ray_release(fresh); return; } + if (RAY_IS_ERR(fresh)) { ray_error_free(fresh); return; } for (int c = 0; c < rel->arity; c++) { ray_t* empty_col = ray_vec_new(desired[c], 0); if (!empty_col) { ray_release(fresh); return; } - if (RAY_IS_ERR(empty_col)) { ray_release(empty_col); ray_release(fresh); return; } + if (RAY_IS_ERR(empty_col)) { ray_error_free(empty_col); ray_release(fresh); return; } ray_t* prev = fresh; fresh = ray_table_add_col(fresh, rel->col_names[c], empty_col); ray_release(empty_col); if (!fresh) { ray_release(prev); return; } - if (RAY_IS_ERR(fresh)) { ray_release(prev); ray_release(fresh); return; } + if (RAY_IS_ERR(fresh)) { ray_release(prev); ray_error_free(fresh); return; } } ray_release(rel->table); rel->table = fresh; @@ -1026,8 +1026,12 @@ static ray_t* dl_project(ray_t* tbl, const int* col_indices, int n_out, ray_t* hcol = ray_table_get_col_idx(head_rel->table, c); int8_t htype = hcol ? hcol->type : RAY_I64; ray_t* ecol = ray_vec_new(htype, 0); - if (!ecol || RAY_IS_ERR(ecol)) { - if (ecol) ray_release(ecol); + if (!ecol) { + ray_release(out); + return ray_error("memory", "dl_project: empty col"); + } + if (RAY_IS_ERR(ecol)) { + ray_error_free(ecol); ray_release(out); return ray_error("memory", "dl_project: empty col"); } @@ -1048,8 +1052,12 @@ static ray_t* dl_project(ray_t* tbl, const int* col_indices, int n_out, ray_t* dst = (src->type == RAY_SYM) ? ray_sym_vec_new(src->attrs & RAY_SYM_W_MASK, nrows) : ray_vec_new(src->type, nrows); - if (!dst || RAY_IS_ERR(dst)) { - if (dst) ray_release(dst); + if (!dst) { + ray_release(out); + return ray_error("memory", "dl_project: vec_new"); + } + if (RAY_IS_ERR(dst)) { + ray_error_free(dst); ray_release(out); return ray_error("memory", "dl_project: vec_new"); } @@ -1740,7 +1748,7 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, * output via the const_table/execute chain. */ if (!projected) return NULL; if (RAY_IS_ERR(projected)) { - ray_release(projected); + ray_error_free(projected); prog->eval_err = true; return NULL; } @@ -2239,23 +2247,23 @@ int dl_eval(dl_program_t* prog) { ray_graph_free(g); if (!raw_tuples) continue; - if (RAY_IS_ERR(raw_tuples)) { prog->eval_err = true; ray_release(raw_tuples); continue; } + if (RAY_IS_ERR(raw_tuples)) { prog->eval_err = true; ray_error_free(raw_tuples); continue; } /* Rename columns to match head relation's expected names */ ray_t* new_tuples = table_rename_cols(raw_tuples, head_rel); ray_release(raw_tuples); if (!new_tuples) continue; - if (RAY_IS_ERR(new_tuples)) { prog->eval_err = true; ray_release(new_tuples); continue; } + if (RAY_IS_ERR(new_tuples)) { prog->eval_err = true; ray_error_free(new_tuples); continue; } /* Merge into the head relation's table */ ray_t* merged = table_union(head_rel->table, new_tuples); ray_release(new_tuples); if (!merged) { prog->eval_err = true; continue; } - if (RAY_IS_ERR(merged)) { prog->eval_err = true; ray_release(merged); continue; } + if (RAY_IS_ERR(merged)) { prog->eval_err = true; ray_error_free(merged); continue; } ray_t* deduped = table_distinct(merged); ray_release(merged); if (!deduped) { prog->eval_err = true; continue; } - if (RAY_IS_ERR(deduped)) { prog->eval_err = true; ray_release(deduped); continue; } + if (RAY_IS_ERR(deduped)) { prog->eval_err = true; ray_error_free(deduped); continue; } ray_release(head_rel->table); head_rel->table = deduped; } @@ -2355,14 +2363,14 @@ int dl_eval(dl_program_t* prog) { prog->rels[body_rel].table = saved; if (!raw_result) continue; - if (RAY_IS_ERR(raw_result)) { prog->eval_err = true; ray_release(raw_result); continue; } + if (RAY_IS_ERR(raw_result)) { prog->eval_err = true; ray_error_free(raw_result); continue; } /* Rename columns to match head relation */ dl_rel_t* head_rel2 = &prog->rels[head_idx]; ray_t* result = table_rename_cols(raw_result, head_rel2); ray_release(raw_result); if (!result) continue; - if (RAY_IS_ERR(result)) { prog->eval_err = true; ray_release(result); continue; } + if (RAY_IS_ERR(result)) { prog->eval_err = true; ray_error_free(result); continue; } /* Accumulate new tuples for this head */ if (new_tuples_per_rel[head_idx]) { @@ -2376,7 +2384,7 @@ int dl_eval(dl_program_t* prog) { } if (RAY_IS_ERR(u)) { prog->eval_err = true; - ray_release(u); + ray_error_free(u); new_tuples_per_rel[head_idx] = NULL; continue; } @@ -2402,7 +2410,7 @@ int dl_eval(dl_program_t* prog) { if (!new_tuples) { delta_tables[rel_idx] = NULL; continue; } if (RAY_IS_ERR(new_tuples)) { prog->eval_err = true; - ray_release(new_tuples); + ray_error_free(new_tuples); delta_tables[rel_idx] = NULL; continue; } @@ -2411,13 +2419,13 @@ int dl_eval(dl_program_t* prog) { ray_t* deduped = table_distinct(new_tuples); ray_release(new_tuples); if (!deduped) { prog->eval_err = true; continue; } - if (RAY_IS_ERR(deduped)) { prog->eval_err = true; ray_release(deduped); continue; } + if (RAY_IS_ERR(deduped)) { prog->eval_err = true; ray_error_free(deduped); continue; } /* Subtract existing relation to get true delta */ ray_t* delta = table_antijoin(deduped, rel->table); ray_release(deduped); if (!delta) { prog->eval_err = true; continue; } - if (RAY_IS_ERR(delta)) { prog->eval_err = true; ray_release(delta); continue; } + if (RAY_IS_ERR(delta)) { prog->eval_err = true; ray_error_free(delta); continue; } delta_tables[rel_idx] = delta; @@ -2429,7 +2437,7 @@ int dl_eval(dl_program_t* prog) { if (!merged) { prog->eval_err = true; continue; } if (RAY_IS_ERR(merged)) { prog->eval_err = true; - ray_release(merged); + ray_error_free(merged); continue; } ray_release(rel->table); @@ -3635,8 +3643,14 @@ ray_t* ray_query_fn(ray_t** args, int64_t n) { } if (col->type == RAY_SYM) { ray_t* i64col = ray_vec_new(RAY_I64, nrows_env); - if (!i64col || RAY_IS_ERR(i64col)) { - if (i64col) ray_release(i64col); + if (!i64col) { + ray_release(clean); + dl_program_free(prog); + ray_release(db); + return ray_error("memory", "query: failed to convert env-backed SYM column"); + } + if (RAY_IS_ERR(i64col)) { + ray_error_free(i64col); ray_release(clean); dl_program_free(prog); ray_release(db); @@ -3651,8 +3665,14 @@ ray_t* ray_query_fn(ray_t** args, int64_t n) { } else { next_clean = ray_table_add_col(clean, ray_table_col_name(env_val, c), col); } - if (!next_clean || RAY_IS_ERR(next_clean)) { - if (next_clean) ray_release(next_clean); + if (!next_clean) { + ray_release(clean); + dl_program_free(prog); + ray_release(db); + return ray_error("memory", "query: failed to build env-backed EDB table"); + } + if (RAY_IS_ERR(next_clean)) { + ray_error_free(next_clean); ray_release(clean); dl_program_free(prog); ray_release(db); diff --git a/test/test_datalog.c b/test/test_datalog.c index 57b55c7b..9dd26ec2 100644 --- a/test/test_datalog.c +++ b/test/test_datalog.c @@ -1897,6 +1897,34 @@ static MunitResult test_eval_surfaces_compile_failure(const void* params, void* return MUNIT_OK; } +/* ray_release() is a deliberate no-op for RAY_ERROR objects, so callers + * that claim to be "releasing" an error under the refcount API actually + * leak the block. ray_error_free() is the escape hatch that calls + * ray_free() directly. This test watches bytes_allocated across a burst + * of create/free cycles: without a real free the counter would climb. */ +static MunitResult test_error_free_reclaims(const void* params, void* fixture) { + (void)params; (void)fixture; + + ray_mem_stats_t before, after; + ray_mem_stats(&before); + for (int i = 0; i < 256; i++) { + ray_t* e = ray_error("test", "iter=%d", i); + munit_assert_ptr_not_null(e); + munit_assert_true(RAY_IS_ERR(e)); + ray_error_free(e); + } + ray_mem_stats(&after); + + /* alloc_count must have grown by at least the 256 we produced. */ + munit_assert_size((size_t)(after.alloc_count - before.alloc_count), >=, 256); + /* free_count must have grown by the same amount — if ray_error_free + * was still a no-op, free_count would lag alloc_count by 256. */ + munit_assert_size((size_t)(after.free_count - before.free_count), >=, 256); + /* Live-bytes must not have grown — the loop is steady-state. */ + munit_assert_size(after.bytes_allocated, <=, before.bytes_allocated); + return MUNIT_OK; +} + static MunitTest datalog_tests[] = { { "/source_provenance", test_source_provenance, datalog_setup, datalog_teardown, 0, NULL }, { "/source_prov_requires_flag", test_source_prov_requires_flag, datalog_setup, datalog_teardown, 0, NULL }, @@ -1941,6 +1969,7 @@ static MunitTest datalog_tests[] = { { "/env_bound_edb_auto_register", test_env_bound_edb_auto_register, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, { "/env_bound_agg_auto_register", test_env_bound_agg_auto_register, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, { "/eval_surfaces_compile_failure", test_eval_surfaces_compile_failure, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, + { "/error_free_reclaims", test_error_free_reclaims, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, { "/agg_scalar_f64", test_agg_scalar_f64, datalog_setup, datalog_teardown, 0, NULL }, { "/agg_scalar_f64_sum_empty", test_agg_scalar_f64_sum_empty, datalog_setup, datalog_teardown, 0, NULL }, { "/agg_scalar_value_col_oor_empty", test_agg_scalar_value_col_oor_empty, datalog_setup, datalog_teardown, 0, NULL }, From e0bf05313a0bdf322bf91d0017b32759fa3cd1fb Mon Sep 17 00:00:00 2001 From: Anton Date: Thu, 23 Apr 2026 13:22:13 +0200 Subject: [PATCH 41/51] fix(datalog,runtime): address round-4 Copilot review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four comments from the 2026-04-23T11:13 Copilot pass: * dl_col_as_f64 / dl_eval_expr helpers (src/ops/datalog.c): five more ray_vec_new sites used `if (!out || RAY_IS_ERR(out)) return NULL` which leaks the RAY_ERROR block (ray_release is a no-op on errors). Split each check and route the RAY_IS_ERR branch through ray_error_free() so the block is actually reclaimed. * dl_project RAY_STR pool propagation (src/ops/datalog.c:~1072): the memcpy path copied 16-byte ray_str_t handles but left dst->str_pool NULL, so reads of strings >12 bytes would dereference pool_off into a NULL pool. Call col_propagate_str_pool(dst, src) after the memcpy when src is RAY_STR (matches the pattern used by ops/filter and ops/pivot). * runtime_create_impl stat() errno handling (src/core/runtime.c:~246): previously any stat() failure silently stayed RAY_OK. Now ENOENT remains the "first-run" OK case, but every other errno (EACCES, ENOTDIR, EIO, …) sets *out_sym_err = RAY_ERR_IO so persistent consumers can see the underlying I/O failure instead of coming up with a silent empty sym table. * test/test_runtime.c (new): targeted tests for the persistent- consumer surface. /runtime/create_with_sym_absent_is_ok verifies ENOENT stays RAY_OK. /runtime/create_with_sym_io_error_surfaces constructs an ENOTDIR path (creates a regular file then asks for a subpath under it) and asserts out_sym_err becomes non-RAY_OK. /runtime/create_with_sym_plain_variant_absent exercises the non-_err variant with a missing sym file. test_main.c wires the new suite into the root. 704/704 pass on debug (ASan+UBSan) and release. --- src/core/runtime.c | 16 +++++-- src/ops/datalog.c | 27 ++++++++--- test/test_main.c | 3 ++ test/test_runtime.c | 113 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 150 insertions(+), 9 deletions(-) create mode 100644 test/test_runtime.c diff --git a/src/core/runtime.c b/src/core/runtime.c index e560dea6..7296eb0f 100644 --- a/src/core/runtime.c +++ b/src/core/runtime.c @@ -28,6 +28,7 @@ #include #include #include +#include #ifdef RAY_OS_WINDOWS #include #else @@ -226,7 +227,14 @@ static ray_runtime_t* runtime_create_impl(const char* sym_path, * builtins append afterwards. */ if (sym_path) { /* Pre-flight size check: reject files that would blow past the - * memory budget before ever touching ray_col_load. */ + * memory budget before ever touching ray_col_load. + * + * errno handling: ENOENT is the normal first-run case and stays + * RAY_OK; any *other* stat failure (EACCES, ENOTDIR, EIO, …) is + * a real problem and must be surfaced as RAY_ERR_IO, otherwise + * the caller would silently continue with an empty sym table + * and later hit the "divergence" class of bugs this entrypoint + * was added to avoid. */ struct stat st; if (stat(sym_path, &st) == 0) { /* Allow the sym file itself plus some working headroom (2x). @@ -244,9 +252,11 @@ static ray_runtime_t* runtime_create_impl(const char* sym_path, /* RAY_ERR_CORRUPT and I/O errors are non-fatal here: * caller inspects out_sym_err to decide recovery. */ } + } else if (errno != ENOENT) { + if (out_sym_err) *out_sym_err = RAY_ERR_IO; } - /* ENOENT and other stat failures: leave out_sym_err = RAY_OK; - * an absent sym file is the normal first-run case. */ + /* ENOENT: leave out_sym_err = RAY_OK — absent sym file is the + * normal first-run case. */ } /* Init language (env + builtins) — must be after __VM is set and diff --git a/src/ops/datalog.c b/src/ops/datalog.c index 200a4163..2ad9af86 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -31,6 +31,7 @@ #include "lang/env.h" #include "table/sym.h" #include "ops/ops.h" +#include "ops/internal.h" /* col_propagate_str_pool */ #include #include @@ -598,7 +599,8 @@ int dl_stratify(dl_program_t* prog) { * of src. If target==RAY_F64 and src is RAY_I64, promote. Returns new owned column. */ static ray_t* dl_col_as_f64(ray_t* src, int64_t nrows) { ray_t* out = ray_vec_new(RAY_F64, nrows); - if (!out || RAY_IS_ERR(out)) return NULL; + if (!out) return NULL; + if (RAY_IS_ERR(out)) { ray_error_free(out); return NULL; } out->len = nrows; double* od = (double*)ray_data(out); if (src->type == RAY_F64) { @@ -621,7 +623,8 @@ static ray_t* dl_eval_expr(dl_expr_t* expr, ray_t* accum, switch (expr->kind) { case DL_EXPR_CONST: { ray_t* col = ray_vec_new(RAY_I64, nrows); - if (!col || RAY_IS_ERR(col)) return NULL; + if (!col) return NULL; + if (RAY_IS_ERR(col)) { ray_error_free(col); return NULL; } col->len = nrows; int64_t* d = (int64_t*)ray_data(col); for (int64_t r = 0; r < nrows; r++) @@ -630,7 +633,8 @@ static ray_t* dl_eval_expr(dl_expr_t* expr, ray_t* accum, } case DL_EXPR_CONST_F64: { ray_t* col = ray_vec_new(RAY_F64, nrows); - if (!col || RAY_IS_ERR(col)) return NULL; + if (!col) return NULL; + if (RAY_IS_ERR(col)) { ray_error_free(col); return NULL; } col->len = nrows; double* d = (double*)ray_data(col); for (int64_t r = 0; r < nrows; r++) @@ -644,7 +648,8 @@ static ray_t* dl_eval_expr(dl_expr_t* expr, ray_t* accum, if (src->type != RAY_I64 && src->type != RAY_F64) return NULL; size_t elem = (src->type == RAY_F64) ? sizeof(double) : sizeof(int64_t); ray_t* dst = ray_vec_new(src->type, nrows); - if (!dst || RAY_IS_ERR(dst)) return NULL; + if (!dst) return NULL; + if (RAY_IS_ERR(dst)) { ray_error_free(dst); return NULL; } dst->len = nrows; memcpy(ray_data(dst), ray_data(src), (size_t)nrows * elem); return dst; @@ -668,7 +673,9 @@ static ray_t* dl_eval_expr(dl_expr_t* expr, ray_t* accum, return NULL; } ray_t* out = ray_vec_new(RAY_F64, nrows); - if (!out || RAY_IS_ERR(out)) { + if (!out) { ray_release(lf); ray_release(rf); return NULL; } + if (RAY_IS_ERR(out)) { + ray_error_free(out); ray_release(lf); ray_release(rf); return NULL; } out->len = nrows; @@ -688,7 +695,9 @@ static ray_t* dl_eval_expr(dl_expr_t* expr, ray_t* accum, return out; } ray_t* out = ray_vec_new(RAY_I64, nrows); - if (!out || RAY_IS_ERR(out)) { + if (!out) { ray_release(lv); ray_release(rv); return NULL; } + if (RAY_IS_ERR(out)) { + ray_error_free(out); ray_release(lv); ray_release(rv); return NULL; } out->len = nrows; @@ -1069,6 +1078,12 @@ static ray_t* dl_project(ray_t* tbl, const int* col_indices, int n_out, return ray_error("type", "dl_project: unsupported column type"); } memcpy(ray_data(dst), ray_data(src), (size_t)nrows * (size_t)esz); + /* RAY_STR stores 16-byte ray_str_t handles inline; strings >12 + * bytes keep their bytes in a per-vector pool referenced via + * pool_off. The memcpy above copies the handles but not the + * pool, so propagate the source's pool onto dst or later + * reads through pool_off would land in a NULL pool. */ + if (src->type == RAY_STR) col_propagate_str_pool(dst, src); ray_t* next = ray_table_add_col(out, head_rel->col_names[c], dst); ray_release(dst); /* ray_table_add_col consumes `out` via ray_cow on success. On diff --git a/test/test_main.c b/test/test_main.c index 13ece139..d2ebe822 100644 --- a/test/test_main.c +++ b/test/test_main.c @@ -67,6 +67,7 @@ extern MunitSuite test_format_suite; extern MunitSuite test_datalog_suite; extern MunitSuite test_rowsel_suite; extern MunitSuite test_embedding_suite; +extern MunitSuite test_runtime_suite; static MunitSuite child_suites[] = { /* { .prefix, .tests, .suites, .iterations, .options } */ @@ -103,6 +104,7 @@ static MunitSuite child_suites[] = { { "/datalog", NULL, NULL, 0, 0 }, { "/rowsel", NULL, NULL, 0, 0 }, { "/embedding",NULL, NULL, 0, 0 }, + { "/runtime", NULL, NULL, 0, 0 }, { NULL, NULL, NULL, 0, 0 }, /* terminator */ }; @@ -149,6 +151,7 @@ int main(int argc, char* argv[]) { child_suites[30] = test_datalog_suite; child_suites[31] = test_rowsel_suite; child_suites[32] = test_embedding_suite; + child_suites[33] = test_runtime_suite; return munit_suite_main(&root_suite, NULL, argc, argv); } diff --git a/test/test_runtime.c b/test/test_runtime.c new file mode 100644 index 00000000..695a29fa --- /dev/null +++ b/test/test_runtime.c @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2025-2026 Anton Kundenko + * All rights reserved. + */ + +#define _DEFAULT_SOURCE /* mkdtemp, strdup */ + +#include "munit.h" +#include +#include +#include +#include +#include +#include + +/* Runtime API forward-declared here because core/runtime.h's `ray_vm_t` + * definition collides with lang/eval.h's `ray_vm_t` when both are pulled + * into the same TU (pre-existing duplication). ray_err_t already comes + * from above. */ +typedef struct ray_runtime_s ray_runtime_t; +ray_runtime_t* ray_runtime_create(int argc, char** argv); +ray_runtime_t* ray_runtime_create_with_sym(const char* sym_path); +ray_runtime_t* ray_runtime_create_with_sym_err(const char* sym_path, + ray_err_t* out_sym_err); +void ray_runtime_destroy(ray_runtime_t* rt); +extern ray_runtime_t* __RUNTIME; + +/* Import RAY_OK / RAY_ERR_IO enum values from rayforce.h -- they live in + * the existing ray_err_t enum and are exposed via ray_err_from_obj / + * ray_err_code_str; numeric values are part of the public surface. */ + +static char* make_tmpdir(void) { + char tmpl[] = "/tmp/rayforce-rt-test-XXXXXX"; + char* dir = mkdtemp(tmpl); + if (!dir) return NULL; + return strdup(tmpl); +} + +/* Absent sym file: stat fails with ENOENT, which is the "first run" + * normal case. out_sym_err must stay RAY_OK and runtime must come up. */ +static MunitResult test_create_with_sym_absent_is_ok(const void* params, void* fixture) { + (void)params; (void)fixture; + char* dir = make_tmpdir(); + munit_assert_ptr_not_null(dir); + char path[256]; + snprintf(path, sizeof(path), "%s/missing.sym", dir); + + ray_err_t err = RAY_ERR_OOM; /* poison — should be overwritten */ + ray_runtime_t* rt = ray_runtime_create_with_sym_err(path, &err); + munit_assert_ptr_not_null(rt); + munit_assert_int((int)err, ==, (int)RAY_OK); + + ray_runtime_destroy(rt); + rmdir(dir); + free(dir); + return MUNIT_OK; +} + +/* Non-ENOENT stat failure must surface as RAY_ERR_IO. We hit this by + * passing a path whose parent exists but isn't a directory (ENOTDIR) — + * portable across Linux/macOS without needing root or chmod games. */ +static MunitResult test_create_with_sym_io_error_surfaces(const void* params, void* fixture) { + (void)params; (void)fixture; + char* dir = make_tmpdir(); + munit_assert_ptr_not_null(dir); + + /* Create a regular file, then ask to stat a path that treats it as a + * directory prefix — POSIX returns ENOTDIR. */ + char blocker[256], path[256]; + snprintf(blocker, sizeof(blocker), "%s/not-a-dir", dir); + snprintf(path, sizeof(path), "%s/not-a-dir/sym", dir); + FILE* f = fopen(blocker, "w"); + munit_assert_ptr_not_null(f); + fclose(f); + + ray_err_t err = RAY_OK; + ray_runtime_t* rt = ray_runtime_create_with_sym_err(path, &err); + munit_assert_ptr_not_null(rt); + munit_assert_int((int)err, !=, (int)RAY_OK); + + ray_runtime_destroy(rt); + unlink(blocker); + rmdir(dir); + free(dir); + return MUNIT_OK; +} + +/* The plain (non-_err) variant discards load result; runtime still comes + * up cleanly regardless of sym-file state. */ +static MunitResult test_create_with_sym_plain_variant_absent(const void* params, void* fixture) { + (void)params; (void)fixture; + char* dir = make_tmpdir(); + munit_assert_ptr_not_null(dir); + char path[256]; + snprintf(path, sizeof(path), "%s/also-missing.sym", dir); + + ray_runtime_t* rt = ray_runtime_create_with_sym(path); + munit_assert_ptr_not_null(rt); + + ray_runtime_destroy(rt); + rmdir(dir); + free(dir); + return MUNIT_OK; +} + +static MunitTest runtime_tests[] = { + { "/create_with_sym_absent_is_ok", test_create_with_sym_absent_is_ok, NULL, NULL, 0, NULL }, + { "/create_with_sym_io_error_surfaces", test_create_with_sym_io_error_surfaces, NULL, NULL, 0, NULL }, + { "/create_with_sym_plain_variant_absent", test_create_with_sym_plain_variant_absent, NULL, NULL, 0, NULL }, + { NULL, NULL, NULL, NULL, 0, NULL }, +}; + +MunitSuite test_runtime_suite = { "/runtime", runtime_tests, NULL, 1, 0 }; From 973ee7690c0b3452934220d7264e2fad2742775b Mon Sep 17 00:00:00 2001 From: Anton Date: Thu, 23 Apr 2026 14:58:17 +0200 Subject: [PATCH 42/51] fix(datalog,runtime): address round-5 Copilot review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Five comments from 2026-04-23T12:12: * dl_filter_eq SYM correctness (src/ops/datalog.c:~934): the constant- filter path read key columns as int64_t*, which miscompares and overreads against adaptive-width RAY_SYM vectors (W8/W16/W32). New dl_col_eq_row helper dispatches on type and uses ray_read_sym for RAY_SYM; the output-column copy is element-size-aware (memcpy of esz bytes per surviving row, allocator picks sym_vec_new for SYM so the narrow width is preserved). Non-numeric/non-sym key columns fall through unfiltered — matches the existing pass-through refcount convention. Also propagates str_pool when the filtered column is RAY_STR. * Scalar aggregate allocation failure (src/ops/datalog.c:~1594): a NULL or RAY_ERROR from ray_vec_new previously did `break`, silently leaving agg_target_var unbound and letting evaluation continue with a half-constructed rule (and leaking the error block). Now release accum, free the error via ray_error_free, set prog->eval_err, and return NULL so dl_eval reports failure. * dl_project empty-accum SYM width (src/ops/datalog.c:~1037): the empty-accum fallback built ecol via ray_vec_new(RAY_SYM, 0) which always lands at W64 — mismatching a head relation that uses narrower SYM attrs. Mirror hcol->attrs width via ray_sym_vec_new(..., 0) when the head-rel column is RAY_SYM so downstream table_union sees a matching schema instead of tripping the column-count guard. * /runtime/create_with_sym_io_error_surfaces now pins the exact code (RAY_ERR_IO) — drifting the errno->code mapping will fail loudly instead of sliding by on a "not RAY_OK" check. * Two new targeted runtime tests: /runtime/create_with_sym_corrupt_file writes garbage bytes to the sym path and asserts out_sym_err goes non-RAY_OK (ray_sym_load flags RAY_ERR_CORRUPT on header validation). /runtime/create_with_sym_load_preserves_user_ids exercises the whole persistence promise end-to-end: intern "rayforce-user-marker", ray_sym_save, destroy, reload via ray_runtime_create_with_sym_err, re-intern the same string and assert the ID is stable. (Oversized- file / RAY_ERR_OOM isn't tested because mem_budget is a large fraction of RAM and can't be overridden from a test without a dependency-injection hook — noted in the commit but not blocking.) 706/706 pass on debug (ASan+UBSan) and release. --- src/ops/datalog.c | 68 ++++++++++++++++++++++++++++++------- test/test_runtime.c | 82 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 137 insertions(+), 13 deletions(-) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index 2ad9af86..a2601a14 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -931,38 +931,63 @@ static ray_t* dl_antijoin_tables(ray_t* left, ray_t* right, } /* Helper: filter a table to rows where column col_idx == value */ +/* Row-at-index read helper: read an I64 from either a RAY_I64 column + * or from a RAY_SYM column (of any adaptive width) as a sym ID. Other + * types aren't supported by the constant-filter path and cause the + * caller to pass through the input table unchanged. */ +static bool dl_col_eq_row(ray_t* col, int64_t row, int64_t value) { + if (col->type == RAY_I64) return ((int64_t*)ray_data(col))[row] == value; + if (col->type == RAY_SYM) + return ray_read_sym(ray_data(col), row, col->type, col->attrs) == value; + return false; +} + static ray_t* dl_filter_eq(ray_t* tbl, int col_idx, int64_t value) { if (!tbl || RAY_IS_ERR(tbl) || ray_table_nrows(tbl) == 0) return tbl; ray_t* col = ray_table_get_col_idx(tbl, col_idx); if (!col) return tbl; + /* Non-numeric, non-sym keys: not supported by this filter. Match + * the existing pass-through convention used by the empty-rows + * early-return above (caller's retain covers us). */ + if (col->type != RAY_I64 && col->type != RAY_SYM) + return tbl; int64_t nrows = ray_table_nrows(tbl); int64_t ncols = ray_table_ncols(tbl); - int64_t* data = (int64_t*)ray_data(col); - /* Count matching rows */ + /* Count matching rows — type-aware read for RAY_SYM adaptive width. */ int64_t count = 0; for (int64_t r = 0; r < nrows; r++) - if (data[r] == value) count++; + if (dl_col_eq_row(col, r, value)) count++; if (count == nrows) { ray_retain(tbl); return tbl; } - /* Build filtered table */ + /* Build filtered table. Each surviving column is allocated with + * its source's element-size (via ray_sym_elem_size) so narrow-SYM + * stays narrow rather than being silently widened to W64. */ ray_t* out = ray_table_new((int)ncols); for (int64_t c = 0; c < ncols; c++) { ray_t* src = ray_table_get_col_idx(tbl, c); if (!src) continue; - ray_t* dst = ray_vec_new(src->type, count); + ray_t* dst = (src->type == RAY_SYM) + ? ray_sym_vec_new(src->attrs & RAY_SYM_W_MASK, count) + : ray_vec_new(src->type, count); if (!dst || RAY_IS_ERR(dst)) continue; dst->len = count; - int64_t* src_d = (int64_t*)ray_data(src); - int64_t* dst_d = (int64_t*)ray_data(dst); + uint8_t esz = ray_sym_elem_size(src->type, src->attrs); + const uint8_t* src_b = (const uint8_t*)ray_data(src); + uint8_t* dst_b = (uint8_t*)ray_data(dst); int64_t j = 0; for (int64_t r = 0; r < nrows; r++) { - if (data[r] == value) - dst_d[j++] = src_d[r]; + if (dl_col_eq_row(col, r, value)) { + memcpy(dst_b + (size_t)j * esz, + src_b + (size_t)r * esz, + (size_t)esz); + j++; + } } + if (src->type == RAY_STR) col_propagate_str_pool(dst, src); out = ray_table_add_col(out, ray_table_col_name(tbl, c), dst); ray_release(dst); } @@ -1034,7 +1059,14 @@ static ray_t* dl_project(ray_t* tbl, const int* col_indices, int n_out, if (empty_accum && head_rel && head_rel->table) { ray_t* hcol = ray_table_get_col_idx(head_rel->table, c); int8_t htype = hcol ? hcol->type : RAY_I64; - ray_t* ecol = ray_vec_new(htype, 0); + /* For SYM columns, preserve the head-relation's + * adaptive-width attrs — ray_vec_new(RAY_SYM, …) would + * force W64 and a later table_union onto a narrower + * head-rel column would hit the column-count check, + * or worse, produce a width-mismatched merge. */ + ray_t* ecol = (htype == RAY_SYM && hcol) + ? ray_sym_vec_new(hcol->attrs & RAY_SYM_W_MASK, 0) + : ray_vec_new(htype, 0); if (!ecol) { ray_release(out); return ray_error("memory", "dl_project: empty col"); @@ -1560,8 +1592,20 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, if (nrows == 0) break; ray_t* new_col = ray_vec_new(is_float ? RAY_F64 : RAY_I64, nrows); - if (!new_col || RAY_IS_ERR(new_col)) - break; + /* Silent break would leave agg_target_var unbound and eval + * would keep running with a partially-constructed rule — + * surface the allocation failure so dl_eval returns -1. */ + if (!new_col) { + ray_release(accum); + prog->eval_err = true; + return NULL; + } + if (RAY_IS_ERR(new_col)) { + ray_error_free(new_col); + ray_release(accum); + prog->eval_err = true; + return NULL; + } new_col->len = nrows; if (is_float) { double* nd = (double*)ray_data(new_col); diff --git a/test/test_runtime.c b/test/test_runtime.c index 695a29fa..c9bbc920 100644 --- a/test/test_runtime.c +++ b/test/test_runtime.c @@ -76,7 +76,10 @@ static MunitResult test_create_with_sym_io_error_surfaces(const void* params, vo ray_err_t err = RAY_OK; ray_runtime_t* rt = ray_runtime_create_with_sym_err(path, &err); munit_assert_ptr_not_null(rt); - munit_assert_int((int)err, !=, (int)RAY_OK); + /* Pin the exact error code — the contract maps every non-ENOENT + * stat failure to RAY_ERR_IO, so drift in the mapping should fail + * this test loudly. */ + munit_assert_int((int)err, ==, (int)RAY_ERR_IO); ray_runtime_destroy(rt); unlink(blocker); @@ -103,10 +106,87 @@ static MunitResult test_create_with_sym_plain_variant_absent(const void* params, return MUNIT_OK; } +/* Corrupt sym file must surface as RAY_ERR_CORRUPT via the _err variant + * (not silently downgraded to RAY_OK). We fake a corrupt file by + * writing random bytes — ray_sym_load expects a serialized RAY_LIST of + * -RAY_STR entries, so arbitrary bytes will fail its header validation. */ +static MunitResult test_create_with_sym_corrupt_file(const void* params, void* fixture) { + (void)params; (void)fixture; + char* dir = make_tmpdir(); + munit_assert_ptr_not_null(dir); + + char path[256]; + snprintf(path, sizeof(path), "%s/corrupt.sym", dir); + FILE* f = fopen(path, "wb"); + munit_assert_ptr_not_null(f); + /* Pre-pad past the ray_t header (32 bytes) with identifiable garbage. */ + unsigned char garbage[128]; + for (size_t i = 0; i < sizeof(garbage); i++) garbage[i] = (unsigned char)(i * 37 + 1); + fwrite(garbage, 1, sizeof(garbage), f); + fclose(f); + + ray_err_t err = RAY_OK; + ray_runtime_t* rt = ray_runtime_create_with_sym_err(path, &err); + munit_assert_ptr_not_null(rt); + munit_assert_int((int)err, !=, (int)RAY_OK); + + ray_runtime_destroy(rt); + unlink(path); + rmdir(dir); + free(dir); + return MUNIT_OK; +} + +/* Load-before-builtins ordering is the whole reason + * ray_runtime_create_with_sym exists: after a save/destroy/load cycle, + * user-interned sym IDs must occupy exactly the slots they had before, + * while builtins append afterwards. Intern a distinctive name, save, + * tear down, reload via the persistent-consumer entrypoint, and verify + * the same string interns to the same ID. */ +static MunitResult test_create_with_sym_load_preserves_user_ids(const void* params, void* fixture) { + (void)params; (void)fixture; + char* dir = make_tmpdir(); + munit_assert_ptr_not_null(dir); + + char path[256]; + snprintf(path, sizeof(path), "%s/ids.sym", dir); + + /* Phase 1: intern a name then persist the sym table. */ + ray_runtime_t* rt1 = ray_runtime_create(0, NULL); + munit_assert_ptr_not_null(rt1); + int64_t id_before = ray_sym_intern("rayforce-user-marker", 20); + munit_assert_int((int)ray_sym_save(path), ==, (int)RAY_OK); + ray_runtime_destroy(rt1); + + /* Phase 2: bring up a fresh runtime via the _with_sym variant so the + * persisted table is loaded before builtins register. */ + ray_err_t err = RAY_ERR_OOM; + ray_runtime_t* rt2 = ray_runtime_create_with_sym_err(path, &err); + munit_assert_ptr_not_null(rt2); + munit_assert_int((int)err, ==, (int)RAY_OK); + + /* Same string must re-intern to the same ID (not shift because of + * builtins claiming the low slots first). */ + int64_t id_after = ray_sym_intern("rayforce-user-marker", 20); + munit_assert_int((int)id_after, ==, (int)id_before); + + ray_runtime_destroy(rt2); + unlink(path); + /* ray_sym_save may also create a lock file. */ + char lock_path[320]; + snprintf(lock_path, sizeof(lock_path), "%s.lk", path); + unlink(lock_path); + rmdir(dir); + free(dir); + return MUNIT_OK; +} + static MunitTest runtime_tests[] = { { "/create_with_sym_absent_is_ok", test_create_with_sym_absent_is_ok, NULL, NULL, 0, NULL }, { "/create_with_sym_io_error_surfaces", test_create_with_sym_io_error_surfaces, NULL, NULL, 0, NULL }, { "/create_with_sym_plain_variant_absent", test_create_with_sym_plain_variant_absent, NULL, NULL, 0, NULL }, + { "/create_with_sym_corrupt_file", test_create_with_sym_corrupt_file, NULL, NULL, 0, NULL }, + { "/create_with_sym_load_preserves_user_ids", test_create_with_sym_load_preserves_user_ids, NULL, NULL, 0, NULL }, { NULL, NULL, NULL, NULL, 0, NULL }, }; From 2759d34b405f09e9213cade9ed09fcda6e89828d Mon Sep 17 00:00:00 2001 From: Anton Date: Thu, 23 Apr 2026 15:05:40 +0200 Subject: [PATCH 43/51] fix(datalog): dl_filter_eq pass-through must retain (owned-ref contract) dl_filter_eq's pass-through branches (NULL input, RAY_IS_ERR input, empty-rows input, missing key column, non-I64/non-SYM key column) used to return the input pointer without bumping its refcount. Combined with the caller idiom ray_retain(body_tbl); ray_t* filtered = dl_filter_eq(body_tbl, c, const); ray_release(body_tbl); body_tbl = filtered; this meant the pass-through paths produced a `filtered` whose refcount was *one less* than the caller assumed. Later `ray_release(body_tbl)` calls (in dl_compile_rule's join paths and the DL_NEG antijoin) could then drop the underlying rel->table to rc=0 and free it out from under the program's own EDB storage. Tighten the contract so every exit from dl_filter_eq returns an owned reference: * empty-rows short-circuit: retain before return. * missing-column short-circuit: retain before return. * non-I64/non-SYM key pass-through: retain before return. * RAY_IS_ERR input: retain iff non-NULL (caller will see RAY_IS_ERR and free via ray_error_free). The count==nrows branch already retained. Callers are unchanged; the fix simply matches their existing expectations. 706/706 pass on debug (ASan+UBSan) and release. --- src/ops/datalog.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index a2601a14..86154d45 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -943,15 +943,22 @@ static bool dl_col_eq_row(ray_t* col, int64_t row, int64_t value) { } static ray_t* dl_filter_eq(ray_t* tbl, int col_idx, int64_t value) { - if (!tbl || RAY_IS_ERR(tbl) || ray_table_nrows(tbl) == 0) return tbl; + /* Contract: always return an owned reference (rc bumped) so the + * caller can release uniformly. Every pass-through must therefore + * retain — else the caller's `ray_release(body_tbl); body_tbl = + * filtered;` pattern would leave body_tbl under-referenced and a + * later release could land on freed memory. */ + if (!tbl || RAY_IS_ERR(tbl)) { if (tbl) ray_retain(tbl); return tbl; } + if (ray_table_nrows(tbl) == 0) { ray_retain(tbl); return tbl; } ray_t* col = ray_table_get_col_idx(tbl, col_idx); - if (!col) return tbl; - /* Non-numeric, non-sym keys: not supported by this filter. Match - * the existing pass-through convention used by the empty-rows - * early-return above (caller's retain covers us). */ - if (col->type != RAY_I64 && col->type != RAY_SYM) + if (!col) { ray_retain(tbl); return tbl; } + /* Non-numeric, non-sym keys: not supported by this filter — pass + * through (retained) rather than miscompare via raw memcpy. */ + if (col->type != RAY_I64 && col->type != RAY_SYM) { + ray_retain(tbl); return tbl; + } int64_t nrows = ray_table_nrows(tbl); int64_t ncols = ray_table_ncols(tbl); From 86880ab1033232202282ee0021797fd2f8df6303 Mon Sep 17 00:00:00 2001 From: Anton Date: Thu, 23 Apr 2026 15:43:41 +0200 Subject: [PATCH 44/51] fix(datalog): surface head-const conflicts via eval_err, drop stderr warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two datalog paths were writing diagnostic messages to stderr from non-debug builds instead of surfacing the failure through the normal error channel: * dl_idb_align_head_const_types (src/ops/datalog.c): head-const type conflict between rules used to `fprintf(stderr, ...)` and return without flagging the program. Removed the stderr write and set prog->eval_err = true instead. dl_eval now short-circuits when eval_err is already set at entry (sticky flag), so the conflict detected at rule-add time reaches dl_eval's return without being reset. Callers like ray_query_fn turn that -1 into a proper "query: evaluation failed" ray_error. * dl_compile_rule grouped-aggregate + positive body atoms (nyi path): same treatment — remove stderr warning, set prog->eval_err. Regression test /datalog/rule_head_const_type_conflict builds a program with two rules whose head-const types collide on the same slot and asserts dl_eval returns -1. 707/707 pass on debug (ASan+UBSan) and release. --- src/ops/datalog.c | 31 +++++++++++++++++++++---------- test/test_datalog.c | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 10 deletions(-) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index 86154d45..856abe92 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -182,9 +182,12 @@ static void dl_idb_align_head_const_types(dl_program_t* prog, const dl_rule_t* r if (want == 0) { desired[c] = cur; } else if (cur != RAY_I64 && cur != want) { - /* Slot already typed by a prior rule to a different type. */ - fprintf(stderr, "dl: head-const type conflict at slot %d: " - "existing %d vs rule %d\n", c, cur, want); + /* First-non-zero-wins policy: once a slot is committed to a + * non-default type by a prior rule, any later rule that + * disagrees is a program-level conflict. Mark the program + * so dl_eval (which reads eval_err after evaluation) reports + * failure — no stderr write from a non-debug code path. */ + prog->eval_err = true; return; } else { desired[c] = want; @@ -1363,8 +1366,12 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, if (rule->body[bi].type == DL_POS) { has_pos = true; break; } } if (has_pos) { - fprintf(stderr, "dl: grouped aggregate with positive body atoms not yet supported\n"); + /* nyi: grouped aggregate + positive body atoms. + * Surface via eval_err so dl_eval reports failure + * instead of writing a warning to stderr in a + * non-debug build. */ ray_release(accum); + prog->eval_err = true; return NULL; } @@ -2263,12 +2270,16 @@ static void dl_build_provenance(dl_program_t* prog) { int dl_eval(dl_program_t* prog) { if (!prog) return -1; - /* Reset the compile/eval error flag at the top of each eval. Rule - * compilation or ray_execute paths may set it on unrecoverable failure - * (e.g. dl_project OOM); we return -1 at the end if it was raised so - * ray_query_fn and other callers can surface "evaluation failed" rather - * than silently returning an empty/partial result. */ - prog->eval_err = false; + /* eval_err is sticky: it may have been raised at rule-add time (e.g. + * by a head-const type conflict in dl_idb_align_head_const_types) — + * resetting here would silently discard that signal. Additional + * failures during stratify/compile/exec below keep setting the flag, + * and the final return honors it either way. */ + if (prog->eval_err) { + /* Short-circuit: compile-time errors already stand; don't run + * a potentially broken fixpoint. */ + return -1; + } /* Stratify if not already done */ if (prog->n_strata == 0) { diff --git a/test/test_datalog.c b/test/test_datalog.c index 9dd26ec2..3ffa6bac 100644 --- a/test/test_datalog.c +++ b/test/test_datalog.c @@ -1374,6 +1374,45 @@ static MunitResult test_rule_head_const_stratification(const void* params, void* return MUNIT_OK; } +/* Conflicting head-const types for the same IDB slot across rules must + * surface via dl_eval == -1 rather than a silent stderr print. Rule A + * commits slot 1 to RAY_SYM; rule B tries to commit it to RAY_F64. */ +static MunitResult test_rule_head_const_type_conflict(const void* params, void* fixture) { + (void)params; (void)fixture; + int64_t xs[] = {1}; + ray_t* xc = ray_vec_from_raw(RAY_I64, xs, 1); + ray_t* src = ray_table_new(1); + src = ray_table_add_col(src, ray_sym_intern("src__c0", 7), xc); + + dl_program_t* prog = dl_program_new(); + dl_add_edb(prog, "src", src, 1); + + /* Rule A: (tag ?x "sym") — slot 1 committed to RAY_SYM. */ + dl_rule_t a; dl_rule_init(&a, "tag", 2); + dl_rule_head_var(&a, 0, 0); + dl_rule_head_const(&a, 1, ray_sym_intern("sym", 3), RAY_SYM); + int ab = dl_rule_add_atom(&a, "src", 1); + dl_body_set_var(&a, ab, 0, 0); + a.n_vars = 1; + munit_assert_int(dl_add_rule(prog, &a), >=, 0); + + /* Rule B: (tag ?x 3.14) — conflicting head-const type for slot 1. */ + dl_rule_t b; dl_rule_init(&b, "tag", 2); + dl_rule_head_var(&b, 0, 0); + dl_rule_head_const_f64(&b, 1, 3.14); + int bb = dl_rule_add_atom(&b, "src", 1); + dl_body_set_var(&b, bb, 0, 0); + b.n_vars = 1; + munit_assert_int(dl_add_rule(prog, &b), >=, 0); + + /* Conflict must surface as dl_eval == -1 (no stderr print). */ + munit_assert_int(dl_eval(prog), ==, -1); + + dl_program_free(prog); + ray_release(src); ray_release(xc); + return MUNIT_OK; +} + /* Surface syntax round-trip for head constants: (rule (foo "a" ?x) ...) */ static MunitResult test_rule_head_const_surface_syntax(const void* params, void* fixture) { (void)params; (void)fixture; @@ -1964,6 +2003,7 @@ static MunitTest datalog_tests[] = { { "/rule_head_const_with_agg", test_rule_head_const_with_agg, datalog_setup, datalog_teardown, 0, NULL }, { "/rule_head_const_with_negation", test_rule_head_const_with_negation, datalog_setup, datalog_teardown, 0, NULL }, { "/rule_head_const_stratification", test_rule_head_const_stratification, datalog_setup, datalog_teardown, 0, NULL }, + { "/rule_head_const_type_conflict", test_rule_head_const_type_conflict, datalog_setup, datalog_teardown, 0, NULL }, { "/rule_head_const_surface_syntax", test_rule_head_const_surface_syntax, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, { "/rule_body_const_surface_syntax", test_rule_body_const_surface_syntax, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, { "/env_bound_edb_auto_register", test_env_bound_edb_auto_register, datalog_rf_setup, datalog_rf_teardown, 0, NULL }, From 916d84a7cd073131273f3875c6795155553a8135 Mon Sep 17 00:00:00 2001 From: Anton Date: Thu, 23 Apr 2026 16:05:48 +0200 Subject: [PATCH 45/51] fix(datalog,runtime): address round-6+7 Copilot review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Seven distinct comments across the 13:35 / 13:54 review rounds: * Aggregate-only rule fallback (datalog.c:~1276): one_val / accum / add_col allocation failures previously returned NULL without setting eval_err or freeing RAY_ERROR blocks. All paths now set prog->eval_err = true and route errors through ray_error_free(). * Grouped-aggregate compile (datalog.c:~1414..1468): added NULL checks around ray_graph_new, ray_sym_str, every ray_scan, ray_group, and ray_execute. On any failure, free the sub-graph, release accum, set eval_err, and return NULL. Execute errors that were RAY_ERROR are now freed via ray_error_free instead of leaking. * DL_CMP type awareness (datalog.c:~1746): comparison loop used to read both sides as int64_t*, miscomparing RAY_F64 columns produced by dl_eval_expr's float path. Now promotes to f64 iff either side is f64 and rejects non-numeric sources with eval_err. Filter-copy loop switched to element-size-aware memcpy so f64/narrow-SYM columns survive the mask unchanged, with str_pool propagation for RAY_STR. * table_union concat (datalog.c:~2055): propagate the original ray_vec_concat error (e.g. "type" for schema mismatch) instead of always rewriting it to a generic "memory" error. * dl_filter_eq (datalog.c:~979): added proper error checks for ray_table_new, ray_vec_new, and ray_table_add_col. Previously a failure in any of those silently `continue`-d past the column, yielding a partial table. * Corrupt-file test (test_runtime.c): the fake-garbage payload was tripping ray_col_load's header parser (RAY_ERR_NYI) before the sym-table-specific corrupt path. Now writes STR_LIST_MAGIC + inflated string count so col_load_str_list hits its truncated- body check, which returns ray_error("corrupt", ...) — mapping back to the exact RAY_ERR_CORRUPT we assert. * Oversized-file OOM test (test_runtime.c): new /runtime/create_with_sym_oversized_file uses ftruncate to produce a 10 EB sparse sym file and asserts out_sym_err == RAY_ERR_OOM. Filesystems that reject the giant ftruncate (tmpfs) return MUNIT_SKIP rather than a spurious failure. * test_datalog.c: added / so fprintf/abort() in datalog_rf_setup have explicit declarations under -Werror (they were arriving transitively before but the coverage was fragile). 708/708 pass on debug (ASan+UBSan) and release. --- src/ops/datalog.c | 210 +++++++++++++++++++++++++++++++++++--------- test/test_datalog.c | 2 + test/test_runtime.c | 59 +++++++++++-- 3 files changed, 224 insertions(+), 47 deletions(-) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index 856abe92..caecff8d 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -977,13 +977,19 @@ static ray_t* dl_filter_eq(ray_t* tbl, int col_idx, int64_t value) { * its source's element-size (via ray_sym_elem_size) so narrow-SYM * stays narrow rather than being silently widened to W64. */ ray_t* out = ray_table_new((int)ncols); + if (!out) return ray_error("memory", "dl_filter_eq: table_new"); + if (RAY_IS_ERR(out)) return out; for (int64_t c = 0; c < ncols; c++) { ray_t* src = ray_table_get_col_idx(tbl, c); - if (!src) continue; + if (!src) { + ray_release(out); + return ray_error("domain", "dl_filter_eq: missing source column"); + } ray_t* dst = (src->type == RAY_SYM) ? ray_sym_vec_new(src->attrs & RAY_SYM_W_MASK, count) : ray_vec_new(src->type, count); - if (!dst || RAY_IS_ERR(dst)) continue; + if (!dst) { ray_release(out); return ray_error("memory", "dl_filter_eq: vec_new"); } + if (RAY_IS_ERR(dst)) { ray_error_free(dst); ray_release(out); return ray_error("memory", "dl_filter_eq: vec_new"); } dst->len = count; uint8_t esz = ray_sym_elem_size(src->type, src->attrs); const uint8_t* src_b = (const uint8_t*)ray_data(src); @@ -998,8 +1004,11 @@ static ray_t* dl_filter_eq(ray_t* tbl, int col_idx, int64_t value) { } } if (src->type == RAY_STR) col_propagate_str_pool(dst, src); - out = ray_table_add_col(out, ray_table_col_name(tbl, c), dst); + ray_t* next = ray_table_add_col(out, ray_table_col_name(tbl, c), dst); ray_release(dst); + if (!next) return ray_error("memory", "dl_filter_eq: add_col"); + if (RAY_IS_ERR(next)) return next; + out = next; } return out; } @@ -1274,16 +1283,36 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, if (!has_agg) return NULL; ray_t* one_val = ray_vec_new(RAY_I64, 1); - if (!one_val || RAY_IS_ERR(one_val)) + if (!one_val) { prog->eval_err = true; return NULL; } + if (RAY_IS_ERR(one_val)) { + ray_error_free(one_val); + prog->eval_err = true; return NULL; + } one_val->len = 1; ((int64_t*)ray_data(one_val))[0] = 0; accum = ray_table_new(1); + if (!accum) { + ray_release(one_val); + prog->eval_err = true; + return NULL; + } + if (RAY_IS_ERR(accum)) { + ray_error_free(accum); + ray_release(one_val); + prog->eval_err = true; + return NULL; + } int64_t unit_sym = ray_sym_intern("_unit", 5); - accum = ray_table_add_col(accum, unit_sym, one_val); + ray_t* accum_unit = ray_table_add_col(accum, unit_sym, one_val); ray_release(one_val); - if (!accum || RAY_IS_ERR(accum)) + if (!accum_unit) { prog->eval_err = true; return NULL; } + if (RAY_IS_ERR(accum_unit)) { + ray_error_free(accum_unit); + prog->eval_err = true; return NULL; + } + accum = accum_unit; } if (!accum) return NULL; @@ -1392,7 +1421,11 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, /* Build a sub-graph that SCANs src_table's columns by symbol name. * ray_graph_new retains src_table internally; no extra retain needed. */ ray_graph_t* gg = ray_graph_new(src_table); - if (!gg) { ray_release(accum); return NULL; } + if (!gg) { + ray_release(accum); + prog->eval_err = true; + return NULL; + } ray_op_t* keys_ops[DL_AGG_MAX_KEYS]; for (int i = 0; i < nk; i++) { @@ -1405,7 +1438,19 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, } int64_t sym = src_rel->col_names[kc]; ray_t* s = ray_sym_str(sym); + if (!s) { + ray_graph_free(gg); + ray_release(accum); + prog->eval_err = true; + return NULL; + } keys_ops[i] = ray_scan(gg, ray_str_ptr(s)); + if (!keys_ops[i]) { + ray_graph_free(gg); + ray_release(accum); + prog->eval_err = true; + return NULL; + } } /* Agg input: value column (for COUNT we still pass a column; any @@ -1422,7 +1467,19 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, } if (value_col < 0 || value_col >= src_rel->arity) value_col = 0; ray_t* vs = ray_sym_str(src_rel->col_names[value_col]); + if (!vs) { + ray_graph_free(gg); + ray_release(accum); + prog->eval_err = true; + return NULL; + } ray_op_t* agg_in = ray_scan(gg, ray_str_ptr(vs)); + if (!agg_in) { + ray_graph_free(gg); + ray_release(accum); + prog->eval_err = true; + return NULL; + } uint16_t op_code; switch (body->agg_op) { @@ -1438,12 +1495,24 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, ray_op_t* ag_ins[1] = { agg_in }; ray_op_t* root = ray_group(gg, keys_ops, (uint8_t)nk, &op_code, ag_ins, 1); + if (!root) { + ray_graph_free(gg); + ray_release(accum); + prog->eval_err = true; + return NULL; + } ray_t* group_tbl = ray_execute(gg, root); ray_graph_free(gg); - if (!group_tbl || RAY_IS_ERR(group_tbl)) { - if (group_tbl) ray_release(group_tbl); + if (!group_tbl) { ray_release(accum); + prog->eval_err = true; + return NULL; + } + if (RAY_IS_ERR(group_tbl)) { + ray_error_free(group_tbl); + ray_release(accum); + prog->eval_err = true; return NULL; } @@ -1689,44 +1758,65 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, ray_t* lhs_evaled = NULL; ray_t* rhs_evaled = NULL; - int64_t* lhs_data; - int64_t* rhs_data; + ray_t* lhs_src = NULL; /* borrowed reference for type inspection */ + ray_t* rhs_src = NULL; if (body->cmp_lhs_expr) { - /* Expression-based LHS */ lhs_evaled = dl_eval_expr(body->cmp_lhs_expr, accum, var_col, nrows); if (!lhs_evaled || RAY_IS_ERR(lhs_evaled)) break; - lhs_data = (int64_t*)ray_data(lhs_evaled); + lhs_src = lhs_evaled; } else { - /* Simple variable LHS */ int lhs_col = var_col[body->cmp_lhs]; - ray_t* lhs_vec = ray_table_get_col_idx(accum, lhs_col); - if (!lhs_vec) break; - lhs_data = (int64_t*)ray_data(lhs_vec); + lhs_src = ray_table_get_col_idx(accum, lhs_col); + if (!lhs_src) break; } if (body->cmp_rhs_expr) { - /* Expression-based RHS */ rhs_evaled = dl_eval_expr(body->cmp_rhs_expr, accum, var_col, nrows); if (!rhs_evaled || RAY_IS_ERR(rhs_evaled)) { if (lhs_evaled) ray_release(lhs_evaled); break; } - rhs_data = (int64_t*)ray_data(rhs_evaled); + rhs_src = rhs_evaled; } else if (body->cmp_rhs != DL_CONST) { - /* Simple variable RHS */ int rhs_col = var_col[body->cmp_rhs]; - ray_t* rhs_vec = ray_table_get_col_idx(accum, rhs_col); - if (!rhs_vec) { + rhs_src = ray_table_get_col_idx(accum, rhs_col); + if (!rhs_src) { if (lhs_evaled) ray_release(lhs_evaled); break; } - rhs_data = (int64_t*)ray_data(rhs_vec); - } else { - rhs_data = NULL; /* constant RHS */ } + /* else rhs is a constant i64 body->cmp_const */ + + /* Reject non-numeric sources — DL_CMP has no meaningful + * comparison for SYM/STR columns without an ordering hook. */ + bool lhs_is_f64 = lhs_src && lhs_src->type == RAY_F64; + bool rhs_is_f64 = rhs_src && rhs_src->type == RAY_F64; + if (lhs_src && lhs_src->type != RAY_I64 && lhs_src->type != RAY_F64) { + if (lhs_evaled) ray_release(lhs_evaled); + if (rhs_evaled) ray_release(rhs_evaled); + prog->eval_err = true; + ray_release(accum); + return NULL; + } + if (rhs_src && rhs_src->type != RAY_I64 && rhs_src->type != RAY_F64) { + if (lhs_evaled) ray_release(lhs_evaled); + if (rhs_evaled) ray_release(rhs_evaled); + prog->eval_err = true; + ray_release(accum); + return NULL; + } + + /* Promote to f64 iff either side is f64. Otherwise stay in + * i64 arithmetic for speed and exact integer semantics. */ + bool use_f64 = lhs_is_f64 || rhs_is_f64; + const int64_t* lhs_i = !use_f64 ? (const int64_t*)ray_data(lhs_src) : NULL; + const int64_t* rhs_i = !use_f64 && rhs_src ? (const int64_t*)ray_data(rhs_src) : NULL; + const double* lhs_f = use_f64 && !lhs_is_f64 ? NULL + : (use_f64 ? (const double*)ray_data(lhs_src) : NULL); + const double* rhs_f = use_f64 && rhs_src && rhs_is_f64 + ? (const double*)ray_data(rhs_src) : NULL; - /* Build boolean mask */ ray_t* mask_block = ray_alloc((size_t)nrows * sizeof(bool)); if (!mask_block) { if (lhs_evaled) ray_release(lhs_evaled); @@ -1736,16 +1826,37 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, bool* mask = (bool*)ray_data(mask_block); int64_t count = 0; for (int64_t r = 0; r < nrows; r++) { - int64_t rv = rhs_data ? rhs_data[r] : body->cmp_const; bool pass = false; - switch (body->cmp_op) { - case DL_CMP_EQ: pass = (lhs_data[r] == rv); break; - case DL_CMP_NE: pass = (lhs_data[r] != rv); break; - case DL_CMP_LT: pass = (lhs_data[r] < rv); break; - case DL_CMP_LE: pass = (lhs_data[r] <= rv); break; - case DL_CMP_GT: pass = (lhs_data[r] > rv); break; - case DL_CMP_GE: pass = (lhs_data[r] >= rv); break; + if (use_f64) { + /* Widen the non-f64 side — mixed arithmetic is already + * supported by dl_eval_expr, and DL_CMP_const is i64. */ + double lv = lhs_is_f64 ? lhs_f[r] : (double)((const int64_t*)ray_data(lhs_src))[r]; + double rv; + if (rhs_src) + rv = rhs_is_f64 ? rhs_f[r] : (double)((const int64_t*)ray_data(rhs_src))[r]; + else + rv = (double)body->cmp_const; + switch (body->cmp_op) { + case DL_CMP_EQ: pass = (lv == rv); break; + case DL_CMP_NE: pass = (lv != rv); break; + case DL_CMP_LT: pass = (lv < rv); break; + case DL_CMP_LE: pass = (lv <= rv); break; + case DL_CMP_GT: pass = (lv > rv); break; + case DL_CMP_GE: pass = (lv >= rv); break; + } + } else { + int64_t lv = lhs_i[r]; + int64_t rv = rhs_i ? rhs_i[r] : body->cmp_const; + switch (body->cmp_op) { + case DL_CMP_EQ: pass = (lv == rv); break; + case DL_CMP_NE: pass = (lv != rv); break; + case DL_CMP_LT: pass = (lv < rv); break; + case DL_CMP_LE: pass = (lv <= rv); break; + case DL_CMP_GT: pass = (lv > rv); break; + case DL_CMP_GE: pass = (lv >= rv); break; + } } + (void)lhs_f; /* silence unused warnings in non-f64 paths */ mask[r] = pass; if (pass) count++; } @@ -1758,20 +1869,29 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, break; /* all rows pass */ } - /* Build filtered table */ + /* Build filtered table — element-size-aware memcpy so f64 + * columns and narrow-SYM columns survive the mask unchanged. */ int64_t ncols = ray_table_ncols(accum); ray_t* out = ray_table_new((int)ncols); for (int64_t c = 0; c < ncols; c++) { ray_t* src = ray_table_get_col_idx(accum, c); if (!src) continue; - ray_t* dst = ray_vec_new(src->type, count); - if (!dst || RAY_IS_ERR(dst)) continue; + ray_t* dst = (src->type == RAY_SYM) + ? ray_sym_vec_new(src->attrs & RAY_SYM_W_MASK, count) + : ray_vec_new(src->type, count); + if (!dst) continue; + if (RAY_IS_ERR(dst)) { ray_error_free(dst); continue; } dst->len = count; - int64_t* src_d = (int64_t*)ray_data(src); - int64_t* dst_d = (int64_t*)ray_data(dst); + uint8_t esz = ray_sym_elem_size(src->type, src->attrs); + const uint8_t* sb = (const uint8_t*)ray_data(src); + uint8_t* db = (uint8_t*)ray_data(dst); int64_t j = 0; for (int64_t r = 0; r < nrows; r++) - if (mask[r]) dst_d[j++] = src_d[r]; + if (mask[r]) { + memcpy(db + (size_t)j * esz, sb + (size_t)r * esz, esz); + j++; + } + if (src->type == RAY_STR) col_propagate_str_pool(dst, src); out = ray_table_add_col(out, ray_table_col_name(accum, c), dst); ray_release(dst); } @@ -1938,11 +2058,17 @@ static ray_t* table_union(ray_t* a, ray_t* b) { return ray_error("domain", "table_union: missing column"); } ray_t* merged = ray_vec_concat(col_a, col_b); - if (!merged || RAY_IS_ERR(merged)) { - if (merged) ray_release(merged); + if (!merged) { ray_release(out); return ray_error("memory", "table_union: concat"); } + if (RAY_IS_ERR(merged)) { + /* Propagate the original error (e.g. "type" for schema + * mismatch) so callers see the real diagnostic instead of + * a generic "memory". */ + ray_release(out); + return merged; + } ray_t* next = ray_table_add_col(out, ray_table_col_name(a, c), merged); ray_release(merged); if (!next) return ray_error("memory", "table_union: add_col"); diff --git a/test/test_datalog.c b/test/test_datalog.c index 3ffa6bac..8b6dfd2c 100644 --- a/test/test_datalog.c +++ b/test/test_datalog.c @@ -31,6 +31,8 @@ #include "ops/datalog.h" #include "table/sym.h" /* ray_read_sym for SYM column inspection */ #include "lang/eval.h" +#include /* fprintf in datalog_rf_setup */ +#include /* abort in datalog_rf_setup */ #include /* Forward-declare runtime API used by the full-runtime fixtures. diff --git a/test/test_runtime.c b/test/test_runtime.c index c9bbc920..9fad1fa9 100644 --- a/test/test_runtime.c +++ b/test/test_runtime.c @@ -11,6 +11,7 @@ #include #include #include +#include #include /* Runtime API forward-declared here because core/runtime.h's `ray_vm_t` @@ -119,16 +120,23 @@ static MunitResult test_create_with_sym_corrupt_file(const void* params, void* f snprintf(path, sizeof(path), "%s/corrupt.sym", dir); FILE* f = fopen(path, "wb"); munit_assert_ptr_not_null(f); - /* Pre-pad past the ray_t header (32 bytes) with identifiable garbage. */ - unsigned char garbage[128]; - for (size_t i = 0; i < sizeof(garbage); i++) garbage[i] = (unsigned char)(i * 37 + 1); - fwrite(garbage, 1, sizeof(garbage), f); + /* Write STR_LIST_MAGIC ("STRL" little-endian) followed by a truncated + * payload — header-count byte count=999 but no body — ray_col_load + * will hit col_load_str_list's "corrupt" path, which maps to + * RAY_ERR_CORRUPT via ray_err_from_obj. */ + uint32_t magic = 0x4C525453U; /* STR_LIST_MAGIC */ + int64_t count = 999; /* claims 999 strings, none present */ + fwrite(&magic, sizeof(magic), 1, f); + fwrite(&count, sizeof(count), 1, f); fclose(f); ray_err_t err = RAY_OK; ray_runtime_t* rt = ray_runtime_create_with_sym_err(path, &err); munit_assert_ptr_not_null(rt); - munit_assert_int((int)err, !=, (int)RAY_OK); + /* Pin the exact error code: the contract maps corrupted sym data + * to RAY_ERR_CORRUPT, distinct from I/O or OOM, so callers can + * decide recovery policy. */ + munit_assert_int((int)err, ==, (int)RAY_ERR_CORRUPT); ray_runtime_destroy(rt); unlink(path); @@ -181,12 +189,53 @@ static MunitResult test_create_with_sym_load_preserves_user_ids(const void* para return MUNIT_OK; } +/* Sym file whose stat st_size exceeds mem_budget/2 must trigger the + * pre-flight OOM guard and surface RAY_ERR_OOM through out_sym_err. + * We use ftruncate to create a sparse file without actually allocating + * the backing bytes. Budget auto-detects ~80% of RAM, so a sparse + * file ~10 EB guarantees tripping the half-budget ceiling on any + * realistic dev/CI host. */ +static MunitResult test_create_with_sym_oversized_file(const void* params, void* fixture) { + (void)params; (void)fixture; + char* dir = make_tmpdir(); + munit_assert_ptr_not_null(dir); + + char path[256]; + snprintf(path, sizeof(path), "%s/huge.sym", dir); + int fd = open(path, O_WRONLY | O_CREAT | O_TRUNC, 0644); + munit_assert_int(fd, >=, 0); + /* 10 EB sparse — bigger than any plausible mem_budget/2. */ + off_t huge = (off_t)1 << 62; + int rc = ftruncate(fd, huge); + close(fd); + if (rc != 0) { + /* Some filesystems (tmpfs on limited hosts) reject the giant + * ftruncate — skip rather than fail spuriously. */ + unlink(path); + rmdir(dir); + free(dir); + return MUNIT_SKIP; + } + + ray_err_t err = RAY_OK; + ray_runtime_t* rt = ray_runtime_create_with_sym_err(path, &err); + munit_assert_ptr_not_null(rt); + munit_assert_int((int)err, ==, (int)RAY_ERR_OOM); + + ray_runtime_destroy(rt); + unlink(path); + rmdir(dir); + free(dir); + return MUNIT_OK; +} + static MunitTest runtime_tests[] = { { "/create_with_sym_absent_is_ok", test_create_with_sym_absent_is_ok, NULL, NULL, 0, NULL }, { "/create_with_sym_io_error_surfaces", test_create_with_sym_io_error_surfaces, NULL, NULL, 0, NULL }, { "/create_with_sym_plain_variant_absent", test_create_with_sym_plain_variant_absent, NULL, NULL, 0, NULL }, { "/create_with_sym_corrupt_file", test_create_with_sym_corrupt_file, NULL, NULL, 0, NULL }, { "/create_with_sym_load_preserves_user_ids", test_create_with_sym_load_preserves_user_ids, NULL, NULL, 0, NULL }, + { "/create_with_sym_oversized_file", test_create_with_sym_oversized_file, NULL, NULL, 0, NULL }, { NULL, NULL, NULL, NULL, 0, NULL }, }; From 5b7949b02354e848d2d737487e602d6c8db66d1d Mon Sep 17 00:00:00 2001 From: Anton Date: Thu, 23 Apr 2026 16:13:10 +0200 Subject: [PATCH 46/51] fix(datalog): check dl_filter_eq return at call sites MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round-6 made dl_filter_eq surface allocation/add_col failures as RAY_ERROR returns, but the two call sites in dl_compile_rule were still blindly assigning the return to body_tbl / neg_tbl and continuing. If the filter failed, the subsequent join path would then operate on a RAY_ERROR, or hit a NULL — either way turning a proper failure into undefined behavior. Both call sites (positive DL_POS body filter at ~1199 and DL_NEG filter at ~1353) now treat a NULL or RAY_ERROR return as a hard failure: release the surviving accum, free any error block via ray_error_free, set prog->eval_err = true, and return NULL from dl_compile_rule so dl_eval reports -1. 708/708 pass on debug (ASan+UBSan) and release. --- src/ops/datalog.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index caecff8d..9f25a88e 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -1198,6 +1198,20 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, if (body->vars[c] == DL_CONST) { ray_t* filtered = dl_filter_eq(body_tbl, c, body->const_vals[c]); ray_release(body_tbl); + if (!filtered) { + /* Treat as genuine failure — dl_filter_eq returns an + * owned reference on every non-NULL path, so NULL + * means something went wrong inside the helper. */ + if (accum) ray_release(accum); + prog->eval_err = true; + return NULL; + } + if (RAY_IS_ERR(filtered)) { + ray_error_free(filtered); + if (accum) ray_release(accum); + prog->eval_err = true; + return NULL; + } body_tbl = filtered; } } @@ -1338,6 +1352,17 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, if (body->vars[c] == DL_CONST) { ray_t* filtered = dl_filter_eq(neg_tbl, c, body->const_vals[c]); ray_release(neg_tbl); + if (!filtered) { + ray_release(accum); + prog->eval_err = true; + return NULL; + } + if (RAY_IS_ERR(filtered)) { + ray_error_free(filtered); + ray_release(accum); + prog->eval_err = true; + return NULL; + } neg_tbl = filtered; } } From 3ff6a7b8262aab7079df4caa071f061cc7a00d03 Mon Sep 17 00:00:00 2001 From: Anton Date: Thu, 23 Apr 2026 16:38:33 +0200 Subject: [PATCH 47/51] =?UTF-8?q?fix(datalog):=20round-8=20=E2=80=94=20add?= =?UTF-8?q?=5Fcol=20leaks,=20broadcast=20SYM=20width,=20align=20eval=5Ferr?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four comments from 2026-04-23T14:21: * dl_filter_eq / table_union / dl_project: ray_table_add_col failures used to return without releasing the partially-built output table. ray_table_add_col doesn't free its input on error, so this leaked the table plus every previously-added column. All four add_col sites (dl_filter_eq, table_union, dl_project — var path and both const/empty paths) now release `out` before returning the error (or return the RAY_ERROR directly when that's the most informative code). * dl_project constant-SYM broadcast width: dl_broadcast_const_col used ray_vec_new(RAY_SYM, …) which always produced W64. When a prior rule had already aligned the head relation's SYM column to a narrower width, the W64 broadcast would then fail ray_vec_concat inside table_union. Added a sym_width_hint parameter routed through ray_sym_vec_new, and in dl_project the caller now inspects the existing head_rel column's attrs and passes the matching width. * dl_idb_align_head_const_types: OOM / add_col failures while rebuilding the IDB's empty schema used to silently `return`, leaving the relation with its original column types and eventually causing a ray_vec_concat mismatch during evaluation. Every failure path now sets prog->eval_err = true (and still frees any RAY_ERROR via ray_error_free), so dl_eval reliably returns -1 when alignment can't complete. 708/708 pass on debug (ASan+UBSan) and release. --- src/ops/datalog.c | 105 +++++++++++++++++++++++++++++++++------------- 1 file changed, 77 insertions(+), 28 deletions(-) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index 9f25a88e..30e90751 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -196,23 +196,33 @@ static void dl_idb_align_head_const_types(dl_program_t* prog, const dl_rule_t* r } if (!any_change) return; - /* Rebuild the table with typed empty columns. ray_release() is a - * deliberate no-op for RAY_ERROR objects (see src/mem/cow.c), so every - * failure path here must pair ray_release() for the valid survivor - * with ray_error_free() for the freshly-returned error block — else - * repeated align calls would silently leak one error block each time. */ + /* Rebuild the table with typed empty columns. Alignment is required + * for later evaluation to produce type-matching table_union inputs, + * so any failure here must also set prog->eval_err = true — silently + * returning would leave the IDB schema unaligned and dl_eval would + * later hit a ray_vec_concat type mismatch without any error signal. */ ray_t* fresh = ray_table_new(rel->arity); - if (!fresh) return; - if (RAY_IS_ERR(fresh)) { ray_error_free(fresh); return; } + if (!fresh) { prog->eval_err = true; return; } + if (RAY_IS_ERR(fresh)) { prog->eval_err = true; ray_error_free(fresh); return; } for (int c = 0; c < rel->arity; c++) { ray_t* empty_col = ray_vec_new(desired[c], 0); - if (!empty_col) { ray_release(fresh); return; } - if (RAY_IS_ERR(empty_col)) { ray_error_free(empty_col); ray_release(fresh); return; } + if (!empty_col) { prog->eval_err = true; ray_release(fresh); return; } + if (RAY_IS_ERR(empty_col)) { + prog->eval_err = true; + ray_error_free(empty_col); + ray_release(fresh); + return; + } ray_t* prev = fresh; fresh = ray_table_add_col(fresh, rel->col_names[c], empty_col); ray_release(empty_col); - if (!fresh) { ray_release(prev); return; } - if (RAY_IS_ERR(fresh)) { ray_release(prev); ray_error_free(fresh); return; } + if (!fresh) { prog->eval_err = true; ray_release(prev); return; } + if (RAY_IS_ERR(fresh)) { + prog->eval_err = true; + ray_release(prev); + ray_error_free(fresh); + return; + } } ray_release(rel->table); rel->table = fresh; @@ -1006,8 +1016,16 @@ static ray_t* dl_filter_eq(ray_t* tbl, int col_idx, int64_t value) { if (src->type == RAY_STR) col_propagate_str_pool(dst, src); ray_t* next = ray_table_add_col(out, ray_table_col_name(tbl, c), dst); ray_release(dst); - if (!next) return ray_error("memory", "dl_filter_eq: add_col"); - if (RAY_IS_ERR(next)) return next; + /* ray_table_add_col does not release `out` on failure, so we + * must release the partially-built table before bailing out. */ + if (!next) { + ray_release(out); + return ray_error("memory", "dl_filter_eq: add_col"); + } + if (RAY_IS_ERR(next)) { + ray_release(out); + return next; + } out = next; } return out; @@ -1023,18 +1041,22 @@ static ray_t* dl_filter_eq(ray_t* tbl, int col_idx, int64_t value) { * onto rule-local scratch — so that the IDB relation table can outlive the * per-iteration scratch that built it. Cross-IDB reads at subsequent * strata borrow from this column via ray_table_get_col_idx. */ -static ray_t* dl_broadcast_const_col(int64_t nrows, int8_t type, int64_t val) { +/* sym_width_hint: when type == RAY_SYM, pass the desired RAY_SYM_W* value + * so the broadcast column matches the IDB relation's existing width + * (otherwise ray_vec_new defaults to W64 and later table_union would + * hit a ray_vec_concat width mismatch). Pass 0 for the default. */ +static ray_t* dl_broadcast_const_col(int64_t nrows, int8_t type, int64_t val, + uint8_t sym_width_hint) { if (type != RAY_I64 && type != RAY_SYM && type != RAY_F64) { return ray_error("type", NULL); } - ray_t* v = ray_vec_new(type, nrows); + ray_t* v = (type == RAY_SYM) + ? ray_sym_vec_new(sym_width_hint & RAY_SYM_W_MASK, nrows) + : ray_vec_new(type, nrows); if (!v || RAY_IS_ERR(v)) return v; v->len = nrows; if (type == RAY_SYM) { - /* Default sym width from ray_vec_new is W64 → 8-byte entries. */ - uint8_t esz = ray_sym_elem_size(v->type, v->attrs); - (void)esz; /* Use the generic writer so it handles any adaptive width. */ void* data = ray_data(v); for (int64_t i = 0; i < nrows; i++) { @@ -1097,8 +1119,14 @@ static ray_t* dl_project(ray_t* tbl, const int* col_indices, int n_out, } ray_t* next = ray_table_add_col(out, head_rel->col_names[c], ecol); ray_release(ecol); - if (!next) return ray_error("memory", "dl_project: add_col"); - if (RAY_IS_ERR(next)) return next; + if (!next) { + ray_release(out); + return ray_error("memory", "dl_project: add_col"); + } + if (RAY_IS_ERR(next)) { + ray_release(out); + return next; + } out = next; continue; } @@ -1137,11 +1165,16 @@ static ray_t* dl_project(ray_t* tbl, const int* col_indices, int n_out, if (src->type == RAY_STR) col_propagate_str_pool(dst, src); ray_t* next = ray_table_add_col(out, head_rel->col_names[c], dst); ray_release(dst); - /* ray_table_add_col consumes `out` via ray_cow on success. On - * error it returns a fresh RAY_ERR object and `out` is no longer - * valid — surface the error to the caller as-is. */ - if (!next) return ray_error("memory", "dl_project: add_col"); - if (RAY_IS_ERR(next)) return next; + /* Release the partial `out` on failure — ray_table_add_col + * does not free its input on error. */ + if (!next) { + ray_release(out); + return ray_error("memory", "dl_project: add_col"); + } + if (RAY_IS_ERR(next)) { + ray_release(out); + return next; + } out = next; } else { /* Constant head slot: materialize an owned broadcast column. */ @@ -1150,7 +1183,17 @@ static ray_t* dl_project(ray_t* tbl, const int* col_indices, int n_out, ray_release(out); return ray_error("domain", "dl_project: unset head-const type"); } - ray_t* bcast = dl_broadcast_const_col(nrows, ctype, head_consts[c]); + /* When the head relation's slot is an existing SYM column + * (from a prior aligned rule), match its width so + * table_union's ray_vec_concat doesn't reject a W64 vs + * narrow mismatch. */ + uint8_t sym_w = 0; + if (ctype == RAY_SYM && head_rel && head_rel->table) { + ray_t* hc = ray_table_get_col_idx(head_rel->table, c); + if (hc && hc->type == RAY_SYM) + sym_w = hc->attrs & RAY_SYM_W_MASK; + } + ray_t* bcast = dl_broadcast_const_col(nrows, ctype, head_consts[c], sym_w); if (!bcast || RAY_IS_ERR(bcast)) { ray_release(out); return bcast ? bcast : ray_error("memory", "dl_project: broadcast"); @@ -2096,8 +2139,14 @@ static ray_t* table_union(ray_t* a, ray_t* b) { } ray_t* next = ray_table_add_col(out, ray_table_col_name(a, c), merged); ray_release(merged); - if (!next) return ray_error("memory", "table_union: add_col"); - if (RAY_IS_ERR(next)) return next; + if (!next) { + ray_release(out); + return ray_error("memory", "table_union: add_col"); + } + if (RAY_IS_ERR(next)) { + ray_release(out); + return next; + } out = next; } return out; From d348677eceb2eaf862b9c8369a4c94d32bd36eef Mon Sep 17 00:00:00 2001 From: Anton Date: Thu, 23 Apr 2026 16:44:31 +0200 Subject: [PATCH 48/51] fix(datalog): release out on const-slot add_col failure in dl_project MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round-8's add_col-leak fix covered the var-slot path (src_idx >= 0) and the empty-accum fallback but missed the third add_col site — the constant-head-slot path right below them. If ray_table_add_col fails there (NULL or RAY_ERR) the partially-built `out` was still being returned-without-release, leaking the table and every column already added. Applied the same release-before-return treatment. 708/708 pass on debug (ASan+UBSan) and release. --- src/ops/datalog.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index 30e90751..aa0344e9 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -1200,8 +1200,14 @@ static ray_t* dl_project(ray_t* tbl, const int* col_indices, int n_out, } ray_t* next = ray_table_add_col(out, head_rel->col_names[c], bcast); ray_release(bcast); - if (!next) return ray_error("memory", "dl_project: add_col"); - if (RAY_IS_ERR(next)) return next; + if (!next) { + ray_release(out); + return ray_error("memory", "dl_project: add_col"); + } + if (RAY_IS_ERR(next)) { + ray_release(out); + return next; + } out = next; } } From 2cd57415fa1499ccbdbc791cd73a2903318ebee0 Mon Sep 17 00:00:00 2001 From: Anton Date: Thu, 23 Apr 2026 17:19:34 +0200 Subject: [PATCH 49/51] fix(datalog,runtime,docs): round-9 Copilot review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Six comments from 2026-04-23T15:11: * Aggregate-only rule fallback (datalog.c:~1374): ray_table_add_col failure left `accum` leaked. Release it on both error exits. * DL_CMP dl_eval_expr failure (datalog.c:~1855): previously a NULL or RAY_ERROR from dl_eval_expr did `break`, silently skipping the comparison filter and letting the rule produce wrong rows. Treat eval failure as unrecoverable: free any RAY_ERROR, release accum, set prog->eval_err, and return NULL from dl_compile_rule. Same for the missing-variable-column paths. * DL_CMP filtered-table build (datalog.c:~1982): `continue`-ing past a failed column allocation / add_col produced a table with fewer columns than accum — silent schema corruption. Now validates ray_table_new, ray_vec_new, ray_sym_vec_new, and ray_table_add_col at every step; any failure releases the partial output and raises prog->eval_err. * Public API backward compat (datalog.h:252): dl_rule_head_const's signature was extended from (rule, pos, val) to (rule, pos, val, type) in this PR, breaking external callers. Restored the 3-arg form as an I64 wrapper and added dl_rule_head_const_typed for the typed variant. Internal call sites and tests updated to the new name. * Plan doc absolute paths (docs/plans/...md): replaced three /Users/aspirational/... paths with $WORKSPACE-relative references so the doc is portable across dev environments / CI. * Oversized-file OOM test (test_runtime.c:~208): 32-bit off_t shift was undefined. Skip when sizeof(off_t) < 8 and build the size via int64_t before casting to off_t. 708/708 pass on debug (ASan+UBSan) and release. --- ...04-18-datalog-aggregates-and-onboarding.md | 6 +- src/ops/datalog.c | 116 +++++++++++++++--- src/ops/datalog.h | 7 +- test/test_datalog.c | 12 +- test/test_runtime.c | 15 ++- 5 files changed, 127 insertions(+), 29 deletions(-) diff --git a/docs/plans/2026-04-18-datalog-aggregates-and-onboarding.md b/docs/plans/2026-04-18-datalog-aggregates-and-onboarding.md index 1556ac09..e7793da2 100644 --- a/docs/plans/2026-04-18-datalog-aggregates-and-onboarding.md +++ b/docs/plans/2026-04-18-datalog-aggregates-and-onboarding.md @@ -672,7 +672,7 @@ Use `gh pr create --draft --base master --head theaspirational:feature/datalog-a ## Phase B — ray-exomem: declarative health derivations -Phase B runs on a separate branch `feature/declarative-derivations` in `/Users/aspirational/Documents/code/lynx/Teide/ray-exomem`. It depends on Phase A's float support only if templates use float thresholds; the integer-only path can land first. +Phase B runs on a separate branch `feature/declarative-derivations` in the consumer repo (`$WORKSPACE/ray-exomem`). It depends on Phase A's float support only if templates use float thresholds; the integer-only path can land first. **Note on dependency direction:** `ray-exomem` consumes `rayforce2` via the sibling checkout (`build.rs:97`). To use Phase A features locally, switch the rayforce2 checkout to `feature/datalog-aggregates` before building ray-exomem. To consume from the upstream merge, wait until the PR from Task A8 is merged into `RayforceDB/rayforce2:master` and the sibling is back on `master`. @@ -684,7 +684,7 @@ Phase B runs on a separate branch `feature/declarative-derivations` in `/Users/a - [ ] **Step 1: Read the existing test** ```bash -sed -n '810,860p' /Users/aspirational/Documents/code/lynx/Teide/ray-exomem/src/system_schema.rs +sed -n '810,860p' "$WORKSPACE/ray-exomem/src/system_schema.rs" ``` Note the inputs (age=30, height=175, weight=75) and the expected derived relations. @@ -974,7 +974,7 @@ git commit -am "fix(onboarding): wire derived bands into recommended-* lookups" If Phase A features are needed (aggregates / float / between), the consumer must check out the matching rayforce2 branch. **Files:** -- Modify: `/Users/aspirational/Documents/code/lynx/Teide/ray-exomem/CLAUDE.md` (Important gotchas section) +- Modify: `$WORKSPACE/ray-exomem/CLAUDE.md` (Important gotchas section) - [ ] **Step 1: Add a note** diff --git a/src/ops/datalog.c b/src/ops/datalog.c index aa0344e9..399143dc 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -270,7 +270,7 @@ void dl_rule_head_var(dl_rule_t* rule, int pos, int var_idx) { if (var_idx + 1 > rule->n_vars) rule->n_vars = var_idx + 1; } -void dl_rule_head_const(dl_rule_t* rule, int pos, int64_t val, int8_t type) { +void dl_rule_head_const_typed(dl_rule_t* rule, int pos, int64_t val, int8_t type) { if (pos < 0 || pos >= rule->head_arity) return; /* Default to RAY_I64 if an unrecognized type sneaks through; keeps * old-callers-with-no-type compat when writing to the slot. */ @@ -281,10 +281,17 @@ void dl_rule_head_const(dl_rule_t* rule, int pos, int64_t val, int8_t type) { rule->head_const_types[pos] = type; } +/* Backward-compatible I64 wrapper. Pre-aggregates-PR external callers + * used this 3-arg form; it now forwards to the typed variant with + * RAY_I64. */ +void dl_rule_head_const(dl_rule_t* rule, int pos, int64_t val) { + dl_rule_head_const_typed(rule, pos, val, RAY_I64); +} + void dl_rule_head_const_f64(dl_rule_t* rule, int pos, double val) { int64_t bits; memcpy(&bits, &val, sizeof(bits)); - dl_rule_head_const(rule, pos, bits, RAY_F64); + dl_rule_head_const_typed(rule, pos, bits, RAY_F64); } int dl_rule_add_atom(dl_rule_t* rule, const char* pred, int arity) { @@ -1369,9 +1376,16 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, int64_t unit_sym = ray_sym_intern("_unit", 5); ray_t* accum_unit = ray_table_add_col(accum, unit_sym, one_val); ray_release(one_val); - if (!accum_unit) { prog->eval_err = true; return NULL; } + /* ray_table_add_col doesn't free `accum` on error — release it + * ourselves so the partially-built table isn't leaked. */ + if (!accum_unit) { + ray_release(accum); + prog->eval_err = true; + return NULL; + } if (RAY_IS_ERR(accum_unit)) { ray_error_free(accum_unit); + ray_release(accum); prog->eval_err = true; return NULL; } @@ -1837,19 +1851,44 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, if (body->cmp_lhs_expr) { lhs_evaled = dl_eval_expr(body->cmp_lhs_expr, accum, var_col, nrows); - if (!lhs_evaled || RAY_IS_ERR(lhs_evaled)) break; + /* LHS evaluation failure can't be silently skipped — a + * missing filter changes the query's answer. */ + if (!lhs_evaled) { + prog->eval_err = true; + ray_release(accum); + return NULL; + } + if (RAY_IS_ERR(lhs_evaled)) { + ray_error_free(lhs_evaled); + prog->eval_err = true; + ray_release(accum); + return NULL; + } lhs_src = lhs_evaled; } else { int lhs_col = var_col[body->cmp_lhs]; lhs_src = ray_table_get_col_idx(accum, lhs_col); - if (!lhs_src) break; + if (!lhs_src) { + prog->eval_err = true; + ray_release(accum); + return NULL; + } } if (body->cmp_rhs_expr) { rhs_evaled = dl_eval_expr(body->cmp_rhs_expr, accum, var_col, nrows); - if (!rhs_evaled || RAY_IS_ERR(rhs_evaled)) { + if (!rhs_evaled) { if (lhs_evaled) ray_release(lhs_evaled); - break; + prog->eval_err = true; + ray_release(accum); + return NULL; + } + if (RAY_IS_ERR(rhs_evaled)) { + ray_error_free(rhs_evaled); + if (lhs_evaled) ray_release(lhs_evaled); + prog->eval_err = true; + ray_release(accum); + return NULL; } rhs_src = rhs_evaled; } else if (body->cmp_rhs != DL_CONST) { @@ -1857,7 +1896,9 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, rhs_src = ray_table_get_col_idx(accum, rhs_col); if (!rhs_src) { if (lhs_evaled) ray_release(lhs_evaled); - break; + prog->eval_err = true; + ray_release(accum); + return NULL; } } /* else rhs is a constant i64 body->cmp_const */ @@ -1944,17 +1985,44 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, } /* Build filtered table — element-size-aware memcpy so f64 - * columns and narrow-SYM columns survive the mask unchanged. */ + * columns and narrow-SYM columns survive the mask unchanged. + * Silently `continue`-ing past missing columns would yield + * a table with fewer columns than accum, breaking schema + * invariants in downstream table_union. Treat every such + * failure as unrecoverable. */ int64_t ncols = ray_table_ncols(accum); ray_t* out = ray_table_new((int)ncols); + if (!out || RAY_IS_ERR(out)) { + if (out && RAY_IS_ERR(out)) ray_error_free(out); + ray_free(mask_block); + ray_release(accum); + prog->eval_err = true; + return NULL; + } for (int64_t c = 0; c < ncols; c++) { ray_t* src = ray_table_get_col_idx(accum, c); - if (!src) continue; + if (!src) { + ray_release(out); ray_free(mask_block); + ray_release(accum); + prog->eval_err = true; + return NULL; + } ray_t* dst = (src->type == RAY_SYM) ? ray_sym_vec_new(src->attrs & RAY_SYM_W_MASK, count) : ray_vec_new(src->type, count); - if (!dst) continue; - if (RAY_IS_ERR(dst)) { ray_error_free(dst); continue; } + if (!dst) { + ray_release(out); ray_free(mask_block); + ray_release(accum); + prog->eval_err = true; + return NULL; + } + if (RAY_IS_ERR(dst)) { + ray_error_free(dst); + ray_release(out); ray_free(mask_block); + ray_release(accum); + prog->eval_err = true; + return NULL; + } dst->len = count; uint8_t esz = ray_sym_elem_size(src->type, src->attrs); const uint8_t* sb = (const uint8_t*)ray_data(src); @@ -1966,8 +2034,22 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, j++; } if (src->type == RAY_STR) col_propagate_str_pool(dst, src); - out = ray_table_add_col(out, ray_table_col_name(accum, c), dst); + ray_t* next = ray_table_add_col(out, ray_table_col_name(accum, c), dst); ray_release(dst); + if (!next) { + ray_release(out); ray_free(mask_block); + ray_release(accum); + prog->eval_err = true; + return NULL; + } + if (RAY_IS_ERR(next)) { + ray_error_free(next); + ray_release(out); ray_free(mask_block); + ray_release(accum); + prog->eval_err = true; + return NULL; + } + out = next; } ray_free(mask_block); ray_release(accum); @@ -3636,18 +3718,18 @@ static ray_t* dl_parse_rule_from_head_and_body(dl_rule_t* out, ray_t* head, int vi = dl_var_get_or_create(vars, harg->i64); dl_rule_head_var(out, i, vi); } else if (harg->type == -RAY_I64) { - dl_rule_head_const(out, i, harg->i64, RAY_I64); + dl_rule_head_const_typed(out, i, harg->i64, RAY_I64); } else if (harg->type == -RAY_SYM) { - dl_rule_head_const(out, i, harg->i64, RAY_SYM); + dl_rule_head_const_typed(out, i, harg->i64, RAY_SYM); } else if (harg->type == -RAY_F64) { int64_t bits; memcpy(&bits, &harg->f64, sizeof(bits)); - dl_rule_head_const(out, i, bits, RAY_F64); + dl_rule_head_const_typed(out, i, bits, RAY_F64); } else if (harg->type == -RAY_STR) { /* Intern the string as a sym so it can be stored in a RAY_SYM * column. Matches the body-literal parser convention. */ int64_t sym = ray_sym_intern(ray_str_ptr(harg), ray_str_len(harg)); - dl_rule_head_const(out, i, sym, RAY_SYM); + dl_rule_head_const_typed(out, i, sym, RAY_SYM); } else { return ray_error("type", "rule: head arguments must be ?variables or constants"); } diff --git a/src/ops/datalog.h b/src/ops/datalog.h index cca64955..3141097b 100644 --- a/src/ops/datalog.h +++ b/src/ops/datalog.h @@ -245,11 +245,16 @@ void dl_rule_init(dl_rule_t* rule, const char* head_pred, int head_arity); /* Set a head argument to a variable */ void dl_rule_head_var(dl_rule_t* rule, int pos, int var_idx); +/* Set a head argument to an I64 constant — backward-compatible + * signature. Equivalent to dl_rule_head_const_typed(rule, pos, val, + * RAY_I64). Prefer the typed variant for new code. */ +void dl_rule_head_const(dl_rule_t* rule, int pos, int64_t val); + /* Set a head argument to a typed constant. * type must be RAY_I64, RAY_SYM, or RAY_F64. * For RAY_F64 callers should pass a double reinterpreted via memcpy/union * into val's int64 slot; dl_rule_head_const_f64 is the safe wrapper. */ -void dl_rule_head_const(dl_rule_t* rule, int pos, int64_t val, int8_t type); +void dl_rule_head_const_typed(dl_rule_t* rule, int pos, int64_t val, int8_t type); /* Convenience wrapper: set a head argument to a RAY_F64 constant. */ void dl_rule_head_const_f64(dl_rule_t* rule, int pos, double val); diff --git a/test/test_datalog.c b/test/test_datalog.c index 8b6dfd2c..b668bb9d 100644 --- a/test/test_datalog.c +++ b/test/test_datalog.c @@ -1041,7 +1041,7 @@ static MunitResult test_rule_head_const_single_rule(const void* params, void* fi dl_rule_t rule; dl_rule_init(&rule, "band", 1); int64_t sym_small = ray_sym_intern("small", 5); - dl_rule_head_const(&rule, 0, sym_small, RAY_SYM); + dl_rule_head_const_typed(&rule, 0, sym_small, RAY_SYM); int body = dl_rule_add_atom(&rule, "weight", 1); dl_body_set_var(&rule, body, 0, 0); /* binds ?W = col 0 */ @@ -1088,7 +1088,7 @@ static MunitResult test_rule_head_const_i64(const void* params, void* fixture) { /* (rule (ev ?X 1) (pair ?X ?Y)) */ dl_rule_t r; dl_rule_init(&r, "ev", 2); dl_rule_head_var(&r, 0, 0); - dl_rule_head_const(&r, 1, 1, RAY_I64); + dl_rule_head_const_typed(&r, 1, 1, RAY_I64); int bi = dl_rule_add_atom(&r, "pair", 2); dl_body_set_var(&r, bi, 0, 0); dl_body_set_var(&r, bi, 1, 1); @@ -1134,7 +1134,7 @@ static MunitResult test_rule_head_const_cross_idb(const void* params, void* fixt /* R1: (foo "small") :- (edge ?U ?V) */ dl_rule_t r1; dl_rule_init(&r1, "foo", 1); - dl_rule_head_const(&r1, 0, sym_small, RAY_SYM); + dl_rule_head_const_typed(&r1, 0, sym_small, RAY_SYM); int r1b = dl_rule_add_atom(&r1, "edge", 2); dl_body_set_var(&r1, r1b, 0, 0); dl_body_set_var(&r1, r1b, 1, 1); @@ -1218,7 +1218,7 @@ static MunitResult test_rule_head_const_with_agg(const void* params, void* fixtu int64_t sym_total = ray_sym_intern("total", 5); dl_rule_t r; dl_rule_init(&r, "stat", 2); - dl_rule_head_const(&r, 0, sym_total, RAY_SYM); + dl_rule_head_const_typed(&r, 0, sym_total, RAY_SYM); dl_rule_head_var(&r, 1, 0); /* ?N */ dl_rule_add_agg(&r, DL_AGG_COUNT, 0, "weight", 1, 0); r.n_vars = 1; @@ -1338,7 +1338,7 @@ static MunitResult test_rule_head_const_stratification(const void* params, void* dl_rule_t r1; dl_rule_init(&r1, "marker", 2); dl_rule_head_var(&r1, 0, 0); - dl_rule_head_const(&r1, 1, sym_seen, RAY_SYM); + dl_rule_head_const_typed(&r1, 1, sym_seen, RAY_SYM); int r1b = dl_rule_add_atom(&r1, "src", 1); dl_body_set_var(&r1, r1b, 0, 0); r1.n_vars = 1; @@ -1392,7 +1392,7 @@ static MunitResult test_rule_head_const_type_conflict(const void* params, void* /* Rule A: (tag ?x "sym") — slot 1 committed to RAY_SYM. */ dl_rule_t a; dl_rule_init(&a, "tag", 2); dl_rule_head_var(&a, 0, 0); - dl_rule_head_const(&a, 1, ray_sym_intern("sym", 3), RAY_SYM); + dl_rule_head_const_typed(&a, 1, ray_sym_intern("sym", 3), RAY_SYM); int ab = dl_rule_add_atom(&a, "src", 1); dl_body_set_var(&a, ab, 0, 0); a.n_vars = 1; diff --git a/test/test_runtime.c b/test/test_runtime.c index 9fad1fa9..3e86a931 100644 --- a/test/test_runtime.c +++ b/test/test_runtime.c @@ -200,12 +200,23 @@ static MunitResult test_create_with_sym_oversized_file(const void* params, void* char* dir = make_tmpdir(); munit_assert_ptr_not_null(dir); + /* Skip on platforms with 32-bit off_t — the sparse size we want + * (>> 4 GB) isn't representable and the shift in that case would + * be undefined. */ + if (sizeof(off_t) < 8) { + free(dir); + return MUNIT_SKIP; + } + char path[256]; snprintf(path, sizeof(path), "%s/huge.sym", dir); int fd = open(path, O_WRONLY | O_CREAT | O_TRUNC, 0644); munit_assert_int(fd, >=, 0); - /* 10 EB sparse — bigger than any plausible mem_budget/2. */ - off_t huge = (off_t)1 << 62; + /* 4 EB sparse — bigger than any plausible mem_budget/2 (<1 ZB of + * RAM). Build via int64_t to keep the shift well-defined, then + * cast to off_t after the width guard above has passed. */ + int64_t huge64 = (int64_t)1 << 62; + off_t huge = (off_t)huge64; int rc = ftruncate(fd, huge); close(fd); if (rc != 0) { From 2ecae482c612d9884f6e538f9177d8e9fcb1b567 Mon Sep 17 00:00:00 2001 From: Anton Date: Thu, 23 Apr 2026 17:33:32 +0200 Subject: [PATCH 50/51] fix(datalog): DL_ASSIGN treats eval failures as unrecoverable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round-9 covered DL_CMP's silent-break paths but missed DL_ASSIGN, which had the same pattern: if (!new_col || RAY_IS_ERR(new_col)) break; A failed assignment silently leaves assign_var unbound; subsequent body literals keep compiling with stale bindings, and dl_eval returns 0 with a wrong result set. Now treat every failure mode as unrecoverable: free any RAY_ERROR via ray_error_free, release accum, set prog->eval_err, and return NULL. Also extended the check to the dl_table_add_computed_col return — previously unchecked. 708/708 pass on debug (ASan+UBSan) and release. --- src/ops/datalog.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index 399143dc..e09b3fe8 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -1454,7 +1454,20 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, case DL_ASSIGN: { int64_t nrows = ray_table_nrows(accum); ray_t* new_col = dl_eval_expr(body->assign_expr, accum, var_col, nrows); - if (!new_col || RAY_IS_ERR(new_col)) break; + /* Silently breaking would leave assign_var unbound and let + * the rest of the rule keep compiling with stale bindings, + * producing a dl_eval == 0 return alongside wrong rows. */ + if (!new_col) { + ray_release(accum); + prog->eval_err = true; + return NULL; + } + if (RAY_IS_ERR(new_col)) { + ray_error_free(new_col); + ray_release(accum); + prog->eval_err = true; + return NULL; + } int new_col_idx = (int)ray_table_ncols(accum); char colname[32]; @@ -1462,6 +1475,12 @@ ray_op_t* dl_compile_rule(dl_program_t* prog, dl_rule_t* rule, ray_t* new_accum = dl_table_add_computed_col(accum, new_col, colname); ray_release(new_col); ray_release(accum); + if (!new_accum) { prog->eval_err = true; return NULL; } + if (RAY_IS_ERR(new_accum)) { + ray_error_free(new_accum); + prog->eval_err = true; + return NULL; + } accum = new_accum; var_bound[body->assign_var] = true; From 23a0cb51c2b839b30ae3222544e25c4fe2f83196 Mon Sep 17 00:00:00 2001 From: Anton Date: Thu, 23 Apr 2026 17:58:20 +0200 Subject: [PATCH 51/51] fix(datalog,runtime): round-10 Copilot review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three comments from 2026-04-23T15:50: * table_union propagates error operands (datalog.c:~2203): when `b` was a RAY_ERROR the function used to return `a` instead, silently masking the real failure. Now both error operands are returned (retained) so the caller sees the true diagnostic, and NULL operands still fall back to the other side. ray_retain is a no-op on errors, so the "owned return" contract is preserved. * dl_broadcast_const_col width sentinel (datalog.c:~1054): the old uint8_t sym_width_hint used 0 as "default", but RAY_SYM_W8 is also 0 — so an aligned IDB with a W8 head-const column would silently widen to W64, later tripping ray_vec_concat. Replaced the hint with a `const ray_t* width_template` pointer: NULL means "default W64", otherwise copy the template column's adaptive width directly. Caller (dl_project) updated to pass the head_rel column pointer. * test_runtime.c: replaced the manual runtime-API forward decls with `#include "core/runtime.h"` now that this TU doesn't pull in lang/eval.h — the ray_vm_t duplication concern from test_datalog.c doesn't apply here, so let the real header be the source of truth. 708/708 pass on debug (ASan+UBSan) and release. --- src/ops/datalog.c | 52 ++++++++++++++++++++++++++++----------------- test/test_runtime.c | 17 +-------------- 2 files changed, 34 insertions(+), 35 deletions(-) diff --git a/src/ops/datalog.c b/src/ops/datalog.c index e09b3fe8..3bb04ce4 100644 --- a/src/ops/datalog.c +++ b/src/ops/datalog.c @@ -1048,17 +1048,23 @@ static ray_t* dl_filter_eq(ray_t* tbl, int col_idx, int64_t value) { * onto rule-local scratch — so that the IDB relation table can outlive the * per-iteration scratch that built it. Cross-IDB reads at subsequent * strata borrow from this column via ray_table_get_col_idx. */ -/* sym_width_hint: when type == RAY_SYM, pass the desired RAY_SYM_W* value - * so the broadcast column matches the IDB relation's existing width - * (otherwise ray_vec_new defaults to W64 and later table_union would - * hit a ray_vec_concat width mismatch). Pass 0 for the default. */ +/* width_template: when type == RAY_SYM, this column is consulted for its + * SYM attrs/width so the broadcast matches the IDB relation's existing + * adaptive width (otherwise ray_vec_new would default to W64 and a + * later table_union would hit a ray_vec_concat width mismatch). Pass + * NULL (no existing column) to get the W64 default. Using a pointer + * here rather than a uint8_t hint avoids the W8=0 sentinel ambiguity + * of an "a zero hint means default" convention. */ static ray_t* dl_broadcast_const_col(int64_t nrows, int8_t type, int64_t val, - uint8_t sym_width_hint) { + const ray_t* width_template) { if (type != RAY_I64 && type != RAY_SYM && type != RAY_F64) { return ray_error("type", NULL); } + uint8_t sym_w = RAY_SYM_W64; + if (type == RAY_SYM && width_template && width_template->type == RAY_SYM) + sym_w = width_template->attrs & RAY_SYM_W_MASK; ray_t* v = (type == RAY_SYM) - ? ray_sym_vec_new(sym_width_hint & RAY_SYM_W_MASK, nrows) + ? ray_sym_vec_new(sym_w, nrows) : ray_vec_new(type, nrows); if (!v || RAY_IS_ERR(v)) return v; v->len = nrows; @@ -1194,13 +1200,10 @@ static ray_t* dl_project(ray_t* tbl, const int* col_indices, int n_out, * (from a prior aligned rule), match its width so * table_union's ray_vec_concat doesn't reject a W64 vs * narrow mismatch. */ - uint8_t sym_w = 0; - if (ctype == RAY_SYM && head_rel && head_rel->table) { - ray_t* hc = ray_table_get_col_idx(head_rel->table, c); - if (hc && hc->type == RAY_SYM) - sym_w = hc->attrs & RAY_SYM_W_MASK; - } - ray_t* bcast = dl_broadcast_const_col(nrows, ctype, head_consts[c], sym_w); + const ray_t* width_tpl = NULL; + if (ctype == RAY_SYM && head_rel && head_rel->table) + width_tpl = ray_table_get_col_idx(head_rel->table, c); + ray_t* bcast = dl_broadcast_const_col(nrows, ctype, head_consts[c], width_tpl); if (!bcast || RAY_IS_ERR(bcast)) { ray_release(out); return bcast ? bcast : ray_error("memory", "dl_project: broadcast"); @@ -2196,15 +2199,26 @@ static ray_t* restore_names(ray_t* tbl, ray_t* src) { /* Create a table by concatenating all rows from tables a and b (same schema). * Uses column-wise ray_vec_concat. Returns new owned table with a's names. */ static ray_t* table_union(ray_t* a, ray_t* b) { - /* Pass-through paths always return a retained reference whenever the - * returned pointer is non-NULL (even on RAY_ERROR), so callers can - * release the return value uniformly without risking use-after-free on - * the pass-through input. */ - if (!a || RAY_IS_ERR(a)) { + /* Pass-through paths always return a retained non-NULL result so + * callers can release uniformly. A NULL operand falls back to the + * other side; a RAY_ERROR operand is *propagated* (retained) rather + * than masked by the non-error side — otherwise a real failure on + * `b` would silently surface as `a` and the caller would never see + * the error. ray_retain is a no-op on errors so the retain call is + * safe and keeps the contract "release is always valid". */ + if (!a) { if (b) ray_retain(b); return b; } - if (!b || RAY_IS_ERR(b)) { ray_retain(a); return a; } + if (RAY_IS_ERR(a)) { + ray_retain(a); /* no-op for errors; documents "owned return" */ + return a; + } + if (!b) { ray_retain(a); return a; } + if (RAY_IS_ERR(b)) { + ray_retain(b); + return b; + } /* Column-count check must run before the empty-rows short-circuit. * Otherwise one side having 0 rows but a stripped schema (e.g. an diff --git a/test/test_runtime.c b/test/test_runtime.c index 3e86a931..4eb76423 100644 --- a/test/test_runtime.c +++ b/test/test_runtime.c @@ -7,6 +7,7 @@ #include "munit.h" #include +#include "core/runtime.h" /* ray_runtime_t, ray_runtime_create*, __RUNTIME */ #include #include #include @@ -14,22 +15,6 @@ #include #include -/* Runtime API forward-declared here because core/runtime.h's `ray_vm_t` - * definition collides with lang/eval.h's `ray_vm_t` when both are pulled - * into the same TU (pre-existing duplication). ray_err_t already comes - * from above. */ -typedef struct ray_runtime_s ray_runtime_t; -ray_runtime_t* ray_runtime_create(int argc, char** argv); -ray_runtime_t* ray_runtime_create_with_sym(const char* sym_path); -ray_runtime_t* ray_runtime_create_with_sym_err(const char* sym_path, - ray_err_t* out_sym_err); -void ray_runtime_destroy(ray_runtime_t* rt); -extern ray_runtime_t* __RUNTIME; - -/* Import RAY_OK / RAY_ERR_IO enum values from rayforce.h -- they live in - * the existing ray_err_t enum and are exposed via ray_err_from_obj / - * ray_err_code_str; numeric values are part of the public surface. */ - static char* make_tmpdir(void) { char tmpl[] = "/tmp/rayforce-rt-test-XXXXXX"; char* dir = mkdtemp(tmpl);