Skip to content
This repository was archived by the owner on Apr 30, 2026. It is now read-only.

Commit d1bb26b

Browse files
committed
fix: OP_IN null-aware row and set handling
Regression: exec_in treated typed nulls as ordinary data values. On a column containing 0Nl (i64 null), the null-sentinel integer read back from the column didn't match the filter set `[1]`, so `not-in k [1]` happily returned the null rows as "true" — leaking 4 rows instead of the 2 that should pass. Fix: - Read the column's and set's HAS_NULLS flag + null bitmap. - Before probing, drop any null elements from the set's probe buffer — a non-null col value must never match a null set cell via sentinel-collision. - In the per-row loop, if the col row is null (atom or bitmap), emit 0 unconditionally. Both `in` and `not-in` are defined to be false for null inputs (SQL-style UNKNOWN → false in boolean context). - Also handle atom null via RAY_ATOM_IS_NULL for the degenerate scalar-col case. Regression test /eval/select_where_in_nulls asserts both `in` and `not-in` correctly exclude null rows from a 5-row column with two nulls. 621/621 tests pass in debug and release.
1 parent 41fc3cc commit d1bb26b

2 files changed

Lines changed: 78 additions & 13 deletions

File tree

src/ops/exec.c

Lines changed: 45 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -606,6 +606,16 @@ static ray_t* exec_in(ray_graph_t* g, ray_op_t* op, ray_t* col, ray_t* set) {
606606
out->len = col_len;
607607
uint8_t* ob = (uint8_t*)ray_data(out);
608608

609+
/* Null-aware: null rows in the column never pass either `in` or
610+
* `not-in`. Mirrors SQL-style semantics where NULL IN (…) and
611+
* NULL NOT IN (…) both yield UNKNOWN / false in a boolean
612+
* context. Also skip null elements when building the probe
613+
* buffer so a non-null col row doesn't accidentally match the
614+
* sentinel value of a null set element. */
615+
bool col_has_nulls = !ray_is_atom(col) && (col->attrs & RAY_ATTR_HAS_NULLS);
616+
bool col_atom_null = ray_is_atom(col) && RAY_ATOM_IS_NULL(col);
617+
bool set_has_nulls = !ray_is_atom(set) && (set->attrs & RAY_ATTR_HAS_NULLS);
618+
609619
#define READ_I64(dst, vec, type, idx) do { \
610620
const void* _d = ray_data(vec); \
611621
switch (type) { \
@@ -636,6 +646,10 @@ static ray_t* exec_in(ray_graph_t* g, ray_op_t* op, ray_t* col, ray_t* set) {
636646
} \
637647
} while (0)
638648

649+
/* Compact probe buffer: drop null set elements up front so the
650+
* inner loop doesn't special-case them. */
651+
int64_t sv_len = 0;
652+
639653
if (use_double) {
640654
double sv_stack[32];
641655
ray_t* sv_hdr = NULL;
@@ -646,21 +660,28 @@ static ray_t* exec_in(ray_graph_t* g, ray_op_t* op, ray_t* col, ray_t* set) {
646660
sv = (double*)ray_data(sv_hdr);
647661
}
648662
if (ray_is_atom(set)) {
649-
if (st == RAY_F64) sv[0] = set->f64;
650-
else sv[0] = (double)set->i64;
663+
if (!RAY_ATOM_IS_NULL(set)) {
664+
sv[0] = (st == RAY_F64) ? set->f64 : (double)set->i64;
665+
sv_len = 1;
666+
}
651667
} else {
652-
for (int64_t i = 0; i < set_len; i++) READ_F64(sv[i], set, st, i);
668+
for (int64_t i = 0; i < set_len; i++) {
669+
if (set_has_nulls && ray_vec_is_null(set, i)) continue;
670+
READ_F64(sv[sv_len], set, st, i);
671+
sv_len++;
672+
}
653673
}
654674
for (int64_t i = 0; i < col_len; i++) {
675+
/* Null col rows never pass either in or not-in. */
676+
bool row_null = col_atom_null ||
677+
(col_has_nulls && !ray_is_atom(col) &&
678+
ray_vec_is_null(col, i));
679+
if (row_null) { ob[i] = 0; continue; }
655680
double cv;
656-
if (ray_is_atom(col)) {
657-
if (ct == RAY_F64) cv = col->f64;
658-
else cv = (double)col->i64;
659-
} else {
660-
READ_F64(cv, col, ct, i);
661-
}
681+
if (ray_is_atom(col)) cv = (ct == RAY_F64) ? col->f64 : (double)col->i64;
682+
else READ_F64(cv, col, ct, i);
662683
int found = 0;
663-
for (int64_t j = 0; j < set_len; j++) {
684+
for (int64_t j = 0; j < sv_len; j++) {
664685
if (cv == sv[j]) { found = 1; break; }
665686
}
666687
ob[i] = (uint8_t)(found ^ negate);
@@ -676,15 +697,26 @@ static ray_t* exec_in(ray_graph_t* g, ray_op_t* op, ray_t* col, ray_t* set) {
676697
if (!sv_hdr) { ray_release(out); return ray_error("oom", NULL); }
677698
sv = (int64_t*)ray_data(sv_hdr);
678699
}
679-
if (ray_is_atom(set)) sv[0] = set->i64;
680-
else for (int64_t i = 0; i < set_len; i++) READ_I64(sv[i], set, st, i);
700+
if (ray_is_atom(set)) {
701+
if (!RAY_ATOM_IS_NULL(set)) { sv[0] = set->i64; sv_len = 1; }
702+
} else {
703+
for (int64_t i = 0; i < set_len; i++) {
704+
if (set_has_nulls && ray_vec_is_null(set, i)) continue;
705+
READ_I64(sv[sv_len], set, st, i);
706+
sv_len++;
707+
}
708+
}
681709

682710
for (int64_t i = 0; i < col_len; i++) {
711+
bool row_null = col_atom_null ||
712+
(col_has_nulls && !ray_is_atom(col) &&
713+
ray_vec_is_null(col, i));
714+
if (row_null) { ob[i] = 0; continue; }
683715
int64_t cv;
684716
if (ray_is_atom(col)) cv = col->i64;
685717
else READ_I64(cv, col, ct, i);
686718
int found = 0;
687-
for (int64_t j = 0; j < set_len; j++) {
719+
for (int64_t j = 0; j < sv_len; j++) {
688720
if (cv == sv[j]) { found = 1; break; }
689721
}
690722
ob[i] = (uint8_t)(found ^ negate);

test/test_lang.c

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1008,6 +1008,38 @@ static MunitResult test_eval_select_where_in_sym(const void* params, void* fixtu
10081008
return MUNIT_OK;
10091009
}
10101010

1011+
/* ---- Test: OP_IN must not leak null rows through ----
1012+
* Regression: exec_in originally treated the raw null-sentinel
1013+
* value as a normal data value. That meant `(not-in k [1])`
1014+
* on a column containing 0Nl nulls returned the null rows as
1015+
* "true" (because null-sentinel != 1), leaking them through.
1016+
* Null rows must never pass either `in` or `not-in`. */
1017+
static MunitResult test_eval_select_where_in_nulls(const void* params, void* fixture) {
1018+
(void)params; (void)fixture;
1019+
1020+
/* not-in with null rows: source has [1 0Nl 3 0Nl 5], filter
1021+
* `not-in [1]` should return only 3 and 5 — not the two nulls. */
1022+
ray_t* r1 = ray_eval_str(
1023+
"(do (set t (table ['k] (list [1 0Nl 3 0Nl 5]))) "
1024+
"(select {from: t where: (not-in k [1])}))");
1025+
munit_assert_ptr_not_null(r1);
1026+
munit_assert_false(RAY_IS_ERR(r1));
1027+
munit_assert_int(ray_table_nrows(r1), ==, 2);
1028+
ray_release(r1);
1029+
1030+
/* in with null rows: source has [1 0Nl 3 0Nl 5], filter
1031+
* `in [1 3]` should return 1 and 3 — not the nulls. */
1032+
ray_t* r2 = ray_eval_str(
1033+
"(do (set t (table ['k] (list [1 0Nl 3 0Nl 5]))) "
1034+
"(select {from: t where: (in k [1 3])}))");
1035+
munit_assert_ptr_not_null(r2);
1036+
munit_assert_false(RAY_IS_ERR(r2));
1037+
munit_assert_int(ray_table_nrows(r2), ==, 2);
1038+
ray_release(r2);
1039+
1040+
return MUNIT_OK;
1041+
}
1042+
10111043
/* ---- Test: `in` with mixed numeric types (F64 col vs I64 set) ----
10121044
* Regression: exec_in originally compared bit patterns which made
10131045
* `(in price_f64 [1 2 3])` match nothing. Now we promote to double
@@ -2761,6 +2793,7 @@ static MunitTest lang_tests[] = {
27612793
{ "/eval/select_where_in_sym", test_eval_select_where_in_sym, lang_setup, lang_teardown, 0, NULL },
27622794
{ "/eval/select_where_in_i64", test_eval_select_where_in_i64, lang_setup, lang_teardown, 0, NULL },
27632795
{ "/eval/select_where_in_mixed_numeric", test_eval_select_where_in_mixed_numeric, lang_setup, lang_teardown, 0, NULL },
2796+
{ "/eval/select_where_in_nulls", test_eval_select_where_in_nulls, lang_setup, lang_teardown, 0, NULL },
27642797
{ "/eval/select_by_where_filters", test_eval_select_by_where_filters, lang_setup, lang_teardown, 0, NULL },
27652798
{ "/eval/select_by_where_in", test_eval_select_by_where_in, lang_setup, lang_teardown, 0, NULL },
27662799
{ "/eval/select_if", test_eval_select_if, lang_setup, lang_teardown, 0, NULL },

0 commit comments

Comments
 (0)