Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,56 @@ class ColumnarCachedBatchBuildFilterPruneSuite extends AnyFunSuite {
"order: stats=null pass-through first, then stats-covers-literal kept")
}

// Regression: a batch whose stats row is non-null but carries a NULL lower/upper bound for a
// predicate-referenced column must NOT be pruned. Such per-batch null bounds arise from
// data-dependent writer demotions invisible to the schema-level stripUnsupportedConjuncts:
// a binary-collation VARCHAR whose 256B upper-bound prefix is all 0xFF (carry overflow), or a
// dictionary-encoded numeric on the V2 fallback path. Without the per-batch bypass, vanilla
// buildFilter evaluates `null <= 999 && 999 <= null` -> null -> coerced false -> the batch is
// silently dropped (data loss).
test("null lower/upper bound on referenced column bypasses pruning (batch kept)") {
val serializer = new ColumnarCachedBatchSerializer
val attr = AttributeReference("id", LongType, nullable = false)()
val predicate = EqualTo(attr, Literal(999L))
val filter = serializer.buildFilter(Seq(predicate), Seq(attr))

val nullBoundStats = new GenericInternalRow(Array[Any](null, null, 0, 10, 80L))
val batch = CachedColumnarBatch(
numRows = 10,
sizeInBytes = 80L,
bytes = Array.fill[Byte](40)(0),
stats = nullBoundStats)

val result = filter(0, Iterator[CachedBatch](batch)).toList
assert(
result.length === 1,
"null-bound referenced column must bypass pruning -> batch kept (no silent data loss)")
assert(result.head.numRows === 10)
}

// The bypass must split a contiguous run correctly: a finite-bound batch that the predicate
// excludes is still pruned, a null-bound batch in the middle is passed through, and a finite
// covering batch after it is still kept -- no batch double-emitted or skipped.
test("null-bound bypass splits a contiguous run: pruned dropped, null-bound + covering kept") {
val serializer = new ColumnarCachedBatchSerializer
val attr = AttributeReference("id", LongType, nullable = false)()
val predicate = EqualTo(attr, Literal(999L))
val filter = serializer.buildFilter(Seq(predicate), Seq(attr))

val prunable = batchWithStats(5, 0L, 100L) // finite, excludes 999 -> pruned
val nullBound = CachedColumnarBatch(
numRows = 7,
sizeInBytes = 56L,
bytes = Array.fill[Byte](28)(0),
stats = new GenericInternalRow(Array[Any](null, null, 0, 7, 56L)))
val covering = batchWithStats(9, 900L, 1000L) // finite, covers 999 -> kept

val result = filter(0, Iterator[CachedBatch](prunable, nullBound, covering)).toList
assert(
result.map(_.numRows) === Seq(7, 9),
"prunable dropped; null-bound bypassed (kept); covering kept -- run split correctly")
}

// ---------------------------------------------------------------------------
// W1-W8 -- non-binary collation StringType wrapper behavior.
// The wrapper strips AND-conjuncts referencing non-binary collation StringType
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -176,25 +176,27 @@ class ColumnarCachedBatchE2ESuite
}
}

test("Float NaN partition: filter on non-NaN not silently pruned") {
test("Float NaN same batch: filter on non-NaN not silently pruned") {
// coalesce(1) forces the NaN row (id=7) and the queried finite row (id=42) into the SAME
// cached batch, deterministically reproducing the regression: previously a NaN poisoned the
// whole column to unsupported -> null min/max bounds -> vanilla buildFilter pruned the batch
// -> the finite k=42.0 row was silently dropped. NaN must instead be skipped so the finite
// bounds [0, 999] are emitted and the matching row is returned (parity with vanilla Spark).
val df = spark
.range(N)
.select(
when(col("id") === 7L, lit(Float.NaN))
.otherwise(col("id").cast("float"))
.as("k"))
.repartition(P)
.coalesce(1)
.cache()
try {
df.count()
// pivot=42 is a non-NaN value that exists somewhere; the partition that
// contains it may also contain the NaN row at id=7 (collision possible
// depending on hash partitioning). Either way, equality must find it.
val result = df.filter(col("k") === 42.0f).count()
assert(
result == 1L,
s"expected 1 row with k=42.0, got $result " +
s"(NaN may have poisoned partition stats)")
s"(NaN must not poison partition stats / prune the finite match)")
} finally {
df.unpersist()
}
Expand Down Expand Up @@ -384,20 +386,20 @@ class ColumnarCachedBatchE2ESuite
}
}

// Config-gate negative test: with partition stats disabled (the production default),
// serializeWithStats must NOT be invoked -- the legacy serialize() path is taken and stats
// are emitted as null. A bug in the gate could silently activate stats for all users, or
// break correctness on the legacy stats=null read path.
// Partition-stats negative test: with partition stats disabled (the production default),
// V3 lazy no-stats bytes are still written, but stats are emitted as null. A bug in the
// gate could silently activate stats for all users, or break correctness on the
// stats=null buildFilter pass-through path.
//
// Asserts correctness only, not numOutputRows: the Gluten native scan reports row counts
// on a separate metrics path, so InMemoryTableScanExec.numOutputRows can legitimately be 0
// in either gated branch (see "numOutputRows reflects post-filter row count" above).
test("partitionStats.enabled=false: legacy serialize() path correctness preserved") {
test("partitionStats.enabled=false: V3 lazy no-stats path correctness preserved") {
withSQLConf(
GlutenConfig.COLUMNAR_TABLE_CACHE_PARTITION_STATS_ENABLED.key -> "false") {
val cached = cacheRange()
try {
cached.count() // materialize cache via legacy serialize() path (stats emitted as null)
cached.count() // materialize cache via V3 no-stats path (stats emitted as null)
val result = cached.filter(col("k") === pivot).count()
assert(result == 1L, s"expected exactly one row matching k=$pivot, got $result")
} finally {
Expand Down Expand Up @@ -431,8 +433,8 @@ class ColumnarCachedBatchE2ESuite
}
}

// Reverse: legacy v1 payload at build (stats=null), reader cannot fabricate
// stats. Distinct from the same-config legacy test: this forces cross-config.
// Reverse: V3 no-stats payload at build (stats=null), reader cannot fabricate stats.
// Distinct from the same-config no-stats test: this forces cross-config.
test("cross-config: build with stats disabled, read with stats enabled") {
var cached: DataFrame = null
var filtered: DataFrame = null
Expand Down Expand Up @@ -509,4 +511,35 @@ class ColumnarCachedBatchE2ESuite
}
}
}

// V3 lazy deserialization smoke tests

test("V3 default: cache + equality filter produces correct result") {
val cached = cacheRange()
try {
cached.count()
val result = cached.filter(col("k") === pivot).count()
assert(result == 1L, s"V3: expected 1 row matching k=$pivot, got $result")
} finally {
cached.unpersist()
}
}

test("V3 default: multi-column cache, partial projection, no crash") {
val cached = spark
.range(N)
.selectExpr(
"cast(id as bigint) as a",
"cast(id*2 as bigint) as b",
"cast(id+1 as bigint) as c")
.repartitionByRange(P, col("a"))
.cache()
try {
cached.count()
val result = cached.filter(col("a") === pivot).select("a", "c").count()
assert(result == 1L, s"V3 projection: expected 1 row, got $result")
} finally {
cached.unpersist()
}
}
}
Loading
Loading