From a18bcca9505bea3295bb23cc41f09766a08d2912 Mon Sep 17 00:00:00 2001
From: Kent Yao <kentyao@microsoft.com>
Date: Fri, 29 May 2026 02:04:15 +0000
Subject: [PATCH 1/2] [MINOR] Upgrade Spark 4.0 to 4.0.2

Bumps the spark-4.0 profile from 4.0.1 to 4.0.2 (patch release).

Changes:
- pom.xml / tools/gluten-it/pom.xml: spark.version
- .github/workflows/util/install-spark-resources.sh: download version
- .github/workflows/velox_backend_x86.yml: step names
- docs/get-started/{build-guide,getting-started}.md: supported versions

No shim code changes are required: 4.0.2 is a maintenance release with
no public API changes, and unlike 4.1.2 (SPARK-55337) it does not revert
any binary signatures that the spark40 shim depends on.

Notable upstream fixes that may affect Gluten behaviour (no Gluten code
change needed, but worth watching CI for plan-stability / metrics diffs):
- SPARK-54439 SPJ KeyGroupedPartitioning + join key size mismatch
- SPARK-53434 ColumnarRow#get should check isNullAt
- SPARK-54917 Upgrade ORC to 2.1.4

Generated-by: claude-opus-4.7
---
 .github/workflows/util/install-spark-resources.sh | 2 +-
 .github/workflows/velox_backend_x86.yml           | 4 ++--
 docs/get-started/build-guide.md                   | 2 +-
 docs/get-started/getting-started.md               | 2 +-
 pom.xml                                           | 2 +-
 tools/gluten-it/pom.xml                           | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/util/install-spark-resources.sh b/.github/workflows/util/install-spark-resources.sh
index 5988f383628..bfbb55f55d7 100755
--- a/.github/workflows/util/install-spark-resources.sh
+++ b/.github/workflows/util/install-spark-resources.sh
@@ -118,7 +118,7 @@ if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
   4.0)
       # Spark-4.0, scala 2.12 // using 2.12 as a hack as 4.0 does not have 2.13 suffix
       cd ${INSTALL_DIR} && \
-      install_spark "4.0.1" "3" "2.12"
+      install_spark "4.0.2" "3" "2.12"
       mv /opt/shims/spark40/spark_home/assembly/target/scala-2.12 /opt/shims/spark40/spark_home/assembly/target/scala-2.13
       ;;
   4.1)
diff --git a/.github/workflows/velox_backend_x86.yml b/.github/workflows/velox_backend_x86.yml
index 8672e5ecc4d..2fb3a059bec 100644
--- a/.github/workflows/velox_backend_x86.yml
+++ b/.github/workflows/velox_backend_x86.yml
@@ -1307,7 +1307,7 @@ jobs:
           pip3 install setuptools==77.0.3 && \
           pip3 install pyspark==3.5.5 cython && \
           pip3 install pandas==2.2.3 pyarrow==20.0.0
-      - name: Prepare Spark Resources for Spark 4.0.1 #TODO remove after image update
+      - name: Prepare Spark Resources for Spark 4.0.2 #TODO remove after image update
         run: |
           rm -rf /opt/shims/spark40
           bash .github/workflows/util/install-spark-resources.sh 4.0
@@ -1358,7 +1358,7 @@ jobs:
         with:
           name: arrow-jars-centos-7-${{github.sha}}
           path: /root/.m2/repository/org/apache/arrow/
-      - name: Prepare Spark Resources for Spark 4.0.1 #TODO remove after image update
+      - name: Prepare Spark Resources for Spark 4.0.2 #TODO remove after image update
         run: |
           rm -rf /opt/shims/spark40
           bash .github/workflows/util/install-spark-resources.sh 4.0
diff --git a/docs/get-started/build-guide.md b/docs/get-started/build-guide.md
index cdfad12f67e..8757f6fa130 100644
--- a/docs/get-started/build-guide.md
+++ b/docs/get-started/build-guide.md
@@ -73,5 +73,5 @@ It's name pattern is `gluten-<backend_type>-bundle-spark<spark.bundle.version>_<
 | 3.3.1         | 3.3                  | 2.12                 |
 | 3.4.4         | 3.4                  | 2.12                 |
 | 3.5.5         | 3.5                  | 2.12                 |
-| 4.0.1         | 4.0                  | 2.13                 |
+| 4.0.2         | 4.0                  | 2.13                 |
 | 4.1.1         | 4.1                  | 2.13                 |
diff --git a/docs/get-started/getting-started.md b/docs/get-started/getting-started.md
index 34ad08a78f8..0842dc0472d 100644
--- a/docs/get-started/getting-started.md
+++ b/docs/get-started/getting-started.md
@@ -24,7 +24,7 @@ Gluten supports two native backends:
 
 - **OS**: Ubuntu 20.04/22.04 or CentOS 7/8 (other Linux distros may work with static build but are not officially tested)
 - **JDK**: OpenJDK 8 or 17 (Spark 4.0 requires JDK 17+)
-- **Spark**: 3.3.1, 3.4.4, 3.5.5, 4.0.1, or 4.1.1
+- **Spark**: 3.3.1, 3.4.4, 3.5.5, 4.0.2, or 4.1.1
 - **Scala**: 2.12 (Spark 4.0 requires Scala 2.13)
 
 ### 2. Build
diff --git a/pom.xml b/pom.xml
index 6ab7bcddc26..87bf626a297 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1269,7 +1269,7 @@
       <properties>
         <sparkbundle.version>4.0</sparkbundle.version>
         <sparkshim.artifactId>spark-sql-columnar-shims-spark40</sparkshim.artifactId>
-        <spark.version>4.0.1</spark.version>
+        <spark.version>4.0.2</spark.version>
         <iceberg.version>1.10.0</iceberg.version>
         <delta.package.name>delta-spark</delta.package.name>
         <delta.version>4.0.1</delta.version>
diff --git a/tools/gluten-it/pom.xml b/tools/gluten-it/pom.xml
index 7dcce0e4c5d..021c6ec2519 100644
--- a/tools/gluten-it/pom.xml
+++ b/tools/gluten-it/pom.xml
@@ -320,7 +320,7 @@
     <profile>
       <id>spark-4.0</id>
       <properties>
-        <spark.version>4.0.1</spark.version>
+        <spark.version>4.0.2</spark.version>
         <scala.version>2.13.17</scala.version>
         <scala.binary.version>2.13</scala.binary.version>
         <delta.package.name>delta-spark</delta.package.name>

From fbb48049470bad301ae029d8754dec92e3945859 Mon Sep 17 00:00:00 2001
From: Kent Yao <kentyao@microsoft.com>
Date: Wed, 3 Jun 2026 16:03:24 +0000
Subject: [PATCH 2/2] [MINOR][TEST] Port SPARK-54439 KeyGroupedPartitioning
 tests for Spark 4.0

Spark 4.0.2 picks up SPARK-54439 (apache/spark#53142), a correctness fix
in KeyGroupedShuffleSpec.createPartitioning() with two new tests in
KeyGroupedPartitioningSuite. The vanilla tests use the base collectShuffles
helper which only matches ShuffleExchangeExec, so they fail under Gluten
where the shuffle is a ColumnarShuffleExchangeExec.

Rather than excluding them, port them as testGluten overrides (same pattern
as the existing SPARK-41471 tests) so they reuse the columnar-aware
collectShuffles helper and keep coverage of the correctness fix.

Locally verified on Velox backend (Spark 4.0.2, Scala 2.13): both new tests
pass (shuffles.size == 1 and checkAnswer), with no change to the set of
pre-existing suite failures.

Generated-by: Claude Opus 4.8
---
 .../utils/velox/VeloxTestSettings.scala       |  1 +
 .../GlutenKeyGroupedPartitioningSuite.scala   | 56 +++++++++++++++++++
 2 files changed, 57 insertions(+)

diff --git a/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
index f564324b141..2c8b508d9ce 100644
--- a/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
+++ b/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -80,6 +80,7 @@ class VeloxTestSettings extends BackendTestSettings {
     .excludeByPrefix("SPARK-48012")
     .excludeByPrefix("SPARK-44647")
     .excludeByPrefix("SPARK-41471")
+    .excludeByPrefix("SPARK-54439")
     // disable due to check for SMJ node
     .excludeByPrefix("SPARK-41413: partitioned join:")
     .excludeByPrefix("SPARK-42038: partially clustered:")
diff --git a/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/connector/GlutenKeyGroupedPartitioningSuite.scala b/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/connector/GlutenKeyGroupedPartitioningSuite.scala
index 8e2e4ca47f0..afe0cb7969c 100644
--- a/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/connector/GlutenKeyGroupedPartitioningSuite.scala
+++ b/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/connector/GlutenKeyGroupedPartitioningSuite.scala
@@ -1210,6 +1210,62 @@ class GlutenKeyGroupedPartitioningSuite
     }
   }
 
+  testGluten("SPARK-54439: KeyGroupedPartitioning and join key size mismatch") {
+    val items_partitions = Array(identity("id"))
+    createTable(items, itemsColumns, items_partitions)
+
+    sql(
+      s"INSERT INTO testcat.ns.$items VALUES " +
+        "(1, 'aa', 40.0, cast('2020-01-01' as timestamp)), " +
+        "(3, 'bb', 10.0, cast('2020-01-01' as timestamp)), " +
+        "(4, 'cc', 15.5, cast('2020-02-01' as timestamp))")
+
+    createTable(purchases, purchasesColumns, Array.empty)
+    sql(
+      s"INSERT INTO testcat.ns.$purchases VALUES " +
+        "(1, 42.0, cast('2020-01-01' as timestamp)), " +
+        "(3, 19.5, cast('2020-02-01' as timestamp))")
+
+    withSQLConf(SQLConf.V2_BUCKETING_SHUFFLE_ENABLED.key -> "true") {
+      // `time` and `item_id` in the required `ClusteredDistribution` for `purchases`, but `item` is
+      // storage partitioned only by `id`
+      val df = createJoinTestDF(Seq("arrive_time" -> "time", "id" -> "item_id"))
+      val shuffles = collectShuffles(df.queryExecution.executedPlan)
+      assert(shuffles.size == 1, "only shuffle one side not report partitioning")
+
+      checkAnswer(df, Seq(Row(1, "aa", 40.0, 42.0)))
+    }
+  }
+
+  testGluten("SPARK-54439: KeyGroupedPartitioning with transform and join key size mismatch") {
+    // Do not use `bucket()` in "one side partition" tests as its implementation in
+    // `InMemoryBaseTable` conflicts with `BucketFunction`
+    val items_partitions = Array(years("arrive_time"))
+    createTable(items, itemsColumns, items_partitions)
+
+    sql(
+      s"INSERT INTO testcat.ns.$items VALUES " +
+        "(1, 'aa', 40.0, cast('2020-01-01' as timestamp)), " +
+        "(1, 'bb', 10.0, cast('2021-01-01' as timestamp)), " +
+        "(4, 'cc', 15.5, cast('2021-02-01' as timestamp))")
+
+    createTable(purchases, purchasesColumns, Array.empty)
+    sql(
+      s"INSERT INTO testcat.ns.$purchases VALUES " +
+        "(1, 42.0, cast('2020-01-01' as timestamp)), " +
+        "(3, 19.5, cast('2021-02-01' as timestamp))")
+
+    withSQLConf(SQLConf.V2_BUCKETING_SHUFFLE_ENABLED.key -> "true") {
+      // `item_id` and `time` in the required `ClusteredDistribution` for `purchases`, but `item` is
+      // storage partitioned only by `year(arrive_time)`
+      val df = createJoinTestDF(Seq("id" -> "item_id", "arrive_time" -> "time"))
+      val shuffles = collectShuffles(df.queryExecution.executedPlan)
+      assert(shuffles.size == 1, "only shuffle one side not report partitioning")
+
+      checkAnswer(df, Seq(Row(1, "aa", 40.0, 42.0)))
+    }
+  }
+
   testGluten(
     "SPARK-44647: SPJ: test join key is subset of cluster key " +
       "with push values and partially-clustered") {