apache · yaooqinn · Jun 4, 2026 · May 29, 2026 · Jun 3, 2026
diff --git a/.github/workflows/util/install-spark-resources.sh b/.github/workflows/util/install-spark-resources.sh
@@ -118,7 +118,7 @@ if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
   4.0)
       # Spark-4.0, scala 2.12 // using 2.12 as a hack as 4.0 does not have 2.13 suffix
       cd ${INSTALL_DIR} && \
-      install_spark "4.0.1" "3" "2.12"
+      install_spark "4.0.2" "3" "2.12"
       mv /opt/shims/spark40/spark_home/assembly/target/scala-2.12 /opt/shims/spark40/spark_home/assembly/target/scala-2.13
       ;;
   4.1)

diff --git a/.github/workflows/velox_backend_x86.yml b/.github/workflows/velox_backend_x86.yml
@@ -1307,7 +1307,7 @@ jobs:
           pip3 install setuptools==77.0.3 && \
           pip3 install pyspark==3.5.5 cython && \
           pip3 install pandas==2.2.3 pyarrow==20.0.0
-      - name: Prepare Spark Resources for Spark 4.0.1 #TODO remove after image update
+      - name: Prepare Spark Resources for Spark 4.0.2 #TODO remove after image update
         run: |
           rm -rf /opt/shims/spark40
           bash .github/workflows/util/install-spark-resources.sh 4.0
@@ -1358,7 +1358,7 @@ jobs:
         with:
           name: arrow-jars-centos-7-${{github.sha}}
           path: /root/.m2/repository/org/apache/arrow/
-      - name: Prepare Spark Resources for Spark 4.0.1 #TODO remove after image update
+      - name: Prepare Spark Resources for Spark 4.0.2 #TODO remove after image update
         run: |
           rm -rf /opt/shims/spark40
           bash .github/workflows/util/install-spark-resources.sh 4.0

diff --git a/docs/get-started/build-guide.md b/docs/get-started/build-guide.md
@@ -73,5 +73,5 @@ It's name pattern is `gluten-<backend_type>-bundle-spark<spark.bundle.version>_<
 | 3.3.1         | 3.3                  | 2.12                 |
 | 3.4.4         | 3.4                  | 2.12                 |
 | 3.5.5         | 3.5                  | 2.12                 |
-| 4.0.1         | 4.0                  | 2.13                 |
+| 4.0.2         | 4.0                  | 2.13                 |
 | 4.1.1         | 4.1                  | 2.13                 |
diff --git a/docs/get-started/getting-started.md b/docs/get-started/getting-started.md
@@ -24,7 +24,7 @@ Gluten supports two native backends:
 
 - **OS**: Ubuntu 20.04/22.04 or CentOS 7/8 (other Linux distros may work with static build but are not officially tested)
 - **JDK**: OpenJDK 8 or 17 (Spark 4.0 requires JDK 17+)
-- **Spark**: 3.3.1, 3.4.4, 3.5.5, 4.0.1, or 4.1.1
+- **Spark**: 3.3.1, 3.4.4, 3.5.5, 4.0.2, or 4.1.1
 - **Scala**: 2.12 (Spark 4.0 requires Scala 2.13)
 
 ### 2. Build

diff --git a/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -80,6 +80,7 @@ class VeloxTestSettings extends BackendTestSettings {
     .excludeByPrefix("SPARK-48012")
     .excludeByPrefix("SPARK-44647")
     .excludeByPrefix("SPARK-41471")
+    .excludeByPrefix("SPARK-54439")
     // disable due to check for SMJ node
     .excludeByPrefix("SPARK-41413: partitioned join:")
     .excludeByPrefix("SPARK-42038: partially clustered:")

diff --git a/...k40/src/test/scala/org/apache/spark/sql/connector/GlutenKeyGroupedPartitioningSuite.scala b/...k40/src/test/scala/org/apache/spark/sql/connector/GlutenKeyGroupedPartitioningSuite.scala
@@ -1210,6 +1210,62 @@ class GlutenKeyGroupedPartitioningSuite
     }
   }
 
+  testGluten("SPARK-54439: KeyGroupedPartitioning and join key size mismatch") {
+    val items_partitions = Array(identity("id"))
+    createTable(items, itemsColumns, items_partitions)
+
+    sql(
+      s"INSERT INTO testcat.ns.$items VALUES " +
+        "(1, 'aa', 40.0, cast('2020-01-01' as timestamp)), " +
+        "(3, 'bb', 10.0, cast('2020-01-01' as timestamp)), " +
+        "(4, 'cc', 15.5, cast('2020-02-01' as timestamp))")
+
+    createTable(purchases, purchasesColumns, Array.empty)
+    sql(
+      s"INSERT INTO testcat.ns.$purchases VALUES " +
+        "(1, 42.0, cast('2020-01-01' as timestamp)), " +
+        "(3, 19.5, cast('2020-02-01' as timestamp))")
+
+    withSQLConf(SQLConf.V2_BUCKETING_SHUFFLE_ENABLED.key -> "true") {
+      // `time` and `item_id` in the required `ClusteredDistribution` for `purchases`, but `item` is
+      // storage partitioned only by `id`
+      val df = createJoinTestDF(Seq("arrive_time" -> "time", "id" -> "item_id"))
+      val shuffles = collectShuffles(df.queryExecution.executedPlan)
+      assert(shuffles.size == 1, "only shuffle one side not report partitioning")
+
+      checkAnswer(df, Seq(Row(1, "aa", 40.0, 42.0)))
+    }
+  }
+
+  testGluten("SPARK-54439: KeyGroupedPartitioning with transform and join key size mismatch") {
+    // Do not use `bucket()` in "one side partition" tests as its implementation in
+    // `InMemoryBaseTable` conflicts with `BucketFunction`
+    val items_partitions = Array(years("arrive_time"))
+    createTable(items, itemsColumns, items_partitions)
+
+    sql(
+      s"INSERT INTO testcat.ns.$items VALUES " +
+        "(1, 'aa', 40.0, cast('2020-01-01' as timestamp)), " +
+        "(1, 'bb', 10.0, cast('2021-01-01' as timestamp)), " +
+        "(4, 'cc', 15.5, cast('2021-02-01' as timestamp))")
+
+    createTable(purchases, purchasesColumns, Array.empty)
+    sql(
+      s"INSERT INTO testcat.ns.$purchases VALUES " +
+        "(1, 42.0, cast('2020-01-01' as timestamp)), " +
+        "(3, 19.5, cast('2021-02-01' as timestamp))")
+
+    withSQLConf(SQLConf.V2_BUCKETING_SHUFFLE_ENABLED.key -> "true") {
+      // `item_id` and `time` in the required `ClusteredDistribution` for `purchases`, but `item` is
+      // storage partitioned only by `year(arrive_time)`
+      val df = createJoinTestDF(Seq("id" -> "item_id", "arrive_time" -> "time"))
+      val shuffles = collectShuffles(df.queryExecution.executedPlan)
+      assert(shuffles.size == 1, "only shuffle one side not report partitioning")
+
+      checkAnswer(df, Seq(Row(1, "aa", 40.0, 42.0)))
+    }
+  }
+
   testGluten(
     "SPARK-44647: SPJ: test join key is subset of cluster key " +
       "with push values and partially-clustered") {

diff --git a/pom.xml b/pom.xml
@@ -1269,7 +1269,7 @@
       <properties>
         <sparkbundle.version>4.0</sparkbundle.version>
         <sparkshim.artifactId>spark-sql-columnar-shims-spark40</sparkshim.artifactId>
-        <spark.version>4.0.1</spark.version>
+        <spark.version>4.0.2</spark.version>
         <iceberg.version>1.10.0</iceberg.version>
         <delta.package.name>delta-spark</delta.package.name>
         <delta.version>4.0.1</delta.version>

diff --git a/tools/gluten-it/pom.xml b/tools/gluten-it/pom.xml
@@ -320,7 +320,7 @@
     <profile>
       <id>spark-4.0</id>
       <properties>
-        <spark.version>4.0.1</spark.version>
+        <spark.version>4.0.2</spark.version>
         <scala.version>2.13.17</scala.version>
         <scala.binary.version>2.13</scala.binary.version>
         <delta.package.name>delta-spark</delta.package.name>