From a18bcca9505bea3295bb23cc41f09766a08d2912 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Fri, 29 May 2026 02:04:15 +0000 Subject: [PATCH 1/2] [MINOR] Upgrade Spark 4.0 to 4.0.2 Bumps the spark-4.0 profile from 4.0.1 to 4.0.2 (patch release). Changes: - pom.xml / tools/gluten-it/pom.xml: spark.version - .github/workflows/util/install-spark-resources.sh: download version - .github/workflows/velox_backend_x86.yml: step names - docs/get-started/{build-guide,getting-started}.md: supported versions No shim code changes are required: 4.0.2 is a maintenance release with no public API changes, and unlike 4.1.2 (SPARK-55337) it does not revert any binary signatures that the spark40 shim depends on. Notable upstream fixes that may affect Gluten behaviour (no Gluten code change needed, but worth watching CI for plan-stability / metrics diffs): - SPARK-54439 SPJ KeyGroupedPartitioning + join key size mismatch - SPARK-53434 ColumnarRow#get should check isNullAt - SPARK-54917 Upgrade ORC to 2.1.4 Generated-by: claude-opus-4.7 --- .github/workflows/util/install-spark-resources.sh | 2 +- .github/workflows/velox_backend_x86.yml | 4 ++-- docs/get-started/build-guide.md | 2 +- docs/get-started/getting-started.md | 2 +- pom.xml | 2 +- tools/gluten-it/pom.xml | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/util/install-spark-resources.sh b/.github/workflows/util/install-spark-resources.sh index 5988f383628..bfbb55f55d7 100755 --- a/.github/workflows/util/install-spark-resources.sh +++ b/.github/workflows/util/install-spark-resources.sh @@ -118,7 +118,7 @@ if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then 4.0) # Spark-4.0, scala 2.12 // using 2.12 as a hack as 4.0 does not have 2.13 suffix cd ${INSTALL_DIR} && \ - install_spark "4.0.1" "3" "2.12" + install_spark "4.0.2" "3" "2.12" mv /opt/shims/spark40/spark_home/assembly/target/scala-2.12 /opt/shims/spark40/spark_home/assembly/target/scala-2.13 ;; 4.1) diff --git a/.github/workflows/velox_backend_x86.yml b/.github/workflows/velox_backend_x86.yml index 8672e5ecc4d..2fb3a059bec 100644 --- a/.github/workflows/velox_backend_x86.yml +++ b/.github/workflows/velox_backend_x86.yml @@ -1307,7 +1307,7 @@ jobs: pip3 install setuptools==77.0.3 && \ pip3 install pyspark==3.5.5 cython && \ pip3 install pandas==2.2.3 pyarrow==20.0.0 - - name: Prepare Spark Resources for Spark 4.0.1 #TODO remove after image update + - name: Prepare Spark Resources for Spark 4.0.2 #TODO remove after image update run: | rm -rf /opt/shims/spark40 bash .github/workflows/util/install-spark-resources.sh 4.0 @@ -1358,7 +1358,7 @@ jobs: with: name: arrow-jars-centos-7-${{github.sha}} path: /root/.m2/repository/org/apache/arrow/ - - name: Prepare Spark Resources for Spark 4.0.1 #TODO remove after image update + - name: Prepare Spark Resources for Spark 4.0.2 #TODO remove after image update run: | rm -rf /opt/shims/spark40 bash .github/workflows/util/install-spark-resources.sh 4.0 diff --git a/docs/get-started/build-guide.md b/docs/get-started/build-guide.md index cdfad12f67e..8757f6fa130 100644 --- a/docs/get-started/build-guide.md +++ b/docs/get-started/build-guide.md @@ -73,5 +73,5 @@ It's name pattern is `gluten--bundle-spark_< | 3.3.1 | 3.3 | 2.12 | | 3.4.4 | 3.4 | 2.12 | | 3.5.5 | 3.5 | 2.12 | -| 4.0.1 | 4.0 | 2.13 | +| 4.0.2 | 4.0 | 2.13 | | 4.1.1 | 4.1 | 2.13 | diff --git a/docs/get-started/getting-started.md b/docs/get-started/getting-started.md index 34ad08a78f8..0842dc0472d 100644 --- a/docs/get-started/getting-started.md +++ b/docs/get-started/getting-started.md @@ -24,7 +24,7 @@ Gluten supports two native backends: - **OS**: Ubuntu 20.04/22.04 or CentOS 7/8 (other Linux distros may work with static build but are not officially tested) - **JDK**: OpenJDK 8 or 17 (Spark 4.0 requires JDK 17+) -- **Spark**: 3.3.1, 3.4.4, 3.5.5, 4.0.1, or 4.1.1 +- **Spark**: 3.3.1, 3.4.4, 3.5.5, 4.0.2, or 4.1.1 - **Scala**: 2.12 (Spark 4.0 requires Scala 2.13) ### 2. Build diff --git a/pom.xml b/pom.xml index 6ab7bcddc26..87bf626a297 100644 --- a/pom.xml +++ b/pom.xml @@ -1269,7 +1269,7 @@ 4.0 spark-sql-columnar-shims-spark40 - 4.0.1 + 4.0.2 1.10.0 delta-spark 4.0.1 diff --git a/tools/gluten-it/pom.xml b/tools/gluten-it/pom.xml index 7dcce0e4c5d..021c6ec2519 100644 --- a/tools/gluten-it/pom.xml +++ b/tools/gluten-it/pom.xml @@ -320,7 +320,7 @@ spark-4.0 - 4.0.1 + 4.0.2 2.13.17 2.13 delta-spark From fbb48049470bad301ae029d8754dec92e3945859 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Wed, 3 Jun 2026 16:03:24 +0000 Subject: [PATCH 2/2] [MINOR][TEST] Port SPARK-54439 KeyGroupedPartitioning tests for Spark 4.0 Spark 4.0.2 picks up SPARK-54439 (apache/spark#53142), a correctness fix in KeyGroupedShuffleSpec.createPartitioning() with two new tests in KeyGroupedPartitioningSuite. The vanilla tests use the base collectShuffles helper which only matches ShuffleExchangeExec, so they fail under Gluten where the shuffle is a ColumnarShuffleExchangeExec. Rather than excluding them, port them as testGluten overrides (same pattern as the existing SPARK-41471 tests) so they reuse the columnar-aware collectShuffles helper and keep coverage of the correctness fix. Locally verified on Velox backend (Spark 4.0.2, Scala 2.13): both new tests pass (shuffles.size == 1 and checkAnswer), with no change to the set of pre-existing suite failures. Generated-by: Claude Opus 4.8 --- .../utils/velox/VeloxTestSettings.scala | 1 + .../GlutenKeyGroupedPartitioningSuite.scala | 56 +++++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala index f564324b141..2c8b508d9ce 100644 --- a/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala @@ -80,6 +80,7 @@ class VeloxTestSettings extends BackendTestSettings { .excludeByPrefix("SPARK-48012") .excludeByPrefix("SPARK-44647") .excludeByPrefix("SPARK-41471") + .excludeByPrefix("SPARK-54439") // disable due to check for SMJ node .excludeByPrefix("SPARK-41413: partitioned join:") .excludeByPrefix("SPARK-42038: partially clustered:") diff --git a/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/connector/GlutenKeyGroupedPartitioningSuite.scala b/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/connector/GlutenKeyGroupedPartitioningSuite.scala index 8e2e4ca47f0..afe0cb7969c 100644 --- a/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/connector/GlutenKeyGroupedPartitioningSuite.scala +++ b/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/connector/GlutenKeyGroupedPartitioningSuite.scala @@ -1210,6 +1210,62 @@ class GlutenKeyGroupedPartitioningSuite } } + testGluten("SPARK-54439: KeyGroupedPartitioning and join key size mismatch") { + val items_partitions = Array(identity("id")) + createTable(items, itemsColumns, items_partitions) + + sql( + s"INSERT INTO testcat.ns.$items VALUES " + + "(1, 'aa', 40.0, cast('2020-01-01' as timestamp)), " + + "(3, 'bb', 10.0, cast('2020-01-01' as timestamp)), " + + "(4, 'cc', 15.5, cast('2020-02-01' as timestamp))") + + createTable(purchases, purchasesColumns, Array.empty) + sql( + s"INSERT INTO testcat.ns.$purchases VALUES " + + "(1, 42.0, cast('2020-01-01' as timestamp)), " + + "(3, 19.5, cast('2020-02-01' as timestamp))") + + withSQLConf(SQLConf.V2_BUCKETING_SHUFFLE_ENABLED.key -> "true") { + // `time` and `item_id` in the required `ClusteredDistribution` for `purchases`, but `item` is + // storage partitioned only by `id` + val df = createJoinTestDF(Seq("arrive_time" -> "time", "id" -> "item_id")) + val shuffles = collectShuffles(df.queryExecution.executedPlan) + assert(shuffles.size == 1, "only shuffle one side not report partitioning") + + checkAnswer(df, Seq(Row(1, "aa", 40.0, 42.0))) + } + } + + testGluten("SPARK-54439: KeyGroupedPartitioning with transform and join key size mismatch") { + // Do not use `bucket()` in "one side partition" tests as its implementation in + // `InMemoryBaseTable` conflicts with `BucketFunction` + val items_partitions = Array(years("arrive_time")) + createTable(items, itemsColumns, items_partitions) + + sql( + s"INSERT INTO testcat.ns.$items VALUES " + + "(1, 'aa', 40.0, cast('2020-01-01' as timestamp)), " + + "(1, 'bb', 10.0, cast('2021-01-01' as timestamp)), " + + "(4, 'cc', 15.5, cast('2021-02-01' as timestamp))") + + createTable(purchases, purchasesColumns, Array.empty) + sql( + s"INSERT INTO testcat.ns.$purchases VALUES " + + "(1, 42.0, cast('2020-01-01' as timestamp)), " + + "(3, 19.5, cast('2021-02-01' as timestamp))") + + withSQLConf(SQLConf.V2_BUCKETING_SHUFFLE_ENABLED.key -> "true") { + // `item_id` and `time` in the required `ClusteredDistribution` for `purchases`, but `item` is + // storage partitioned only by `year(arrive_time)` + val df = createJoinTestDF(Seq("id" -> "item_id", "arrive_time" -> "time")) + val shuffles = collectShuffles(df.queryExecution.executedPlan) + assert(shuffles.size == 1, "only shuffle one side not report partitioning") + + checkAnswer(df, Seq(Row(1, "aa", 40.0, 42.0))) + } + } + testGluten( "SPARK-44647: SPJ: test join key is subset of cluster key " + "with push values and partially-clustered") {