diff --git a/.github/workflows/util/install-spark-resources.sh b/.github/workflows/util/install-spark-resources.sh index 5988f383628..bfbb55f55d7 100755 --- a/.github/workflows/util/install-spark-resources.sh +++ b/.github/workflows/util/install-spark-resources.sh @@ -118,7 +118,7 @@ if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then 4.0) # Spark-4.0, scala 2.12 // using 2.12 as a hack as 4.0 does not have 2.13 suffix cd ${INSTALL_DIR} && \ - install_spark "4.0.1" "3" "2.12" + install_spark "4.0.2" "3" "2.12" mv /opt/shims/spark40/spark_home/assembly/target/scala-2.12 /opt/shims/spark40/spark_home/assembly/target/scala-2.13 ;; 4.1) diff --git a/.github/workflows/velox_backend_x86.yml b/.github/workflows/velox_backend_x86.yml index 8672e5ecc4d..2fb3a059bec 100644 --- a/.github/workflows/velox_backend_x86.yml +++ b/.github/workflows/velox_backend_x86.yml @@ -1307,7 +1307,7 @@ jobs: pip3 install setuptools==77.0.3 && \ pip3 install pyspark==3.5.5 cython && \ pip3 install pandas==2.2.3 pyarrow==20.0.0 - - name: Prepare Spark Resources for Spark 4.0.1 #TODO remove after image update + - name: Prepare Spark Resources for Spark 4.0.2 #TODO remove after image update run: | rm -rf /opt/shims/spark40 bash .github/workflows/util/install-spark-resources.sh 4.0 @@ -1358,7 +1358,7 @@ jobs: with: name: arrow-jars-centos-7-${{github.sha}} path: /root/.m2/repository/org/apache/arrow/ - - name: Prepare Spark Resources for Spark 4.0.1 #TODO remove after image update + - name: Prepare Spark Resources for Spark 4.0.2 #TODO remove after image update run: | rm -rf /opt/shims/spark40 bash .github/workflows/util/install-spark-resources.sh 4.0 diff --git a/docs/get-started/build-guide.md b/docs/get-started/build-guide.md index cdfad12f67e..8757f6fa130 100644 --- a/docs/get-started/build-guide.md +++ b/docs/get-started/build-guide.md @@ -73,5 +73,5 @@ It's name pattern is `gluten--bundle-spark_< | 3.3.1 | 3.3 | 2.12 | | 3.4.4 | 3.4 | 2.12 | | 3.5.5 | 3.5 | 2.12 | -| 4.0.1 | 4.0 | 2.13 | +| 4.0.2 | 4.0 | 2.13 | | 4.1.1 | 4.1 | 2.13 | diff --git a/docs/get-started/getting-started.md b/docs/get-started/getting-started.md index 34ad08a78f8..0842dc0472d 100644 --- a/docs/get-started/getting-started.md +++ b/docs/get-started/getting-started.md @@ -24,7 +24,7 @@ Gluten supports two native backends: - **OS**: Ubuntu 20.04/22.04 or CentOS 7/8 (other Linux distros may work with static build but are not officially tested) - **JDK**: OpenJDK 8 or 17 (Spark 4.0 requires JDK 17+) -- **Spark**: 3.3.1, 3.4.4, 3.5.5, 4.0.1, or 4.1.1 +- **Spark**: 3.3.1, 3.4.4, 3.5.5, 4.0.2, or 4.1.1 - **Scala**: 2.12 (Spark 4.0 requires Scala 2.13) ### 2. Build diff --git a/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala index f564324b141..2c8b508d9ce 100644 --- a/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala @@ -80,6 +80,7 @@ class VeloxTestSettings extends BackendTestSettings { .excludeByPrefix("SPARK-48012") .excludeByPrefix("SPARK-44647") .excludeByPrefix("SPARK-41471") + .excludeByPrefix("SPARK-54439") // disable due to check for SMJ node .excludeByPrefix("SPARK-41413: partitioned join:") .excludeByPrefix("SPARK-42038: partially clustered:") diff --git a/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/connector/GlutenKeyGroupedPartitioningSuite.scala b/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/connector/GlutenKeyGroupedPartitioningSuite.scala index 8e2e4ca47f0..afe0cb7969c 100644 --- a/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/connector/GlutenKeyGroupedPartitioningSuite.scala +++ b/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/connector/GlutenKeyGroupedPartitioningSuite.scala @@ -1210,6 +1210,62 @@ class GlutenKeyGroupedPartitioningSuite } } + testGluten("SPARK-54439: KeyGroupedPartitioning and join key size mismatch") { + val items_partitions = Array(identity("id")) + createTable(items, itemsColumns, items_partitions) + + sql( + s"INSERT INTO testcat.ns.$items VALUES " + + "(1, 'aa', 40.0, cast('2020-01-01' as timestamp)), " + + "(3, 'bb', 10.0, cast('2020-01-01' as timestamp)), " + + "(4, 'cc', 15.5, cast('2020-02-01' as timestamp))") + + createTable(purchases, purchasesColumns, Array.empty) + sql( + s"INSERT INTO testcat.ns.$purchases VALUES " + + "(1, 42.0, cast('2020-01-01' as timestamp)), " + + "(3, 19.5, cast('2020-02-01' as timestamp))") + + withSQLConf(SQLConf.V2_BUCKETING_SHUFFLE_ENABLED.key -> "true") { + // `time` and `item_id` in the required `ClusteredDistribution` for `purchases`, but `item` is + // storage partitioned only by `id` + val df = createJoinTestDF(Seq("arrive_time" -> "time", "id" -> "item_id")) + val shuffles = collectShuffles(df.queryExecution.executedPlan) + assert(shuffles.size == 1, "only shuffle one side not report partitioning") + + checkAnswer(df, Seq(Row(1, "aa", 40.0, 42.0))) + } + } + + testGluten("SPARK-54439: KeyGroupedPartitioning with transform and join key size mismatch") { + // Do not use `bucket()` in "one side partition" tests as its implementation in + // `InMemoryBaseTable` conflicts with `BucketFunction` + val items_partitions = Array(years("arrive_time")) + createTable(items, itemsColumns, items_partitions) + + sql( + s"INSERT INTO testcat.ns.$items VALUES " + + "(1, 'aa', 40.0, cast('2020-01-01' as timestamp)), " + + "(1, 'bb', 10.0, cast('2021-01-01' as timestamp)), " + + "(4, 'cc', 15.5, cast('2021-02-01' as timestamp))") + + createTable(purchases, purchasesColumns, Array.empty) + sql( + s"INSERT INTO testcat.ns.$purchases VALUES " + + "(1, 42.0, cast('2020-01-01' as timestamp)), " + + "(3, 19.5, cast('2021-02-01' as timestamp))") + + withSQLConf(SQLConf.V2_BUCKETING_SHUFFLE_ENABLED.key -> "true") { + // `item_id` and `time` in the required `ClusteredDistribution` for `purchases`, but `item` is + // storage partitioned only by `year(arrive_time)` + val df = createJoinTestDF(Seq("id" -> "item_id", "arrive_time" -> "time")) + val shuffles = collectShuffles(df.queryExecution.executedPlan) + assert(shuffles.size == 1, "only shuffle one side not report partitioning") + + checkAnswer(df, Seq(Row(1, "aa", 40.0, 42.0))) + } + } + testGluten( "SPARK-44647: SPJ: test join key is subset of cluster key " + "with push values and partially-clustered") { diff --git a/pom.xml b/pom.xml index 6ab7bcddc26..87bf626a297 100644 --- a/pom.xml +++ b/pom.xml @@ -1269,7 +1269,7 @@ 4.0 spark-sql-columnar-shims-spark40 - 4.0.1 + 4.0.2 1.10.0 delta-spark 4.0.1 diff --git a/tools/gluten-it/pom.xml b/tools/gluten-it/pom.xml index 7dcce0e4c5d..021c6ec2519 100644 --- a/tools/gluten-it/pom.xml +++ b/tools/gluten-it/pom.xml @@ -320,7 +320,7 @@ spark-4.0 - 4.0.1 + 4.0.2 2.13.17 2.13 delta-spark