G-Research · EnricoMi · Sep 18, 2025 · Sep 18, 2025 · Sep 18, 2025 · Sep 18, 2025
diff --git a/.asf.yaml b/.asf.yaml
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# https://cwiki.apache.org/confluence/display/INFRA/git+-+.asf.yaml+features
+# https://github.com/apache/infrastructure-asfyaml/blob/main/README.md
 ---
 github:
   description: "Apache Spark - A unified analytics engine for large-scale data processing"

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -50,6 +50,11 @@ on:
         description: 'Number of job splits'
         required: true
         default: '1'
+      create-commit:
+        type: boolean
+        description: 'Commit the benchmark results to the current branch'
+        required: true
+        default: false
 
 jobs:
   matrix-gen:
@@ -195,10 +200,31 @@ jobs:
         # To keep the directory structure and file permissions, tar them
         # See also https://github.com/actions/upload-artifact#maintaining-file-permissions-and-case-sensitive-files
         echo "Preparing the benchmark results:"
-        tar -cvf benchmark-results-${{ inputs.jdk }}-${{ inputs.scala }}.tar `git diff --name-only` `git ls-files --others --exclude=tpcds-sf-1 --exclude=tpcds-sf-1-text --exclude-standard`
+        tar -cvf target/benchmark-results-${{ inputs.jdk }}-${{ inputs.scala }}.tar `git diff --name-only` `git ls-files --others --exclude=tpcds-sf-1 --exclude=tpcds-sf-1-text --exclude-standard`
+    - name: Create a pull request with the results
+      if: ${{ inputs.create-commit && success() }}
+      run: |
+        git config --local user.name "${{ github.actor }}"
+        git config --local user.email "${{ github.event.pusher.email || format('{0}@users.noreply.github.com', github.actor) }}"
+        git add -A
+        git commit -m "Benchmark results for ${{ inputs.class }} (JDK ${{ inputs.jdk }}, Scala ${{ inputs.scala }}, split ${{ matrix.split }} of ${{ inputs.num-splits }})"
+        for i in {1..5}; do
+          echo "Attempt $i to push..."
+          git fetch origin ${{ github.ref_name }}
+          git rebase origin/${{ github.ref_name }}
+          if git push origin ${{ github.ref_name }}:${{ github.ref_name }}; then
+            echo "Push successful."
+            exit 0
+          else
+            echo "Push failed, retrying in 3 seconds..."
+            sleep 3
+          fi
+        done
+        echo "Error: Failed to push after 5 attempts."
+        exit 1
     - name: Upload benchmark results
       uses: actions/upload-artifact@v4
       with:
         name: benchmark-results-${{ inputs.jdk }}-${{ inputs.scala }}-${{ matrix.split }}
-        path: benchmark-results-${{ inputs.jdk }}-${{ inputs.scala }}.tar
+        path: target/benchmark-results-${{ inputs.jdk }}-${{ inputs.scala }}.tar
 
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -112,7 +112,7 @@ jobs:
             ui=false
             docs=false
           fi
-          build=`./dev/is-changed.py -m "core,unsafe,kvstore,avro,utils,network-common,network-shuffle,repl,launcher,examples,sketch,variant,api,catalyst,hive-thriftserver,mllib-local,mllib,graphx,streaming,sql-kafka-0-10,streaming-kafka-0-10,streaming-kinesis-asl,kubernetes,hadoop-cloud,spark-ganglia-lgpl,profiler,protobuf,yarn,connect,sql,hive,pipelines"`
+          build=`./dev/is-changed.py -m "core,unsafe,kvstore,avro,utils,utils-java,network-common,network-shuffle,repl,launcher,examples,sketch,variant,api,catalyst,hive-thriftserver,mllib-local,mllib,graphx,streaming,sql-kafka-0-10,streaming-kafka-0-10,streaming-kinesis-asl,kubernetes,hadoop-cloud,spark-ganglia-lgpl,profiler,protobuf,yarn,connect,sql,hive,pipelines"`
           precondition="
             {
               \"build\": \"$build\",
@@ -122,6 +122,8 @@ jobs:
               \"tpcds-1g\": \"$tpcds\",
               \"docker-integration-tests\": \"$docker\",
               \"lint\" : \"true\",
+              \"java17\" : \"$build\",
+              \"java25\" : \"$build\",
               \"docs\" : \"$docs\",
               \"yarn\" : \"$yarn\",
               \"k8s-integration-tests\" : \"$kubernetes\",
@@ -240,7 +242,7 @@ jobs:
         # Note that the modules below are from sparktestsupport/modules.py.
         modules:
           - >-
-            core, unsafe, kvstore, avro, utils,
+            core, unsafe, kvstore, avro, utils, utils-java,
             network-common, network-shuffle, repl, launcher,
             examples, sketch, variant
           - >-
@@ -360,7 +362,7 @@ jobs:
     - name: Install Python packages (Python 3.11)
       if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) || contains(matrix.modules, 'connect') || contains(matrix.modules, 'yarn')
       run: |
-        python3.11 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting 'lxml==4.9.4' 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.29.1'
+        python3.11 -m pip install 'numpy>=1.22' pyarrow pandas pyyaml scipy unittest-xml-reporting 'lxml==4.9.4' 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.29.5'
         python3.11 -m pip list
     # Run the tests.
     - name: Run tests
@@ -512,37 +514,34 @@ jobs:
             pyspark-core, pyspark-errors, pyspark-streaming, pyspark-logger
           - >-
             pyspark-mllib, pyspark-ml, pyspark-ml-connect, pyspark-pipelines
+          - >-
+            pyspark-structured-streaming, pyspark-structured-streaming-connect
           - >-
             pyspark-connect
           - >-
             pyspark-pandas
           - >-
             pyspark-pandas-slow
           - >-
-            pyspark-pandas-connect-part0
-          - >-
-            pyspark-pandas-connect-part1
-          - >-
-            pyspark-pandas-connect-part2
+            pyspark-pandas-connect
           - >-
-            pyspark-pandas-connect-part3
+            pyspark-pandas-slow-connect
         exclude:
           # Always run if pyspark == 'true', even infra-image is skip (such as non-master job)
           # In practice, the build will run in individual PR, but not against the individual commit
           # in Apache Spark repository.
           - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark != 'true' && 'pyspark-sql, pyspark-resource, pyspark-testing' }}
           - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark != 'true' && 'pyspark-core, pyspark-errors, pyspark-streaming, pyspark-logger' }}
           - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark != 'true' && 'pyspark-mllib, pyspark-ml, pyspark-ml-connect' }}
+          - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark != 'true' && 'pyspark-structured-streaming, pyspark-structured-streaming-connect' }}
           - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark != 'true' && 'pyspark-connect' }}
           # Always run if pyspark-pandas == 'true', even infra-image is skip (such as non-master job)
           # In practice, the build will run in individual PR, but not against the individual commit
           # in Apache Spark repository.
           - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas' }}
           - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-slow' }}
-          - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part0' }}
-          - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part1' }}
-          - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part2' }}
-          - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part3' }}
+          - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect' }}
+          - modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-slow-connect' }}
     env:
       MODULES_TO_TEST: ${{ matrix.modules }}
       HADOOP_PROFILE: ${{ inputs.hadoop }}
@@ -605,8 +604,9 @@ jobs:
       run: |
         for py in $(echo $PYTHON_TO_TEST | tr "," "\n")
         do
-          echo $py
+          $py --version
           $py -m pip list
+          echo ""
         done
     - name: Install Conda for pip packaging test
       if: contains(matrix.modules, 'pyspark-errors')
@@ -766,7 +766,7 @@ jobs:
         python-version: '3.11'
     - name: Install dependencies for Python CodeGen check
       run: |
-        python3.11 -m pip install 'black==23.12.1' 'protobuf==5.29.1' 'mypy==1.8.0' 'mypy-protobuf==3.3.0'
+        python3.11 -m pip install 'black==23.12.1' 'protobuf==5.29.5' 'mypy==1.8.0' 'mypy-protobuf==3.3.0'
         python3.11 -m pip list
     - name: Python CodeGen check for branch-3.5
       if: inputs.branch == 'branch-3.5'
@@ -919,6 +919,42 @@ jobs:
     - name: R linter
       run: ./dev/lint-r
 
+  java17:
+    needs: [precondition]
+    if: fromJson(needs.precondition.outputs.required).java17 == 'true'
+    name: Java 17 build with Maven
+    runs-on: ubuntu-latest
+    timeout-minutes: 120
+    steps:
+    - uses: actions/checkout@v4
+    - uses: actions/setup-java@v4
+      with:
+        distribution: zulu
+        java-version: 17
+    - name: Build with Maven
+      run: |
+        export MAVEN_OPTS="-Xss64m -Xmx4g -Xms4g -XX:ReservedCodeCacheSize=128m -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN"
+        export MAVEN_CLI_OPTS="--no-transfer-progress"
+        ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pjvm-profiler -Pspark-ganglia-lgpl -Pkinesis-asl clean install
+
+  java25:
+    needs: [precondition]
+    if: fromJson(needs.precondition.outputs.required).java25 == 'true'
+    name: Java 25 build with Maven
+    runs-on: ubuntu-latest
+    timeout-minutes: 120
+    steps:
+    - uses: actions/checkout@v4
+    - uses: actions/setup-java@v4
+      with:
+        distribution: zulu
+        java-version: 25
+    - name: Build with Maven
+      run: |
+        export MAVEN_OPTS="-Xss64m -Xmx4g -Xms4g -XX:ReservedCodeCacheSize=128m -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN"
+        export MAVEN_CLI_OPTS="--no-transfer-progress"
+        ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pjvm-profiler -Pspark-ganglia-lgpl -Pkinesis-asl clean install
+
   # Documentation build
   docs:
     needs: [precondition, infra-image]
@@ -998,10 +1034,14 @@ jobs:
         # Should unpin 'sphinxcontrib-*' after upgrading sphinx>5
         python3.9 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' 'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5'
         python3.9 -m pip install ipython_genutils # See SPARK-38517
-        python3.9 -m pip install sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly<6.0.0'
+        python3.9 -m pip install sphinx_plotly_directive 'numpy>=1.22' pyarrow pandas 'plotly<6.0.0'
         python3.9 -m pip install 'docutils<0.18.0' # See SPARK-39421
-    - name: List Python packages
+    - name: List Python packages for branch-3.5 and branch-4.0
+      if: inputs.branch == 'branch-3.5' || inputs.branch == 'branch-4.0'
       run: python3.9 -m pip list
+    - name: List Python packages
+      if: inputs.branch != 'branch-3.5' && inputs.branch != 'branch-4.0'
+      run: python3.11 -m pip list
     - name: Install dependencies for documentation generation
       run: |
         # Keep the version of Bundler here in sync with the following locations:
@@ -1010,7 +1050,8 @@ jobs:
         gem install bundler -v 2.4.22
         cd docs
         bundle install --retry=100
-    - name: Run documentation build
+    - name: Run documentation build for branch-3.5 and branch-4.0
+      if: inputs.branch == 'branch-3.5' || inputs.branch == 'branch-4.0'
       run: |
         # We need this link to make sure `python3` points to `python3.9` which contains the prerequisite packages.
         ln -s "$(which python3.9)" "/usr/local/bin/python3"
@@ -1031,6 +1072,30 @@ jobs:
         echo "SKIP_SQLDOC: $SKIP_SQLDOC"
         cd docs
         bundle exec jekyll build
+    - name: Run documentation build
+      if: inputs.branch != 'branch-3.5' && inputs.branch != 'branch-4.0'
+      run: |
+        # We need this link to make sure `python3` points to `python3.11` which contains the prerequisite packages.
+        ln -s "$(which python3.11)" "/usr/local/bin/python3"
+        # Build docs first with SKIP_API to ensure they are buildable without requiring any
+        # language docs to be built beforehand.
+        cd docs; SKIP_ERRORDOC=1 SKIP_API=1 bundle exec jekyll build; cd ..
+        if [ -f "./dev/is-changed.py" ]; then
+          # Skip PySpark and SparkR docs while keeping Scala/Java/SQL docs
+          pyspark_modules=`cd dev && python3.11 -c "import sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if m.name.startswith('pyspark')))"`
+          if [ `./dev/is-changed.py -m $pyspark_modules` = false ]; then export SKIP_PYTHONDOC=1; fi
+          if [ `./dev/is-changed.py -m sparkr` = false ]; then export SKIP_RDOC=1; fi
+        fi
+        export PYSPARK_DRIVER_PYTHON=python3.11
+        export PYSPARK_PYTHON=python3.11
+        # Print the values of environment variables `SKIP_ERRORDOC`, `SKIP_SCALADOC`, `SKIP_PYTHONDOC`, `SKIP_RDOC` and `SKIP_SQLDOC`
+        echo "SKIP_ERRORDOC: $SKIP_ERRORDOC"
+        echo "SKIP_SCALADOC: $SKIP_SCALADOC"
+        echo "SKIP_PYTHONDOC: $SKIP_PYTHONDOC"
+        echo "SKIP_RDOC: $SKIP_RDOC"
+        echo "SKIP_SQLDOC: $SKIP_SQLDOC"
+        cd docs
+        bundle exec jekyll build
     - name: Tar documentation
       if: github.repository != 'apache/spark'
       run: tar cjf site.tar.bz2 docs/_site
@@ -1259,9 +1324,9 @@ jobs:
           sudo apt update
           sudo apt-get install r-base
       - name: Start Minikube
-        uses: medyagh/[email protected].19
+        uses: medyagh/[email protected].20
         with:
-          kubernetes-version: "1.33.0"
+          kubernetes-version: "1.34.0"
           # Github Action limit cpu:2, memory: 6947MB, limit to 2U6G for better resource statistic
           cpus: 2
           memory: 6144m
@@ -1279,8 +1344,10 @@ jobs:
           kubectl create clusterrolebinding serviceaccounts-cluster-admin --clusterrole=cluster-admin --group=system:serviceaccounts || true
           if [[ "${{ inputs.branch }}" == 'branch-3.5' ]]; then
             kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.7.0/installer/volcano-development.yaml || true
-          else
+          elif [[ "${{ inputs.branch }}" == 'branch-4.0' ]]; then
             kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.11.0/installer/volcano-development.yaml || true
+          else
+            kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.12.2/installer/volcano-development.yaml || true
           fi
           eval $(minikube docker-env)
           build/sbt -Phadoop-3 -Psparkr -Pkubernetes -Pvolcano -Pkubernetes-integration-tests -Dspark.kubernetes.test.volcanoMaxConcurrencyJobNum=1 -Dtest.exclude.tags=local "kubernetes-integration-tests/test"

diff --git a/.github/workflows/build_infra_images_cache.yml b/.github/workflows/build_infra_images_cache.yml
@@ -33,13 +33,13 @@ on:
     - 'dev/spark-test-image/python-minimum/Dockerfile'
     - 'dev/spark-test-image/python-ps-minimum/Dockerfile'
     - 'dev/spark-test-image/pypy-310/Dockerfile'
-    - 'dev/spark-test-image/python-309/Dockerfile'
     - 'dev/spark-test-image/python-310/Dockerfile'
     - 'dev/spark-test-image/python-311/Dockerfile'
     - 'dev/spark-test-image/python-311-classic-only/Dockerfile'
     - 'dev/spark-test-image/python-312/Dockerfile'
     - 'dev/spark-test-image/python-313/Dockerfile'
     - 'dev/spark-test-image/python-313-nogil/Dockerfile'
+    - 'dev/spark-test-image/python-314/Dockerfile'
     - 'dev/spark-test-image/numpy-213/Dockerfile'
     - '.github/workflows/build_infra_images_cache.yml'
   # Create infra image when cutting down branches/tags
@@ -153,19 +153,6 @@ jobs:
       - name: Image digest (PySpark with PyPy 3.10)
         if: hashFiles('dev/spark-test-image/pypy-310/Dockerfile') != ''
         run: echo ${{ steps.docker_build_pyspark_pypy_310.outputs.digest }}
-      - name: Build and push (PySpark with Python 3.9)
-        if: hashFiles('dev/spark-test-image/python-309/Dockerfile') != ''
-        id: docker_build_pyspark_python_309
-        uses: docker/build-push-action@v6
-        with:
-          context: ./dev/spark-test-image/python-309/
-          push: true
-          tags: ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-309-cache:${{ github.ref_name }}-static
-          cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-309-cache:${{ github.ref_name }}
-          cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-309-cache:${{ github.ref_name }},mode=max
-      - name: Image digest (PySpark with Python 3.9)
-        if: hashFiles('dev/spark-test-image/python-309/Dockerfile') != ''
-        run: echo ${{ steps.docker_build_pyspark_python_309.outputs.digest }}
       - name: Build and push (PySpark with Python 3.10)
         if: hashFiles('dev/spark-test-image/python-310/Dockerfile') != ''
         id: docker_build_pyspark_python_310
@@ -244,6 +231,19 @@ jobs:
       - name: Image digest (PySpark with Python 3.13 no GIL)
         if: hashFiles('dev/spark-test-image/python-313-nogil/Dockerfile') != ''
         run: echo ${{ steps.docker_build_pyspark_python_313_nogil.outputs.digest }}
+      - name: Build and push (PySpark with Python 3.14)
+        if: hashFiles('dev/spark-test-image/python-314/Dockerfile') != ''
+        id: docker_build_pyspark_python_314
+        uses: docker/build-push-action@v6
+        with:
+          context: ./dev/spark-test-image/python-314/
+          push: true
+          tags: ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-314-cache:${{ github.ref_name }}-static
+          cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-314-cache:${{ github.ref_name }}
+          cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-314-cache:${{ github.ref_name }},mode=max
+      - name: Image digest (PySpark with Python 3.14)
+        if: hashFiles('dev/spark-test-image/python-314/Dockerfile') != ''
+        run: echo ${{ steps.docker_build_pyspark_python_314.outputs.digest }}
       - name: Build and push (PySpark with Numpy 2.1.3)
         if: hashFiles('dev/spark-test-image/numpy-213/Dockerfile') != ''
         id: docker_build_pyspark_numpy_213

diff --git a/.github/workflows/build_maven_java21_arm.yml b/.github/workflows/build_maven_java21_arm.yml
@@ -21,7 +21,7 @@ name: "Build / Maven (master, Scala 2.13, Hadoop 3, JDK 21, ARM)"
 
 on:
   schedule:
-    - cron: '0 15 * * *'
+    - cron: '0 15 */2 * *'
   workflow_dispatch:
 
 jobs:

diff --git a/.../workflows/build_maven_java21_macos15.yml → .../workflows/build_maven_java21_macos26.yml b/.../workflows/build_maven_java21_macos15.yml → .../workflows/build_maven_java21_macos26.yml
@@ -17,7 +17,7 @@
 # under the License.
 #
 
-name: "Build / Maven (master, Scala 2.13, Hadoop 3, JDK 21, MacOS-15)"
+name: "Build / Maven (master, Scala 2.13, Hadoop 3, JDK 21, MacOS-26)"
 
 on:
   schedule:
@@ -33,7 +33,7 @@ jobs:
     if: github.repository == 'apache/spark'
     with:
       java: 21
-      os: macos-15
+      os: macos-26
       arch: arm64
       envs: >-
         {

diff --git a/.github/workflows/build_non_ansi.yml b/.github/workflows/build_non_ansi.yml
@@ -40,6 +40,7 @@ jobs:
           "PYSPARK_IMAGE_TO_TEST": "python-311",
           "PYTHON_TO_TEST": "python3.11",
           "SPARK_ANSI_SQL_MODE": "false",
+          "SPARK_TEST_SPARK_BLOOM_FILTER_SUITE_ENABLED": "true"
         }
       jobs: >-
         {

diff --git a/.github/workflows/build_python_3.9.yml → .github/workflows/build_python_3.14.yml b/.github/workflows/build_python_3.9.yml → .github/workflows/build_python_3.14.yml
@@ -17,7 +17,7 @@
 # under the License.
 #
 
-name: "Build / Python-only (master, Python 3.9)"
+name: "Build / Python-only (master, Python 3.14)"
 
 on:
   schedule:
@@ -37,8 +37,8 @@ jobs:
       hadoop: hadoop3
       envs: >-
         {
-          "PYSPARK_IMAGE_TO_TEST": "python-309",
-          "PYTHON_TO_TEST": "python3.9"
+          "PYSPARK_IMAGE_TO_TEST": "python-314",
+          "PYTHON_TO_TEST": "python3.14"
         }
       jobs: >-
         {