From c5f8a0ee33f825ea17f2fb179a58f26ff241e53c Mon Sep 17 00:00:00 2001 From: sezruby Date: Tue, 2 Jun 2026 10:07:30 -0700 Subject: [PATCH 1/2] [CORE] Fix arrow.c shading: exclude memory/vector packages so public API stays unshaded MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bundled Arrow C-Data classes (org.apache.arrow.c.*) are correctly excluded from relocation because their native JNI binds to the original class names. However, their public API signatures take and return org.apache.arrow.memory.* and org.apache.arrow.vector.* types, which were being relocated to org.apache.gluten.shaded.*. The result: bundled ArrowArrayStream/ArrowSchema/ArrowArray/Data classes are compiled against the shaded BufferAllocator/VectorSchemaRoot, so any caller passing a vanilla Apache Arrow allocator gets NoSuchMethodError. Triggered for any Spark workload that combines gluten with another library using Arrow C-Data (Iceberg's Arrow vector layer, Lance Java's writer, Snowflake JDBC's Arrow result decoder, etc.) when gluten's bundle wins classloader resolution against vanilla Arrow. Fix: extend the relocation excludes to also keep org.apache.arrow.memory.** and org.apache.arrow.vector.** unshaded. The bundled C-Data API now matches the public Apache Arrow API. Adds dev/check-arrow-c-shading.sh which runs javap on the produced bundle jar and asserts that public method signatures reference unshaded Arrow types. Wired into package/pom.xml's verify phase via exec-maven-plugin so regressions are caught in CI. Tested against the upstream gluten-velox-bundle-spark3.5_2.12-linux_amd64-1.6.0.jar — script exits 1 with a clear diagnosis on the broken bundle. Closes #12225 --- dev/check-arrow-c-shading.sh | 89 ++++++++++++++++++++++++++++++++++++ package/pom.xml | 43 ++++++++++++++++- 2 files changed, 131 insertions(+), 1 deletion(-) create mode 100755 dev/check-arrow-c-shading.sh diff --git a/dev/check-arrow-c-shading.sh b/dev/check-arrow-c-shading.sh new file mode 100755 index 00000000000..620407603e9 --- /dev/null +++ b/dev/check-arrow-c-shading.sh @@ -0,0 +1,89 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Verify the bundled gluten-velox jar's Arrow C-Data classes have method +# signatures referencing the *unshaded* org.apache.arrow.memory.BufferAllocator +# and org.apache.arrow.vector.* types — not the gluten-shaded copies. +# +# Background: org.apache.arrow.c.* must NOT be relocated (its native JNI binds +# to the original class names), but its public API methods accept/return +# org.apache.arrow.memory.* and org.apache.arrow.vector.* types. Those types +# must therefore also stay unshaded in the bundle, otherwise the bundled +# ArrowArrayStream/ArrowSchema get re-bound to the shaded BufferAllocator at +# compile time and any caller passing a vanilla Apache Arrow allocator hits +# `NoSuchMethodError`. See gluten#12225. +# +# Usage: +# dev/check-arrow-c-shading.sh +# +# Exit codes: +# 0 — bundle is well-shaded (Arrow C-Data API uses public Apache Arrow types) +# 1 — bundle is broken (Arrow C-Data API references gluten-shaded types) +# 2 — usage / setup error + +set -euo pipefail + +JAR="${1:?usage: $0 }" +if [[ ! -f "$JAR" ]]; then + echo "error: jar not found: $JAR" >&2 + exit 2 +fi + +if ! command -v javap >/dev/null; then + echo "error: javap not found on PATH" >&2 + exit 2 +fi + +WORKDIR=$(mktemp -d) +trap 'rm -rf "$WORKDIR"' EXIT + +# Classes whose public API touches the unshaded boundary. +CLASSES=( + "org/apache/arrow/c/ArrowArrayStream" + "org/apache/arrow/c/ArrowSchema" + "org/apache/arrow/c/ArrowArray" + "org/apache/arrow/c/Data" +) + +failures=0 +for cls in "${CLASSES[@]}"; do + if ! unzip -p "$JAR" "${cls}.class" > "$WORKDIR/$(basename "$cls").class" 2>/dev/null; then + echo " SKIP $cls (not in bundle)" + continue + fi + signatures=$(javap -p "$WORKDIR/$(basename "$cls").class" 2>/dev/null || true) + # Any method signature mentioning the gluten-shaded Arrow path is the bug. + bad=$(echo "$signatures" | grep -E "org\.apache\.gluten\.shaded\.org\.apache\.arrow\.(memory|vector)\." || true) + if [[ -n "$bad" ]]; then + echo " FAIL $cls — public API references gluten-shaded Arrow types:" + echo "$bad" | sed 's/^/ /' + failures=$((failures + 1)) + else + echo " OK $cls" + fi +done + +if (( failures > 0 )); then + echo + echo "Bundle has $failures Arrow C-Data class(es) with shaded API types." + echo "See gluten#12225 for context. Update package/pom.xml's" + echo " excludes to also exclude" + echo "org.apache.arrow.memory.** and org.apache.arrow.vector.**." + exit 1 +fi + +echo +echo "All Arrow C-Data classes use unshaded public Apache Arrow API. ✓" diff --git a/package/pom.xml b/package/pom.xml index 55dec68cdd4..cfde0eb5fd8 100644 --- a/package/pom.xml +++ b/package/pom.xml @@ -121,10 +121,22 @@ org.apache.arrow ${gluten.shade.packageName}.org.apache.arrow - + org.apache.arrow.c.* org.apache.arrow.c.jni.* + org.apache.arrow.memory.** + org.apache.arrow.vector.** org.apache.arrow.dataset.** @@ -287,6 +299,35 @@ + + org.codehaus.mojo + exec-maven-plugin + + + + verify-arrow-c-shading + verify + + exec + + + ${project.basedir}/../dev/check-arrow-c-shading.sh + + ${project.build.directory}/${project.build.finalName}.jar + + + + + From 43a5b6fd7b8287387925f819048f97588ba532c2 Mon Sep 17 00:00:00 2001 From: sezruby Date: Tue, 2 Jun 2026 13:12:48 -0700 Subject: [PATCH 2/2] =?UTF-8?q?fixup:=20spotless=20=E2=80=94=20execution?= =?UTF-8?q?=20element=20order=20is=20goals=20before=20phase?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- package/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package/pom.xml b/package/pom.xml index cfde0eb5fd8..709170a50fc 100644 --- a/package/pom.xml +++ b/package/pom.xml @@ -315,10 +315,10 @@ (gluten#12225). --> verify-arrow-c-shading - verify exec + verify ${project.basedir}/../dev/check-arrow-c-shading.sh