Skip to content

Commit 5f73e7c

Browse files
authored
feat: add spark 4.0.0 image (#1216)
* feat: add Spark 4.0.0 * update spark client image version * replace maven-help-plugin with bash-foo * explicitely tell the build script which maven binary to use * comment on the hbase connector version
1 parent e30798a commit 5f73e7c

File tree

5 files changed

+118
-28
lines changed

5 files changed

+118
-28
lines changed

spark-connect-client/versions.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,10 @@
55
"java-base": "17",
66
"python": "3.11",
77
},
8+
{
9+
"product": "4.0.0",
10+
"spark-k8s": "4.0.0",
11+
"java-base": "17",
12+
"python": "3.11",
13+
},
814
]

spark-k8s/Dockerfile

Lines changed: 54 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,17 @@ COPY --chown=${STACKABLE_USER_UID}:0 spark-k8s/hbase-connectors/stackable/patche
6161
COPY --chown=${STACKABLE_USER_UID}:0 spark-k8s/hbase-connectors/stackable/patches/${HBASE_CONNECTOR} /stackable/src/spark-k8s/hbase-connectors/stackable/patches/${HBASE_CONNECTOR}
6262

6363
RUN <<EOF
64+
65+
# IMPORTANT: HBase connectors don't support Spark 4 yet, so we skip the build.
66+
# Watch this PR for updates: https://github.com/apache/hbase-connectors/pull/130
67+
if [[ "${PRODUCT}" == 4* ]]; then
68+
# Create this empty directory so that following COPY layers succeed.
69+
mkdir -p /stackable/spark/jars
70+
# Create a dummy tarball to satisfy the build process for Spark 3.
71+
touch hbase-connector-${HBASE_CONNECTOR}-stackable${RELEASE}-src.tar.gz
72+
exit 0
73+
fi
74+
6475
cd "$(/stackable/patchable --images-repo-root=src checkout spark-k8s/hbase-connectors ${HBASE_CONNECTOR})/spark"
6576

6677
NEW_VERSION="${HBASE_CONNECTOR}-stackable${RELEASE}"
@@ -80,18 +91,10 @@ export JDK_JAVA_OPTIONS="\
8091
--add-opens java.base/java.util=ALL-UNNAMED"
8192

8293
# Get the Scala version used by Spark
83-
SCALA_VERSION=$( \
84-
mvn --quiet --non-recursive --file /stackable/spark/pom.xml \
85-
org.apache.maven.plugins:maven-help-plugin:3.5.0:evaluate \
86-
-DforceStdout \
87-
-Dexpression='project.properties(scala.version)')
94+
SCALA_VERSION=$(grep "scala.version" /stackable/spark/pom.xml | head -n1 | awk -F '[<>]' '{print $3}')
8895

8996
# Get the Scala binary version used by Spark
90-
SCALA_BINARY_VERSION=$( \
91-
mvn --quiet --non-recursive --file /stackable/spark/pom.xml \
92-
org.apache.maven.plugins:maven-help-plugin:3.5.0:evaluate \
93-
-DforceStdout \
94-
-Dexpression='project.properties(scala.binary.version)')
97+
SCALA_BINARY_VERSION=$(grep "scala.binary.version" /stackable/spark/pom.xml | head -n1 | awk -F '[<>]' '{print $3}')
9598

9699
# Build the Spark HBase connector
97100
# Skip the tests because the MiniHBaseCluster does not get ready for
@@ -108,6 +111,7 @@ mvn \
108111
--define hadoop-three.version="${HADOOP_VERSION}" \
109112
--define hbase.version="${HBASE}" \
110113
--define skipTests \
114+
--define maven.test.skip=true \
111115
clean package
112116

113117
mkdir -p /stackable/spark/jars
@@ -157,22 +161,36 @@ COPY --chown=${STACKABLE_USER_UID}:0 --from=spark-source-builder \
157161
COPY --from=hadoop-builder --chown=${STACKABLE_USER_UID}:0 /stackable/patched-libs /stackable/patched-libs
158162

159163
# >>> Build spark
160-
# Compiling the tests takes a lot of time, so we skip them
161-
# -Dmaven.test.skip=true skips both the compilation and execution of tests
162-
# -DskipTests skips only the execution
163164
RUN <<EOF
164165
# Make Maven aware of custom Stackable libraries
165166
mv /stackable/patched-libs/maven /root/.m2/repository
166167

167168
ORIGINAL_VERSION="${PRODUCT}"
168169
NEW_VERSION="${PRODUCT}-stackable${RELEASE}"
169170

171+
MAVEN_BIN="/usr/bin/mvn"
170172
export MAVEN_OPTS="-Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g"
171173

172-
./dev/make-distribution.sh \
173-
-Dhadoop.version="${HADOOP_VERSION}-stackable${RELEASE}" \
174-
-DskipTests \
175-
-P'hadoop-3' -Pkubernetes -Phive -Phive-thriftserver
174+
case "${PRODUCT}" in
175+
4*)
176+
# The Spark 4 script has a --connect option which is not available in Spark 3.
177+
# This option is required to build Spark Connect.
178+
# Also this option breaks the Spark 3 build so we ensure it's only provided here.
179+
./dev/make-distribution.sh \
180+
--mvn "${MAVEN_BIN}" \
181+
--connect \
182+
-Dhadoop.version="${HADOOP_VERSION}-stackable${RELEASE}" \
183+
-DskipTests \
184+
-P'hadoop-3' -Pkubernetes -Phive -Phive-thriftserver
185+
;;
186+
*)
187+
./dev/make-distribution.sh \
188+
--mvn "${MAVEN_BIN}" \
189+
-Dhadoop.version="${HADOOP_VERSION}-stackable${RELEASE}" \
190+
-DskipTests \
191+
-P'hadoop-3' -Pkubernetes -Phive -Phive-thriftserver
192+
;;
193+
esac
176194

177195
sed -i "s/${NEW_VERSION}/${ORIGINAL_VERSION}/g" assembly/target/bom.json
178196
EOF
@@ -183,22 +201,30 @@ EOF
183201
# we create a new dist/connect folder, and copy them here.
184202
RUN <<EOF
185203

186-
# Get the Scala binary version
187-
SCALA_BINARY_VERSION=$( \
188-
mvn --quiet --non-recursive --file pom.xml \
189-
org.apache.maven.plugins:maven-help-plugin:3.5.0:evaluate \
190-
-DforceStdout \
191-
-Dexpression='project.properties(scala.binary.version)')
204+
SCALA_BINARY_VERSION=$(grep "scala.binary.version" pom.xml | head -n1 | awk -F '[<>]' '{print $3}')
192205

193206
mkdir -p dist/connect
194207
cd dist/connect
195208

196-
cp "/stackable/spark-${PRODUCT}-stackable${RELEASE}/connector/connect/server/target/spark-connect_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" .
197-
cp "/stackable/spark-${PRODUCT}-stackable${RELEASE}/connector/connect/common/target/spark-connect-common_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" .
198-
cp "/stackable/spark-${PRODUCT}-stackable${RELEASE}/connector/connect/client/jvm/target/spark-connect-client-jvm_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" .
199-
200-
# The Spark operator expects a file named spark-connect_${SCALA_BINARY_VERSION}-${PRODUCT}.jar without the -stackable${RELEASE} suffix.
209+
case "${PRODUCT}" in
210+
4*)
211+
cp "/stackable/spark-${PRODUCT}-stackable${RELEASE}/sql/connect/server/target/spark-connect_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" .
212+
cp "/stackable/spark-${PRODUCT}-stackable${RELEASE}/sql/connect/common/target/spark-connect-common_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" .
213+
cp "/stackable/spark-${PRODUCT}-stackable${RELEASE}/sql/connect/client/jvm/target/spark-connect-client-jvm_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" .
214+
;;
215+
*)
216+
cp "/stackable/spark-${PRODUCT}-stackable${RELEASE}/connector/connect/server/target/spark-connect_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" .
217+
cp "/stackable/spark-${PRODUCT}-stackable${RELEASE}/connector/connect/common/target/spark-connect-common_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" .
218+
cp "/stackable/spark-${PRODUCT}-stackable${RELEASE}/connector/connect/client/jvm/target/spark-connect-client-jvm_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" .
219+
;;
220+
esac
221+
222+
# This link is needed by the operator and is kept for backwards compatibility.
223+
# TODO: remove it at some time in the future.
201224
ln -s "spark-connect_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" "spark-connect_${SCALA_BINARY_VERSION}-${PRODUCT}.jar"
225+
# Link to the spark-connect jar without the stackable suffix and scala version.
226+
# This link supersedes the previous link.
227+
ln -s "spark-connect_${SCALA_BINARY_VERSION}-${PRODUCT}-stackable${RELEASE}.jar" "spark-connect-${PRODUCT}.jar"
202228
EOF
203229

204230
# <<< Build spark
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
From 2da5608928018dd017c91b904eb8f84a4f6df78a Mon Sep 17 00:00:00 2001
2+
From: Razvan-Daniel Mihai <[email protected]>
3+
Date: Fri, 4 Jul 2025 15:54:55 +0200
4+
Subject: Update CycloneDX plugin
5+
6+
---
7+
dev/make-distribution.sh | 1 -
8+
pom.xml | 5 +++++
9+
2 files changed, 5 insertions(+), 1 deletion(-)
10+
11+
diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh
12+
index 16607e45ae..44e345a245 100755
13+
--- a/dev/make-distribution.sh
14+
+++ b/dev/make-distribution.sh
15+
@@ -176,7 +176,6 @@ BUILD_COMMAND=("$MVN" clean package \
16+
-Dmaven.javadoc.skip=true \
17+
-Dmaven.scaladoc.skip=true \
18+
-Dmaven.source.skip \
19+
- -Dcyclonedx.skip=true \
20+
$@)
21+
22+
# Actually build the jar
23+
diff --git a/pom.xml b/pom.xml
24+
index 443d46a430..632920f100 100644
25+
--- a/pom.xml
26+
+++ b/pom.xml
27+
@@ -3327,6 +3327,11 @@
28+
<groupId>org.cyclonedx</groupId>
29+
<artifactId>cyclonedx-maven-plugin</artifactId>
30+
<version>2.8.0</version>
31+
+ <configuration>
32+
+ <projectType>application</projectType>
33+
+ <schemaVersion>1.5</schemaVersion>
34+
+ <skipNotDeployed>false</skipNotDeployed>
35+
+ </configuration>
36+
<executions>
37+
<execution>
38+
<phase>package</phase>
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
base = "fa33ea000a0bda9e5a3fa1af98e8e85b8cc5e4d4"
2+
mirror = "https://github.com/stackabletech/spark.git"

spark-k8s/versions.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,4 +35,22 @@
3535
"tini": "0.19.0",
3636
"hbase_connector": "1.0.1",
3737
},
38+
{
39+
"product": "4.0.0",
40+
"java-base": "17",
41+
"java-devel": "17",
42+
"python": "3.11",
43+
"hadoop/hadoop": "3.4.1",
44+
"hbase": "2.6.2",
45+
"aws_java_sdk_bundle": "2.24.6",
46+
"azure_storage": "7.0.1", # https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-azure/3.3.4
47+
"azure_keyvault_core": "1.0.0", # https://mvnrepository.com/artifact/com.microsoft.azure/azure-storage/7.0.1
48+
"jackson_dataformat_xml": "2.15.2", # https://mvnrepository.com/artifact/org.apache.spark/spark-core_2.13/3.5.1
49+
"stax2_api": "4.2.1", # https://mvnrepository.com/artifact/com.fasterxml.jackson.dataformat/jackson-dataformat-xml/2.15.2
50+
"woodstox_core": "6.5.1", # https://mvnrepository.com/artifact/com.fasterxml.jackson.dataformat/jackson-dataformat-xml/2.15.2
51+
"vector": "0.47.0",
52+
"jmx_exporter": "1.3.0",
53+
"tini": "0.19.0",
54+
"hbase_connector": "1.0.1", # This is not supported in Spark 4 yet.
55+
},
3856
]

0 commit comments

Comments
 (0)