Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
57 commits
Select commit Hold shift + click to select a range
0543b77
SRE-3704 ci: CI-Test-FI
grom72 Apr 9, 2026
7055023
Run NLT on Docker
grom72 Apr 13, 2026
2b73285
Run NLT w/o fault injection
grom72 Apr 13, 2026
6cda785
FI one by one
grom72 Apr 14, 2026
1cae67d
Remove temporary obsolete stages
grom72 Apr 15, 2026
2b56e8c
Run fi tests on NLT node
grom72 Apr 15, 2026
0783da7
Remove UNNEDED stage
grom72 Apr 15, 2026
b387e0d
Skip build for fault injection
grom72 Apr 15, 2026
bfb7a3f
small fix
grom72 Apr 15, 2026
7e2e739
more tests in parallel
grom72 Apr 16, 2026
3eb5ade
everything should be in place
grom72 Apr 16, 2026
3df423a
Start from the beginning
grom72 Apr 16, 2026
c4f271e
Restart from begining
grom72 Apr 16, 2026
f3f0759
Signed-off-by: Tomasz Gromadzki <tomasz.gromadzki@hpe.com>
grom72 Apr 16, 2026
82fedf1
Install daos-client-tests to enable FI
grom72 Apr 20, 2026
8f3f953
Fix for previous step
grom72 Apr 20, 2026
a183da3
Provide more parameters for NLT w FI test
grom72 Apr 20, 2026
4f1eb79
Temporary ignore tests script result
grom72 Apr 20, 2026
55888e4
Increase test timeout
grom72 Apr 20, 2026
21989d2
Trigger FI build in target stage
grom72 Apr 21, 2026
c495c3e
Fix parameters
grom72 Apr 21, 2026
074fe91
Fix NLT tests to use proper log file name
grom72 Apr 21, 2026
c58023d
Fix agent name
grom72 Apr 21, 2026
b4e1ee7
We need test+nlt_post.sh script to collect logs
grom72 Apr 22, 2026
4e0f754
Compare NLT and FI stages
grom72 Apr 22, 2026
b57a68a
Use custom library
grom72 Apr 28, 2026
f4b2ad5
ci: fix nlt-errors.json pattern path in NLT Fault injection post block
grom72 Apr 29, 2026
f72d336
ci: remove duplicate junit step from NLT Fault injection post block
grom72 Apr 29, 2026
39e28f6
ci: use nlt_name to label Fault injection issues in Jenkins UI
grom72 Apr 29, 2026
df82b57
ci: fetch nlt_logs from build/ when --no-root mode is used
grom72 Apr 29, 2026
e932b1c
Fault injection tests withot --no-root option
grom72 Apr 29, 2026
422f625
Restore original code
grom72 Apr 30, 2026
7d1d10b
Merge remote-tracking branch 'origin/master' into grom72/SRE-3704-CI-…
grom72 Apr 30, 2026
4d18b9a
Jenkinsfile: simplify NLT fault injection recordIssues call
grom72 Apr 30, 2026
d272552
Run NLT on VMs
grom72 May 4, 2026
f5ec432
NLT takes more than 60 minutes
grom72 May 4, 2026
61e629b
FI with --system-ram-reserved 16
grom72 May 5, 2026
308d60b
NLT & FI with --system-ram-reserved 4
grom72 May 5, 2026
2323fd9
Ubuntu docker w/o proxy
grom72 May 5, 2026
7d10c3f
Final prototype tune
grom72 May 5, 2026
01993a7
Restore Valgrind stash
grom72 May 5, 2026
fe8be03
Remove obsolete code
grom72 May 5, 2026
8d788dc
Revert "Ubuntu docker w/o proxy"
grom72 May 5, 2026
13c5235
Fix stage name
grom72 May 5, 2026
e0fd4e3
Revert "DAOS-623 test: add allowed error for FI (#17959)"
grom72 May 5, 2026
042de8c
Move FI test to original group
grom72 May 6, 2026
55294f2
Disable maldet fo CI nodes
grom72 May 6, 2026
bdd0209
Use tmpfs for NLT test logs
grom72 May 6, 2026
07088ac
Move all temporary files to tmpfs under ./nlt_logs folder
grom72 May 6, 2026
dd9c9c0
nlt: remove ABT_STACK_OVERFLOW_CHECK=mprotect from nlt_server.yaml
grom72 May 7, 2026
302965e
Trigger build with 20 CPU cores per VM
grom72 May 8, 2026
adcac00
Revert "nlt: remove ABT_STACK_OVERFLOW_CHECK=mprotect from nlt_server…
grom72 May 8, 2026
df9c31d
Reapply "nlt: remove ABT_STACK_OVERFLOW_CHECK=mprotect from nlt_serve…
grom72 May 8, 2026
6cea7a1
Merge remote-tracking branch 'origin/master' into grom72/SRE-3704-CI-…
grom72 May 8, 2026
1a346d1
Parameters adjustment
grom72 May 8, 2026
885881e
fault_status falback only based on PATH
grom72 May 8, 2026
1341223
FIx typo
grom72 May 8, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 25 additions & 77 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
// To use a test branch (i.e. PR) until it lands to master
// I.e. for testing library changes
//@Library(value='pipeline-lib@your_branch') _
@Library(value=['pipeline-lib@grom72/SRE-3704','system-pipeline-lib@grom72/SRE-3704']) _

/* groovylint-disable-next-line CompileStatic */
job_status_internal = [:]
Expand Down Expand Up @@ -47,42 +48,6 @@ void job_step_update(def value=currentBuild.currentResult) {
jobStatusUpdate(job_status_internal, env.STAGE_NAME, value)
}

Map nlt_test() {
// groovylint-disable-next-line NoJavaUtilDate
Date startDate = new Date()
try {
unstash('nltr')
} catch (e) {
print 'Unstash failed, results from NLT stage will not be included'
}
sh label: 'Fault injection testing using NLT',
script: './ci/docker_nlt.sh --class-name fault-injection fi'
List filesList = []
filesList.addAll(findFiles(glob: '*.memcheck.xml'))
int vgfail = 0
int vgerr = 0
if (filesList) {
String rcs = sh label: 'Check for Valgrind errors',
script: "grep -E '<error( |>)' ${filesList.join(' ')} || true",
returnStdout: true
if (rcs) {
vgfail = 1
}
String suite = sanitizedStageName()
junitSimpleReport suite: suite,
file: suite + '_valgrind_results.xml',
fails: vgfail,
errors: vgerr,
name: 'Valgrind_Memcheck',
class: 'Valgrind',
message: 'Valgrind Memcheck error detected',
testdata: rcs
}
int runTime = durationSeconds(startDate)
Map runData = ['nlttest_time': runTime]
return runData
}

// For master, this is just some wildly high number
String next_version() {
return '1000'
Expand Down Expand Up @@ -391,8 +356,11 @@ pipeline {
defaultValue: 'ci_vm9',
description: 'Label to use for 9 VM functional tests')
string(name: 'CI_NLT_1_LABEL',
defaultValue: 'ci_nlt_1',
defaultValue: 'ci_nlt_vm1',
description: 'Label to use for NLT tests')
string(name: 'CI_FI_1_LABEL',
defaultValue: 'ci_fi_vm1',
description: 'Label to use for Fault Injection (FI) tests')
string(name: 'FUNCTIONAL_HARDWARE_MEDIUM_LABEL',
defaultValue: 'ci_nvme5',
description: 'Label to use for the Functional Hardware Medium (MD on SSD) stages')
Expand Down Expand Up @@ -978,63 +946,44 @@ pipeline {
}
} // post
} // stage('Functional on Ubuntu 20.04')
stage('Fault injection testing') {
stage('NLT Fault Injection testing') {
when {
beforeAgent true
expression { !skipStage() }
}
agent {
dockerfile {
filename 'utils/docker/Dockerfile.el.9'
label 'docker_runner_fi'
additionalBuildArgs dockerBuildArgs(repo_type: 'stable',
parallel_build: true,
deps_build: true) +
' --build-arg POINT_RELEASE=.7 '
args '--tmpfs /mnt/daos_0'
}
label params.CI_FI_1_LABEL
}
steps {
/* job_step_update(nlt_test()) */
job_step_update(
sconsBuild(parallel_build: true,
scons_args: 'PREFIX=/opt/daos TARGET_TYPE=release BUILD_TYPE=debug',
build_deps: 'no'))
job_step_update(nlt_test())
// recordCoverage(tools: [[parser: 'COBERTURA', pattern:'nltr.xml']],
// skipPublishingChecks: true,
// id: 'fir', name: 'Fault Injection Report')
unitTest(timeout_time: 240,
inst_repos: daosRepos(),
test_script: 'ci/unit/test_nlt.sh --memcheck no' +
' --system-ram-reserved 4 --server-debug WARN' +
' --log-usage-import nltr.json' +
' --log-usage-save nltr.xml' +
' --class-name fault-injection fi',
unstash_opt: true,
unstash_tests: false,
inst_rpms: unitPackages(target: 'el9') + ' daos-client-tests',
image_version: 'el9.7'))
}
post {
always {
unitTestPost artifacts: ['nlt_logs/'],
testResults: 'nlt-junit.xml',
always_script: 'ci/unit/test_nlt_post.sh',
valgrind_stash: 'fault-inject-valgrind'
discoverGitReferenceBuild referenceJob: 'daos-stack/daos/master',
scm: 'daos-stack/daos',
requiredResult: hudson.model.Result.UNSTABLE
recordIssues enabledForFailure: true,
/* ignore warning/errors from PMDK logging system */
filters: [excludeFile('pmdk/.+')],
failOnError: false,
ignoreQualityGate: true,
qualityGates: [[threshold: 1, type: 'TOTAL_ERROR'],
[threshold: 1, type: 'TOTAL_HIGH'],
[threshold: 1, type: 'NEW_NORMAL', unstable: true],
[threshold: 1, type: 'NEW_LOW', unstable: true]],
tools: [issues(pattern: 'nlt-errors.json',
name: 'Fault injection issues',
id: 'Fault_Injection'),
issues(pattern: 'nlt-client-leaks.json',
name: 'Fault injection leaks',
id: 'NLT_client')],
scm: 'daos-stack/daos'
junit testResults: 'nlt-junit.xml'
stash name: 'fault-inject-valgrind',
includes: '*.memcheck.xml',
allowEmpty: true
archiveArtifacts artifacts: 'nlt_logs/fault-injection/',
allowEmptyArchive: true
job_status_update()
}
}
} // stage('Fault injection testing')
} // stage('NLT Fault injection testing')
stage('Test RPMs on EL 9.6') {
when {
beforeAgent true
Expand Down Expand Up @@ -1255,8 +1204,7 @@ pipeline {
post {
always {
valgrindReportPublish valgrind_stashes: ['nlt-memcheck',
'unit-memcheck',
'fault-inject-valgrind']
'unit-memcheck']
job_status_update('final_status')
jobStatusWrite(job_status_internal)
}
Expand Down
42 changes: 0 additions & 42 deletions ci/docker_nlt.sh

This file was deleted.

6 changes: 6 additions & 0 deletions ci/provisioning/post_provision_config_common_functions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,12 @@ post_provision_config_nodes() {
dnf -y erase fuse3\*
fi

# maldet brings additional load on CPU during tests (e,g, NLT tests)
if command -v maldet &>/dev/null; then
systemctl stop maldet 2>/dev/null || true
systemctl disable maldet 2>/dev/null || true
fi

if [ -n "$CONFIG_POWER_ONLY" ]; then
rm -f "$REPOS_DIR"/*_job_daos-stack_job_*_job_*.repo
time dnf -y erase fio fuse ior-hpc mpich-autoload \
Expand Down
8 changes: 4 additions & 4 deletions ci/unit/test_nlt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ mydir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
# Copy over the install tree and some of the build tree.
rsync -rlpt -z -e "ssh $SSH_KEY_ARGS" .build_vars* opt-daos.tar utils requirements-utest.txt jenkins@"$NODE":build/

# shellcheck disable=SC2029
ssh -tt "$SSH_KEY_ARGS" jenkins@"$NODE" "DAOS_HTTPS_PROXY=\"${DAOS_HTTPS_PROXY:-}\" \
DAOS_NO_PROXY=\"${DAOS_NO_PROXY:-}\" \
$(cat "$mydir/test_nlt_node.sh")"
ssh -T "$SSH_KEY_ARGS" jenkins@"$NODE" \
"DAOS_HTTPS_PROXY=\"${DAOS_HTTPS_PROXY:-}\" \
DAOS_NO_PROXY=\"${DAOS_NO_PROXY:-}\" \
bash -s -- $*" < "$mydir/test_nlt_node.sh"
21 changes: 17 additions & 4 deletions ci/unit/test_nlt_node.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
set -uex

sudo bash -c 'echo 1 > /proc/sys/kernel/sysrq'
sudo mkdir -p /mnt/daos
# using mmap()'ed ULT stacks requires to bump system default
if [ "$(sudo sysctl -n vm.max_map_count)" -lt "1000000" ] ; then
sudo sysctl vm.max_map_count=1000000
Expand Down Expand Up @@ -45,7 +44,21 @@ pip install /opt/daos/lib/daos/python/
sudo prlimit --nofile=1024:262144 --pid $$
prlimit -n

HTTPS_PROXY="${DAOS_HTTPS_PROXY:-}" \
if [ "$#" -eq 0 ]; then
set -- --max-log-size 1950MiB \
--class-name nlt \
--system-ram-reserved 4 \
--dfuse-dir /localhome/jenkins/ \
--log-usage-save nltir.xml \
--log-usage-export nltr.json all
fi

mkdir -p nlt_logs
sudo mount -t tmpfs tmpfs nlt_logs
sudo chown jenkins:jenkins nlt_logs

exec env \
TMPDIR="$(pwd)/nlt_logs" \
HTTPS_PROXY="${DAOS_HTTPS_PROXY:-}" \
NO_PROXY="${DAOS_NO_PROXY:-}" \
./utils/node_local_test.py --max-log-size 1950MiB \
--dfuse-dir /localhome/jenkins/ --log-usage-save nltir.xml --log-usage-export nltr.json all
./utils/node_local_test.py "$@"
10 changes: 9 additions & 1 deletion ci/unit/test_nlt_post.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,19 @@ mkdir nlt_logs
rsync -v -dprt -e "ssh $SSH_KEY_ARGS" jenkins@"$NODE":/tmp/ \
--filter="include dnt*.log" --filter="include dnt*.log.bz2" \
--filter="include dnt_fi_*_logs" \
--filter="exclude *" nlt_logs/
--filter="exclude *" nlt_logs/ || true

# When running with --no-root, DAOS logs go to build/nlt_logs/ on the node
# instead of /tmp/, so fetch them from there as well.
rsync -v -rlpt -e "ssh $SSH_KEY_ARGS" jenkins@"$NODE":build/nlt_logs/ \
--filter="include dnt*.log" --filter="include dnt*.log.bz2" \
--filter="include dnt_fi_*_logs" --filter="include */" \
--filter="exclude *" nlt_logs/ || true

rsync -v -dpt -z -e "ssh $SSH_KEY_ARGS" jenkins@"$NODE":build/ \
--filter="include nlt*.json" --filter="include dnt*.xml" \
--filter="include nltir.xml" --filter="include nltr.json" \
--filter="include nlt-junit.xml" --filter="exclude *" ./

mkdir -p vm_test
mv nlt-errors.json vm_test/
5 changes: 0 additions & 5 deletions src/tests/ftest/cart/util/cart_logtest.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,6 @@ def __init__(self, log_iter, quiet=False):
self.fi_triggered = False
self.fi_location = None
self.skip_suffixes = []
self.skip_substrings = []
self._tracers = []
self.ftest_mode = False

Expand Down Expand Up @@ -444,10 +443,6 @@ def _check_pid_from_log_file(self, pid, abort_on_warning, leak_wf, show_memleaks
show = False
if show and any(map(line.get_msg().endswith, self.skip_suffixes)):
show = False
if show:
line_msg = line.get_msg().casefold()
if any(sub in line_msg for sub in self.skip_substrings):
show = False
if show:
# Allow WARNING or ERROR messages, but anything higher like assert should
# trigger a failure.
Expand Down
1 change: 0 additions & 1 deletion utils/nlt_server.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ engines:
- DAOS_MD_CAP=1024
- DAOS_STRICT_SHUTDOWN=1
- DAOS_TARGET_OVERSUBSCRIBE=1
- ABT_STACK_OVERFLOW_CHECK=mprotect
storage:
-
class: ram
Expand Down
15 changes: 8 additions & 7 deletions utils/node_local_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5025,13 +5025,6 @@ def sizeof_fmt(num, suffix='B'):
if ignore_busy:
lto.skip_suffixes.append(" DER_BUSY(-1012): 'Device or resource busy'")

lto.skip_substrings.extend([
'sluggish ec boundary report from rank',
'sluggish stable epoch reporting',
'progress callback was not called for too long',
'rpc failed; rc:',
])

try:
lto.check_log_file(abort_on_warning=True,
show_memleaks=show_memleaks,
Expand Down Expand Up @@ -6757,6 +6750,14 @@ def run(wf, args):
run_fi = True
else:
print("Unable to detect fault injection feature, skipping testing")
print("Use fallback on $PATH")
fs = subprocess.run(['fault_status'], check=False)
print(fs)
if fs.returncode == 0:
run_fi = True
else:
print("Unable to detect fault injection feature - fall back does not work, "
"skipping testing")

if run_fi:
args.server_debug = 'INFO'
Expand Down
Loading