diff --git a/Jenkinsfile b/Jenkinsfile index 55a3418f689..2e645d05533 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -19,6 +19,7 @@ // To use a test branch (i.e. PR) until it lands to master // I.e. for testing library changes //@Library(value='pipeline-lib@your_branch') _ +@Library(value=['pipeline-lib@grom72/SRE-3704','system-pipeline-lib@grom72/SRE-3704']) _ /* groovylint-disable-next-line CompileStatic */ job_status_internal = [:] @@ -47,42 +48,6 @@ void job_step_update(def value=currentBuild.currentResult) { jobStatusUpdate(job_status_internal, env.STAGE_NAME, value) } -Map nlt_test() { - // groovylint-disable-next-line NoJavaUtilDate - Date startDate = new Date() - try { - unstash('nltr') - } catch (e) { - print 'Unstash failed, results from NLT stage will not be included' - } - sh label: 'Fault injection testing using NLT', - script: './ci/docker_nlt.sh --class-name fault-injection fi' - List filesList = [] - filesList.addAll(findFiles(glob: '*.memcheck.xml')) - int vgfail = 0 - int vgerr = 0 - if (filesList) { - String rcs = sh label: 'Check for Valgrind errors', - script: "grep -E ')' ${filesList.join(' ')} || true", - returnStdout: true - if (rcs) { - vgfail = 1 - } - String suite = sanitizedStageName() - junitSimpleReport suite: suite, - file: suite + '_valgrind_results.xml', - fails: vgfail, - errors: vgerr, - name: 'Valgrind_Memcheck', - class: 'Valgrind', - message: 'Valgrind Memcheck error detected', - testdata: rcs - } - int runTime = durationSeconds(startDate) - Map runData = ['nlttest_time': runTime] - return runData -} - // For master, this is just some wildly high number String next_version() { return '1000' @@ -351,6 +316,9 @@ pipeline { booleanParam(name: 'CI_FUNCTIONAL_leap15_TEST', defaultValue: false, description: 'Run the Functional on Leap 15 test stage') + booleanParam(name: 'CI_FUNCTIONAL_sles15_TEST', + defaultValue: false, + description: 'Run the Functional on SLES 15 test stage') booleanParam(name: 'CI_FUNCTIONAL_ubuntu20_TEST', defaultValue: false, description: 'Run the Functional on Ubuntu 20.04 test stage') @@ -391,8 +359,11 @@ pipeline { defaultValue: 'ci_vm9', description: 'Label to use for 9 VM functional tests') string(name: 'CI_NLT_1_LABEL', - defaultValue: 'ci_nlt_1', + defaultValue: 'ci_nlt_vm1', description: 'Label to use for NLT tests') + string(name: 'CI_FI_1_LABEL', + defaultValue: 'ci_fi_vm1', + description: 'Label to use for Fault Injection (FI) tests') string(name: 'FUNCTIONAL_HARDWARE_MEDIUM_LABEL', defaultValue: 'ci_nvme5', description: 'Label to use for the Functional Hardware Medium (MD on SSD) stages') @@ -934,7 +905,8 @@ pipeline { stage('Functional on SLES 15') { when { beforeAgent true - expression { !skipStage() } + /* expression { !skipStage() } until stage works as expected */ + expression { false } } agent { label vm9_label('Leap15') @@ -978,63 +950,44 @@ pipeline { } } // post } // stage('Functional on Ubuntu 20.04') - stage('Fault injection testing') { + stage('NLT Fault Injection testing') { when { beforeAgent true expression { !skipStage() } } agent { - dockerfile { - filename 'utils/docker/Dockerfile.el.9' - label 'docker_runner_fi' - additionalBuildArgs dockerBuildArgs(repo_type: 'stable', - parallel_build: true, - deps_build: true) + - ' --build-arg POINT_RELEASE=.7 ' - args '--tmpfs /mnt/daos_0' - } + label params.CI_FI_1_LABEL } steps { + /* job_step_update(nlt_test()) */ job_step_update( - sconsBuild(parallel_build: true, - scons_args: 'PREFIX=/opt/daos TARGET_TYPE=release BUILD_TYPE=debug', - build_deps: 'no')) - job_step_update(nlt_test()) - // recordCoverage(tools: [[parser: 'COBERTURA', pattern:'nltr.xml']], - // skipPublishingChecks: true, - // id: 'fir', name: 'Fault Injection Report') + unitTest(timeout_time: 240, + inst_repos: daosRepos(), + test_script: 'ci/unit/test_nlt.sh --memcheck no' + + ' --system-ram-reserved 4 --server-debug WARN' + + ' --log-usage-import nltr.json' + + ' --log-usage-save nltr.xml' + + ' --class-name fault-injection fi', + unstash_opt: true, + unstash_tests: false, + inst_rpms: unitPackages(target: 'el9') + ' daos-client-tests', + image_version: 'el9.7')) } post { always { + unitTestPost artifacts: ['nlt_logs/'], + testResults: 'nlt-junit.xml', + always_script: 'ci/unit/test_nlt_post.sh', + valgrind_stash: 'fault-inject-valgrind' discoverGitReferenceBuild referenceJob: 'daos-stack/daos/master', scm: 'daos-stack/daos', requiredResult: hudson.model.Result.UNSTABLE - recordIssues enabledForFailure: true, - /* ignore warning/errors from PMDK logging system */ - filters: [excludeFile('pmdk/.+')], - failOnError: false, - ignoreQualityGate: true, - qualityGates: [[threshold: 1, type: 'TOTAL_ERROR'], - [threshold: 1, type: 'TOTAL_HIGH'], - [threshold: 1, type: 'NEW_NORMAL', unstable: true], - [threshold: 1, type: 'NEW_LOW', unstable: true]], - tools: [issues(pattern: 'nlt-errors.json', - name: 'Fault injection issues', - id: 'Fault_Injection'), - issues(pattern: 'nlt-client-leaks.json', - name: 'Fault injection leaks', - id: 'NLT_client')], - scm: 'daos-stack/daos' - junit testResults: 'nlt-junit.xml' - stash name: 'fault-inject-valgrind', - includes: '*.memcheck.xml', - allowEmpty: true archiveArtifacts artifacts: 'nlt_logs/fault-injection/', allowEmptyArchive: true job_status_update() } } - } // stage('Fault injection testing') + } // stage('NLT Fault injection testing') stage('Test RPMs on EL 9.6') { when { beforeAgent true @@ -1255,8 +1208,7 @@ pipeline { post { always { valgrindReportPublish valgrind_stashes: ['nlt-memcheck', - 'unit-memcheck', - 'fault-inject-valgrind'] + 'unit-memcheck'] job_status_update('final_status') jobStatusWrite(job_status_internal) } diff --git a/ci/docker_nlt.sh b/ci/docker_nlt.sh deleted file mode 100755 index a6d85eba771..00000000000 --- a/ci/docker_nlt.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash - -# Script for running NLT in a docker container. This is called from Jenkinsfile -# where needed, and is a cheat way of running setup_daos_server_helper under sudo -# and NLT itself from a single script. - -set -e - -set -x - -. utils/sl/setup_local.sh - -ps auwx -sudo --preserve-env=SL_PREFIX,SL_SPDK_PREFIX ./utils/setup_daos_server_helper.sh - -TMP_DIR=$(mktemp -d) - -cp utils/node_local_test.py utils/nlt_server.yaml .build_vars.json "$TMP_DIR" -cp src/tests/ftest/cart/util/cart_logparse.py src/tests/ftest/cart/util/cart_logtest.py "$TMP_DIR" -if [ -e nltr.json ] -then - cp nltr.json "$TMP_DIR" -fi - -pushd "$TMP_DIR" - -set +e - -sudo --preserve-env=VIRTUAL_ENV,PATH ./node_local_test.py \ - --no-root --memcheck no --system-ram-reserved 48 --server-debug WARN \ - --log-usage-import nltr.json --log-usage-save nltr.xml "$@" - -RC=$? -set -e -popd - -cp "$TMP_DIR"/*.json . -cp "$TMP_DIR"/*.xml . -sudo chmod -R o+r "$TMP_DIR"/nlt_logs -cp -r "$TMP_DIR"/nlt_logs . - -exit $RC diff --git a/ci/provisioning/post_provision_config_common_functions.sh b/ci/provisioning/post_provision_config_common_functions.sh index 46fba4b21c2..68dd4d3da7f 100755 --- a/ci/provisioning/post_provision_config_common_functions.sh +++ b/ci/provisioning/post_provision_config_common_functions.sh @@ -321,6 +321,12 @@ post_provision_config_nodes() { dnf -y erase fuse3\* fi + # maldet brings additional load on CPU during tests (e,g, NLT tests) + if command -v maldet &>/dev/null; then + systemctl stop maldet 2>/dev/null || true + systemctl disable maldet 2>/dev/null || true + fi + if [ -n "$CONFIG_POWER_ONLY" ]; then rm -f "$REPOS_DIR"/*_job_daos-stack_job_*_job_*.repo time dnf -y erase fio fuse ior-hpc mpich-autoload \ diff --git a/ci/unit/test_nlt.sh b/ci/unit/test_nlt.sh index b8176aca873..23e3bc8b549 100755 --- a/ci/unit/test_nlt.sh +++ b/ci/unit/test_nlt.sh @@ -13,7 +13,7 @@ mydir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" # Copy over the install tree and some of the build tree. rsync -rlpt -z -e "ssh $SSH_KEY_ARGS" .build_vars* opt-daos.tar utils requirements-utest.txt jenkins@"$NODE":build/ -# shellcheck disable=SC2029 -ssh -tt "$SSH_KEY_ARGS" jenkins@"$NODE" "DAOS_HTTPS_PROXY=\"${DAOS_HTTPS_PROXY:-}\" \ - DAOS_NO_PROXY=\"${DAOS_NO_PROXY:-}\" \ - $(cat "$mydir/test_nlt_node.sh")" +ssh -T "$SSH_KEY_ARGS" jenkins@"$NODE" \ + "DAOS_HTTPS_PROXY=\"${DAOS_HTTPS_PROXY:-}\" \ + DAOS_NO_PROXY=\"${DAOS_NO_PROXY:-}\" \ + bash -s -- $*" < "$mydir/test_nlt_node.sh" diff --git a/ci/unit/test_nlt_node.sh b/ci/unit/test_nlt_node.sh index 62a734f3bcf..fdf03fec352 100755 --- a/ci/unit/test_nlt_node.sh +++ b/ci/unit/test_nlt_node.sh @@ -6,7 +6,6 @@ set -uex sudo bash -c 'echo 1 > /proc/sys/kernel/sysrq' -sudo mkdir -p /mnt/daos # using mmap()'ed ULT stacks requires to bump system default if [ "$(sudo sysctl -n vm.max_map_count)" -lt "1000000" ] ; then sudo sysctl vm.max_map_count=1000000 @@ -45,7 +44,21 @@ pip install /opt/daos/lib/daos/python/ sudo prlimit --nofile=1024:262144 --pid $$ prlimit -n -HTTPS_PROXY="${DAOS_HTTPS_PROXY:-}" \ +if [ "$#" -eq 0 ]; then + set -- --max-log-size 1950MiB \ + --class-name nlt \ + --system-ram-reserved 4 \ + --dfuse-dir /localhome/jenkins/ \ + --log-usage-save nltir.xml \ + --log-usage-export nltr.json all +fi + +mkdir -p nlt_logs +sudo mount -t tmpfs tmpfs nlt_logs +sudo chown jenkins:jenkins nlt_logs + +exec env \ + TMPDIR="$(pwd)/nlt_logs" \ + HTTPS_PROXY="${DAOS_HTTPS_PROXY:-}" \ NO_PROXY="${DAOS_NO_PROXY:-}" \ - ./utils/node_local_test.py --max-log-size 1950MiB \ - --dfuse-dir /localhome/jenkins/ --log-usage-save nltir.xml --log-usage-export nltr.json all + ./utils/node_local_test.py "$@" diff --git a/ci/unit/test_nlt_post.sh b/ci/unit/test_nlt_post.sh index c46a63dac2f..a70fa14c0e3 100755 --- a/ci/unit/test_nlt_post.sh +++ b/ci/unit/test_nlt_post.sh @@ -16,11 +16,19 @@ mkdir nlt_logs rsync -v -dprt -e "ssh $SSH_KEY_ARGS" jenkins@"$NODE":/tmp/ \ --filter="include dnt*.log" --filter="include dnt*.log.bz2" \ --filter="include dnt_fi_*_logs" \ - --filter="exclude *" nlt_logs/ + --filter="exclude *" nlt_logs/ || true + +# When running with --no-root, DAOS logs go to build/nlt_logs/ on the node +# instead of /tmp/, so fetch them from there as well. +rsync -v -rlpt -e "ssh $SSH_KEY_ARGS" jenkins@"$NODE":build/nlt_logs/ \ + --filter="include dnt*.log" --filter="include dnt*.log.bz2" \ + --filter="include dnt_fi_*_logs" --filter="include */" \ + --filter="exclude *" nlt_logs/ || true rsync -v -dpt -z -e "ssh $SSH_KEY_ARGS" jenkins@"$NODE":build/ \ --filter="include nlt*.json" --filter="include dnt*.xml" \ --filter="include nltir.xml" --filter="include nltr.json" \ --filter="include nlt-junit.xml" --filter="exclude *" ./ + mkdir -p vm_test mv nlt-errors.json vm_test/ diff --git a/src/tests/ftest/cart/util/cart_logtest.py b/src/tests/ftest/cart/util/cart_logtest.py index d203ce0784e..a500cf2046c 100755 --- a/src/tests/ftest/cart/util/cart_logtest.py +++ b/src/tests/ftest/cart/util/cart_logtest.py @@ -225,7 +225,6 @@ def __init__(self, log_iter, quiet=False): self.fi_triggered = False self.fi_location = None self.skip_suffixes = [] - self.skip_substrings = [] self._tracers = [] self.ftest_mode = False @@ -444,10 +443,6 @@ def _check_pid_from_log_file(self, pid, abort_on_warning, leak_wf, show_memleaks show = False if show and any(map(line.get_msg().endswith, self.skip_suffixes)): show = False - if show: - line_msg = line.get_msg().casefold() - if any(sub in line_msg for sub in self.skip_substrings): - show = False if show: # Allow WARNING or ERROR messages, but anything higher like assert should # trigger a failure. diff --git a/utils/nlt_server.yaml b/utils/nlt_server.yaml index 438e55718be..5a9cc7f4c61 100644 --- a/utils/nlt_server.yaml +++ b/utils/nlt_server.yaml @@ -14,7 +14,6 @@ engines: - DAOS_MD_CAP=1024 - DAOS_STRICT_SHUTDOWN=1 - DAOS_TARGET_OVERSUBSCRIBE=1 - - ABT_STACK_OVERFLOW_CHECK=mprotect storage: - class: ram diff --git a/utils/node_local_test.py b/utils/node_local_test.py index 724843e87b5..7ef66dd7734 100755 --- a/utils/node_local_test.py +++ b/utils/node_local_test.py @@ -5025,13 +5025,6 @@ def sizeof_fmt(num, suffix='B'): if ignore_busy: lto.skip_suffixes.append(" DER_BUSY(-1012): 'Device or resource busy'") - lto.skip_substrings.extend([ - 'sluggish ec boundary report from rank', - 'sluggish stable epoch reporting', - 'progress callback was not called for too long', - 'rpc failed; rc:', - ]) - try: lto.check_log_file(abort_on_warning=True, show_memleaks=show_memleaks, @@ -6757,6 +6750,14 @@ def run(wf, args): run_fi = True else: print("Unable to detect fault injection feature, skipping testing") + print("Use fallback on $PATH") + fs = subprocess.run(['fault_status'], check=False) + print(fs) + if fs.returncode == 0: + run_fi = True + else: + print("Unable to detect fault injection feature - fall back does not work, " + "skipping testing") if run_fi: args.server_debug = 'INFO'