diff --git a/cmd/zed/zed.d/deadman-slot_off.sh b/cmd/zed/zed.d/deadman-sync-slot_off.sh similarity index 100% rename from cmd/zed/zed.d/deadman-slot_off.sh rename to cmd/zed/zed.d/deadman-sync-slot_off.sh diff --git a/cmd/zed/zed.d/pool_import-led.sh b/cmd/zed/zed.d/pool_import-sync-led.sh similarity index 100% rename from cmd/zed/zed.d/pool_import-led.sh rename to cmd/zed/zed.d/pool_import-sync-led.sh diff --git a/cmd/zed/zed.d/statechange-led.sh b/cmd/zed/zed.d/statechange-sync-led.sh similarity index 100% rename from cmd/zed/zed.d/statechange-led.sh rename to cmd/zed/zed.d/statechange-sync-led.sh diff --git a/cmd/zed/zed.d/statechange-slot_off.sh b/cmd/zed/zed.d/statechange-sync-slot_off.sh similarity index 100% rename from cmd/zed/zed.d/statechange-slot_off.sh rename to cmd/zed/zed.d/statechange-sync-slot_off.sh diff --git a/cmd/zed/zed.d/vdev_attach-led.sh b/cmd/zed/zed.d/vdev_attach-sync-led.sh similarity index 100% rename from cmd/zed/zed.d/vdev_attach-led.sh rename to cmd/zed/zed.d/vdev_attach-sync-led.sh diff --git a/cmd/zed/zed.d/vdev_clear-led.sh b/cmd/zed/zed.d/vdev_clear-sync-led.sh similarity index 100% rename from cmd/zed/zed.d/vdev_clear-led.sh rename to cmd/zed/zed.d/vdev_clear-sync-led.sh diff --git a/cmd/zed/zed_exec.c b/cmd/zed/zed_exec.c index 036081decd64..cb6c5e783ba5 100644 --- a/cmd/zed/zed_exec.c +++ b/cmd/zed/zed_exec.c @@ -196,38 +196,33 @@ _nop(int sig) (void) sig; } -static void * -_reap_children(void *arg) +static void +wait_for_children(boolean_t do_pause, boolean_t wait) { - (void) arg; - struct launched_process_node node, *pnode; pid_t pid; - int status; struct rusage usage; - struct sigaction sa = {}; - - (void) sigfillset(&sa.sa_mask); - (void) sigdelset(&sa.sa_mask, SIGCHLD); - (void) pthread_sigmask(SIG_SETMASK, &sa.sa_mask, NULL); - - (void) sigemptyset(&sa.sa_mask); - sa.sa_handler = _nop; - sa.sa_flags = SA_NOCLDSTOP; - (void) sigaction(SIGCHLD, &sa, NULL); + int status; + struct launched_process_node node, *pnode; for (_reap_children_stop = B_FALSE; !_reap_children_stop; ) { (void) pthread_mutex_lock(&_launched_processes_lock); - pid = wait4(0, &status, WNOHANG, &usage); - + pid = wait4(0, &status, wait ? 0 : WNOHANG, &usage); if (pid == 0 || pid == (pid_t)-1) { (void) pthread_mutex_unlock(&_launched_processes_lock); - if (pid == 0 || errno == ECHILD) - pause(); - else if (errno != EINTR) + if ((pid == 0) || (errno == ECHILD)) { + if (do_pause) + pause(); + } else if (errno != EINTR) zed_log_msg(LOG_WARNING, "Failed to wait for children: %s", strerror(errno)); + zed_log_msg(LOG_INFO, "badpid exit %d", pid); + if (!do_pause) + return; + } else { + zed_log_msg(LOG_INFO, "normal pid %d", pid); + memset(&node, 0, sizeof (node)); node.pid = pid; pnode = avl_find(&_launched_processes, &node, NULL); @@ -278,6 +273,25 @@ _reap_children(void *arg) } } +} + +static void * +_reap_children(void *arg) +{ + (void) arg; + struct sigaction sa = {}; + + (void) sigfillset(&sa.sa_mask); + (void) sigdelset(&sa.sa_mask, SIGCHLD); + (void) pthread_sigmask(SIG_SETMASK, &sa.sa_mask, NULL); + + (void) sigemptyset(&sa.sa_mask); + sa.sa_handler = _nop; + sa.sa_flags = SA_NOCLDSTOP; + (void) sigaction(SIGCHLD, &sa, NULL); + + wait_for_children(B_TRUE, B_FALSE); + return (NULL); } @@ -306,6 +320,45 @@ zed_exec_fini(void) _reap_children_tid = (pthread_t)-1; } +/* + * Check if the zedlet name indicates if it is a synchronous zedlet + * + * Synchronous zedlets have a "-sync-" immediately following the event name in + * their zedlet filename, like: + * + * EVENT_NAME-sync-ZEDLETNAME.sh + * + * For example, if you wanted a synchronous statechange script: + * + * statechange-sync-myzedlet.sh + * + * Synchronous zedlets are guaranteed to be the only zedlet running. No other + * zedlets may run in parallel with a synchronous zedlet. A synchronously + * zedlet will wait for all previously spawned zedlets to finish before running. + * Users should be careful to only use synchronous zedlets when needed, since + * they decrease parallelism. + */ +static boolean_t +zedlet_is_sync(const char *zedlet, const char *event) +{ + const char *sync_str = "-sync-"; + size_t sync_str_len; + size_t zedlet_len; + size_t event_len; + + sync_str_len = strlen(sync_str); + zedlet_len = strlen(zedlet); + event_len = strlen(event); + + if (event_len + sync_str_len >= zedlet_len) + return (B_FALSE); + + if (strncmp(&zedlet[event_len], sync_str, sync_str_len) == 0) + return (B_TRUE); + + return (B_FALSE); +} + /* * Process the event [eid] by synchronously invoking all zedlets with a * matching class prefix. @@ -368,9 +421,28 @@ zed_exec_process(uint64_t eid, const char *class, const char *subclass, z = zed_strings_next(zcp->zedlets)) { for (csp = class_strings; *csp; csp++) { n = strlen(*csp); - if ((strncmp(z, *csp, n) == 0) && !isalpha(z[n])) + if ((strncmp(z, *csp, n) == 0) && !isalpha(z[n])) { + boolean_t is_sync = zedlet_is_sync(z, *csp); + + if (is_sync) { + /* + * Wait for previous zedlets to + * finish + */ + wait_for_children(B_FALSE, B_TRUE); + } + _zed_exec_fork_child(eid, zcp->zedlet_dir, z, e, zcp->zevent_fd, zcp->do_foreground); + + if (is_sync) { + /* + * Wait for sync zedlet we just launched + * to finish. + */ + wait_for_children(B_FALSE, B_TRUE); + } + } } } free(e); diff --git a/man/man8/zed.8.in b/man/man8/zed.8.in index c90a1834403b..b1445ef94992 100644 --- a/man/man8/zed.8.in +++ b/man/man8/zed.8.in @@ -158,6 +158,8 @@ Multiple ZEDLETs may be invoked for a given zevent. ZEDLETs are executables invoked by the ZED in response to a given zevent. They should be written under the presumption they can be invoked concurrently, and they should use appropriate locking to access any shared resources. +The one exception to this are "synchronous zedlets", which are described later +in this page. Common variables used by ZEDLETs can be stored in the default rc file which is sourced by scripts; these variables should be prefixed with .Sy ZED_ . @@ -233,6 +235,36 @@ and .Sy ZPOOL . These variables may be overridden in the rc file. . +.Sh Synchronous ZEDLETS +ZED's normal behavior is to spawn off zedlets in parallel and ignore their +completion order. +This means that ZED can potentially +have zedlets for event ID number 2 starting before zedlets for event ID number +1 have finished. +Most of the time this is fine, and it actually helps when the system is getting +hammered with hundreds of events. +.Pp +However, there are times when you want your zedlets to be executed in sequence +with the event ID. +That is where synchronous zedlets come in. +.Pp +ZED will wait for all previously spawned zedlets to finish before running +a synchronous zedlet. +Synchronous zedlets are guaranteed to be the only +zedlet running. +No other zedlets may run in parallel with a synchronous zedlet. +Users should be careful to only use synchronous zedlets when needed, since +they decrease parallelism. +.Pp +To make a zedlet synchronous, simply add a "-sync-" immediately following the +event name in the zedlet's file name: +.Pp +.Sy EVENT_NAME-sync-ZEDLETNAME.sh +.Pp +For example, if you wanted a synchronous statechange script: +.Pp +.Sy statechange-sync-myzedlet.sh +. .Sh FILES .Bl -tag -width "-c" .It Pa @sysconfdir@/zfs/zed.d diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 8d3cb06076a3..a930fb07f5cb 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -109,7 +109,8 @@ tags = ['functional', 'direct'] [tests/functional/events:Linux] tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter', 'zed_fd_spill', 'zed_cksum_reported', 'zed_cksum_config', 'zed_io_config', - 'zed_slow_io', 'zed_slow_io_many_vdevs', 'zed_diagnose_multiple'] + 'zed_slow_io', 'zed_slow_io_many_vdevs', 'zed_diagnose_multiple', + 'zed_synchronous_zedlet'] tags = ['functional', 'events'] [tests/functional/fadvise:Linux] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index acff5e57db93..08ff5a95a7b9 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1518,6 +1518,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/events/zed_rc_filter.ksh \ functional/events/zed_slow_io.ksh \ functional/events/zed_slow_io_many_vdevs.ksh \ + functional/events/zed_synchronous_zedlet.ksh \ functional/exec/cleanup.ksh \ functional/exec/exec_001_pos.ksh \ functional/exec/exec_002_neg.ksh \ diff --git a/tests/zfs-tests/tests/functional/events/zed_synchronous_zedlet.ksh b/tests/zfs-tests/tests/functional/events/zed_synchronous_zedlet.ksh new file mode 100755 index 000000000000..6b732ea96d0c --- /dev/null +++ b/tests/zfs-tests/tests/functional/events/zed_synchronous_zedlet.ksh @@ -0,0 +1,149 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 by Lawrence Livermore National Security, LLC. +# + +# DESCRIPTION: +# Verify ZED synchronous zedlets work as expected +# +# STRATEGY: +# 1. Create a scrub_start zedlet that runs quickly +# 2. Create a scrub_start zedlet that runs slowly (takes seconds) +# 3. Create a scrub_finish zedlet that is synchronous and runs slowly +# 4. Create a trim_start zedlet that runs quickly +# 4. Scrub the pool +# 5. Trim the pool +# 6. Verify the synchronous scrub_finish zedlet waited for the scrub_start +# zedlets to finish (including the slow one). If the scrub_finish zedlet +# was not synchronous, it would have completed before the slow scrub_start +# zedlet. +# 7. Verify the trim_start zedlet waited for the slow synchronous scrub_finish +# zedlet to complete. + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/events/events_common.kshlib + +verify_runnable "both" + +OUR_ZEDLETS="scrub_start-async.sh scrub_start-slow.sh scrub_finish-sync-slow.sh trim_start-async.sh" + +OUTFILE="$TEST_BASE_DIR/zed_synchronous_zedlet_lines" +TESTPOOL2=testpool2 + +function cleanup +{ + zed_stop + + for i in $OUR_ZEDLETS ; do + log_must rm -f $ZEDLET_DIR/$i + done + destroy_pool $TESTPOOL2 + log_must rm -f $TEST_BASE_DIR/vdev-file-sync-zedlet + log_must rm -f $OUTFILE +} + +log_assert "Verify ZED synchronous zedlets work as expected" + +log_onexit cleanup + +# Make a pool +log_must truncate -s 100M $TEST_BASE_DIR/vdev-file-sync-zedlet +log_must zpool create $TESTPOOL2 $TEST_BASE_DIR/vdev-file-sync-zedlet + +# Do an initial scrub +log_must zpool scrub -w $TESTPOOL2 + +log_must zpool events -c + +mkdir -p $ZEDLET_DIR + +# Create zedlets +cat << EOF > $ZEDLET_DIR/scrub_start-async.sh +#!/bin/ksh -p +echo "\$(date) \$(basename \$0)" >> $OUTFILE +EOF + +cat << EOF > $ZEDLET_DIR/scrub_start-slow.sh +#!/bin/ksh -p +sleep 3 +echo "\$(date) \$(basename \$0)" >> $OUTFILE +EOF + +cat << EOF > $ZEDLET_DIR/scrub_finish-sync-slow.sh +#!/bin/ksh -p +sleep 3 +echo "\$(date) \$(basename \$0)" >> $OUTFILE +EOF + +cat << EOF > $ZEDLET_DIR/trim_start-async.sh +#!/bin/ksh -p +echo "\$(date) \$(basename \$0)" >> $OUTFILE +EOF + +for i in $OUR_ZEDLETS ; do + log_must chmod +x $ZEDLET_DIR/$i +done + +log_must zed_start + +# Do a scrub - it should be instantaneous. +log_must zpool scrub -w $TESTPOOL2 + +# Start off a trim immediately after scrubiung. The trim should be +# instantaneous and generate a trimp_start event. This will happen in parallel +# with the slow 'scrub_finish-sync-slow.sh' zedlet still running. +log_must zpool trim -w $TESTPOOL2 + +# Wait for scrub_finish event to happen for sanity. This is the *event*, not +# the completion of zedlets for the event. +log_must file_wait_event $ZED_DEBUG_LOG 'sysevent\.fs\.zfs\.trim_finish' 10 + +# At a minimum, scrub_start-slow.sh + scrub_finish-sync-slow.sh will take a +# total of 6 seconds to run, so wait 7 sec to be sure. +sleep 7 + +# If our zedlets were run in the right order, with sync correctly honored, you +# will see this ordering in $OUTFILE: +# +# Fri May 16 12:04:23 PDT 2025 scrub_start-async.sh +# Fri May 16 12:04:26 PDT 2025 scrub_start-slow.sh +# Fri May 16 12:04:31 PDT 2025 scrub_finish-sync-slow.sh +# Fri May 16 12:04:31 PDT 2025 trim_start-async.sh +# +# Check for this ordering + +# Get a list of just the script names in the order they were executed +# from OUTFILE +lines="$(echo $(grep -Eo '(scrub|trim)_.+\.sh$' $OUTFILE))" + +# Compare it to the ordering we expect +expected="\ +scrub_start-async.sh \ +scrub_start-slow.sh \ +scrub_finish-sync-slow.sh \ +trim_start-async.sh" +log_must test "$lines" == "$expected" + +log_pass "Verified synchronous zedlets"