diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 7eed5f48b989..955995a12c4a 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -281,9 +281,30 @@ typedef enum dmu_object_type { * the transaction is full. See the comment above dmu_tx_assign() for more * details on the meaning of these flags. */ -#define DMU_TX_NOWAIT (0ULL) -#define DMU_TX_WAIT (1ULL<<0) -#define DMU_TX_NOTHROTTLE (1ULL<<1) +typedef enum { + /* + * If the tx cannot be assigned to a transaction for any reason, do + * not block but return immediately. + */ + DMU_TX_NOWAIT = 0, + + /* + * Assign the tx to the open transaction. If the open transaction is + * full, or the write throttle is active, block until the next + * transaction and try again. If the pool suspends while waiting + * and failmode=continue, return an error. + */ + DMU_TX_WAIT = (1 << 0), + + /* If the write throttle would prevent the assignment, ignore it. */ + DMU_TX_NOTHROTTLE = (1 << 1), + + /* + * With DMU_TX_WAIT, always block if the pool suspends during + * assignment, regardless of the value of the failmode= property. + */ + DMU_TX_SUSPEND = (1 << 2), +} dmu_tx_flag_t; void byteswap_uint64_array(void *buf, size_t size); void byteswap_uint32_array(void *buf, size_t size); @@ -849,7 +870,7 @@ void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object); void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow); void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size); void dmu_tx_abort(dmu_tx_t *tx); -int dmu_tx_assign(dmu_tx_t *tx, uint64_t flags); +int dmu_tx_assign(dmu_tx_t *tx, dmu_tx_flag_t flags); void dmu_tx_wait(dmu_tx_t *tx); void dmu_tx_commit(dmu_tx_t *tx); void dmu_tx_mark_netfree(dmu_tx_t *tx); diff --git a/include/sys/dmu_tx.h b/include/sys/dmu_tx.h index b87836ecc2d9..ce49a0c49044 100644 --- a/include/sys/dmu_tx.h +++ b/include/sys/dmu_tx.h @@ -25,6 +25,7 @@ */ /* * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2025, Klara, Inc. */ #ifndef _SYS_DMU_TX_H @@ -80,6 +81,9 @@ struct dmu_tx { /* has this transaction already been delayed? */ boolean_t tx_dirty_delayed; + /* whether dmu_tx_wait() should return on suspend */ + boolean_t tx_break_on_suspend; + int tx_err; }; @@ -143,7 +147,7 @@ extern dmu_tx_stats_t dmu_tx_stats; * These routines are defined in dmu.h, and are called by the user. */ dmu_tx_t *dmu_tx_create(objset_t *dd); -int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); +int dmu_tx_assign(dmu_tx_t *tx, dmu_tx_flag_t flags); void dmu_tx_commit(dmu_tx_t *tx); void dmu_tx_abort(dmu_tx_t *tx); uint64_t dmu_tx_get_txg(dmu_tx_t *tx); diff --git a/include/sys/txg.h b/include/sys/txg.h index 5ce727a279de..eabb6f7aab4e 100644 --- a/include/sys/txg.h +++ b/include/sys/txg.h @@ -25,6 +25,7 @@ */ /* * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2025, Klara, Inc. */ #ifndef _SYS_TXG_H @@ -78,6 +79,9 @@ typedef enum { /* If a signal arrives while waiting, abort and return EINTR */ TXG_WAIT_SIGNAL = (1 << 0), + + /* If the pool suspends while waiting, abort and return ESHUTDOWN. */ + TXG_WAIT_SUSPEND = (1 << 1), } txg_wait_flag_t; struct dsl_pool; @@ -111,6 +115,11 @@ extern int txg_wait_synced_flags(struct dsl_pool *dp, uint64_t txg, */ extern void txg_wait_synced(struct dsl_pool *dp, uint64_t txg); +/* + * Wake all threads waiting in txg_wait_synced_flags() so they can reevaluate. + */ +extern void txg_wait_kick(struct dsl_pool *dp); + /* * Wait until the given transaction group, or one after it, is * the open transaction group. Try to make this happen as soon diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c index b75d1ccea685..c8ab7cc7cf8e 100644 --- a/module/os/freebsd/zfs/vdev_geom.c +++ b/module/os/freebsd/zfs/vdev_geom.c @@ -1241,7 +1241,16 @@ vdev_geom_io_done(zio_t *zio) } if (bp == NULL) { - ASSERT3S(zio->io_error, ==, ENXIO); + if (zio_injection_enabled && zio->io_error == EIO) + /* + * Convert an injected EIO to ENXIO. This is needed to + * work around zio_handle_device_injection_impl() not + * currently being able to inject ENXIO directly, while + * the assertion below only allows ENXIO here. + */ + zio->io_error = SET_ERROR(ENXIO); + else + ASSERT3S(zio->io_error, ==, ENXIO); return; } diff --git a/module/zfs/dmu_redact.c b/module/zfs/dmu_redact.c index cde47de6b2bf..65443d112f27 100644 --- a/module/zfs/dmu_redact.c +++ b/module/zfs/dmu_redact.c @@ -568,7 +568,7 @@ commit_rl_updates(objset_t *os, struct merge_data *md, uint64_t object, { dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(os->os_spa)->dp_mos_dir); dmu_tx_hold_space(tx, sizeof (struct redact_block_list_node)); - VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT)); + VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT | DMU_TX_SUSPEND)); uint64_t txg = dmu_tx_get_txg(tx); if (!md->md_synctask_txg[txg & TXG_MASK]) { dsl_sync_task_nowait(dmu_tx_pool(tx), diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index f2bd6a5e3c3c..c2e6c749fbaa 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -23,7 +23,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2017 by Delphix. All rights reserved. - * Copyright (c) 2024, Klara, Inc. + * Copyright (c) 2024, 2025, Klara, Inc. */ #include @@ -1017,7 +1017,7 @@ dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) * decreasing performance. */ static int -dmu_tx_try_assign(dmu_tx_t *tx, uint64_t flags) +dmu_tx_try_assign(dmu_tx_t *tx) { spa_t *spa = tx->tx_pool->dp_spa; @@ -1032,19 +1032,10 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t flags) DMU_TX_STAT_BUMP(dmu_tx_suspended); /* - * If the user has indicated a blocking failure mode - * then return ERESTART which will block in dmu_tx_wait(). - * Otherwise, return EIO so that an error can get - * propagated back to the VOP calls. - * - * Note that we always honor the `flags` flag regardless - * of the failuremode setting. + * Let dmu_tx_assign() know specifically what happened, so + * it can make the right choice based on the caller flags. */ - if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE && - !(flags & DMU_TX_WAIT)) - return (SET_ERROR(EIO)); - - return (SET_ERROR(ERESTART)); + return (SET_ERROR(ESHUTDOWN)); } if (!tx->tx_dirty_delayed && @@ -1184,6 +1175,12 @@ dmu_tx_unassign(dmu_tx_t *tx) * they have already called dmu_tx_wait() (though most likely on a * different tx). * + * If DMU_TX_SUSPEND is set, this indicates that this tx should ignore + * the pool being or becoming suspending while it is in progress. This will + * cause dmu_tx_assign() (and dmu_tx_wait()) to block until the pool resumes. + * If this flag is not set and the pool suspends, the return will be either + * ERESTART or EIO, depending on the value of the pool's failmode= property. + * * It is guaranteed that subsequent successful calls to dmu_tx_assign() * will assign the tx to monotonically increasing txgs. Of course this is * not strong monotonicity, because the same txg can be returned multiple @@ -1201,12 +1198,13 @@ dmu_tx_unassign(dmu_tx_t *tx) * 1 <- dmu_tx_get_txg(T3) */ int -dmu_tx_assign(dmu_tx_t *tx, uint64_t flags) +dmu_tx_assign(dmu_tx_t *tx, dmu_tx_flag_t flags) { int err; ASSERT(tx->tx_txg == 0); - ASSERT0(flags & ~(DMU_TX_WAIT | DMU_TX_NOTHROTTLE)); + ASSERT0(flags & ~(DMU_TX_WAIT | DMU_TX_NOTHROTTLE | DMU_TX_SUSPEND)); + IMPLY(flags & DMU_TX_SUSPEND, flags & DMU_TX_WAIT); ASSERT(!dsl_pool_sync_context(tx->tx_pool)); /* If we might wait, we must not hold the config lock. */ @@ -1215,13 +1213,74 @@ dmu_tx_assign(dmu_tx_t *tx, uint64_t flags) if ((flags & DMU_TX_NOTHROTTLE)) tx->tx_dirty_delayed = B_TRUE; - while ((err = dmu_tx_try_assign(tx, flags)) != 0) { + if (!(flags & DMU_TX_SUSPEND)) + tx->tx_break_on_suspend = B_TRUE; + + while ((err = dmu_tx_try_assign(tx)) != 0) { dmu_tx_unassign(tx); + boolean_t suspended = (err == ESHUTDOWN); + if (suspended) { + /* + * Pool suspended. We need to decide whether to block + * and retry, or return error, depending on the + * caller's flags and the pool config. + */ + if (flags & DMU_TX_SUSPEND) + /* + * The caller expressly does not care about + * suspend, so treat it as a normal retry. + */ + err = SET_ERROR(ERESTART); + else if ((flags & DMU_TX_WAIT) && + spa_get_failmode(tx->tx_pool->dp_spa) == + ZIO_FAILURE_MODE_CONTINUE) + /* + * Caller wants to wait, but pool config is + * overriding that, so return EIO to be + * propagated back to userspace. + */ + err = SET_ERROR(EIO); + else + /* Anything else, we should just block. */ + err = SET_ERROR(ERESTART); + } + + /* + * Return unless we decided to retry, or the caller does not + * want to block. + */ if (err != ERESTART || !(flags & DMU_TX_WAIT)) return (err); + /* + * Wait until there's room in this txg, or until it's been + * synced out and a new one is available. + * + * If we're here because the pool suspended above, then we + * unset tx_break_on_suspend to make sure that if dmu_tx_wait() + * has to fall back to a txg_wait_synced_flags(), it doesn't + * immediately return because the pool is suspended. That would + * then immediately return here, and we'd end up in a busy loop + * until the pool resumes. + * + * On the other hand, if the pool hasn't suspended yet, then it + * should be allowed to break a txg wait if the pool does + * suspend, so we can loop and reassess it in + * dmu_tx_try_assign(). + */ + if (suspended) + tx->tx_break_on_suspend = B_FALSE; + dmu_tx_wait(tx); + + /* + * Reset tx_break_on_suspend for DMU_TX_SUSPEND. We do this + * here so that it's available if we return for some other + * reason, and then the caller calls dmu_tx_wait(). + */ + if (!(flags & DMU_TX_SUSPEND)) + tx->tx_break_on_suspend = B_TRUE; } txg_rele_to_quiesce(&tx->tx_txgh); @@ -1239,6 +1298,16 @@ dmu_tx_wait(dmu_tx_t *tx) ASSERT(tx->tx_txg == 0); ASSERT(!dsl_pool_config_held(tx->tx_pool)); + /* + * Break on suspend according to whether or not DMU_TX_SUSPEND was + * supplied to the previous dmu_tx_assign() call. For clients, this + * ensures that after dmu_tx_assign() fails, the followup dmu_tx_wait() + * gets the same behaviour wrt suspend. See also the comments in + * dmu_tx_assign(). + */ + txg_wait_flag_t flags = + (tx->tx_break_on_suspend ? TXG_WAIT_SUSPEND : TXG_WAIT_NONE); + before = gethrtime(); if (tx->tx_wait_dirty) { @@ -1276,7 +1345,7 @@ dmu_tx_wait(dmu_tx_t *tx) * obtain a tx. If that's the case then tx_lasttried_txg * would not have been set. */ - txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); + txg_wait_synced_flags(dp, spa_last_synced_txg(spa) + 1, flags); } else if (tx->tx_needassign_txh) { dnode_t *dn = tx->tx_needassign_txh->txh_dnode; @@ -1291,7 +1360,7 @@ dmu_tx_wait(dmu_tx_t *tx) * out a TXG at which point we'll hopefully have synced * a portion of the changes. */ - txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); + txg_wait_synced_flags(dp, spa_last_synced_txg(spa) + 1, flags); } spa_tx_assign_add_nsecs(spa, gethrtime() - before); diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index e10b1a879204..cf04bb866683 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -1437,7 +1437,7 @@ dsl_scan_restart_resilver(dsl_pool_t *dp, uint64_t txg) if (txg == 0) { dmu_tx_t *tx; tx = dmu_tx_create_dd(dp->dp_mos_dir); - VERIFY(0 == dmu_tx_assign(tx, DMU_TX_WAIT)); + VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT | DMU_TX_SUSPEND)); txg = dmu_tx_get_txg(tx); dp->dp_scan->scn_restart_txg = txg; diff --git a/module/zfs/dsl_synctask.c b/module/zfs/dsl_synctask.c index d538aa75ed98..1097a72d3211 100644 --- a/module/zfs/dsl_synctask.c +++ b/module/zfs/dsl_synctask.c @@ -58,7 +58,7 @@ dsl_sync_task_common(const char *pool, dsl_checkfunc_t *checkfunc, top: tx = dmu_tx_create_dd(dp->dp_mos_dir); - VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT)); + VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT | DMU_TX_SUSPEND)); dst.dst_pool = dp; dst.dst_txg = dmu_tx_get_txg(tx); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 4d36a04498a7..bc2d1e1c47db 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1984,7 +1984,7 @@ static void spa_unload_log_sm_flush_all(spa_t *spa) { dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT)); + VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT | DMU_TX_SUSPEND)); ASSERT3U(spa->spa_log_flushall_txg, ==, 0); spa->spa_log_flushall_txg = dmu_tx_get_txg(tx); diff --git a/module/zfs/txg.c b/module/zfs/txg.c index dbeacc4abe4a..46a1d06a7fdb 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -23,6 +23,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Portions Copyright 2011 Martin Matuska * Copyright (c) 2012, 2019 by Delphix. All rights reserved. + * Copyright (c) 2025, Klara, Inc. */ #include @@ -705,7 +706,7 @@ txg_wait_synced_flags(dsl_pool_t *dp, uint64_t txg, txg_wait_flag_t flags) int error = 0; tx_state_t *tx = &dp->dp_tx; - ASSERT0(flags & ~TXG_WAIT_SIGNAL); + ASSERT0(flags & ~(TXG_WAIT_SIGNAL | TXG_WAIT_SUSPEND)); ASSERT(!dsl_pool_config_held(dp)); mutex_enter(&tx->tx_sync_lock); @@ -723,6 +724,15 @@ txg_wait_synced_flags(dsl_pool_t *dp, uint64_t txg, txg_wait_flag_t flags) * else interesting happens, we'll set an error and break out. */ while (tx->tx_synced_txg < txg) { + if ((flags & TXG_WAIT_SUSPEND) && spa_suspended(dp->dp_spa)) { + /* + * Pool suspended and the caller does not want to + * block; inform them immediately. + */ + error = SET_ERROR(ESHUTDOWN); + break; + } + dprintf("broadcasting sync more " "tx_synced=%llu waiting=%llu dp=%px\n", (u_longlong_t)tx->tx_synced_txg, @@ -756,6 +766,15 @@ txg_wait_synced(dsl_pool_t *dp, uint64_t txg) VERIFY0(txg_wait_synced_flags(dp, txg, TXG_WAIT_NONE)); } +void +txg_wait_kick(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + mutex_enter(&tx->tx_sync_lock); + cv_broadcast(&tx->tx_sync_done_cv); + mutex_exit(&tx->tx_sync_lock); +} + /* * Wait for the specified open transaction group. Set should_quiesce * when the current open txg should be quiesced immediately. diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index b58b87d1fcc7..fac2c3a5f154 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -569,7 +569,7 @@ spa_condense_indirect_commit_entry(spa_t *spa, dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count)); - VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT)); + VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT | DMU_TX_SUSPEND)); int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; /* diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c index 8ff38889b797..4274728578ad 100644 --- a/module/zfs/vdev_initialize.c +++ b/module/zfs/vdev_initialize.c @@ -158,7 +158,7 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) vd->vdev_initialize_state = new_state; dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT)); + VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT | DMU_TX_SUSPEND)); if (new_state != VDEV_INITIALIZE_NONE) { dsl_sync_task_nowait(spa_get_dsl(spa), @@ -250,7 +250,7 @@ vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data) mutex_exit(&vd->vdev_initialize_io_lock); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT)); + VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT | DMU_TX_SUSPEND)); uint64_t txg = dmu_tx_get_txg(tx); spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 62d9c9909bd1..a9b12471cbc1 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -4652,7 +4652,8 @@ spa_raidz_expand_thread(void *arg, zthr_t *zthr) dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT)); + VERIFY0(dmu_tx_assign(tx, + DMU_TX_WAIT | DMU_TX_SUSPEND)); uint64_t txg = dmu_tx_get_txg(tx); /* diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c index 21cb57e38b12..0e296606d037 100644 --- a/module/zfs/vdev_rebuild.c +++ b/module/zfs/vdev_rebuild.c @@ -287,7 +287,7 @@ vdev_rebuild_initiate(vdev_t *vd) ASSERT(!vd->vdev_rebuilding); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT)); + VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT | DMU_TX_SUSPEND)); vd->vdev_rebuilding = B_TRUE; @@ -592,7 +592,7 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size) mutex_exit(&vr->vr_io_lock); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT)); + VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT | DMU_TX_SUSPEND)); uint64_t txg = dmu_tx_get_txg(tx); spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); @@ -932,7 +932,7 @@ vdev_rebuild_thread(void *arg) dsl_pool_t *dp = spa_get_dsl(spa); dmu_tx_t *tx = dmu_tx_create_dd(dp->dp_mos_dir); - VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT)); + VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT | DMU_TX_SUSPEND)); mutex_enter(&vd->vdev_rebuild_lock); if (error == 0) { diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index 28aae9a31856..d8de5a747f04 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -1724,7 +1724,8 @@ spa_vdev_remove_thread(void *arg) dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT)); + VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT | + DMU_TX_SUSPEND)); uint64_t txg = dmu_tx_get_txg(tx); /* diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index 43998577c0ad..842bb3e690d4 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -342,7 +342,7 @@ vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state, vd->vdev_trim_state = new_state; dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT)); + VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT | DMU_TX_SUSPEND)); dsl_sync_task_nowait(spa_get_dsl(spa), vdev_trim_zap_update_sync, guid, tx); @@ -527,7 +527,7 @@ vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size) mutex_exit(&vd->vdev_trim_io_lock); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT)); + VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT | DMU_TX_SUSPEND)); uint64_t txg = dmu_tx_get_txg(tx); spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); diff --git a/module/zfs/zil.c b/module/zfs/zil.c index ac271d398155..644a00aaeb40 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -957,7 +957,7 @@ zil_commit_activate_saxattr_feature(zilog_t *zilog) dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL && !dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR)) { tx = dmu_tx_create(zilog->zl_os); - VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT)); + VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT | DMU_TX_SUSPEND)); dsl_dataset_dirty(ds, tx); txg = dmu_tx_get_txg(tx); @@ -1003,7 +1003,7 @@ zil_create(zilog_t *zilog) */ if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) { tx = dmu_tx_create(zilog->zl_os); - VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT)); + VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT | DMU_TX_SUSPEND)); dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); @@ -1093,7 +1093,7 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first) return (B_FALSE); tx = dmu_tx_create(zilog->zl_os); - VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT)); + VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT | DMU_TX_SUSPEND)); dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); @@ -1977,7 +1977,8 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) * Open transaction to allocate the next block pointer. */ dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); - VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT | DMU_TX_NOTHROTTLE)); + VERIFY0(dmu_tx_assign(tx, + DMU_TX_WAIT | DMU_TX_NOTHROTTLE | DMU_TX_SUSPEND)); dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); uint64_t txg = dmu_tx_get_txg(tx); @@ -3456,7 +3457,8 @@ zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw) * should not be subject to the dirty data based delays. We * use DMU_TX_NOTHROTTLE to bypass the delay mechanism. */ - VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT | DMU_TX_NOTHROTTLE)); + VERIFY0(dmu_tx_assign(tx, + DMU_TX_WAIT | DMU_TX_NOTHROTTLE | DMU_TX_SUSPEND)); itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t)); itx->itx_sync = B_TRUE; diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 1769606ebb8a..c24faeada7fe 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -2714,6 +2714,8 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason) } mutex_exit(&spa->spa_suspend_lock); + + txg_wait_kick(spa->spa_dsl_pool); } int diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 9ea511dec0eb..246f02e9db6d 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -716,6 +716,10 @@ tags = ['functional', 'direct'] tests = ['exec_001_pos', 'exec_002_neg'] tags = ['functional', 'exec'] +[tests/functional/failmode] +tests = ['failmode_dmu_tx_wait', 'failmode_dmu_tx_continue'] +tags = ['functional', 'failmode'] + [tests/functional/fallocate] tests = ['fallocate_punch-hole'] tags = ['functional', 'fallocate'] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index acff5e57db93..4345b2e930d0 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1525,6 +1525,10 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/fadvise/cleanup.ksh \ functional/fadvise/fadvise_sequential.ksh \ functional/fadvise/setup.ksh \ + functional/failmode/cleanup.ksh \ + functional/failmode/failmode_dmu_tx_wait.ksh \ + functional/failmode/failmode_dmu_tx_continue.ksh \ + functional/failmode/setup.ksh \ functional/fallocate/cleanup.ksh \ functional/fallocate/fallocate_prealloc.ksh \ functional/fallocate/fallocate_punch-hole.ksh \ diff --git a/tests/zfs-tests/tests/functional/failmode/cleanup.ksh b/tests/zfs-tests/tests/functional/failmode/cleanup.ksh new file mode 100755 index 000000000000..59d225388f47 --- /dev/null +++ b/tests/zfs-tests/tests/functional/failmode/cleanup.ksh @@ -0,0 +1,30 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +default_cleanup diff --git a/tests/zfs-tests/tests/functional/failmode/failmode_dmu_tx_continue.ksh b/tests/zfs-tests/tests/functional/failmode/failmode_dmu_tx_continue.ksh new file mode 100755 index 000000000000..f5f37b3f51bb --- /dev/null +++ b/tests/zfs-tests/tests/functional/failmode/failmode_dmu_tx_continue.ksh @@ -0,0 +1,102 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +log_assert "dmu_tx_assign() returns when pool suspends with failmode=continue" + +typeset -i dd_pid=0 + +function cleanup +{ + zinject -c all || true + zpool clear $TESTPOOL || true + test $dd_pid -gt 0 && kill -9 $dd_pid || true + destroy_pool $TESTPOOL +} + +log_onexit cleanup + +DISK=${DISKS%% *} + +# create a single-disk pool, set failmode=continue +log_must zpool create -o failmode=continue -f $TESTPOOL $DISK +log_must zfs create -o recordsize=128k $TESTPOOL/$TESTFS + +# start writing to a file in the background. these args to dd will make it +# keep writing until it fills the pool, but we abort before that happens +dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/file bs=128k & +dd_pid=$! + +# give it a moment for the write throttle to start pushing back +sleep 2 + +# force the pool to suspend by inducing the writes and followup probe to fail +log_must zinject -d $DISK -e io -T write $TESTPOOL +log_must zinject -d $DISK -e nxio -T probe $TESTPOOL + +# should only take a moment, but give it a chance +log_note "waiting for pool to suspend" +typeset -i tries=10 +until [[ $(kstat_pool $TESTPOOL state) == "SUSPENDED" ]] ; do + if ((tries-- == 0)); then + log_fail "pool didn't suspend" + fi + sleep 1 +done + +# dmu_tx_try_assign() should have noticed the suspend by now +typeset -i suspended=$(kstat dmu_tx.dmu_tx_suspended) + +# dd should have exited with failure +typeset -i blocked +if kill -0 $dd_pid ; then + blocked=1 + log_note "dd is blocked in the kernel!" +else + blocked=0 + log_note "dd exited while pool suspended" +fi + +# bring the pool back online +log_must zinject -c all +log_must zpool clear $TESTPOOL + +# kill dd if it's still running, then get its return code +# (it will be a failure if it was still running and we kill it, but we don't +# care about that, because we already know it blocked) +test $blocked -eq 1 && kill -9 $dd_pid +wait $dd_pid +typeset -i rc=$? +dd_pid=0 + +# confirm that dd failed when the pool suspended +log_must test $suspended -ne 0 +log_must test $blocked -eq 0 +log_must test $rc -ne 0 + +log_pass "dmu_tx_assign() returns when pool suspends with failmode=continue" diff --git a/tests/zfs-tests/tests/functional/failmode/failmode_dmu_tx_wait.ksh b/tests/zfs-tests/tests/functional/failmode/failmode_dmu_tx_wait.ksh new file mode 100755 index 000000000000..88284942603e --- /dev/null +++ b/tests/zfs-tests/tests/functional/failmode/failmode_dmu_tx_wait.ksh @@ -0,0 +1,98 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +log_assert "dmu_tx_assign() blocks when pool suspends with failmode=wait" + +typeset -i dd_pid=0 + +function cleanup +{ + zinject -c all || true + zpool clear $TESTPOOL || true + test $dd_pid -gt 0 && kill -9 $dd_pid || true + destroy_pool $TESTPOOL +} + +log_onexit cleanup + +DISK=${DISKS%% *} + +# create a single-disk pool, set failmode=wait +log_must zpool create -o failmode=wait -f $TESTPOOL $DISK +log_must zfs create -o recordsize=128k $TESTPOOL/$TESTFS + +# start writing to a file in the background. these args to dd will make it +# keep writing until it fills the pool, but we will kill it before that happens. +dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/file bs=128k & +dd_pid=$! + +# give it a moment for the write throttle to start pushing back +sleep 2 + +# force the pool to suspend by inducing the writes and followup probe to fail +log_must zinject -d $DISK -e io -T write $TESTPOOL +log_must zinject -d $DISK -e nxio -T probe $TESTPOOL + +# should only take a moment, but give it a chance +log_note "waiting for pool to suspend" +typeset -i tries=10 +until [[ $(kstat_pool $TESTPOOL state) == "SUSPENDED" ]] ; do + if ((tries-- == 0)); then + log_fail "pool didn't suspend" + fi + sleep 1 +done + +# dmu_tx_try_assign() should have noticed the suspend by now +typeset -i suspended=$(kstat dmu_tx.dmu_tx_suspended) + +# dd should still be running, blocked in the kernel +typeset -i blocked +if kill -0 $dd_pid ; then + blocked=1 + log_note "dd is blocked as expected" +else + blocked=0 + log_note "dd exited while pool suspended!" +fi + +# bring the pool back online +log_must zinject -c all +log_must zpool clear $TESTPOOL + +# kill dd, we're done with it +kill -9 $dd_pid +wait $dd_pid +dd_pid=0 + +# confirm that dd was blocked in dmu_tx assign/wait +log_must test $suspended -ne 0 +log_must test $blocked -eq 1 + +log_pass "dmu_tx_assign() blocks when pool suspends with failmode=wait" diff --git a/tests/zfs-tests/tests/functional/failmode/setup.ksh b/tests/zfs-tests/tests/functional/failmode/setup.ksh new file mode 100755 index 000000000000..099c6306dbd5 --- /dev/null +++ b/tests/zfs-tests/tests/functional/failmode/setup.ksh @@ -0,0 +1,28 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib