diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 45eb9c783659..9a3e45d29ac0 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -7723,7 +7723,8 @@ import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path) if (cfg == NULL) { zdb_set_skip_mmp(poolname); - error = spa_get_stats(poolname, &cfg, NULL, 0); + error = spa_get_stats(poolname, &cfg, NULL, 0, + ZPOOL_LOCK_BEHAVIOR_DEFAULT); if (error != 0) { fatal("Tried to read config of pool \"%s\" but " "spa_get_stats() failed with error %d\n", diff --git a/cmd/zpool/zpool_iter.c b/cmd/zpool/zpool_iter.c index 2ec189b98653..fa4f5769b730 100644 --- a/cmd/zpool/zpool_iter.c +++ b/cmd/zpool/zpool_iter.c @@ -118,6 +118,7 @@ pool_list_get(int argc, char **argv, zprop_list_t **proplist, zfs_type_t type, boolean_t literal, int *err) { zpool_list_t *zlp; + int rc; zlp = safe_malloc(sizeof (zpool_list_t)); @@ -137,7 +138,11 @@ pool_list_get(int argc, char **argv, zprop_list_t **proplist, zfs_type_t type, zlp->zl_literal = literal; if (argc == 0) { - (void) zpool_iter(g_zfs, add_pool, zlp); + rc = zpool_iter(g_zfs, add_pool, zlp); + if (rc != 0) { + free(zlp); + return (NULL); + } zlp->zl_findall = B_TRUE; } else { int i; diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 8eb7e35cf2d0..a355fa750797 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -10979,6 +10979,32 @@ status_callback(zpool_handle_t *zhp, void *data) return (0); } +/* + * Set the zpool status lock behavior based off of the ZPOOL_LOCK_BEHAVIOR + * envvar. If the var is not set, or an unknown value, then set the lock + * behavior to ZPOOL_LOCK_BEHAVIOR_DEFAULT. + */ +static void +zpool_set_lock_behavior(void) +{ + char *str; + zpool_lock_behavior_t zpool_lock_behavior; + + str = getenv("ZPOOL_LOCK_BEHAVIOR"); + if (str == NULL) + zpool_lock_behavior = ZPOOL_LOCK_BEHAVIOR_DEFAULT; + else if (strcmp(str, "wait") == 0) + zpool_lock_behavior = ZPOOL_LOCK_BEHAVIOR_WAIT; + else if (strcmp(str, "trylock") == 0) + zpool_lock_behavior = ZPOOL_LOCK_BEHAVIOR_TRYLOCK; + else if (strcmp(str, "lockless") == 0) + zpool_lock_behavior = ZPOOL_LOCK_BEHAVIOR_LOCKLESS; + else + zpool_lock_behavior = ZPOOL_LOCK_BEHAVIOR_DEFAULT; + + libzfs_set_lock_behavior(g_zfs, zpool_lock_behavior); +} + /* * zpool status [-c [script1,script2,...]] [-dDegiLpPstvx] [--power] ... * [-T d|u] [pool] [interval [count]] @@ -11152,6 +11178,8 @@ zpool_do_status(int argc, char **argv) usage(B_FALSE); } + zpool_set_lock_behavior(); + for (;;) { if (cb.cb_json) { cb.cb_jsobj = zpool_json_schema(0, 1); diff --git a/include/libzfs.h b/include/libzfs.h index c3b3542bcf6c..107541abee90 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -264,6 +264,8 @@ _LIBZFS_H int zpool_create(libzfs_handle_t *, const char *, nvlist_t *, nvlist_t *, nvlist_t *); _LIBZFS_H int zpool_destroy(zpool_handle_t *, const char *); _LIBZFS_H int zpool_add(zpool_handle_t *, nvlist_t *, boolean_t check_ashift); +_LIBZFS_H void libzfs_set_lock_behavior(libzfs_handle_t *, + zpool_lock_behavior_t); typedef struct splitflags { /* do not split, but return the config that would be split off */ diff --git a/include/os/freebsd/spl/sys/mutex.h b/include/os/freebsd/spl/sys/mutex.h index 546a4681a291..f91831fcefde 100644 --- a/include/os/freebsd/spl/sys/mutex.h +++ b/include/os/freebsd/spl/sys/mutex.h @@ -72,4 +72,19 @@ typedef enum { #define mutex_owned(lock) sx_xlocked(lock) #define mutex_owner(lock) sx_xholder(lock) +/* + * Poor-man's version of Linux kernel's down_timeout(). Try to acquire a mutex + * for 'ns' number of nanoseconds. Returns 0 if mutex was acquired or ETIME + * if timeout occurred. + */ +static inline int mutex_enter_timeout(kmutex_t *mutex, uint64_t ns) +{ + hrtime_t end = gethrtime() + ns; + while (gethrtime() < end) { + if (mutex_tryenter(mutex)) + return (0); /* success */ + } + return (ETIME); +} + #endif /* _OPENSOLARIS_SYS_MUTEX_H_ */ diff --git a/include/os/linux/spl/sys/mutex.h b/include/os/linux/spl/sys/mutex.h index f000f53ab9b6..5f1faa6a95ce 100644 --- a/include/os/linux/spl/sys/mutex.h +++ b/include/os/linux/spl/sys/mutex.h @@ -26,6 +26,7 @@ #define _SPL_MUTEX_H #include +#include #include #include #include @@ -187,4 +188,19 @@ spl_mutex_lockdep_on_maybe(kmutex_t *mp) \ /* NOTE: do not dereference mp after this point */ \ } +/* + * Poor-man's version of Linux kernel's down_timeout(). Try to acquire a mutex + * for 'ns' number of nanoseconds. Returns 0 if mutex was acquired or ETIME + * if timeout occurred. + */ +static inline int mutex_enter_timeout(kmutex_t *mutex, uint64_t ns) +{ + hrtime_t end = gethrtime() + ns; + while (gethrtime() < end) { + if (mutex_tryenter(mutex)) + return (0); /* success */ + } + return (ETIME); +} + #endif /* _SPL_MUTEX_H */ diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 44d63e8708cb..ac9642f5433b 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -872,6 +872,9 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_REBUILD_STATS "org.openzfs:rebuild_stats" #define ZPOOL_CONFIG_COMPATIBILITY "compatibility" +/* ZFS_IOC_POOL_STATS argument to for spa_namespace locking behavior */ +#define ZPOOL_CONFIG_LOCK_BEHAVIOR "lock_behavior" /* not stored on disk */ + /* * The persistent vdev state is stored as separate values rather than a single * 'vdev_state' entry. This is because a device can be in multiple states, such @@ -1984,6 +1987,30 @@ enum zio_encrypt { ZFS_XA_NS_PREFIX_MATCH(LINUX_TRUSTED, name) || \ ZFS_XA_NS_PREFIX_MATCH(LINUX_USER, name)) +/* + * Set locking behavior for zpool commands. + */ +typedef enum { + /* Wait to acquire the lock on the zpool config */ + ZPOOL_LOCK_BEHAVIOR_WAIT = 0, + ZPOOL_LOCK_BEHAVIOR_DEFAULT = ZPOOL_LOCK_BEHAVIOR_WAIT, + /* + * Return an error if it's taking an unnecessarily long time to + * acquire the lock on the pool config (default 100ms) + */ + ZPOOL_LOCK_BEHAVIOR_TRYLOCK = 1, + + /* + * DANGER: THIS CAN CRASH YOUR SYSTEM + * + * If you can't acquire the pool config lock after 100ms then do a + * a lockless lookup. This should only be done in emergencies, as it + * can crash the kernel module! + */ + ZPOOL_LOCK_BEHAVIOR_LOCKLESS = 2, + ZPOOL_LOCK_BEHAVIOR_END = 3 /* last entry marker */ +} zpool_lock_behavior_t; + #ifdef __cplusplus } #endif diff --git a/include/sys/spa.h b/include/sys/spa.h index 880f60471b65..4cb642bfc701 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -742,10 +742,13 @@ typedef enum trim_type { /* state manipulation functions */ extern int spa_open(const char *pool, spa_t **, const void *tag); +extern int spa_open_common_lock_behavior(const char *pool, spa_t **spapp, + const void *tag, nvlist_t *nvpolicy, nvlist_t **config, + zpool_lock_behavior_t zpool_lock_behavior); extern int spa_open_rewind(const char *pool, spa_t **, const void *tag, nvlist_t *policy, nvlist_t **config); extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot, - size_t buflen); + size_t buflen, zpool_lock_behavior_t zpool_lock_behavior); extern int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, nvlist_t *zplprops, struct dsl_crypto_params *dcp); extern int spa_import(char *pool, nvlist_t *config, nvlist_t *props, @@ -851,10 +854,13 @@ extern kcondvar_t spa_namespace_cv; extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t, boolean_t); extern void spa_config_load(void); -extern int spa_all_configs(uint64_t *generation, nvlist_t **pools); +extern int spa_all_configs(uint64_t *generation, nvlist_t **pools, + zpool_lock_behavior_t zpool_lock_behavior); extern void spa_config_set(spa_t *spa, nvlist_t *config); extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats); +extern nvlist_t *spa_config_generate_lock_behavior(spa_t *spa, vdev_t *vd, + uint64_t txg, int getstats, zpool_lock_behavior_t zpool_lock_behavior); extern void spa_config_update(spa_t *spa, int what); extern int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int atype); @@ -866,9 +872,11 @@ extern int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, /* Namespace manipulation */ extern spa_t *spa_lookup(const char *name); +extern spa_t *spa_lookup_lockless(const char *name); extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot); extern void spa_remove(spa_t *spa); extern spa_t *spa_next(spa_t *prev); +extern spa_t *spa_next_lockless(spa_t *prev); /* Refcount functions */ extern void spa_open_ref(spa_t *spa, const void *tag); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 8c52f751a819..87a93f5f9d90 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -491,6 +491,7 @@ extern void spa_set_deadman_synctime(hrtime_t ns); extern void spa_set_deadman_ziotime(hrtime_t ns); extern const char *spa_history_zone(void); extern const char *zfs_active_allocator; +extern unsigned int spa_namespace_trylock_ms; extern int param_set_active_allocator_common(const char *val); #ifdef __cplusplus diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index b3d48e257538..09a5b41c0c5d 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -277,6 +277,7 @@ extern void mutex_enter(kmutex_t *mp); extern int mutex_enter_check_return(kmutex_t *mp); extern void mutex_exit(kmutex_t *mp); extern int mutex_tryenter(kmutex_t *mp); +extern int mutex_enter_timeout(kmutex_t *mp, uint64_t ns); #define NESTED_SINGLE 1 #define mutex_enter_nested(mp, class) mutex_enter(mp) diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 26ae5d2f1396..4fe026b70289 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -218,6 +218,7 @@ + @@ -2073,6 +2074,15 @@ + + + + + + + + + @@ -2212,7 +2222,7 @@ - + @@ -2270,6 +2280,9 @@ + + + @@ -2949,6 +2962,12 @@ + + + + + + @@ -2959,6 +2978,12 @@ + + + + + + @@ -3054,6 +3079,12 @@ + + + + + + @@ -3563,12 +3594,6 @@ - - - - - - @@ -4324,12 +4349,6 @@ - - - - - - @@ -4948,12 +4967,6 @@ - - - - - - @@ -8550,6 +8563,11 @@ + + + + + diff --git a/lib/libzfs/libzfs_config.c b/lib/libzfs/libzfs_config.c index 0d2102191389..9fb77b397528 100644 --- a/lib/libzfs/libzfs_config.c +++ b/lib/libzfs/libzfs_config.c @@ -109,6 +109,7 @@ namespace_reload(libzfs_handle_t *hdl) nvpair_t *elem; zfs_cmd_t zc = {"\0"}; void *cookie; + nvlist_t *nv_args = NULL; if (hdl->libzfs_ns_gen == 0) { /* @@ -129,21 +130,35 @@ namespace_reload(libzfs_handle_t *hdl) zcmd_alloc_dst_nvlist(hdl, &zc, 0); + + VERIFY0(nvlist_alloc(&nv_args, NV_UNIQUE_NAME, 0)); + VERIFY0(nvlist_add_uint64(nv_args, ZPOOL_CONFIG_LOCK_BEHAVIOR, + hdl->zpool_lock_behavior)); + zcmd_write_src_nvlist(hdl, &zc, nv_args); + for (;;) { zc.zc_cookie = hdl->libzfs_ns_gen; if (zfs_ioctl(hdl, ZFS_IOC_POOL_CONFIGS, &zc) != 0) { switch (errno) { + case EAGAIN: + return (zfs_standard_error(hdl, errno, + dgettext(TEXT_DOMAIN, "Unable to get the " + "pool config lock in a reasonable amount of" + " time"))); case EEXIST: /* * The namespace hasn't changed. */ zcmd_free_nvlists(&zc); return (0); - case ENOMEM: zcmd_expand_dst_nvlist(hdl, &zc); break; - + case ENOTSUP: + return (zfs_standard_error(hdl, errno, + dgettext(TEXT_DOMAIN, + "ZPOOL_LOCK_BEHAVIOR=lockless requires" + " root"))); default: zcmd_free_nvlists(&zc); return (zfs_standard_error(hdl, errno, @@ -253,6 +268,7 @@ zpool_refresh_stats(zpool_handle_t *zhp, boolean_t *missing) int error; nvlist_t *config; libzfs_handle_t *hdl = zhp->zpool_hdl; + nvlist_t *nv_args = NULL; *missing = B_FALSE; (void) strcpy(zc.zc_name, zhp->zpool_name); @@ -262,9 +278,15 @@ zpool_refresh_stats(zpool_handle_t *zhp, boolean_t *missing) zcmd_alloc_dst_nvlist(hdl, &zc, zhp->zpool_config_size); + VERIFY0(nvlist_alloc(&nv_args, NV_UNIQUE_NAME, 0)); + VERIFY0(nvlist_add_uint64(nv_args, ZPOOL_CONFIG_LOCK_BEHAVIOR, + hdl->zpool_lock_behavior)); + zcmd_write_src_nvlist(hdl, &zc, nv_args); + for (;;) { - if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_STATS, - &zc) == 0) { + int rc; + rc = zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_STATS, &zc); + if (rc == 0) { /* * The real error is returned in the zc_cookie field. */ @@ -276,7 +298,8 @@ zpool_refresh_stats(zpool_handle_t *zhp, boolean_t *missing) zcmd_expand_dst_nvlist(hdl, &zc); else { zcmd_free_nvlists(&zc); - if (errno == ENOENT || errno == EINVAL) + + if (errno == ENOENT || errno == EINVAL || rc) *missing = B_TRUE; zhp->zpool_state = POOL_STATE_UNAVAIL; return (0); diff --git a/lib/libzfs/libzfs_impl.h b/lib/libzfs/libzfs_impl.h index 9053740ec2f0..cbc36eb871ec 100644 --- a/lib/libzfs/libzfs_impl.h +++ b/lib/libzfs/libzfs_impl.h @@ -73,6 +73,8 @@ struct libzfs_handle { uint64_t libzfs_max_nvlist; void *libfetch; char *libfetch_load_error; + + zpool_lock_behavior_t zpool_lock_behavior; }; struct zfs_handle { diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 17677d0f8608..bc8adaf05877 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -80,16 +80,15 @@ zpool_get_all_props(zpool_handle_t *zhp) (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + nvlist_t *innvl = fnvlist_alloc(); if (zhp->zpool_n_propnames > 0) { - nvlist_t *innvl = fnvlist_alloc(); fnvlist_add_string_array(innvl, ZPOOL_GET_PROPS_NAMES, zhp->zpool_propnames, zhp->zpool_n_propnames); - zcmd_write_src_nvlist(hdl, &zc, innvl); - fnvlist_free(innvl); } - - zcmd_alloc_dst_nvlist(hdl, &zc, 0); - + VERIFY0(nvlist_add_uint64(innvl, ZPOOL_CONFIG_LOCK_BEHAVIOR, + hdl->zpool_lock_behavior)); + zcmd_write_src_nvlist(hdl, &zc, innvl); + fnvlist_free(innvl); while (zfs_ioctl(hdl, ZFS_IOC_POOL_GET_PROPS, &zc) != 0) { if (errno == ENOMEM) zcmd_expand_dst_nvlist(hdl, &zc); diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index e450d94add87..06682bd220b1 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -1026,6 +1026,18 @@ libzfs_free_str_array(char **strs, int count) free(strs); } +/* + * Set spa_namespace locking behavior for zpools. For example you may want to + * error out if you can't acquire the lock, or skip the lock entirely (which + * can be dangerous). + */ +void +libzfs_set_lock_behavior(libzfs_handle_t *hdl, + zpool_lock_behavior_t zpool_lock_behavior) +{ + hdl->zpool_lock_behavior = zpool_lock_behavior; +} + /* * Returns 1 if environment variable is set to "YES", "yes", "ON", "on", or * a non-zero number. @@ -1111,6 +1123,7 @@ libzfs_init(void) ftbl[SPA_FEATURE_LARGE_BLOCKS].fi_zfs_mod_supported = B_FALSE; } + libzfs_set_lock_behavior(hdl, ZPOOL_LOCK_BEHAVIOR_DEFAULT); return (hdl); } diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index e397fc851cc1..0389089621a1 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -238,6 +238,22 @@ mutex_exit(kmutex_t *mp) VERIFY0(pthread_mutex_unlock(&mp->m_lock)); } +/* + * Poor-man's version of Linux kernel's down_timeout(). Try to acquire a mutex + * for 'ns' number of nanoseconds. Returns 0 if mutex was acquired or ENOLCK + * if timeout occurred. + */ +int +mutex_enter_timeout(kmutex_t *mutex, uint64_t ns) +{ + hrtime_t end = gethrtime() + ns; + while (gethrtime() < end) { + if (mutex_tryenter(mutex)) + return (0); /* success */ + } + return (ENOLCK); +} + /* * ========================================================================= * rwlocks diff --git a/lib/libzpool/util.c b/lib/libzpool/util.c index a297daedbd4d..b2b11f32af67 100644 --- a/lib/libzpool/util.c +++ b/lib/libzpool/util.c @@ -137,7 +137,8 @@ show_pool_stats(spa_t *spa) nvlist_t *config, *nvroot; const char *name; - VERIFY(spa_get_stats(spa_name(spa), &config, NULL, 0) == 0); + VERIFY(spa_get_stats(spa_name(spa), &config, NULL, 0, + ZPOOL_LOCK_BEHAVIOR_DEFAULT) == 0); VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index f5881d7faf59..54d64605b60b 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -2673,8 +2673,35 @@ Defines zvol block devices behavior when .It Sy zvol_enforce_quotas Ns = Ns Sy 0 Ns | Ns 1 Pq uint Enable strict ZVOL quota enforcement. The strict quota enforcement may have a performance impact. +.It Sy zfs_debug_namespace_lock_delay +WARNING, DEBUG USE ONLY! +If set, add an artificial +.Sy zfs_debug_namespace_lock_delay +milliseconds delay after taking the spa_namespace lock. +This is used to simulate a hung pool. +If +.Sy zfs_debug_namespace_lock_delay +is set to -1 then crash the kernel on the next spa_namespace lock hold. +If set to 0 (default) then take no action. +This option is used by test suite. +.It Sy spa_namespace_trylock_ms +If the user has set +.Sy ZPOOL_LOCK_BEHAIVOR=lockless|trylock +then try for this many milliseconds to get the spa_namespace lock before +moving on. +.Sy zpool status +This is the internal lock that synchronizes changes to the pool configuration. +Note that +.Sy zpool status +will try to get the lock in multiple places (like getting the pool config, +stats, props) before returning. +That means the total wall clock time for +.Sy zpool status +could a multiple of +.Sy spa_namespace_trylock_ms +if the lock is already held. +Default is 100 (milliseconds). .El -. .Sh ZFS I/O SCHEDULER ZFS issues I/O operations to leaf vdevs to satisfy and complete I/O operations. The scheduler determines when and in what order those operations are issued. diff --git a/man/man8/zpool.8 b/man/man8/zpool.8 index b96944050594..fa8143315fdd 100644 --- a/man/man8/zpool.8 +++ b/man/man8/zpool.8 @@ -576,6 +576,49 @@ or by setting .Sy ZFS_VDEV_DEVID_OPT_OUT . .Pp +.It Sy ZPOOL_LOCK_BEHAVIOR +Change the +.Sy zpool status +pool lock acquisition behavior. +This can be useful for running +.Sy zpool status +even when the pool is hung. +Valid +.Sy ZPOOL_LOCK_BEHAVIOR +values are: +.Bl -tag -compact -offset 4n -width "lockless" +.It Sy trylock +Try for a short amount of time to get the pool config lock (spa_namespace lock). +If that doesn't work then simply abort +.Sy zpool status . +The amount of time to wait for the lock is controlled by the +.Sy spa_namespace_trylock_ms +module parameter. +.It Sy lockless +Same as +.Sy trylock +but if +.Sy zpool status +cannot get the lock, then read the status information locklessly. +.Sy WARNING: this is dangerous and can crash your system if pool configs are being modified while zpool status is running. +As a safegurard, +.Sy zpool status +with +.Sy ZPOOL_LOCK_BEHAVIOR=lockless +can only be run by root. +.Sy lockless +should only be used in emergency/debug situations. +.It Sy wait +Wait forever for the lock. +This is the default behavior. +.El +.Pp +If +.Sy ZPOOL_LOCK_BEHAVIOR +is set to any other value, or not set, then +.Sy zpool status +will wait forever for the pool config lock. +.Pp .It Sy ZPOOL_SCRIPTS_AS_ROOT Allow a privileged user to run .Nm zpool Cm status Ns / Ns Cm iostat Fl c . diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 66dfe629dc96..86862a2ded82 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -310,6 +310,27 @@ static int zfs_livelist_condense_zthr_cancel = 0; */ static int zfs_livelist_condense_new_alloc = 0; + +/* + * WARNING, FOR TEST USE ONLY! + * + * Wait zfs_debug_namespace_lock_delay milliseconds after taking the + * spa_namespace_lock to simulate lock contention. + * If set to -1, then *panic* to simulate a pool failure. + * If set to 0 then take no action (default) + */ +static int zfs_debug_namespace_lock_delay = 0; + +/* + * When zpool_lock_behavior is set to ZPOOL_LOCK_BEHAVIOR_TRYLOCK or + * ZPOOL_LOCK_BEHAVIOR_LOCKLESS, then try to acquire the the spa_namespace_ lock + * for this many milliseconds before moving on to the next step. + * + * Note that 'zpool status' will try to get the lock in multiple places (like + * getting the pool config, stats, props) before returning. + */ +unsigned int spa_namespace_trylock_ms = 100; + /* * ========================================================================== * SPA properties routines @@ -5794,8 +5815,9 @@ spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, * ambiguous state. */ static int -spa_open_common(const char *pool, spa_t **spapp, const void *tag, - nvlist_t *nvpolicy, nvlist_t **config) +spa_open_common_impl(const char *pool, spa_t **spapp, const void *tag, + nvlist_t *nvpolicy, nvlist_t **config, + zpool_lock_behavior_t zpool_lock_behavior) { spa_t *spa; spa_load_state_t state = SPA_LOAD_OPEN; @@ -5812,8 +5834,45 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag, * avoid dsl_dir_open() calling this in the first place. */ if (MUTEX_NOT_HELD(&spa_namespace_lock)) { - mutex_enter(&spa_namespace_lock); - locked = B_TRUE; + + /* + * Special debug case: we've selected a pool for lockless + * operation + */ + if ((zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_LOCKLESS) || + (zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_TRYLOCK)) { + /* Make a cursory effort to acquire the lock in 100ms */ + if (mutex_enter_timeout(&spa_namespace_lock, + MSEC2NSEC(spa_namespace_trylock_ms)) != 0) { + if (zpool_lock_behavior == + ZPOOL_LOCK_BEHAVIOR_LOCKLESS) { + + /* + * Danger: we're doing spa_lookup() + * without holding the lock + */ + spa = spa_lookup_lockless(pool); + if (spa == NULL) { + return (SET_ERROR(ENOENT)); + } + + goto skip_the_locks; + } else { + /* + * We had ZPOOL_LOCK_BEHAVIOR_TRYLOCK + * set and cant get the lock in 100ms. + */ + return (SET_ERROR(EAGAIN)); + } + } else { + /* We got the lock from mutex_enter_timeout() */ + locked = B_TRUE; + } + } else { + /* Normal case of getting the lock */ + mutex_enter(&spa_namespace_lock); + locked = B_TRUE; + } } if ((spa = spa_lookup(pool)) == NULL) { @@ -5822,6 +5881,19 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag, return (SET_ERROR(ENOENT)); } + /* + * Simulate holding the spa_namespace lock for a long time, or forcing + * a crash with the lock held. This is for testing purposes only. + */ + if (zfs_debug_namespace_lock_delay == -1) { +#if _KERNEL && defined(__linux__) + BUG(); +#endif + } else if (zfs_debug_namespace_lock_delay > 0) { + zfs_sleep_until(gethrtime() + + MSEC2NSEC(zfs_debug_namespace_lock_delay)); + } + if (spa->spa_state == POOL_STATE_UNINITIALIZED) { zpool_load_policy_t policy; @@ -5881,10 +5953,12 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag, } } +skip_the_locks: spa_open_ref(spa, tag); if (config != NULL) - *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); + *config = spa_config_generate_lock_behavior(spa, NULL, -1ULL, + B_TRUE, zpool_lock_behavior); /* * If we've recovered the pool, pass back any information we @@ -5910,6 +5984,23 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag, return (0); } +static int +spa_open_common(const char *pool, spa_t **spapp, const void *tag, + nvlist_t *nvpolicy, nvlist_t **config) +{ + return (spa_open_common_impl(pool, spapp, tag, nvpolicy, config, + ZPOOL_LOCK_BEHAVIOR_DEFAULT)); +} + +int +spa_open_common_lock_behavior(const char *pool, spa_t **spapp, const void *tag, + nvlist_t *nvpolicy, nvlist_t **config, zpool_lock_behavior_t + zpool_lock_behavior) +{ + return (spa_open_common_impl(pool, spapp, tag, nvpolicy, config, + zpool_lock_behavior)); +} + int spa_open_rewind(const char *name, spa_t **spapp, const void *tag, nvlist_t *policy, nvlist_t **config) @@ -6143,13 +6234,18 @@ spa_add_feature_stats(spa_t *spa, nvlist_t *config) int spa_get_stats(const char *name, nvlist_t **config, - char *altroot, size_t buflen) + char *altroot, size_t buflen, zpool_lock_behavior_t zpool_lock_behavior) { int error; spa_t *spa; *config = NULL; - error = spa_open_common(name, &spa, FTAG, NULL, config); + + error = spa_open_common_lock_behavior(name, &spa, FTAG, NULL, config, + zpool_lock_behavior); + if (error != 0) { + return (error); + } if (spa != NULL) { /* @@ -11080,6 +11176,13 @@ ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, U64, ZMOD_RW, "Allow importing pool with up to this number of missing top-level " "vdevs (in read-only mode)"); +ZFS_MODULE_PARAM(zfs, zfs_, debug_namespace_lock_delay, INT, ZMOD_RW, + "Do not use - for ZFS test suite only."); + +ZFS_MODULE_PARAM(zfs_spa, spa_, namespace_trylock_ms, UINT, ZMOD_RW, + "Try for this many milliseconds to get the spa_namespace lock when " + "running 'zpool status --trylock|--lockless'."); + ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT, ZMOD_RW, "Set the livelist condense zthr to pause"); diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index de53de114979..6f5045e5cf52 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -69,6 +69,7 @@ static uint64_t spa_config_generation = 1; * userland pools when doing testing. */ char *spa_config_path = (char *)ZPOOL_CACHE; + #ifdef _KERNEL static int zfs_autoimport_disable = B_TRUE; #endif @@ -369,19 +370,42 @@ spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent, * information for all pool visible within the zone. */ int -spa_all_configs(uint64_t *generation, nvlist_t **pools) +spa_all_configs(uint64_t *generation, nvlist_t **pools, zpool_lock_behavior_t + zpool_lock_behavior) { + int error = 0; spa_t *spa = NULL; + boolean_t use_lock = B_TRUE; if (*generation == spa_config_generation) return (SET_ERROR(EEXIST)); - int error = mutex_enter_interruptible(&spa_namespace_lock); - if (error) - return (SET_ERROR(EINTR)); + if (zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_WAIT) { + error = mutex_enter_interruptible(&spa_namespace_lock); + } else { + error = mutex_enter_timeout(&spa_namespace_lock, + MSEC2NSEC(spa_namespace_trylock_ms)); + } + if (error) { + if (zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_TRYLOCK) + return (SET_ERROR(EAGAIN)); + else if (zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_LOCKLESS) + use_lock = B_FALSE; + else + return (SET_ERROR(EINTR)); + } *pools = fnvlist_alloc(); - while ((spa = spa_next(spa)) != NULL) { + + do { + if (use_lock) + spa = spa_next(spa); + else + spa = spa_next_lockless(spa); + + if (spa == NULL) + break; + if (INGLOBALZONE(curproc) || zone_dataset_visible(spa_name(spa), NULL)) { mutex_enter(&spa->spa_props_lock); @@ -389,9 +413,11 @@ spa_all_configs(uint64_t *generation, nvlist_t **pools) spa->spa_config); mutex_exit(&spa->spa_props_lock); } - } + } while (1); *generation = spa_config_generation; - mutex_exit(&spa_namespace_lock); + + if (use_lock) + mutex_exit(&spa_namespace_lock); return (0); } @@ -412,8 +438,9 @@ spa_config_set(spa_t *spa, nvlist_t *config) * We infer whether to generate a complete config or just one top-level config * based on whether vd is the root vdev. */ -nvlist_t * -spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) +static nvlist_t * +spa_config_generate_impl(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats, + zpool_lock_behavior_t zpool_lock_behavior) { nvlist_t *config, *nvroot; vdev_t *rvd = spa->spa_root_vdev; @@ -421,15 +448,26 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) boolean_t locked = B_FALSE; uint64_t split_guid; const char *pool_name; + boolean_t skip_locks = B_FALSE; + + /* Special debug case: we've selected a pool for lockless operation */ + if (zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_LOCKLESS) { + skip_locks = B_TRUE; + } if (vd == NULL) { vd = rvd; locked = B_TRUE; - spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); + + if (!skip_locks) + spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, + RW_READER); } - ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER) == - (SCL_CONFIG | SCL_STATE)); + if (!skip_locks) + ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, + RW_READER) == (SCL_CONFIG | SCL_STATE)); + /* * If txg is -1, report the current value of spa->spa_config_txg. @@ -552,12 +590,27 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) kmem_free(dds, sizeof (ddt_stat_t)); } - if (locked) + if (locked && !skip_locks) spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (config); } +nvlist_t * +spa_config_generate_lock_behavior(spa_t *spa, vdev_t *vd, uint64_t txg, + int getstats, zpool_lock_behavior_t zpool_lock_behavior) +{ + return (spa_config_generate_impl(spa, vd, txg, getstats, + zpool_lock_behavior)); +} + +nvlist_t * +spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) +{ + return (spa_config_generate_impl(spa, vd, txg, getstats, + ZPOOL_LOCK_BEHAVIOR_DEFAULT)); +} + /* * Update all disk labels, generate a fresh config based on the current * in-core state, and sync the global config cache (do not sync the config diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index f2b0297787f4..9847ce86f67c 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -608,21 +608,14 @@ spa_config_held(spa_t *spa, int locks, krw_t rw) * ========================================================================== */ -/* - * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held. - * Returns NULL if no matching spa_t is found. - */ spa_t * -spa_lookup(const char *name) +spa_lookup_lockless(const char *name) { static spa_t search; /* spa_t is large; don't allocate on stack */ - spa_t *spa; avl_index_t where; + spa_t *spa; char *cp; - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - -retry: (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); /* @@ -634,7 +627,24 @@ spa_lookup(const char *name) *cp = '\0'; spa = avl_find(&spa_namespace_avl, &search, &where); - if (spa == NULL) + + return (spa); +} + +/* + * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held. + * Returns NULL if no matching spa_t is found. + */ +spa_t * +spa_lookup(const char *name) +{ + spa_t *spa; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + +retry: + spa = spa_lookup_lockless(name); + if (!spa) return (NULL); /* @@ -842,7 +852,6 @@ spa_remove(spa_t *spa) ASSERT0(spa->spa_waiters); nvlist_free(spa->spa_config_splitting); - avl_remove(&spa_namespace_avl, spa); if (spa->spa_root) @@ -907,6 +916,15 @@ spa_remove(spa_t *spa) kmem_free(spa, sizeof (spa_t)); } +spa_t * +spa_next_lockless(spa_t *prev) +{ + if (prev) + return (AVL_NEXT(&spa_namespace_avl, prev)); + else + return (avl_first(&spa_namespace_avl)); +} + /* * Given a pool, return the next pool in the namespace, or NULL if there is * none. If 'prev' is NULL, return the first pool. @@ -915,11 +933,7 @@ spa_t * spa_next(spa_t *prev) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); - - if (prev) - return (AVL_NEXT(&spa_namespace_avl, prev)); - else - return (avl_first(&spa_namespace_avl)); + return (spa_next_lockless(prev)); } /* @@ -1583,8 +1597,9 @@ spa_load_guid_exists(uint64_t guid) ASSERT(MUTEX_HELD(&spa_namespace_lock)); for (spa_t *spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) { - if (spa_load_guid(spa) == guid) + if (spa_load_guid(spa) == guid) { return (B_TRUE); + } } return (arc_async_flush_guid_inuse(guid)); @@ -3033,6 +3048,7 @@ EXPORT_SYMBOL(spa_lookup); EXPORT_SYMBOL(spa_add); EXPORT_SYMBOL(spa_remove); EXPORT_SYMBOL(spa_next); +EXPORT_SYMBOL(spa_next_lockless); /* Refcount functions */ EXPORT_SYMBOL(spa_open_ref); @@ -3160,6 +3176,3 @@ ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift, ZFS_MODULE_PARAM(zfs, spa_, num_allocators, INT, ZMOD_RW, "Number of allocators per spa"); - -ZFS_MODULE_PARAM(zfs, spa_, cpus_per_allocator, INT, ZMOD_RW, - "Minimum number of CPUs per allocators"); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index c37d8a1c07e1..1eb5ca20d23c 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -1585,13 +1585,82 @@ zfs_ioc_pool_export(zfs_cmd_t *zc) return (error); } +/* + * Given a zc with only ZPOOL_CONFIG_LOCK_BEHAVIOR set in its ioctl source + * nvlist, OR the source nvlist itself (innvl), extract and return the + * zpool_lock_behavior_t value. Additionally, if the user is trying to use + * lockless behavior, verify that the user has the correct permissions. + * + * NOTE: Only one of 'zc' or 'innvl' should be set, not both. + * + * Return value: + * + * If zc and innvl are NULL, then return ZPOOL_LOCK_BEHAVIOR_DEFAULT. + * + * If the user has permission to do the ZPOOL_CONFIG_LOCK_BEHAVIOR specified + * then return the correct zpool_lock_behavior_t. + * + * If ZPOOL_CONFIG_LOCK_BEHAVIOR is not specified then return + * ZPOOL_LOCK_BEHAVIOR_DEFAULT. + * + * If the user is has specified ZPOOL_LOCK_BEHAVIOR_LOCKLESS and does not have + * permission to do so, then return -EPERM. + */ +static int +zfs_get_zpool_lock_behavior_helper(zfs_cmd_t *zc, nvlist_t *innvl) +{ + int error = 0; + uint64_t zpool_lock_behavior = ZPOOL_LOCK_BEHAVIOR_DEFAULT; + nvlist_t *nvl = NULL; /* args being passed in */ + + /* Only 'zc' or 'innvl' should be set (or neither), not both */ + ASSERT(!(zc != NULL && innvl != NULL)); + if (zc) { + /* + * Our zfs_cmd_t can include an nvlist with optional input + * arguments. + */ + error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &nvl); + } else { + if (nvlist_dup(innvl, &nvl, KM_SLEEP) != 0) + return (ZPOOL_LOCK_BEHAVIOR_DEFAULT); + } + + if (error == 0) { + if (nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_LOCK_BEHAVIOR, + &zpool_lock_behavior) == 0) { + if (zpool_lock_behavior >= ZPOOL_LOCK_BEHAVIOR_END) { + nvlist_free(nvl); + /* Unrecognized zpool_lock_behavior value */ + return (ZPOOL_LOCK_BEHAVIOR_DEFAULT); + } + } + nvlist_free(nvl); + } + + if (zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_LOCKLESS) + if (secpolicy_sys_config(CRED(), B_FALSE) != 0) + return (-EPERM); + + return ((int)zpool_lock_behavior); +} + static int zfs_ioc_pool_configs(zfs_cmd_t *zc) { nvlist_t *configs; int error; + int zpool_lock_behavior; + + zpool_lock_behavior = zfs_get_zpool_lock_behavior_helper(zc, NULL); + if (zpool_lock_behavior == -EPERM) { + zc->zc_cookie = EPERM; + return (EPERM); + } + + error = spa_all_configs(&zc->zc_cookie, &configs, zpool_lock_behavior); - error = spa_all_configs(&zc->zc_cookie, &configs); if (error) return (error); @@ -1617,9 +1686,16 @@ zfs_ioc_pool_stats(zfs_cmd_t *zc) nvlist_t *config; int error; int ret = 0; + zpool_lock_behavior_t zpool_lock_behavior; + + zpool_lock_behavior = zfs_get_zpool_lock_behavior_helper(zc, NULL); + if (zpool_lock_behavior == -EPERM) { + zc->zc_cookie = EPERM; + return (EPERM); + } error = spa_get_stats(zc->zc_name, &config, zc->zc_value, - sizeof (zc->zc_value)); + sizeof (zc->zc_value), zpool_lock_behavior); if (config != NULL) { ret = put_nvlist(zc, config); @@ -3118,6 +3194,7 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc) static const zfs_ioc_key_t zfs_keys_get_props[] = { { ZPOOL_GET_PROPS_NAMES, DATA_TYPE_STRING_ARRAY, ZK_OPTIONAL }, + { ZPOOL_CONFIG_LOCK_BEHAVIOR, DATA_TYPE_UINT64, ZK_OPTIONAL }, }; static int @@ -3127,26 +3204,59 @@ zfs_ioc_pool_get_props(const char *pool, nvlist_t *innvl, nvlist_t *outnvl) char **props = NULL; unsigned int n_props = 0; int error; + int rc; + uint64_t zpool_lock_behavior; + boolean_t is_locked; if (nvlist_lookup_string_array(innvl, ZPOOL_GET_PROPS_NAMES, &props, &n_props) != 0) { props = NULL; } - if ((error = spa_open(pool, &spa, FTAG)) != 0) { + zpool_lock_behavior = zfs_get_zpool_lock_behavior_helper(NULL, innvl); + if (zpool_lock_behavior == -EPERM) { + return (EPERM); + } + + if ((error = spa_open_common_lock_behavior(pool, &spa, FTAG, NULL, NULL, + zpool_lock_behavior)) != 0) { /* * If the pool is faulted, there may be properties we can still * get (such as altroot and cachefile), so attempt to get them * anyway. */ - mutex_enter(&spa_namespace_lock); - if ((spa = spa_lookup(pool)) != NULL) { + + is_locked = B_FALSE; + if ((zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_TRYLOCK) || + (zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_LOCKLESS)) { + rc = mutex_enter_timeout(&spa_namespace_lock, + MSEC2NSEC(spa_namespace_trylock_ms)); + if (rc == 0) + is_locked = B_TRUE; + } else { + mutex_enter(&spa_namespace_lock); + is_locked = B_TRUE; + } + + if (zpool_lock_behavior == ZPOOL_LOCK_BEHAVIOR_TRYLOCK && + !is_locked) { + return (EAGAIN); + } + + if (is_locked) + spa = spa_lookup(pool); + else + spa = spa_lookup_lockless(pool); + + if (spa != NULL) { error = spa_prop_get(spa, outnvl); if (error == 0 && props != NULL) error = spa_prop_get_nvlist(spa, props, n_props, outnvl); } - mutex_exit(&spa_namespace_lock); + + if (is_locked) + mutex_exit(&spa_namespace_lock); } else { error = spa_prop_get(spa, outnvl); if (error == 0 && props != NULL) diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 9373b39a184a..bf5781d2fc6a 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -560,7 +560,7 @@ tests = ['zpool_status_001_pos', 'zpool_status_002_pos', 'zpool_status_003_pos', 'zpool_status_004_pos', 'zpool_status_005_pos', 'zpool_status_006_pos', 'zpool_status_007_pos', 'zpool_status_008_pos', - 'zpool_status_features_001_pos'] + 'zpool_status_features_001_pos', 'zpool_status_lockless'] tags = ['functional', 'cli_root', 'zpool_status'] [tests/functional/cli_root/zpool_sync] diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index 8bffe9d8240c..3898ad80494a 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -3885,4 +3885,14 @@ function pop_coredump_pattern esac } +# Get the Unix timestamp as expressed in milliseconds. This can be useful for +# measuring how long something takes. This works on both Linux and +# FreeBSD. +function get_unix_timestamp_in_ms +{ + # We can't just use 'date +%s%N' here since FreeBSD 'date' only supports + # one-second resolution. + python3 -c 'import time; print(round(time.time() * 1000))' +} + . ${STF_SUITE}/include/kstat.shlib diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index 79dc64ad9350..f18bc9c0f23f 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -39,6 +39,7 @@ DEADMAN_EVENTS_PER_SECOND deadman_events_per_second zfs_deadman_events_per_secon DEADMAN_FAILMODE deadman.failmode zfs_deadman_failmode DEADMAN_SYNCTIME_MS deadman.synctime_ms zfs_deadman_synctime_ms DEADMAN_ZIOTIME_MS deadman.ziotime_ms zfs_deadman_ziotime_ms +DEBUG_NAMESPACE_LOCK_DELAY debug_namespace_lock_delay zfs_debug_namespace_lock_delay DISABLE_IVSET_GUID_CHECK disable_ivset_guid_check zfs_disable_ivset_guid_check DMU_OFFSET_NEXT_SYNC dmu_offset_next_sync zfs_dmu_offset_next_sync EMBEDDED_SLOG_MIN_MS embedded_slog_min_ms zfs_embedded_slog_min_ms @@ -87,6 +88,7 @@ SPA_ASIZE_INFLATION spa.asize_inflation spa_asize_inflation SPA_DISCARD_MEMORY_LIMIT spa.discard_memory_limit zfs_spa_discard_memory_limit SPA_LOAD_VERIFY_DATA spa.load_verify_data spa_load_verify_data SPA_LOAD_VERIFY_METADATA spa.load_verify_metadata spa_load_verify_metadata +SPA_NAMESPACE_TRYLOCK_MS spa.namespace_trylock_ms spa_namespace_trylock_ms TRIM_EXTENT_BYTES_MIN trim.extent_bytes_min zfs_trim_extent_bytes_min TRIM_METASLAB_SKIP trim.metaslab_skip zfs_trim_metaslab_skip TRIM_TXG_BATCH trim.txg_batch zfs_trim_txg_batch diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index db1ef0d03aaf..e6071f50d409 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1273,6 +1273,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_status/zpool_status_007_pos.ksh \ functional/cli_root/zpool_status/zpool_status_008_pos.ksh \ functional/cli_root/zpool_status/zpool_status_features_001_pos.ksh \ + functional/cli_root/zpool_status/zpool_status_lockless.ksh \ functional/cli_root/zpool_sync/cleanup.ksh \ functional/cli_root/zpool_sync/setup.ksh \ functional/cli_root/zpool_sync/zpool_sync_001_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_lockless.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_lockless.ksh new file mode 100755 index 000000000000..1971432ee511 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_lockless.ksh @@ -0,0 +1,111 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify 'ZPOOL_LOCK_BEHAVIOR=lockless|trylock|wait' works correctly with +# zpool status. + +DISK=$TEST_BASE_DIR/file-vdev-lockless +ZFS_USER=zfs_user +ZFS_GROUP=zfs_group + +function cleanup +{ + unset ZPOOL_LOCK_BEHAVIOR + log_must restore_tunable DEBUG_NAMESPACE_LOCK_DELAY + log_must restore_tunable SPA_NAMESPACE_TRYLOCK_MS + log_must del_user $ZFS_USER + log_must del_group $ZFS_USER + log_must zpool destroy testpool2 + log_must rm -f "$DISK" +} + +log_assert "Verify ZPOOL_LOCK_BEHAVIOR values with zpool status" + +log_onexit cleanup + +log_must save_tunable DEBUG_NAMESPACE_LOCK_DELAY +log_must save_tunable SPA_NAMESPACE_TRYLOCK_MS + +log_must add_group $ZFS_GROUP +log_must add_user $ZFS_GROUP $ZFS_USER + +log_mustnot test -e $DISK +log_must truncate -s 100M $DISK +log_must zpool create testpool2 $DISK + +# Normal, safe, zpool status variants. These should all work. +log_must user_run $ZFS_USER zpool status +export ZPOOL_LOCK_BEHAVIOR="" +log_must user_run $ZFS_USER zpool status +export ZPOOL_LOCK_BEHAVIOR=wait +log_must user_run $ZFS_USER "zpool status" +export ZPOOL_LOCK_BEHAVIOR=trylock +log_must user_run $ZFS_USER "zpool status" + +# We should not have permission as an ordinary user to run lockless +export ZPOOL_LOCK_BEHAVIOR=lockless +log_mustnot user_run $ZFS_USER "zpool status" +log_must zpool status +export ZPOOL_LOCK_BEHAVIOR="" + +# Add an artificial 500ms delay after taking the spa_namespace lock +set_tunable32 DEBUG_NAMESPACE_LOCK_DELAY 500 +zpool status & +sleep 0.1 + +# This should fail since the previously spawned off 'zpool status' is holding +# the lock for at least 500ms, and "trylock" only tries for 100ms. +set_tunable32 SPA_NAMESPACE_TRYLOCK_MS 100 +export ZPOOL_LOCK_BEHAVIOR=trylock +log_mustnot user_run $ZFS_USER "zpool status" +wait + +set_tunable32 DEBUG_NAMESPACE_LOCK_DELAY 100 +zpool status & +sleep 0.05 + +# This should succeed since the previously spawned off 'zpool status' is holding +# the lock for only 100ms, and "trylock" tries for 300ms. +set_tunable32 SPA_NAMESPACE_TRYLOCK_MS 300 +export ZPOOL_LOCK_BEHAVIOR=trylock +log_must user_run $ZFS_USER "zpool status" +wait + +# Now we artificially hold the lock for 500ms, and check that "lockless" +# returns in a shorter amount of time, and without error. +set_tunable32 DEBUG_NAMESPACE_LOCK_DELAY 500 +set_tunable32 SPA_NAMESPACE_TRYLOCK_MS 10 +zpool status & +sleep 0.05 +before=$(get_unix_timestamp_in_ms) +export ZPOOL_LOCK_BEHAVIOR=lockless +log_must zpool status +after=$(get_unix_timestamp_in_ms) +d=$(($after - $before)) +wait +log_note "'lockless' zpool status took $d milliseconds" +log_must test $d -le 300 + +log_pass "ZPOOL_LOCK_BEHAVIOR=lockless|trylock|wait zpool status works"