diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index c8deb5be419e..4c34599369b4 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -385,9 +385,23 @@ typedef enum { VDEV_PROP_TRIM_SUPPORT, VDEV_PROP_TRIM_ERRORS, VDEV_PROP_SLOW_IOS, + VDEV_PROP_SCHEDULER, VDEV_NUM_PROPS } vdev_prop_t; +/* + * Different scheduling behaviors for vdev prop io_scheduler. + * VDEV_SCHEDULER_AUTO = Don't queue if vdev is nonrot and backed by blkdev, + * queue otherwise. + * VDEV_SCHEDULER_CLASSIC = Always queue. + * VDEV_SCHEDULER_NONE = Never Queue. + */ +typedef enum { + VDEV_SCHEDULER_AUTO, + VDEV_SCHEDULER_CLASSIC, + VDEV_SCHEDULER_NONE +} vdev_scheduler_type_t; + /* * Dataset property functions shared between libzfs and kernel. */ diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index c925eb490cd3..3b699f9622cb 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -423,6 +423,7 @@ struct vdev { boolean_t vdev_resilver_deferred; /* resilver deferred */ boolean_t vdev_kobj_flag; /* kobj event record */ boolean_t vdev_attaching; /* vdev attach ashift handling */ + boolean_t vdev_is_blkdev; /* vdev is backed by block device */ vdev_queue_t vdev_queue; /* I/O deadline schedule queue */ spa_aux_vdev_t *vdev_aux; /* for l2cache and spares vdevs */ zio_t *vdev_probe_zio; /* root of current probe */ @@ -466,6 +467,7 @@ struct vdev { uint64_t vdev_io_t; uint64_t vdev_slow_io_n; uint64_t vdev_slow_io_t; + uint64_t vdev_scheduler; /* control how I/Os are submitted */ }; #define VDEV_PAD_SIZE (8 << 10) diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 0c3e8106ca6d..01dde456ae2b 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -6116,7 +6116,8 @@ - + + diff --git a/man/man7/vdevprops.7 b/man/man7/vdevprops.7 index acabe6b6613a..975aeb33d9ae 100644 --- a/man/man7/vdevprops.7 +++ b/man/man7/vdevprops.7 @@ -157,6 +157,22 @@ If this device should perform new allocations, used to disable a device when it is scheduled for later removal. See .Xr zpool-remove 8 . +.It Sy io_scheduler Ns = Ns Sy auto Ns | Ns Sy classic Ns | Ns Sy none +Controls how I/O requests are added to the vdev queue when reading or +writing to this vdev. +.It Sy auto +Does not add I/O requests to the vdev queue if the vdev is backed by a +non-rotational block device. +This can sometimes improve performance for direct IOs. +I/O requests will be added to the vdev queue if the vdev is backed by a +rotational block device or file. +This is the default behavior. +.It Sy classic +Always adds I/O requests to the vdev queue. +.It Sy none +Never adds I/O requests to the vdev queue. +This is not recommended for vdevs backed by spinning disks as it could +result in starvation. .El .Ss User Properties In addition to the standard native properties, ZFS supports arbitrary user diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c index c8ab7cc7cf8e..a3efe344cbfa 100644 --- a/module/os/freebsd/zfs/vdev_geom.c +++ b/module/os/freebsd/zfs/vdev_geom.c @@ -968,6 +968,9 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, else vd->vdev_nonrot = B_FALSE; + /* Is backed by a block device. */ + vd->vdev_is_blkdev = B_TRUE; + /* Set when device reports it supports TRIM. */ error = g_getattr("GEOM::candelete", cp, &has_trim); vd->vdev_has_trim = (error == 0 && has_trim); diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 154ca22d9513..0b7296f80b8e 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -447,6 +447,9 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, /* Inform the ZIO pipeline that we are non-rotational */ v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(bdev)); + /* Is backed by a block device. */ + v->vdev_is_blkdev = B_TRUE; + /* Physical volume size in bytes for the partition */ *psize = bdev_capacity(bdev); diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index 04ae9f986d8f..9badd2bf77b9 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -326,6 +326,13 @@ vdev_prop_init(void) { NULL } }; + static const zprop_index_t vdevschedulertype_table[] = { + { "auto", VDEV_SCHEDULER_AUTO }, + { "classic", VDEV_SCHEDULER_CLASSIC }, + { "none", VDEV_SCHEDULER_NONE }, + { NULL } + }; + struct zfs_mod_supported_features *sfeatures = zfs_mod_list_supported(ZFS_SYSFS_VDEV_PROPERTIES); @@ -470,6 +477,10 @@ vdev_prop_init(void) zprop_register_index(VDEV_PROP_TRIM_SUPPORT, "trim_support", 0, PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "TRIMSUP", boolean_table, sfeatures); + zprop_register_index(VDEV_PROP_SCHEDULER, "scheduler", + VDEV_SCHEDULER_AUTO, PROP_DEFAULT, ZFS_TYPE_VDEV, + "auto | classic | none", "IO_SCHEDULER", vdevschedulertype_table, + sfeatures); /* default index properties */ zprop_register_index(VDEV_PROP_FAILFAST, "failfast", B_TRUE, diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index aa4038a7526f..c1d2a05ccd8d 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -722,6 +722,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N); vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T); + vd->vdev_scheduler = vdev_prop_default_numeric(VDEV_PROP_SCHEDULER); + list_link_init(&vd->vdev_config_dirty_node); list_link_init(&vd->vdev_state_dirty_node); list_link_init(&vd->vdev_initialize_node); @@ -3890,6 +3892,12 @@ vdev_load(vdev_t *vd) if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); + + error = vdev_prop_get_int(vd, VDEV_PROP_SCHEDULER, + &vd->vdev_scheduler); + if (error && error != ENOENT) + vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " + "failed [error=%d]", (u_longlong_t)zapobj, error); } /* @@ -6125,6 +6133,15 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) } vd->vdev_slow_io_t = intval; break; + case VDEV_PROP_SCHEDULER: + if (nvpair_value_uint64(elem, &intval) != 0) { + error = EINVAL; + break; + } + if (vd->vdev_ops->vdev_op_leaf) { + vd->vdev_scheduler = intval; + } + break; default: /* Most processing is done in vdev_props_set_sync */ break; @@ -6488,6 +6505,7 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) case VDEV_PROP_IO_T: case VDEV_PROP_SLOW_IO_N: case VDEV_PROP_SLOW_IO_T: + case VDEV_PROP_SCHEDULER: err = vdev_prop_get_int(vd, prop, &intval); if (err && err != ENOENT) break; diff --git a/module/zfs/vdev_file.c b/module/zfs/vdev_file.c index f457669bc809..79eb62a0e34c 100644 --- a/module/zfs/vdev_file.c +++ b/module/zfs/vdev_file.c @@ -109,6 +109,9 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, */ vd->vdev_nonrot = B_TRUE; + /* Is not backed by a block device. */ + vd->vdev_is_blkdev = B_FALSE; + /* * Allow TRIM on file based vdevs. This may not always be supported, * since it depends on your kernel version and underlying filesystem diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index aa41f7066036..3b7617567351 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -879,6 +879,38 @@ vdev_queue_io_to_issue(vdev_queue_t *vq) return (zio); } +static boolean_t +vdev_should_queue_zio(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + boolean_t should_queue = B_TRUE; + + /* + * Add zio with ZIO_FLAG_NODATA to queue as bypass code + * currently does not handle certain cases (gang abd, raidz + * write aggregation). + */ + if (zio->io_flags & ZIO_FLAG_NODATA) + return (B_TRUE); + + switch (vd->vdev_scheduler) { + case VDEV_SCHEDULER_AUTO: + if (vd->vdev_nonrot && vd->vdev_is_blkdev) + should_queue = B_FALSE; + break; + case VDEV_SCHEDULER_CLASSIC: + should_queue = B_TRUE; + break; + case VDEV_SCHEDULER_NONE: + should_queue = B_FALSE; + break; + default: + should_queue = B_TRUE; + break; + } + return (should_queue); +} + zio_t * vdev_queue_io(zio_t *zio) { @@ -922,6 +954,11 @@ vdev_queue_io(zio_t *zio) zio->io_flags |= ZIO_FLAG_DONT_QUEUE; zio->io_timestamp = gethrtime(); + if (!vdev_should_queue_zio(zio)) { + zio->io_queue_state = ZIO_QS_NONE; + return (zio); + } + mutex_enter(&vq->vq_lock); vdev_queue_io_add(vq, zio); nio = vdev_queue_io_to_issue(vq); @@ -954,6 +991,10 @@ vdev_queue_io_done(zio_t *zio) vq->vq_io_complete_ts = now; vq->vq_io_delta_ts = zio->io_delta = now - zio->io_timestamp; + if (zio->io_queue_state == ZIO_QS_NONE) { + return; + } + mutex_enter(&vq->vq_lock); vdev_queue_pending_remove(vq, zio); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 7cc7a3cf94f4..7d9fd722def3 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -552,7 +552,7 @@ tags = ['functional', 'cli_root', 'zpool_scrub'] tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg', 'zpool_set_ashift', 'zpool_set_features', 'vdev_set_001_pos', 'user_property_001_pos', 'user_property_002_neg', - 'zpool_set_clear_userprop'] + 'zpool_set_clear_userprop','vdev_set_io_scheduler'] tags = ['functional', 'cli_root', 'zpool_set'] [tests/functional/cli_root/zpool_split] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 388a4160736a..5263cd5dbc31 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1252,6 +1252,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_set/setup.ksh \ functional/cli_root/zpool/setup.ksh \ functional/cli_root/zpool_set/vdev_set_001_pos.ksh \ + functional/cli_root/zpool_set/vdev_set_io_scheduler.ksh \ functional/cli_root/zpool_set/zpool_set_common.kshlib \ functional/cli_root/zpool_set/zpool_set_001_pos.ksh \ functional/cli_root/zpool_set/zpool_set_002_neg.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg index ccb5e9c15809..f71be2dc66f8 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg @@ -76,4 +76,5 @@ typeset -a properties=( trim_support trim_errors slow_ios + scheduler ) diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_set/vdev_set_io_scheduler.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_set/vdev_set_io_scheduler.ksh new file mode 100755 index 000000000000..d3ef66d56d01 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_set/vdev_set_io_scheduler.ksh @@ -0,0 +1,96 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 by Triad National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Setting vdev io_scheduler property while reading from vdev should not cause panic. +# +# STRATEGY: +# 1. Create a zpool +# 2. Write a file to the pool. +# 3. Start reading from file, while also setting the io_scheduler property. +# + +verify_runnable "global" + +command -v fio > /dev/null || log_unsupported "fio missing" +log_must save_tunable DIO_ENABLED +log_must set_tunable32 DIO_ENABLED 1 + +function set_io_scheduler +{ + zpool set scheduler=none $TESTPOOL1 $FILEDEV + sleep 0.1 + zpool set scheduler=classic $TESTPOOL1 $FILEDEV + sleep 0.1 +} + +function cleanup +{ + destroy_pool $TESTPOOL1 + log_must rm -f $FILEDEV + log_must restore_tunable DIO_ENABLED +} + +log_assert "Toggling vdev scheduler property while reading from vdev should not cause panic" +log_onexit cleanup + +# 1. Create a pool + +FILEDEV="$TEST_BASE_DIR/filedev.$$" +log_must truncate -s $(($MINVDEVSIZE * 2)) $FILEDEV +create_pool $TESTPOOL1 $FILEDEV + +mntpnt=$(get_prop mountpoint $TESTPOOL1) + +# 2. Write a file to the pool, while also setting the io_scheduler property. + +log_must eval "fio --filename=$mntpnt/foobar --name=write-file \ + --rw=write --size=$MINVDEVSIZE --bs=128k --numjobs=1 --direct=1 \ + --ioengine=sync --time_based --runtime=10 &" + +ITERATIONS=30 + +for i in $(seq $ITERATIONS); do + log_must set_io_scheduler +done; +wait + +# 3. Starting reading from file, while also setting the io_scheduler property. + +log_must eval "fio --filename=$mntpnt/foobar --name=read-file \ + --rw=read --size=$MINVDEVSIZE --bs=128k --numjobs=1 --direct=1 \ + --ioengine=sync --time_based --runtime=10 &" + +for i in $(seq $ITERATIONS); do + log_must set_io_scheduler +done; +wait + +log_pass "Setting vdev scheduler property while reading from vdev does not cause panic"