diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index c8deb5be419e..4c34599369b4 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -385,9 +385,23 @@ typedef enum {
VDEV_PROP_TRIM_SUPPORT,
VDEV_PROP_TRIM_ERRORS,
VDEV_PROP_SLOW_IOS,
+ VDEV_PROP_SCHEDULER,
VDEV_NUM_PROPS
} vdev_prop_t;
+/*
+ * Different scheduling behaviors for vdev prop io_scheduler.
+ * VDEV_SCHEDULER_AUTO = Don't queue if vdev is nonrot and backed by blkdev,
+ * queue otherwise.
+ * VDEV_SCHEDULER_CLASSIC = Always queue.
+ * VDEV_SCHEDULER_NONE = Never Queue.
+ */
+typedef enum {
+ VDEV_SCHEDULER_AUTO,
+ VDEV_SCHEDULER_CLASSIC,
+ VDEV_SCHEDULER_NONE
+} vdev_scheduler_type_t;
+
/*
* Dataset property functions shared between libzfs and kernel.
*/
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index c925eb490cd3..3b699f9622cb 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -423,6 +423,7 @@ struct vdev {
boolean_t vdev_resilver_deferred; /* resilver deferred */
boolean_t vdev_kobj_flag; /* kobj event record */
boolean_t vdev_attaching; /* vdev attach ashift handling */
+ boolean_t vdev_is_blkdev; /* vdev is backed by block device */
vdev_queue_t vdev_queue; /* I/O deadline schedule queue */
spa_aux_vdev_t *vdev_aux; /* for l2cache and spares vdevs */
zio_t *vdev_probe_zio; /* root of current probe */
@@ -466,6 +467,7 @@ struct vdev {
uint64_t vdev_io_t;
uint64_t vdev_slow_io_n;
uint64_t vdev_slow_io_t;
+ uint64_t vdev_scheduler; /* control how I/Os are submitted */
};
#define VDEV_PAD_SIZE (8 << 10)
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
index 0c3e8106ca6d..01dde456ae2b 100644
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -6116,7 +6116,8 @@
-
+
+
diff --git a/man/man7/vdevprops.7 b/man/man7/vdevprops.7
index acabe6b6613a..975aeb33d9ae 100644
--- a/man/man7/vdevprops.7
+++ b/man/man7/vdevprops.7
@@ -157,6 +157,22 @@ If this device should perform new allocations, used to disable a device
when it is scheduled for later removal.
See
.Xr zpool-remove 8 .
+.It Sy io_scheduler Ns = Ns Sy auto Ns | Ns Sy classic Ns | Ns Sy none
+Controls how I/O requests are added to the vdev queue when reading or
+writing to this vdev.
+.It Sy auto
+Does not add I/O requests to the vdev queue if the vdev is backed by a
+non-rotational block device.
+This can sometimes improve performance for direct IOs.
+I/O requests will be added to the vdev queue if the vdev is backed by a
+rotational block device or file.
+This is the default behavior.
+.It Sy classic
+Always adds I/O requests to the vdev queue.
+.It Sy none
+Never adds I/O requests to the vdev queue.
+This is not recommended for vdevs backed by spinning disks as it could
+result in starvation.
.El
.Ss User Properties
In addition to the standard native properties, ZFS supports arbitrary user
diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c
index c8ab7cc7cf8e..a3efe344cbfa 100644
--- a/module/os/freebsd/zfs/vdev_geom.c
+++ b/module/os/freebsd/zfs/vdev_geom.c
@@ -968,6 +968,9 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
else
vd->vdev_nonrot = B_FALSE;
+ /* Is backed by a block device. */
+ vd->vdev_is_blkdev = B_TRUE;
+
/* Set when device reports it supports TRIM. */
error = g_getattr("GEOM::candelete", cp, &has_trim);
vd->vdev_has_trim = (error == 0 && has_trim);
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index 154ca22d9513..0b7296f80b8e 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -447,6 +447,9 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
/* Inform the ZIO pipeline that we are non-rotational */
v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(bdev));
+ /* Is backed by a block device. */
+ v->vdev_is_blkdev = B_TRUE;
+
/* Physical volume size in bytes for the partition */
*psize = bdev_capacity(bdev);
diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c
index 04ae9f986d8f..9badd2bf77b9 100644
--- a/module/zcommon/zpool_prop.c
+++ b/module/zcommon/zpool_prop.c
@@ -326,6 +326,13 @@ vdev_prop_init(void)
{ NULL }
};
+ static const zprop_index_t vdevschedulertype_table[] = {
+ { "auto", VDEV_SCHEDULER_AUTO },
+ { "classic", VDEV_SCHEDULER_CLASSIC },
+ { "none", VDEV_SCHEDULER_NONE },
+ { NULL }
+ };
+
struct zfs_mod_supported_features *sfeatures =
zfs_mod_list_supported(ZFS_SYSFS_VDEV_PROPERTIES);
@@ -470,6 +477,10 @@ vdev_prop_init(void)
zprop_register_index(VDEV_PROP_TRIM_SUPPORT, "trim_support", 0,
PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "TRIMSUP",
boolean_table, sfeatures);
+ zprop_register_index(VDEV_PROP_SCHEDULER, "scheduler",
+ VDEV_SCHEDULER_AUTO, PROP_DEFAULT, ZFS_TYPE_VDEV,
+ "auto | classic | none", "IO_SCHEDULER", vdevschedulertype_table,
+ sfeatures);
/* default index properties */
zprop_register_index(VDEV_PROP_FAILFAST, "failfast", B_TRUE,
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index aa4038a7526f..c1d2a05ccd8d 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -722,6 +722,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N);
vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T);
+ vd->vdev_scheduler = vdev_prop_default_numeric(VDEV_PROP_SCHEDULER);
+
list_link_init(&vd->vdev_config_dirty_node);
list_link_init(&vd->vdev_state_dirty_node);
list_link_init(&vd->vdev_initialize_node);
@@ -3890,6 +3892,12 @@ vdev_load(vdev_t *vd)
if (error && error != ENOENT)
vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
"failed [error=%d]", (u_longlong_t)zapobj, error);
+
+ error = vdev_prop_get_int(vd, VDEV_PROP_SCHEDULER,
+ &vd->vdev_scheduler);
+ if (error && error != ENOENT)
+ vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
+ "failed [error=%d]", (u_longlong_t)zapobj, error);
}
/*
@@ -6125,6 +6133,15 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
}
vd->vdev_slow_io_t = intval;
break;
+ case VDEV_PROP_SCHEDULER:
+ if (nvpair_value_uint64(elem, &intval) != 0) {
+ error = EINVAL;
+ break;
+ }
+ if (vd->vdev_ops->vdev_op_leaf) {
+ vd->vdev_scheduler = intval;
+ }
+ break;
default:
/* Most processing is done in vdev_props_set_sync */
break;
@@ -6488,6 +6505,7 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
case VDEV_PROP_IO_T:
case VDEV_PROP_SLOW_IO_N:
case VDEV_PROP_SLOW_IO_T:
+ case VDEV_PROP_SCHEDULER:
err = vdev_prop_get_int(vd, prop, &intval);
if (err && err != ENOENT)
break;
diff --git a/module/zfs/vdev_file.c b/module/zfs/vdev_file.c
index f457669bc809..79eb62a0e34c 100644
--- a/module/zfs/vdev_file.c
+++ b/module/zfs/vdev_file.c
@@ -109,6 +109,9 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
*/
vd->vdev_nonrot = B_TRUE;
+ /* Is not backed by a block device. */
+ vd->vdev_is_blkdev = B_FALSE;
+
/*
* Allow TRIM on file based vdevs. This may not always be supported,
* since it depends on your kernel version and underlying filesystem
diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c
index aa41f7066036..3b7617567351 100644
--- a/module/zfs/vdev_queue.c
+++ b/module/zfs/vdev_queue.c
@@ -879,6 +879,38 @@ vdev_queue_io_to_issue(vdev_queue_t *vq)
return (zio);
}
+static boolean_t
+vdev_should_queue_zio(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ boolean_t should_queue = B_TRUE;
+
+ /*
+ * Add zio with ZIO_FLAG_NODATA to queue as bypass code
+ * currently does not handle certain cases (gang abd, raidz
+ * write aggregation).
+ */
+ if (zio->io_flags & ZIO_FLAG_NODATA)
+ return (B_TRUE);
+
+ switch (vd->vdev_scheduler) {
+ case VDEV_SCHEDULER_AUTO:
+ if (vd->vdev_nonrot && vd->vdev_is_blkdev)
+ should_queue = B_FALSE;
+ break;
+ case VDEV_SCHEDULER_CLASSIC:
+ should_queue = B_TRUE;
+ break;
+ case VDEV_SCHEDULER_NONE:
+ should_queue = B_FALSE;
+ break;
+ default:
+ should_queue = B_TRUE;
+ break;
+ }
+ return (should_queue);
+}
+
zio_t *
vdev_queue_io(zio_t *zio)
{
@@ -922,6 +954,11 @@ vdev_queue_io(zio_t *zio)
zio->io_flags |= ZIO_FLAG_DONT_QUEUE;
zio->io_timestamp = gethrtime();
+ if (!vdev_should_queue_zio(zio)) {
+ zio->io_queue_state = ZIO_QS_NONE;
+ return (zio);
+ }
+
mutex_enter(&vq->vq_lock);
vdev_queue_io_add(vq, zio);
nio = vdev_queue_io_to_issue(vq);
@@ -954,6 +991,10 @@ vdev_queue_io_done(zio_t *zio)
vq->vq_io_complete_ts = now;
vq->vq_io_delta_ts = zio->io_delta = now - zio->io_timestamp;
+ if (zio->io_queue_state == ZIO_QS_NONE) {
+ return;
+ }
+
mutex_enter(&vq->vq_lock);
vdev_queue_pending_remove(vq, zio);
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 7cc7a3cf94f4..7d9fd722def3 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -552,7 +552,7 @@ tags = ['functional', 'cli_root', 'zpool_scrub']
tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg',
'zpool_set_ashift', 'zpool_set_features', 'vdev_set_001_pos',
'user_property_001_pos', 'user_property_002_neg',
- 'zpool_set_clear_userprop']
+ 'zpool_set_clear_userprop','vdev_set_io_scheduler']
tags = ['functional', 'cli_root', 'zpool_set']
[tests/functional/cli_root/zpool_split]
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index 388a4160736a..5263cd5dbc31 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1252,6 +1252,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/cli_root/zpool_set/setup.ksh \
functional/cli_root/zpool/setup.ksh \
functional/cli_root/zpool_set/vdev_set_001_pos.ksh \
+ functional/cli_root/zpool_set/vdev_set_io_scheduler.ksh \
functional/cli_root/zpool_set/zpool_set_common.kshlib \
functional/cli_root/zpool_set/zpool_set_001_pos.ksh \
functional/cli_root/zpool_set/zpool_set_002_neg.ksh \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg
index ccb5e9c15809..f71be2dc66f8 100644
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg
@@ -76,4 +76,5 @@ typeset -a properties=(
trim_support
trim_errors
slow_ios
+ scheduler
)
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_set/vdev_set_io_scheduler.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_set/vdev_set_io_scheduler.ksh
new file mode 100755
index 000000000000..d3ef66d56d01
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_set/vdev_set_io_scheduler.ksh
@@ -0,0 +1,96 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2025 by Triad National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Setting vdev io_scheduler property while reading from vdev should not cause panic.
+#
+# STRATEGY:
+# 1. Create a zpool
+# 2. Write a file to the pool.
+# 3. Start reading from file, while also setting the io_scheduler property.
+#
+
+verify_runnable "global"
+
+command -v fio > /dev/null || log_unsupported "fio missing"
+log_must save_tunable DIO_ENABLED
+log_must set_tunable32 DIO_ENABLED 1
+
+function set_io_scheduler
+{
+ zpool set scheduler=none $TESTPOOL1 $FILEDEV
+ sleep 0.1
+ zpool set scheduler=classic $TESTPOOL1 $FILEDEV
+ sleep 0.1
+}
+
+function cleanup
+{
+ destroy_pool $TESTPOOL1
+ log_must rm -f $FILEDEV
+ log_must restore_tunable DIO_ENABLED
+}
+
+log_assert "Toggling vdev scheduler property while reading from vdev should not cause panic"
+log_onexit cleanup
+
+# 1. Create a pool
+
+FILEDEV="$TEST_BASE_DIR/filedev.$$"
+log_must truncate -s $(($MINVDEVSIZE * 2)) $FILEDEV
+create_pool $TESTPOOL1 $FILEDEV
+
+mntpnt=$(get_prop mountpoint $TESTPOOL1)
+
+# 2. Write a file to the pool, while also setting the io_scheduler property.
+
+log_must eval "fio --filename=$mntpnt/foobar --name=write-file \
+ --rw=write --size=$MINVDEVSIZE --bs=128k --numjobs=1 --direct=1 \
+ --ioengine=sync --time_based --runtime=10 &"
+
+ITERATIONS=30
+
+for i in $(seq $ITERATIONS); do
+ log_must set_io_scheduler
+done;
+wait
+
+# 3. Starting reading from file, while also setting the io_scheduler property.
+
+log_must eval "fio --filename=$mntpnt/foobar --name=read-file \
+ --rw=read --size=$MINVDEVSIZE --bs=128k --numjobs=1 --direct=1 \
+ --ioengine=sync --time_based --runtime=10 &"
+
+for i in $(seq $ITERATIONS); do
+ log_must set_io_scheduler
+done;
+wait
+
+log_pass "Setting vdev scheduler property while reading from vdev does not cause panic"