-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Add vdev property to toggle adding io to vdev queue #17358
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -385,9 +385,23 @@ typedef enum { | |||||
VDEV_PROP_TRIM_SUPPORT, | ||||||
VDEV_PROP_TRIM_ERRORS, | ||||||
VDEV_PROP_SLOW_IOS, | ||||||
VDEV_PROP_SCHEDULER, | ||||||
VDEV_NUM_PROPS | ||||||
} vdev_prop_t; | ||||||
|
||||||
/* | ||||||
* Different scheduling behaviors for vdev prop io_scheduler. | ||||||
* VDEV_SCHEDULER_AUTO = Don't queue if vdev is nonrot and backed by blkdev, | ||||||
* queue otherwise. | ||||||
* VDEV_SCHEDULER_CLASSIC = Always queue. | ||||||
* VDEV_SCHEDULER_NONE = Never Queue. | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
*/ | ||||||
typedef enum { | ||||||
VDEV_SCHEDULER_AUTO, | ||||||
VDEV_SCHEDULER_CLASSIC, | ||||||
VDEV_SCHEDULER_NONE | ||||||
} vdev_scheduler_type_t; | ||||||
|
||||||
/* | ||||||
* Dataset property functions shared between libzfs and kernel. | ||||||
*/ | ||||||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -157,6 +157,22 @@ If this device should perform new allocations, used to disable a device | |||||
when it is scheduled for later removal. | ||||||
See | ||||||
.Xr zpool-remove 8 . | ||||||
.It Sy io_scheduler Ns = Ns Sy auto Ns | Ns Sy classic Ns | Ns Sy none | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
Controls how I/O requests are added to the vdev queue when reading or | ||||||
writing to this vdev. | ||||||
.It Sy auto | ||||||
Does not add I/O requests to the vdev queue if the vdev is backed by a | ||||||
non-rotational block device. | ||||||
This can sometimes improve performance for direct IOs. | ||||||
I/O requests will be added to the vdev queue if the vdev is backed by a | ||||||
rotational block device or file. | ||||||
This is the default behavior. | ||||||
.It Sy classic | ||||||
Always adds I/O requests to the vdev queue. | ||||||
.It Sy none | ||||||
Never adds I/O requests to the vdev queue. | ||||||
This is not recommended for vdevs backed by spinning disks as it could | ||||||
result in starvation. | ||||||
.El | ||||||
.Ss User Properties | ||||||
In addition to the standard native properties, ZFS supports arbitrary user | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -326,6 +326,13 @@ vdev_prop_init(void) | |
{ NULL } | ||
}; | ||
|
||
static const zprop_index_t vdevschedulertype_table[] = { | ||
{ "auto", VDEV_SCHEDULER_AUTO }, | ||
{ "classic", VDEV_SCHEDULER_CLASSIC }, | ||
{ "none", VDEV_SCHEDULER_NONE }, | ||
{ NULL } | ||
}; | ||
|
||
struct zfs_mod_supported_features *sfeatures = | ||
zfs_mod_list_supported(ZFS_SYSFS_VDEV_PROPERTIES); | ||
|
||
|
@@ -470,6 +477,10 @@ vdev_prop_init(void) | |
zprop_register_index(VDEV_PROP_TRIM_SUPPORT, "trim_support", 0, | ||
PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "TRIMSUP", | ||
boolean_table, sfeatures); | ||
zprop_register_index(VDEV_PROP_SCHEDULER, "scheduler", | ||
VDEV_SCHEDULER_AUTO, PROP_DEFAULT, ZFS_TYPE_VDEV, | ||
"auto | classic | none", "IO_SCHEDULER", vdevschedulertype_table, | ||
sfeatures); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should be moved in to the "default index properties" section below. It looks like failfast somehow managed to end up in the wrong section. We should fix that up too, but that can be done in it's own PR. |
||
|
||
/* default index properties */ | ||
zprop_register_index(VDEV_PROP_FAILFAST, "failfast", B_TRUE, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -722,6 +722,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) | |
vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N); | ||
vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T); | ||
|
||
vd->vdev_scheduler = vdev_prop_default_numeric(VDEV_PROP_SCHEDULER); | ||
|
||
list_link_init(&vd->vdev_config_dirty_node); | ||
list_link_init(&vd->vdev_state_dirty_node); | ||
list_link_init(&vd->vdev_initialize_node); | ||
|
@@ -3890,6 +3892,12 @@ vdev_load(vdev_t *vd) | |
if (error && error != ENOENT) | ||
vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " | ||
"failed [error=%d]", (u_longlong_t)zapobj, error); | ||
|
||
error = vdev_prop_get_int(vd, VDEV_PROP_SCHEDULER, | ||
&vd->vdev_scheduler); | ||
if (error && error != ENOENT) | ||
vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " | ||
"failed [error=%d]", (u_longlong_t)zapobj, error); | ||
} | ||
|
||
/* | ||
|
@@ -6125,6 +6133,15 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) | |
} | ||
vd->vdev_slow_io_t = intval; | ||
break; | ||
case VDEV_PROP_SCHEDULER: | ||
if (nvpair_value_uint64(elem, &intval) != 0) { | ||
error = EINVAL; | ||
break; | ||
} | ||
if (vd->vdev_ops->vdev_op_leaf) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Only allowing this to be set on leaves has the complication that when the drive is replaced it will revert back to the default value for the property. The There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It looks like |
||
vd->vdev_scheduler = intval; | ||
} | ||
break; | ||
default: | ||
/* Most processing is done in vdev_props_set_sync */ | ||
break; | ||
|
@@ -6488,6 +6505,7 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) | |
case VDEV_PROP_IO_T: | ||
case VDEV_PROP_SLOW_IO_N: | ||
case VDEV_PROP_SLOW_IO_T: | ||
case VDEV_PROP_SCHEDULER: | ||
err = vdev_prop_get_int(vd, prop, &intval); | ||
if (err && err != ENOENT) | ||
break; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -879,6 +879,38 @@ vdev_queue_io_to_issue(vdev_queue_t *vq) | |
return (zio); | ||
} | ||
|
||
static boolean_t | ||
vdev_should_queue_zio(zio_t *zio) | ||
{ | ||
vdev_t *vd = zio->io_vd; | ||
boolean_t should_queue = B_TRUE; | ||
|
||
/* | ||
* Add zio with ZIO_FLAG_NODATA to queue as bypass code | ||
* currently does not handle certain cases (gang abd, raidz | ||
* write aggregation). | ||
*/ | ||
if (zio->io_flags & ZIO_FLAG_NODATA) | ||
return (B_TRUE); | ||
|
||
switch (vd->vdev_scheduler) { | ||
case VDEV_SCHEDULER_AUTO: | ||
if (vd->vdev_nonrot && vd->vdev_is_blkdev) | ||
should_queue = B_FALSE; | ||
break; | ||
case VDEV_SCHEDULER_CLASSIC: | ||
should_queue = B_TRUE; | ||
break; | ||
case VDEV_SCHEDULER_NONE: | ||
should_queue = B_FALSE; | ||
break; | ||
default: | ||
should_queue = B_TRUE; | ||
break; | ||
} | ||
return (should_queue); | ||
} | ||
|
||
zio_t * | ||
vdev_queue_io(zio_t *zio) | ||
{ | ||
|
@@ -922,6 +954,11 @@ vdev_queue_io(zio_t *zio) | |
zio->io_flags |= ZIO_FLAG_DONT_QUEUE; | ||
zio->io_timestamp = gethrtime(); | ||
|
||
if (!vdev_should_queue_zio(zio)) { | ||
zio->io_queue_state = ZIO_QS_NONE; | ||
return (zio); | ||
} | ||
|
||
mutex_enter(&vq->vq_lock); | ||
vdev_queue_io_add(vq, zio); | ||
nio = vdev_queue_io_to_issue(vq); | ||
|
@@ -954,6 +991,10 @@ vdev_queue_io_done(zio_t *zio) | |
vq->vq_io_complete_ts = now; | ||
vq->vq_io_delta_ts = zio->io_delta = now - zio->io_timestamp; | ||
|
||
if (zio->io_queue_state == ZIO_QS_NONE) { | ||
return; | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: let's drop the parenthesis here. |
||
|
||
mutex_enter(&vq->vq_lock); | ||
vdev_queue_pending_remove(vq, zio); | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -76,4 +76,5 @@ typeset -a properties=( | |
trim_support | ||
trim_errors | ||
slow_ios | ||
scheduler | ||
) |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,96 @@ | ||||||
#!/bin/ksh -p | ||||||
# SPDX-License-Identifier: CDDL-1.0 | ||||||
# | ||||||
# CDDL HEADER START | ||||||
# | ||||||
# The contents of this file are subject to the terms of the | ||||||
# Common Development and Distribution License (the "License"). | ||||||
# You may not use this file except in compliance with the License. | ||||||
# | ||||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | ||||||
# or https://opensource.org/licenses/CDDL-1.0. | ||||||
# See the License for the specific language governing permissions | ||||||
# and limitations under the License. | ||||||
# | ||||||
# When distributing Covered Code, include this CDDL HEADER in each | ||||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE. | ||||||
# If applicable, add the following below this CDDL HEADER, with the | ||||||
# fields enclosed by brackets "[]" replaced with your own identifying | ||||||
# information: Portions Copyright [yyyy] [name of copyright owner] | ||||||
# | ||||||
# CDDL HEADER END | ||||||
# | ||||||
|
||||||
# | ||||||
# Copyright (c) 2025 by Triad National Security, LLC. | ||||||
# | ||||||
|
||||||
. $STF_SUITE/include/libtest.shlib | ||||||
|
||||||
# | ||||||
# DESCRIPTION: | ||||||
# Setting vdev io_scheduler property while reading from vdev should not cause panic. | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
# | ||||||
# STRATEGY: | ||||||
# 1. Create a zpool | ||||||
# 2. Write a file to the pool. | ||||||
# 3. Start reading from file, while also setting the io_scheduler property. | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
# | ||||||
|
||||||
verify_runnable "global" | ||||||
|
||||||
command -v fio > /dev/null || log_unsupported "fio missing" | ||||||
log_must save_tunable DIO_ENABLED | ||||||
log_must set_tunable32 DIO_ENABLED 1 | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I get that this change was motivated by Direct IO, but for the purposes on this test setting |
||||||
|
||||||
function set_io_scheduler | ||||||
{ | ||||||
zpool set scheduler=none $TESTPOOL1 $FILEDEV | ||||||
sleep 0.1 | ||||||
zpool set scheduler=classic $TESTPOOL1 $FILEDEV | ||||||
sleep 0.1 | ||||||
} | ||||||
|
||||||
function cleanup | ||||||
{ | ||||||
destroy_pool $TESTPOOL1 | ||||||
log_must rm -f $FILEDEV | ||||||
log_must restore_tunable DIO_ENABLED | ||||||
} | ||||||
|
||||||
log_assert "Toggling vdev scheduler property while reading from vdev should not cause panic" | ||||||
log_onexit cleanup | ||||||
|
||||||
# 1. Create a pool | ||||||
|
||||||
FILEDEV="$TEST_BASE_DIR/filedev.$$" | ||||||
log_must truncate -s $(($MINVDEVSIZE * 2)) $FILEDEV | ||||||
create_pool $TESTPOOL1 $FILEDEV | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should use the existing |
||||||
|
||||||
mntpnt=$(get_prop mountpoint $TESTPOOL1) | ||||||
|
||||||
# 2. Write a file to the pool, while also setting the io_scheduler property. | ||||||
|
||||||
log_must eval "fio --filename=$mntpnt/foobar --name=write-file \ | ||||||
--rw=write --size=$MINVDEVSIZE --bs=128k --numjobs=1 --direct=1 \ | ||||||
--ioengine=sync --time_based --runtime=10 &" | ||||||
|
||||||
ITERATIONS=30 | ||||||
|
||||||
for i in $(seq $ITERATIONS); do | ||||||
log_must set_io_scheduler | ||||||
done; | ||||||
wait | ||||||
|
||||||
# 3. Starting reading from file, while also setting the io_scheduler property. | ||||||
|
||||||
log_must eval "fio --filename=$mntpnt/foobar --name=read-file \ | ||||||
--rw=read --size=$MINVDEVSIZE --bs=128k --numjobs=1 --direct=1 \ | ||||||
--ioengine=sync --time_based --runtime=10 &" | ||||||
|
||||||
for i in $(seq $ITERATIONS); do | ||||||
log_must set_io_scheduler | ||||||
done; | ||||||
wait | ||||||
|
||||||
log_pass "Setting vdev scheduler property while reading from vdev does not cause panic" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.