From c02eea7eeaebd7270cb8ff09049cc7e0fc9bc8da Mon Sep 17 00:00:00 2001 From: Easwar Hariharan Date: Tue, 25 Feb 2025 20:17:20 +0000 Subject: [PATCH 01/30] rbd: convert timeouts to secs_to_jiffies() Commit b35108a51cf7 ("jiffies: Define secs_to_jiffies()") introduced secs_to_jiffies(). As the value here is a multiple of 1000, use secs_to_jiffies() instead of msecs_to_jiffies() to avoid the multiplication This is converted using scripts/coccinelle/misc/secs_to_jiffies.cocci with the following Coccinelle rules: @depends on patch@ expression E; @@ -msecs_to_jiffies(E * 1000) +secs_to_jiffies(E) @depends on patch@ expression E; @@ -msecs_to_jiffies(E * MSEC_PER_SEC) +secs_to_jiffies(E) While here, remove the no-longer necessary check for range since there's no multiplication involved. Acked-by: Ilya Dryomov Signed-off-by: Easwar Hariharan Link: https://lore.kernel.org/r/20250225-converge-secs-to-jiffies-part-two-v3-6-a43967e36c88@linux.microsoft.com Signed-off-by: Jens Axboe --- drivers/block/rbd.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index faafd7ff43d6ef..41207133e21e92 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -108,7 +108,7 @@ static int atomic_dec_return_safe(atomic_t *v) #define RBD_OBJ_PREFIX_LEN_MAX 64 #define RBD_NOTIFY_TIMEOUT 5 /* seconds */ -#define RBD_RETRY_DELAY msecs_to_jiffies(1000) +#define RBD_RETRY_DELAY secs_to_jiffies(1) /* Feature bits */ @@ -4162,7 +4162,7 @@ static void rbd_acquire_lock(struct work_struct *work) dout("%s rbd_dev %p requeuing lock_dwork\n", __func__, rbd_dev); mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, - msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC)); + secs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT)); } } @@ -6283,9 +6283,7 @@ static int rbd_parse_param(struct fs_parameter *param, break; case Opt_lock_timeout: /* 0 is "wait forever" (i.e. infinite timeout) */ - if (result.uint_32 > INT_MAX / 1000) - goto out_of_range; - opt->lock_timeout = msecs_to_jiffies(result.uint_32 * 1000); + opt->lock_timeout = secs_to_jiffies(result.uint_32); break; case Opt_pool_ns: kfree(pctx->spec->pool_ns); From 6376ef2b6af3bbcb7c50dc657bdfb83aba467aef Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Tue, 25 Feb 2025 14:24:55 -0700 Subject: [PATCH 02/30] ublk: complete command synchronously on error In case of an error, ublk's ->uring_cmd() functions currently return -EIOCBQUEUED and immediately call io_uring_cmd_done(). -EIOCBQUEUED and io_uring_cmd_done() are intended for asynchronous completions. For synchronous completions, the ->uring_cmd() function can just return the negative return code directly. This skips io_uring_cmd_del_cancelable(), and deferring the completion to task work. So return the error code directly from __ublk_ch_uring_cmd() and ublk_ctrl_uring_cmd(). Update ublk_ch_uring_cmd_cb(), which currently ignores the return value from __ublk_ch_uring_cmd(), to call io_uring_cmd_done() for synchronous completions. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20250225212456.2902549-1-csander@purestorage.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 529085181f3550..ff648c6839c154 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1866,10 +1866,9 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, return -EIOCBQUEUED; out: - io_uring_cmd_done(cmd, ret, 0, issue_flags); pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n", __func__, cmd_op, tag, ret, io->flags); - return -EIOCBQUEUED; + return ret; } static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, @@ -1925,7 +1924,10 @@ static inline int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd, unsigned int issue_flags) { - ublk_ch_uring_cmd_local(cmd, issue_flags); + int ret = ublk_ch_uring_cmd_local(cmd, issue_flags); + + if (ret != -EIOCBQUEUED) + io_uring_cmd_done(cmd, ret, 0, issue_flags); } static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) @@ -3056,10 +3058,9 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, if (ub) ublk_put_device(ub); out: - io_uring_cmd_done(cmd, ret, 0, issue_flags); pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n", __func__, ret, cmd->cmd_op, header->dev_id, header->queue_id); - return -EIOCBQUEUED; + return ret; } static const struct file_operations ublk_ctl_fops = { From d8ae0061afb8bbdb0cf6b2cd4b5be5c54e42b228 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Wed, 26 Feb 2025 19:06:09 +0900 Subject: [PATCH 03/30] null_blk: generate null_blk configfs features string The null_blk configfs file 'features' provides a string that lists available null_blk features for userspace programs to reference. The string is defined as a long constant in the code, which tends to be forgotten for updates. It also causes checkpatch.pl to report "WARNING: quoted string split across lines". To avoid these drawbacks, generate the feature string on the fly. Refer to the ca_name field of each element in the nullb_device_attrs table and concatenate them in the given buffer. Also, sorted nullb_device_attrs table elements in alphabetical order. Of note is that the feature "index" was missing before this commit. This commit adds it to the generated string. Suggested-by: Bart Van Assche Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Reviewed-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Signed-off-by: Shin'ichiro Kawasaki Link: https://lore.kernel.org/r/20250226100613.1622564-2-shinichiro.kawasaki@wdc.com Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 86 ++++++++++++++++++++--------------- 1 file changed, 49 insertions(+), 37 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index d94ef37480bd45..0725d221cff49e 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -592,41 +592,41 @@ static ssize_t nullb_device_zone_offline_store(struct config_item *item, CONFIGFS_ATTR_WO(nullb_device_, zone_offline); static struct configfs_attribute *nullb_device_attrs[] = { - &nullb_device_attr_size, + &nullb_device_attr_badblocks, + &nullb_device_attr_blocking, + &nullb_device_attr_blocksize, + &nullb_device_attr_cache_size, &nullb_device_attr_completion_nsec, - &nullb_device_attr_submit_queues, - &nullb_device_attr_poll_queues, + &nullb_device_attr_discard, + &nullb_device_attr_fua, &nullb_device_attr_home_node, - &nullb_device_attr_queue_mode, - &nullb_device_attr_blocksize, - &nullb_device_attr_max_sectors, - &nullb_device_attr_irqmode, &nullb_device_attr_hw_queue_depth, &nullb_device_attr_index, - &nullb_device_attr_blocking, - &nullb_device_attr_use_per_node_hctx, - &nullb_device_attr_power, - &nullb_device_attr_memory_backed, - &nullb_device_attr_discard, + &nullb_device_attr_irqmode, + &nullb_device_attr_max_sectors, &nullb_device_attr_mbps, - &nullb_device_attr_cache_size, - &nullb_device_attr_badblocks, - &nullb_device_attr_zoned, - &nullb_device_attr_zone_size, + &nullb_device_attr_memory_backed, + &nullb_device_attr_no_sched, + &nullb_device_attr_poll_queues, + &nullb_device_attr_power, + &nullb_device_attr_queue_mode, + &nullb_device_attr_rotational, + &nullb_device_attr_shared_tag_bitmap, + &nullb_device_attr_shared_tags, + &nullb_device_attr_size, + &nullb_device_attr_submit_queues, + &nullb_device_attr_use_per_node_hctx, + &nullb_device_attr_virt_boundary, + &nullb_device_attr_zone_append_max_sectors, &nullb_device_attr_zone_capacity, - &nullb_device_attr_zone_nr_conv, - &nullb_device_attr_zone_max_open, + &nullb_device_attr_zone_full, &nullb_device_attr_zone_max_active, - &nullb_device_attr_zone_append_max_sectors, - &nullb_device_attr_zone_readonly, + &nullb_device_attr_zone_max_open, + &nullb_device_attr_zone_nr_conv, &nullb_device_attr_zone_offline, - &nullb_device_attr_zone_full, - &nullb_device_attr_virt_boundary, - &nullb_device_attr_no_sched, - &nullb_device_attr_shared_tags, - &nullb_device_attr_shared_tag_bitmap, - &nullb_device_attr_fua, - &nullb_device_attr_rotational, + &nullb_device_attr_zone_readonly, + &nullb_device_attr_zone_size, + &nullb_device_attr_zoned, NULL, }; @@ -704,16 +704,28 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item) static ssize_t memb_group_features_show(struct config_item *item, char *page) { - return snprintf(page, PAGE_SIZE, - "badblocks,blocking,blocksize,cache_size,fua," - "completion_nsec,discard,home_node,hw_queue_depth," - "irqmode,max_sectors,mbps,memory_backed,no_sched," - "poll_queues,power,queue_mode,shared_tag_bitmap," - "shared_tags,size,submit_queues,use_per_node_hctx," - "virt_boundary,zoned,zone_capacity,zone_max_active," - "zone_max_open,zone_nr_conv,zone_offline,zone_readonly," - "zone_size,zone_append_max_sectors,zone_full," - "rotational\n"); + + struct configfs_attribute **entry; + char delimiter = ','; + size_t left = PAGE_SIZE; + size_t written = 0; + int ret; + + for (entry = &nullb_device_attrs[0]; *entry && left > 0; entry++) { + if (!*(entry + 1)) + delimiter = '\n'; + ret = snprintf(page + written, left, "%s%c", (*entry)->ca_name, + delimiter); + if (ret >= left) { + WARN_ONCE(1, "Too many null_blk features to print\n"); + memzero_explicit(page, PAGE_SIZE); + return -ENOBUFS; + } + left -= ret; + written += ret; + } + + return written; } CONFIGFS_ATTR_RO(memb_group_, features); From 6b87fa3245a93913efb3d8b858f6750d655d5db9 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Wed, 26 Feb 2025 19:06:10 +0900 Subject: [PATCH 04/30] null_blk: introduce badblocks_once parameter When IO errors happen on real storage devices, the IOs repeated to the same target range can success by virtue of recovery features by devices, such as reserved block assignment. To simulate such IO errors and recoveries, introduce the new parameter badblocks_once parameter. When this parameter is set to 1, the specified badblocks are cleared after the first IO error, so that the next IO to the blocks succeed. Reviewed-by: Damien Le Moal Reviewed-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Signed-off-by: Shin'ichiro Kawasaki Link: https://lore.kernel.org/r/20250226100613.1622564-3-shinichiro.kawasaki@wdc.com Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 11 ++++++++--- drivers/block/null_blk/null_blk.h | 1 + 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 0725d221cff49e..2a060a6ea8c0b1 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -473,6 +473,7 @@ NULLB_DEVICE_ATTR(shared_tags, bool, NULL); NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL); NULLB_DEVICE_ATTR(fua, bool, NULL); NULLB_DEVICE_ATTR(rotational, bool, NULL); +NULLB_DEVICE_ATTR(badblocks_once, bool, NULL); static ssize_t nullb_device_power_show(struct config_item *item, char *page) { @@ -593,6 +594,7 @@ CONFIGFS_ATTR_WO(nullb_device_, zone_offline); static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_badblocks, + &nullb_device_attr_badblocks_once, &nullb_device_attr_blocking, &nullb_device_attr_blocksize, &nullb_device_attr_cache_size, @@ -1315,10 +1317,13 @@ static inline blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, sector_t first_bad; int bad_sectors; - if (badblocks_check(bb, sector, nr_sectors, &first_bad, &bad_sectors)) - return BLK_STS_IOERR; + if (!badblocks_check(bb, sector, nr_sectors, &first_bad, &bad_sectors)) + return BLK_STS_OK; - return BLK_STS_OK; + if (cmd->nq->dev->badblocks_once) + badblocks_clear(bb, first_bad, bad_sectors); + + return BLK_STS_IOERR; } static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h index 6f9fe617108704..3c4c07f0418b06 100644 --- a/drivers/block/null_blk/null_blk.h +++ b/drivers/block/null_blk/null_blk.h @@ -63,6 +63,7 @@ struct nullb_device { unsigned long flags; /* device flags */ unsigned int curr_cache; struct badblocks badblocks; + bool badblocks_once; unsigned int nr_zones; unsigned int nr_zones_imp_open; From 7859d042b0954f843d2e97c1324bb04bf28df2f6 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Wed, 26 Feb 2025 19:06:11 +0900 Subject: [PATCH 05/30] null_blk: replace null_process_cmd() call in null_zone_write() As a preparation to support partial data transfer due to badblocks, replace the null_process_cmd() call in null_zone_write() with equivalent calls to null_handle_badblocks() and null_handle_memory_backed(). This commit does not change behavior. It will enable null_handle_badblocks() to return the size of partial data transfer in the following commit, allowing null_zone_write() to move write pointers appropriately. Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Signed-off-by: Shin'ichiro Kawasaki Link: https://lore.kernel.org/r/20250226100613.1622564-4-shinichiro.kawasaki@wdc.com Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 11 ++++------- drivers/block/null_blk/null_blk.h | 5 +++++ drivers/block/null_blk/zoned.c | 15 ++++++++++++--- 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 2a060a6ea8c0b1..87037cb375c94e 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1309,9 +1309,8 @@ static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd) return sts; } -static inline blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, - sector_t sector, - sector_t nr_sectors) +blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, sector_t sector, + sector_t nr_sectors) { struct badblocks *bb = &cmd->nq->dev->badblocks; sector_t first_bad; @@ -1326,10 +1325,8 @@ static inline blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, return BLK_STS_IOERR; } -static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, - enum req_op op, - sector_t sector, - sector_t nr_sectors) +blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, enum req_op op, + sector_t sector, sector_t nr_sectors) { struct nullb_device *dev = cmd->nq->dev; diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h index 3c4c07f0418b06..ee60f3a8879671 100644 --- a/drivers/block/null_blk/null_blk.h +++ b/drivers/block/null_blk/null_blk.h @@ -132,6 +132,11 @@ blk_status_t null_handle_discard(struct nullb_device *dev, sector_t sector, sector_t nr_sectors); blk_status_t null_process_cmd(struct nullb_cmd *cmd, enum req_op op, sector_t sector, unsigned int nr_sectors); +blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, sector_t sector, + sector_t nr_sectors); +blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, enum req_op op, + sector_t sector, sector_t nr_sectors); + #ifdef CONFIG_BLK_DEV_ZONED int null_init_zoned_dev(struct nullb_device *dev, struct queue_limits *lim); diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index 0d5f9bf952292e..7677f6cf23f466 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -412,9 +412,18 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, zone->cond = BLK_ZONE_COND_IMP_OPEN; } - ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); - if (ret != BLK_STS_OK) - goto unlock_zone; + if (dev->badblocks.shift != -1) { + ret = null_handle_badblocks(cmd, sector, nr_sectors); + if (ret != BLK_STS_OK) + goto unlock_zone; + } + + if (dev->memory_backed) { + ret = null_handle_memory_backed(cmd, REQ_OP_WRITE, sector, + nr_sectors); + if (ret != BLK_STS_OK) + goto unlock_zone; + } zone->wp += nr_sectors; if (zone->wp == zone->start + zone->capacity) { From 6d9725d1000a0bc4e41fbe2db51181e80e4260eb Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Wed, 26 Feb 2025 19:06:12 +0900 Subject: [PATCH 06/30] null_blk: pass transfer size to null_handle_rq() As preparation to support partial data transfer, add a new argument to null_handle_rq() to pass the number of sectors to transfer. While at it, rename the function from null_handle_rq to null_handle_data_transfer. This commit does not change the behavior. Reviewed-by: Damien Le Moal Reviewed-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Signed-off-by: Shin'ichiro Kawasaki Link: https://lore.kernel.org/r/20250226100613.1622564-5-shinichiro.kawasaki@wdc.com Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 87037cb375c94e..8025766988124b 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1263,25 +1263,37 @@ static int null_transfer(struct nullb *nullb, struct page *page, return err; } -static blk_status_t null_handle_rq(struct nullb_cmd *cmd) +/* + * Transfer data for the given request. The transfer size is capped with the + * nr_sectors argument. + */ +static blk_status_t null_handle_data_transfer(struct nullb_cmd *cmd, + sector_t nr_sectors) { struct request *rq = blk_mq_rq_from_pdu(cmd); struct nullb *nullb = cmd->nq->dev->nullb; int err = 0; unsigned int len; sector_t sector = blk_rq_pos(rq); + unsigned int max_bytes = nr_sectors << SECTOR_SHIFT; + unsigned int transferred_bytes = 0; struct req_iterator iter; struct bio_vec bvec; spin_lock_irq(&nullb->lock); rq_for_each_segment(bvec, rq, iter) { len = bvec.bv_len; + if (transferred_bytes + len > max_bytes) + len = max_bytes - transferred_bytes; err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, op_is_write(req_op(rq)), sector, rq->cmd_flags & REQ_FUA); if (err) break; sector += len >> SECTOR_SHIFT; + transferred_bytes += len; + if (transferred_bytes >= max_bytes) + break; } spin_unlock_irq(&nullb->lock); @@ -1333,7 +1345,7 @@ blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, enum req_op op, if (op == REQ_OP_DISCARD) return null_handle_discard(dev, sector, nr_sectors); - return null_handle_rq(cmd); + return null_handle_data_transfer(cmd, nr_sectors); } static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd) From 386d7f4be4cdfb0b3a937f26fe674818394f795e Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Wed, 26 Feb 2025 19:06:13 +0900 Subject: [PATCH 07/30] null_blk: do partial IO for bad blocks The current null_blk implementation checks if any bad blocks exist in the target blocks of each IO. If so, the IO fails and data is not transferred for all of the IO target blocks. However, when real storage devices have bad blocks, the devices may transfer data partially up to the first bad blocks (e.g., SAS drives). Especially, when the IO is a write operation, such partial IO leaves partially written data on the device. To simulate such partial IO using null_blk, introduce the new parameter 'badblocks_partial_io'. When this parameter is set, null_handle_badblocks() returns the number of the sectors for the partial IO as its third pointer argument. Pass the returned number of sectors to the following calls to null_handle_memory_backend() in null_process_cmd() and null_zone_write(). Reviewed-by: Damien Le Moal Reviewed-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Signed-off-by: Shin'ichiro Kawasaki Link: https://lore.kernel.org/r/20250226100613.1622564-6-shinichiro.kawasaki@wdc.com Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 40 ++++++++++++++++++++++++------- drivers/block/null_blk/null_blk.h | 4 ++-- drivers/block/null_blk/zoned.c | 9 ++++--- 3 files changed, 40 insertions(+), 13 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 8025766988124b..31d44cef684145 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -474,6 +474,7 @@ NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL); NULLB_DEVICE_ATTR(fua, bool, NULL); NULLB_DEVICE_ATTR(rotational, bool, NULL); NULLB_DEVICE_ATTR(badblocks_once, bool, NULL); +NULLB_DEVICE_ATTR(badblocks_partial_io, bool, NULL); static ssize_t nullb_device_power_show(struct config_item *item, char *page) { @@ -595,6 +596,7 @@ CONFIGFS_ATTR_WO(nullb_device_, zone_offline); static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_badblocks, &nullb_device_attr_badblocks_once, + &nullb_device_attr_badblocks_partial_io, &nullb_device_attr_blocking, &nullb_device_attr_blocksize, &nullb_device_attr_cache_size, @@ -1321,19 +1323,40 @@ static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd) return sts; } +/* + * Check if the command should fail for the badblocks. If so, return + * BLK_STS_IOERR and return number of partial I/O sectors to be written or read, + * which may be less than the requested number of sectors. + * + * @cmd: The command to handle. + * @sector: The start sector for I/O. + * @nr_sectors: Specifies number of sectors to write or read, and returns the + * number of sectors to be written or read. + */ blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, sector_t sector, - sector_t nr_sectors) + unsigned int *nr_sectors) { struct badblocks *bb = &cmd->nq->dev->badblocks; + struct nullb_device *dev = cmd->nq->dev; + unsigned int block_sectors = dev->blocksize >> SECTOR_SHIFT; sector_t first_bad; int bad_sectors; + unsigned int partial_io_sectors = 0; - if (!badblocks_check(bb, sector, nr_sectors, &first_bad, &bad_sectors)) + if (!badblocks_check(bb, sector, *nr_sectors, &first_bad, &bad_sectors)) return BLK_STS_OK; if (cmd->nq->dev->badblocks_once) badblocks_clear(bb, first_bad, bad_sectors); + if (cmd->nq->dev->badblocks_partial_io) { + if (!IS_ALIGNED(first_bad, block_sectors)) + first_bad = ALIGN_DOWN(first_bad, block_sectors); + if (sector < first_bad) + partial_io_sectors = first_bad - sector; + } + *nr_sectors = partial_io_sectors; + return BLK_STS_IOERR; } @@ -1392,18 +1415,19 @@ blk_status_t null_process_cmd(struct nullb_cmd *cmd, enum req_op op, sector_t sector, unsigned int nr_sectors) { struct nullb_device *dev = cmd->nq->dev; + blk_status_t badblocks_ret = BLK_STS_OK; blk_status_t ret; - if (dev->badblocks.shift != -1) { - ret = null_handle_badblocks(cmd, sector, nr_sectors); + if (dev->badblocks.shift != -1) + badblocks_ret = null_handle_badblocks(cmd, sector, &nr_sectors); + + if (dev->memory_backed && nr_sectors) { + ret = null_handle_memory_backed(cmd, op, sector, nr_sectors); if (ret != BLK_STS_OK) return ret; } - if (dev->memory_backed) - return null_handle_memory_backed(cmd, op, sector, nr_sectors); - - return BLK_STS_OK; + return badblocks_ret; } static void null_handle_cmd(struct nullb_cmd *cmd, sector_t sector, diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h index ee60f3a8879671..7bb6128dbaafb7 100644 --- a/drivers/block/null_blk/null_blk.h +++ b/drivers/block/null_blk/null_blk.h @@ -64,6 +64,7 @@ struct nullb_device { unsigned int curr_cache; struct badblocks badblocks; bool badblocks_once; + bool badblocks_partial_io; unsigned int nr_zones; unsigned int nr_zones_imp_open; @@ -133,11 +134,10 @@ blk_status_t null_handle_discard(struct nullb_device *dev, sector_t sector, blk_status_t null_process_cmd(struct nullb_cmd *cmd, enum req_op op, sector_t sector, unsigned int nr_sectors); blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, sector_t sector, - sector_t nr_sectors); + unsigned int *nr_sectors); blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, enum req_op op, sector_t sector, sector_t nr_sectors); - #ifdef CONFIG_BLK_DEV_ZONED int null_init_zoned_dev(struct nullb_device *dev, struct queue_limits *lim); int null_register_zoned_dev(struct nullb *nullb); diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index 7677f6cf23f466..4e5728f459899f 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -353,6 +353,7 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, struct nullb_device *dev = cmd->nq->dev; unsigned int zno = null_zone_no(dev, sector); struct nullb_zone *zone = &dev->zones[zno]; + blk_status_t badblocks_ret = BLK_STS_OK; blk_status_t ret; trace_nullb_zone_op(cmd, zno, zone->cond); @@ -413,9 +414,11 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, } if (dev->badblocks.shift != -1) { - ret = null_handle_badblocks(cmd, sector, nr_sectors); - if (ret != BLK_STS_OK) + badblocks_ret = null_handle_badblocks(cmd, sector, &nr_sectors); + if (badblocks_ret != BLK_STS_OK && !nr_sectors) { + ret = badblocks_ret; goto unlock_zone; + } } if (dev->memory_backed) { @@ -438,7 +441,7 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, zone->cond = BLK_ZONE_COND_FULL; } - ret = BLK_STS_OK; + ret = badblocks_ret; unlock_zone: null_unlock_zone(dev, zone); From e9945facd48d2d5da87fa247f5d6a19c23d935fd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 25 Feb 2025 07:44:31 -0800 Subject: [PATCH 08/30] block: mark bounce buffering as incompatible with integrity None of the few drivers still using the legacy block layer bounce buffering support integrity metadata. Explicitly mark the features as incompatible and stop creating the slab and mempool for integrity buffers for the bounce bio_set. Signed-off-by: Christoph Hellwig Reviewed-by: Anuj Gupta Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20250225154449.422989-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 5 +++++ block/bounce.c | 2 -- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index c44dadc35e1ece..2763a34a9d569f 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -117,6 +117,11 @@ static int blk_validate_integrity_limits(struct queue_limits *lim) return 0; } + if (lim->features & BLK_FEAT_BOUNCE_HIGH) { + pr_warn("no bounce buffer support for integrity metadata\n"); + return -EINVAL; + } + if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) { pr_warn("integrity support disabled.\n"); return -EINVAL; diff --git a/block/bounce.c b/block/bounce.c index 0d898cd5ec497f..09a9616cf20944 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -41,8 +41,6 @@ static void init_bounce_bioset(void) ret = bioset_init(&bounce_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); BUG_ON(ret); - if (bioset_integrity_create(&bounce_bio_set, BIO_POOL_SIZE)) - BUG_ON(1); ret = bioset_init(&bounce_bio_split, BIO_POOL_SIZE, 0, 0); BUG_ON(ret); From d2cfe5ceca59b74ef96bfe00632e3927d00c9918 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 25 Feb 2025 07:44:32 -0800 Subject: [PATCH 09/30] block: move the block layer auto-integrity code into a new file The code that automatically creates a integrity payload and generates and verifies the checksums for bios that don't have submitter-provided integrity payload currently sits right in the middle of the block integrity metadata infrastructure. Split it into a separate file to make the different layers clear. Signed-off-by: Christoph Hellwig Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20250225154449.422989-3-hch@lst.de Signed-off-by: Jens Axboe --- block/Makefile | 3 +- block/bio-integrity-auto.c | 162 +++++++++++++++++++++++++++++++++++++ block/bio-integrity.c | 159 ------------------------------------ 3 files changed, 164 insertions(+), 160 deletions(-) create mode 100644 block/bio-integrity-auto.c diff --git a/block/Makefile b/block/Makefile index 33748123710b36..3a941dc0d27fb2 100644 --- a/block/Makefile +++ b/block/Makefile @@ -26,7 +26,8 @@ obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o obj-$(CONFIG_IOSCHED_BFQ) += bfq.o -obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o +obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o \ + bio-integrity-auto.o obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o obj-$(CONFIG_BLK_WBT) += blk-wbt.o obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o diff --git a/block/bio-integrity-auto.c b/block/bio-integrity-auto.c new file mode 100644 index 00000000000000..357241fa0f20bf --- /dev/null +++ b/block/bio-integrity-auto.c @@ -0,0 +1,162 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2007, 2008, 2009 Oracle Corporation + * Written by: Martin K. Petersen + * + * Automatically generate and verify integrity data on PI capable devices if the + * bio submitter didn't provide PI itself. This ensures that kernel verifies + * data integrity even if the file system (or other user of the block device) is + * not aware of PI. + */ +#include +#include +#include "blk.h" + +static struct workqueue_struct *kintegrityd_wq; + +static void bio_integrity_verify_fn(struct work_struct *work) +{ + struct bio_integrity_payload *bip = + container_of(work, struct bio_integrity_payload, bip_work); + struct bio *bio = bip->bip_bio; + + blk_integrity_verify(bio); + + kfree(bvec_virt(bip->bip_vec)); + bio_integrity_free(bio); + bio_endio(bio); +} + +/** + * __bio_integrity_endio - Integrity I/O completion function + * @bio: Protected bio + * + * Normally I/O completion is done in interrupt context. However, verifying I/O + * integrity is a time-consuming task which must be run in process context. + * + * This function postpones completion accordingly. + */ +bool __bio_integrity_endio(struct bio *bio) +{ + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + struct bio_integrity_payload *bip = bio_integrity(bio); + + if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && bi->csum_type) { + INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); + queue_work(kintegrityd_wq, &bip->bip_work); + return false; + } + + kfree(bvec_virt(bip->bip_vec)); + bio_integrity_free(bio); + return true; +} + +/** + * bio_integrity_prep - Prepare bio for integrity I/O + * @bio: bio to prepare + * + * Checks if the bio already has an integrity payload attached. If it does, the + * payload has been generated by another kernel subsystem, and we just pass it + * through. + * Otherwise allocates integrity payload and for writes the integrity metadata + * will be generated. For reads, the completion handler will verify the + * metadata. + */ +bool bio_integrity_prep(struct bio *bio) +{ + struct bio_integrity_payload *bip; + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + gfp_t gfp = GFP_NOIO; + unsigned int len; + void *buf; + + if (!bi) + return true; + + if (!bio_sectors(bio)) + return true; + + /* Already protected? */ + if (bio_integrity(bio)) + return true; + + switch (bio_op(bio)) { + case REQ_OP_READ: + if (bi->flags & BLK_INTEGRITY_NOVERIFY) + return true; + break; + case REQ_OP_WRITE: + if (bi->flags & BLK_INTEGRITY_NOGENERATE) + return true; + + /* + * Zero the memory allocated to not leak uninitialized kernel + * memory to disk for non-integrity metadata where nothing else + * initializes the memory. + */ + if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE) + gfp |= __GFP_ZERO; + break; + default: + return true; + } + + /* Allocate kernel buffer for protection data */ + len = bio_integrity_bytes(bi, bio_sectors(bio)); + buf = kmalloc(len, gfp); + if (!buf) + goto err_end_io; + + bip = bio_integrity_alloc(bio, GFP_NOIO, 1); + if (IS_ERR(bip)) { + kfree(buf); + goto err_end_io; + } + + bip->bip_flags |= BIP_BLOCK_INTEGRITY; + bip_set_seed(bip, bio->bi_iter.bi_sector); + + if (bi->csum_type == BLK_INTEGRITY_CSUM_IP) + bip->bip_flags |= BIP_IP_CHECKSUM; + if (bi->csum_type) + bip->bip_flags |= BIP_CHECK_GUARD; + if (bi->flags & BLK_INTEGRITY_REF_TAG) + bip->bip_flags |= BIP_CHECK_REFTAG; + + if (bio_integrity_add_page(bio, virt_to_page(buf), len, + offset_in_page(buf)) < len) + goto err_end_io; + + /* Auto-generate integrity metadata if this is a write */ + if (bio_data_dir(bio) == WRITE) + blk_integrity_generate(bio); + else + bip->bio_iter = bio->bi_iter; + return true; + +err_end_io: + bio->bi_status = BLK_STS_RESOURCE; + bio_endio(bio); + return false; +} +EXPORT_SYMBOL(bio_integrity_prep); + +void blk_flush_integrity(void) +{ + flush_workqueue(kintegrityd_wq); +} + +static int __init blk_integrity_auto_init(void) +{ + /* + * kintegrityd won't block much but may burn a lot of CPU cycles. + * Make it highpri CPU intensive wq with max concurrency of 1. + */ + kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM | + WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1); + if (!kintegrityd_wq) + panic("Failed to create kintegrityd\n"); + return 0; +} +subsys_initcall(blk_integrity_auto_init); diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 5d81ad9a3d20a7..aa9f966123194a 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -10,17 +10,10 @@ #include #include #include -#include #include #include "blk.h" static struct kmem_cache *bip_slab; -static struct workqueue_struct *kintegrityd_wq; - -void blk_flush_integrity(void) -{ - flush_workqueue(kintegrityd_wq); -} /** * bio_integrity_free - Free bio integrity payload @@ -413,149 +406,6 @@ int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta) return ret; } -/** - * bio_integrity_prep - Prepare bio for integrity I/O - * @bio: bio to prepare - * - * Description: Checks if the bio already has an integrity payload attached. - * If it does, the payload has been generated by another kernel subsystem, - * and we just pass it through. Otherwise allocates integrity payload. - * The bio must have data direction, target device and start sector set priot - * to calling. In the WRITE case, integrity metadata will be generated using - * the block device's integrity function. In the READ case, the buffer - * will be prepared for DMA and a suitable end_io handler set up. - */ -bool bio_integrity_prep(struct bio *bio) -{ - struct bio_integrity_payload *bip; - struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); - unsigned int len; - void *buf; - gfp_t gfp = GFP_NOIO; - - if (!bi) - return true; - - if (!bio_sectors(bio)) - return true; - - /* Already protected? */ - if (bio_integrity(bio)) - return true; - - switch (bio_op(bio)) { - case REQ_OP_READ: - if (bi->flags & BLK_INTEGRITY_NOVERIFY) - return true; - break; - case REQ_OP_WRITE: - if (bi->flags & BLK_INTEGRITY_NOGENERATE) - return true; - - /* - * Zero the memory allocated to not leak uninitialized kernel - * memory to disk for non-integrity metadata where nothing else - * initializes the memory. - */ - if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE) - gfp |= __GFP_ZERO; - break; - default: - return true; - } - - /* Allocate kernel buffer for protection data */ - len = bio_integrity_bytes(bi, bio_sectors(bio)); - buf = kmalloc(len, gfp); - if (unlikely(buf == NULL)) { - goto err_end_io; - } - - bip = bio_integrity_alloc(bio, GFP_NOIO, 1); - if (IS_ERR(bip)) { - kfree(buf); - goto err_end_io; - } - - bip->bip_flags |= BIP_BLOCK_INTEGRITY; - bip_set_seed(bip, bio->bi_iter.bi_sector); - - if (bi->csum_type == BLK_INTEGRITY_CSUM_IP) - bip->bip_flags |= BIP_IP_CHECKSUM; - - /* describe what tags to check in payload */ - if (bi->csum_type) - bip->bip_flags |= BIP_CHECK_GUARD; - if (bi->flags & BLK_INTEGRITY_REF_TAG) - bip->bip_flags |= BIP_CHECK_REFTAG; - if (bio_integrity_add_page(bio, virt_to_page(buf), len, - offset_in_page(buf)) < len) { - printk(KERN_ERR "could not attach integrity payload\n"); - goto err_end_io; - } - - /* Auto-generate integrity metadata if this is a write */ - if (bio_data_dir(bio) == WRITE) - blk_integrity_generate(bio); - else - bip->bio_iter = bio->bi_iter; - return true; - -err_end_io: - bio->bi_status = BLK_STS_RESOURCE; - bio_endio(bio); - return false; -} -EXPORT_SYMBOL(bio_integrity_prep); - -/** - * bio_integrity_verify_fn - Integrity I/O completion worker - * @work: Work struct stored in bio to be verified - * - * Description: This workqueue function is called to complete a READ - * request. The function verifies the transferred integrity metadata - * and then calls the original bio end_io function. - */ -static void bio_integrity_verify_fn(struct work_struct *work) -{ - struct bio_integrity_payload *bip = - container_of(work, struct bio_integrity_payload, bip_work); - struct bio *bio = bip->bip_bio; - - blk_integrity_verify(bio); - - kfree(bvec_virt(bip->bip_vec)); - bio_integrity_free(bio); - bio_endio(bio); -} - -/** - * __bio_integrity_endio - Integrity I/O completion function - * @bio: Protected bio - * - * Description: Completion for integrity I/O - * - * Normally I/O completion is done in interrupt context. However, - * verifying I/O integrity is a time-consuming task which must be run - * in process context. This function postpones completion - * accordingly. - */ -bool __bio_integrity_endio(struct bio *bio) -{ - struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); - struct bio_integrity_payload *bip = bio_integrity(bio); - - if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && bi->csum_type) { - INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); - queue_work(kintegrityd_wq, &bip->bip_work); - return false; - } - - kfree(bvec_virt(bip->bip_vec)); - bio_integrity_free(bio); - return true; -} - /** * bio_integrity_advance - Advance integrity vector * @bio: bio whose integrity vector to update @@ -644,15 +494,6 @@ void bioset_integrity_free(struct bio_set *bs) void __init bio_integrity_init(void) { - /* - * kintegrityd won't block much but may burn a lot of CPU cycles. - * Make it highpri CPU intensive wq with max concurrency of 1. - */ - kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM | - WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1); - if (!kintegrityd_wq) - panic("Failed to create kintegrityd\n"); - bip_slab = kmem_cache_create("bio_integrity_payload", sizeof(struct bio_integrity_payload) + sizeof(struct bio_vec) * BIO_INLINE_VECS, From 1972a1faaa026eb34322a2b4fcbb3c28d68cc5e2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 25 Feb 2025 07:44:33 -0800 Subject: [PATCH 10/30] block: split struct bio_integrity_payload Many of the fields in struct bio_integrity_payload are only needed for the default integrity buffer in the block layer, and the variable sized array at the end of the structure makes it very hard to embed into caller allocated structures. Reduce struct bio_integrity_payload to the minimal structure needed in common code and create two separate containing structures for the automatically generated payload and the caller allocated payload. The latter is a simple wrapper for struct bio_integrity_payload and the bvecs, while the former contains the additional fields moved out of struct bio_integrity_payload. Always use a dedicated mempool for automatic integrity metadata instead of depending on bio_set that is submitter controlled and thus often doesn't have the mempool initialized and stop using mempools for the submitter buffers as they aren't in the NOIO I/O submission path where we need to guarantee forward progress. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Tested-by: Anuj Gupta Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Link: https://lore.kernel.org/r/20250225154449.422989-4-hch@lst.de Signed-off-by: Jens Axboe --- block/bio-integrity-auto.c | 75 +++++++++++++------ block/bio-integrity.c | 107 ++++++---------------------- block/bio.c | 6 -- block/blk.h | 2 +- block/t10-pi.c | 6 +- drivers/md/dm-integrity.c | 12 ---- drivers/md/dm-table.c | 6 -- drivers/md/md.c | 13 ---- drivers/target/target_core_iblock.c | 12 ---- include/linux/bio-integrity.h | 25 +------ include/linux/bio.h | 4 -- 11 files changed, 80 insertions(+), 188 deletions(-) diff --git a/block/bio-integrity-auto.c b/block/bio-integrity-auto.c index 357241fa0f20bf..e524c609be5066 100644 --- a/block/bio-integrity-auto.c +++ b/block/bio-integrity-auto.c @@ -12,18 +12,34 @@ #include #include "blk.h" +struct bio_integrity_data { + struct bio *bio; + struct bvec_iter saved_bio_iter; + struct work_struct work; + struct bio_integrity_payload bip; + struct bio_vec bvec; +}; + +static struct kmem_cache *bid_slab; +static mempool_t bid_pool; static struct workqueue_struct *kintegrityd_wq; -static void bio_integrity_verify_fn(struct work_struct *work) +static void bio_integrity_finish(struct bio_integrity_data *bid) { - struct bio_integrity_payload *bip = - container_of(work, struct bio_integrity_payload, bip_work); - struct bio *bio = bip->bip_bio; + bid->bio->bi_integrity = NULL; + bid->bio->bi_opf &= ~REQ_INTEGRITY; + kfree(bvec_virt(bid->bip.bip_vec)); + mempool_free(bid, &bid_pool); +} - blk_integrity_verify(bio); +static void bio_integrity_verify_fn(struct work_struct *work) +{ + struct bio_integrity_data *bid = + container_of(work, struct bio_integrity_data, work); + struct bio *bio = bid->bio; - kfree(bvec_virt(bip->bip_vec)); - bio_integrity_free(bio); + blk_integrity_verify_iter(bio, &bid->saved_bio_iter); + bio_integrity_finish(bid); bio_endio(bio); } @@ -40,15 +56,16 @@ bool __bio_integrity_endio(struct bio *bio) { struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct bio_integrity_payload *bip = bio_integrity(bio); + struct bio_integrity_data *bid = + container_of(bip, struct bio_integrity_data, bip); if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && bi->csum_type) { - INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); - queue_work(kintegrityd_wq, &bip->bip_work); + INIT_WORK(&bid->work, bio_integrity_verify_fn); + queue_work(kintegrityd_wq, &bid->work); return false; } - kfree(bvec_virt(bip->bip_vec)); - bio_integrity_free(bio); + bio_integrity_finish(bid); return true; } @@ -65,8 +82,8 @@ bool __bio_integrity_endio(struct bio *bio) */ bool bio_integrity_prep(struct bio *bio) { - struct bio_integrity_payload *bip; struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + struct bio_integrity_data *bid; gfp_t gfp = GFP_NOIO; unsigned int len; void *buf; @@ -102,27 +119,30 @@ bool bio_integrity_prep(struct bio *bio) return true; } + if (WARN_ON_ONCE(bio_has_crypt_ctx(bio))) + return true; + /* Allocate kernel buffer for protection data */ len = bio_integrity_bytes(bi, bio_sectors(bio)); buf = kmalloc(len, gfp); if (!buf) goto err_end_io; + bid = mempool_alloc(&bid_pool, GFP_NOIO); + if (!bid) + goto err_free_buf; + bio_integrity_init(bio, &bid->bip, &bid->bvec, 1); - bip = bio_integrity_alloc(bio, GFP_NOIO, 1); - if (IS_ERR(bip)) { - kfree(buf); - goto err_end_io; - } + bid->bio = bio; - bip->bip_flags |= BIP_BLOCK_INTEGRITY; - bip_set_seed(bip, bio->bi_iter.bi_sector); + bid->bip.bip_flags |= BIP_BLOCK_INTEGRITY; + bip_set_seed(&bid->bip, bio->bi_iter.bi_sector); if (bi->csum_type == BLK_INTEGRITY_CSUM_IP) - bip->bip_flags |= BIP_IP_CHECKSUM; + bid->bip.bip_flags |= BIP_IP_CHECKSUM; if (bi->csum_type) - bip->bip_flags |= BIP_CHECK_GUARD; + bid->bip.bip_flags |= BIP_CHECK_GUARD; if (bi->flags & BLK_INTEGRITY_REF_TAG) - bip->bip_flags |= BIP_CHECK_REFTAG; + bid->bip.bip_flags |= BIP_CHECK_REFTAG; if (bio_integrity_add_page(bio, virt_to_page(buf), len, offset_in_page(buf)) < len) @@ -132,9 +152,11 @@ bool bio_integrity_prep(struct bio *bio) if (bio_data_dir(bio) == WRITE) blk_integrity_generate(bio); else - bip->bio_iter = bio->bi_iter; + bid->saved_bio_iter = bio->bi_iter; return true; +err_free_buf: + kfree(buf); err_end_io: bio->bi_status = BLK_STS_RESOURCE; bio_endio(bio); @@ -149,6 +171,13 @@ void blk_flush_integrity(void) static int __init blk_integrity_auto_init(void) { + bid_slab = kmem_cache_create("bio_integrity_data", + sizeof(struct bio_integrity_data), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + + if (mempool_init_slab_pool(&bid_pool, BIO_POOL_SIZE, bid_slab)) + panic("bio: can't create integrity pool\n"); + /* * kintegrityd won't block much but may burn a lot of CPU cycles. * Make it highpri CPU intensive wq with max concurrency of 1. diff --git a/block/bio-integrity.c b/block/bio-integrity.c index aa9f966123194a..608594a154a5b9 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -7,13 +7,12 @@ */ #include -#include -#include -#include -#include #include "blk.h" -static struct kmem_cache *bip_slab; +struct bio_integrity_alloc { + struct bio_integrity_payload bip; + struct bio_vec bvecs[]; +}; /** * bio_integrity_free - Free bio integrity payload @@ -23,21 +22,23 @@ static struct kmem_cache *bip_slab; */ void bio_integrity_free(struct bio *bio) { - struct bio_integrity_payload *bip = bio_integrity(bio); - struct bio_set *bs = bio->bi_pool; - - if (bs && mempool_initialized(&bs->bio_integrity_pool)) { - if (bip->bip_vec) - bvec_free(&bs->bvec_integrity_pool, bip->bip_vec, - bip->bip_max_vcnt); - mempool_free(bip, &bs->bio_integrity_pool); - } else { - kfree(bip); - } + kfree(bio_integrity(bio)); bio->bi_integrity = NULL; bio->bi_opf &= ~REQ_INTEGRITY; } +void bio_integrity_init(struct bio *bio, struct bio_integrity_payload *bip, + struct bio_vec *bvecs, unsigned int nr_vecs) +{ + memset(bip, 0, sizeof(*bip)); + bip->bip_max_vcnt = nr_vecs; + if (nr_vecs) + bip->bip_vec = bvecs; + + bio->bi_integrity = bip; + bio->bi_opf |= REQ_INTEGRITY; +} + /** * bio_integrity_alloc - Allocate integrity payload and attach it to bio * @bio: bio to attach integrity metadata to @@ -52,48 +53,16 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp_mask, unsigned int nr_vecs) { - struct bio_integrity_payload *bip; - struct bio_set *bs = bio->bi_pool; - unsigned inline_vecs; + struct bio_integrity_alloc *bia; if (WARN_ON_ONCE(bio_has_crypt_ctx(bio))) return ERR_PTR(-EOPNOTSUPP); - if (!bs || !mempool_initialized(&bs->bio_integrity_pool)) { - bip = kmalloc(struct_size(bip, bip_inline_vecs, nr_vecs), gfp_mask); - inline_vecs = nr_vecs; - } else { - bip = mempool_alloc(&bs->bio_integrity_pool, gfp_mask); - inline_vecs = BIO_INLINE_VECS; - } - - if (unlikely(!bip)) + bia = kmalloc(struct_size(bia, bvecs, nr_vecs), gfp_mask); + if (unlikely(!bia)) return ERR_PTR(-ENOMEM); - - memset(bip, 0, sizeof(*bip)); - - /* always report as many vecs as asked explicitly, not inline vecs */ - bip->bip_max_vcnt = nr_vecs; - if (nr_vecs > inline_vecs) { - bip->bip_vec = bvec_alloc(&bs->bvec_integrity_pool, - &bip->bip_max_vcnt, gfp_mask); - if (!bip->bip_vec) - goto err; - } else if (nr_vecs) { - bip->bip_vec = bip->bip_inline_vecs; - } - - bip->bip_bio = bio; - bio->bi_integrity = bip; - bio->bi_opf |= REQ_INTEGRITY; - - return bip; -err: - if (bs && mempool_initialized(&bs->bio_integrity_pool)) - mempool_free(bip, &bs->bio_integrity_pool); - else - kfree(bip); - return ERR_PTR(-ENOMEM); + bio_integrity_init(bio, &bia->bip, bia->bvecs, nr_vecs); + return &bia->bip; } EXPORT_SYMBOL(bio_integrity_alloc); @@ -467,35 +436,3 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, return 0; } - -int bioset_integrity_create(struct bio_set *bs, int pool_size) -{ - if (mempool_initialized(&bs->bio_integrity_pool)) - return 0; - - if (mempool_init_slab_pool(&bs->bio_integrity_pool, - pool_size, bip_slab)) - return -1; - - if (biovec_init_pool(&bs->bvec_integrity_pool, pool_size)) { - mempool_exit(&bs->bio_integrity_pool); - return -1; - } - - return 0; -} -EXPORT_SYMBOL(bioset_integrity_create); - -void bioset_integrity_free(struct bio_set *bs) -{ - mempool_exit(&bs->bio_integrity_pool); - mempool_exit(&bs->bvec_integrity_pool); -} - -void __init bio_integrity_init(void) -{ - bip_slab = kmem_cache_create("bio_integrity_payload", - sizeof(struct bio_integrity_payload) + - sizeof(struct bio_vec) * BIO_INLINE_VECS, - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); -} diff --git a/block/bio.c b/block/bio.c index f0c416e5931d97..dabc1a6c41b1fc 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1657,7 +1657,6 @@ void bioset_exit(struct bio_set *bs) mempool_exit(&bs->bio_pool); mempool_exit(&bs->bvec_pool); - bioset_integrity_free(bs); if (bs->bio_slab) bio_put_slab(bs); bs->bio_slab = NULL; @@ -1737,8 +1736,6 @@ static int __init init_bio(void) BUILD_BUG_ON(BIO_FLAG_LAST > 8 * sizeof_field(struct bio, bi_flags)); - bio_integrity_init(); - for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) { struct biovec_slab *bvs = bvec_slabs + i; @@ -1754,9 +1751,6 @@ static int __init init_bio(void) BIOSET_NEED_BVECS | BIOSET_PERCPU_CACHE)) panic("bio: can't allocate bios\n"); - if (bioset_integrity_create(&fs_bio_set, BIO_POOL_SIZE)) - panic("bio: can't create integrity pool\n"); - return 0; } subsys_initcall(init_bio); diff --git a/block/blk.h b/block/blk.h index 90fa5f28ccabfd..8f5554a6989e58 100644 --- a/block/blk.h +++ b/block/blk.h @@ -710,7 +710,7 @@ int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, int bdev_permission(dev_t dev, blk_mode_t mode, void *holder); void blk_integrity_generate(struct bio *bio); -void blk_integrity_verify(struct bio *bio); +void blk_integrity_verify_iter(struct bio *bio, struct bvec_iter *saved_iter); void blk_integrity_prepare(struct request *rq); void blk_integrity_complete(struct request *rq, unsigned int nr_bytes); diff --git a/block/t10-pi.c b/block/t10-pi.c index 2d05421f0fa566..de172d56b1f3b5 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -404,7 +404,7 @@ void blk_integrity_generate(struct bio *bio) } } -void blk_integrity_verify(struct bio *bio) +void blk_integrity_verify_iter(struct bio *bio, struct bvec_iter *saved_iter) { struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct bio_integrity_payload *bip = bio_integrity(bio); @@ -418,9 +418,9 @@ void blk_integrity_verify(struct bio *bio) */ iter.disk_name = bio->bi_bdev->bd_disk->disk_name; iter.interval = 1 << bi->interval_exp; - iter.seed = bip->bio_iter.bi_sector; + iter.seed = saved_iter->bi_sector; iter.prot_buf = bvec_virt(bip->bip_vec); - __bio_for_each_segment(bv, bio, bviter, bip->bio_iter) { + __bio_for_each_segment(bv, bio, bviter, *saved_iter) { void *kaddr = bvec_kmap_local(&bv); blk_status_t ret = BLK_STS_OK; diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index ee9f7cecd78e0e..e743657379f7fc 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -4808,23 +4808,11 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv ti->error = "Cannot allocate bio set"; goto bad; } - r = bioset_integrity_create(&ic->recheck_bios, RECHECK_POOL_SIZE); - if (r) { - ti->error = "Cannot allocate bio integrity set"; - r = -ENOMEM; - goto bad; - } r = bioset_init(&ic->recalc_bios, 1, 0, BIOSET_NEED_BVECS); if (r) { ti->error = "Cannot allocate bio set"; goto bad; } - r = bioset_integrity_create(&ic->recalc_bios, 1); - if (r) { - ti->error = "Cannot allocate bio integrity set"; - r = -ENOMEM; - goto bad; - } } ic->metadata_wq = alloc_workqueue("dm-integrity-metadata", diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index bf9a61191e9a2f..453803f1edf546 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1081,15 +1081,9 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET; if (bioset_init(&pools->io_bs, pool_size, io_front_pad, bioset_flags)) goto out_free_pools; - if (mempool_needs_integrity && - bioset_integrity_create(&pools->io_bs, pool_size)) - goto out_free_pools; init_bs: if (bioset_init(&pools->bs, pool_size, front_pad, 0)) goto out_free_pools; - if (mempool_needs_integrity && - bioset_integrity_create(&pools->bs, pool_size)) - goto out_free_pools; t->mempools = pools; return 0; diff --git a/drivers/md/md.c b/drivers/md/md.c index 30b3dbbce2d2df..79cabe4be77dd3 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2359,19 +2359,6 @@ int md_integrity_register(struct mddev *mddev) return 0; /* shouldn't register */ pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); - if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || - (mddev->level != 1 && mddev->level != 10 && - bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) { - /* - * No need to handle the failure of bioset_integrity_create, - * because the function is called by md_run() -> pers->run(), - * md_run calls bioset_exit -> bioset_integrity_free in case - * of failure case. - */ - pr_err("md: failed to create integrity pool for %s\n", - mdname(mddev)); - return -EINVAL; - } return 0; } EXPORT_SYMBOL(md_integrity_register); diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index c8dc92a7d63e64..73564efd11d299 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -167,18 +167,6 @@ static int iblock_configure_device(struct se_device *dev) break; } - if (dev->dev_attrib.pi_prot_type) { - struct bio_set *bs = &ib_dev->ibd_bio_set; - - if (bioset_integrity_create(bs, IBLOCK_BIO_POOL_SIZE) < 0) { - pr_err("Unable to allocate bioset for PI\n"); - ret = -ENOMEM; - goto out_blkdev_put; - } - pr_debug("IBLOCK setup BIP bs->bio_integrity_pool: %p\n", - &bs->bio_integrity_pool); - } - dev->dev_attrib.hw_pi_prot_type = dev->dev_attrib.pi_prot_type; return 0; diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 802f52e38efd5b..0a25716820fe07 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -16,8 +16,6 @@ enum bip_flags { }; struct bio_integrity_payload { - struct bio *bip_bio; /* parent bio */ - struct bvec_iter bip_iter; unsigned short bip_vcnt; /* # of integrity bio_vecs */ @@ -25,12 +23,7 @@ struct bio_integrity_payload { unsigned short bip_flags; /* control flags */ u16 app_tag; /* application tag value */ - struct bvec_iter bio_iter; /* for rewinding parent bio */ - - struct work_struct bip_work; /* I/O completion */ - struct bio_vec *bip_vec; - struct bio_vec bip_inline_vecs[];/* embedded bvec array */ }; #define BIP_CLONE_FLAGS (BIP_MAPPED_INTEGRITY | BIP_IP_CHECKSUM | \ @@ -74,6 +67,8 @@ static inline void bip_set_seed(struct bio_integrity_payload *bip, bip->bip_iter.bi_sector = seed; } +void bio_integrity_init(struct bio *bio, struct bio_integrity_payload *bip, + struct bio_vec *bvecs, unsigned int nr_vecs); struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp, unsigned int nr); int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, @@ -85,9 +80,6 @@ bool bio_integrity_prep(struct bio *bio); void bio_integrity_advance(struct bio *bio, unsigned int bytes_done); void bio_integrity_trim(struct bio *bio); int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask); -int bioset_integrity_create(struct bio_set *bs, int pool_size); -void bioset_integrity_free(struct bio_set *bs); -void bio_integrity_init(void); #else /* CONFIG_BLK_DEV_INTEGRITY */ @@ -96,15 +88,6 @@ static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) return NULL; } -static inline int bioset_integrity_create(struct bio_set *bs, int pool_size) -{ - return 0; -} - -static inline void bioset_integrity_free(struct bio_set *bs) -{ -} - static inline int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) { return -EINVAL; @@ -139,10 +122,6 @@ static inline void bio_integrity_trim(struct bio *bio) { } -static inline void bio_integrity_init(void) -{ -} - static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) { return false; diff --git a/include/linux/bio.h b/include/linux/bio.h index 4b79bf50f4f0e5..cafc7c215de8be 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -625,10 +625,6 @@ struct bio_set { mempool_t bio_pool; mempool_t bvec_pool; -#if defined(CONFIG_BLK_DEV_INTEGRITY) - mempool_t bio_integrity_pool; - mempool_t bvec_integrity_pool; -#endif unsigned int back_pad; /* From 9fcc4add3c0d6e43788d55bf7a957c1d4de8584a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 27 Feb 2025 18:37:07 +0800 Subject: [PATCH 11/30] ublk: add DMA alignment limit The in-tree ublk driver doesn't need DMA alignment limit because there is one data copy between request pages and the userspace buffer. However, ublk is going to support zero copy, then DMA alignment limit is required, because same IO buffer is forwarded to backend which may have specific buffer DMA alignment limit, so the limit has to be exposed from the frontend driver to client application. Cc: Keith Busch Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250227103707.2640014-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 16 +++++++++++++++- include/uapi/linux/ublk_cmd.h | 7 +++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index ff648c6839c154..e8f52d8341fba8 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -70,7 +70,8 @@ /* All UBLK_PARAM_TYPE_* should be included here */ #define UBLK_PARAM_TYPE_ALL \ (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \ - UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED) + UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \ + UBLK_PARAM_TYPE_DMA_ALIGN) struct ublk_rq_data { struct llist_node node; @@ -568,6 +569,16 @@ static int ublk_validate_params(const struct ublk_device *ub) else if (ublk_dev_is_zoned(ub)) return -EINVAL; + if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN) { + const struct ublk_param_dma_align *p = &ub->params.dma; + + if (p->alignment >= PAGE_SIZE) + return -EINVAL; + + if (!is_power_of_2(p->alignment + 1)) + return -EINVAL; + } + return 0; } @@ -2300,6 +2311,9 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) if (ub->params.basic.attrs & UBLK_ATTR_ROTATIONAL) lim.features |= BLK_FEAT_ROTATIONAL; + if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN) + lim.dma_alignment = ub->params.dma.alignment; + if (wait_for_completion_interruptible(&ub->completion) != 0) return -EINTR; diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index a8bc98bb69fce3..8093acdeaa1141 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -401,6 +401,11 @@ struct ublk_param_zoned { __u8 reserved[20]; }; +struct ublk_param_dma_align { + __u32 alignment; + __u8 pad[4]; +}; + struct ublk_params { /* * Total length of parameters, userspace has to set 'len' for both @@ -413,12 +418,14 @@ struct ublk_params { #define UBLK_PARAM_TYPE_DISCARD (1 << 1) #define UBLK_PARAM_TYPE_DEVT (1 << 2) #define UBLK_PARAM_TYPE_ZONED (1 << 3) +#define UBLK_PARAM_TYPE_DMA_ALIGN (1 << 4) __u32 types; /* types of parameter included */ struct ublk_param_basic basic; struct ublk_param_discard discard; struct ublk_param_devt devt; struct ublk_param_zoned zoned; + struct ublk_param_dma_align dma; }; #endif From ad4801dbfb1ac2144b05d4e443a0010c20236676 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 24 Feb 2025 13:38:09 +0100 Subject: [PATCH 12/30] crypto,fs: Separate out hkdf_extract() and hkdf_expand() Separate out the HKDF functions into a separate module to to make them available to other callers. And add a testsuite to the module with test vectors from RFC 5869 (and additional vectors for SHA384 and SHA512) to ensure the integrity of the algorithm. Signed-off-by: Hannes Reinecke Acked-by: Eric Biggers Acked-by: Herbert Xu Signed-off-by: Keith Busch --- crypto/Kconfig | 6 + crypto/Makefile | 1 + crypto/hkdf.c | 573 ++++++++++++++++++++++++++++++++++++++++++ fs/crypto/Kconfig | 1 + fs/crypto/hkdf.c | 85 ++----- include/crypto/hkdf.h | 20 ++ 6 files changed, 616 insertions(+), 70 deletions(-) create mode 100644 crypto/hkdf.c create mode 100644 include/crypto/hkdf.h diff --git a/crypto/Kconfig b/crypto/Kconfig index 74ae5f52b78405..90c17fb5498bec 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -141,6 +141,12 @@ config CRYPTO_ACOMP select CRYPTO_ALGAPI select CRYPTO_ACOMP2 +config CRYPTO_HKDF + tristate + select CRYPTO_SHA256 if !CONFIG_CRYPTO_MANAGER_DISABLE_TESTS + select CRYPTO_SHA512 if !CONFIG_CRYPTO_MANAGER_DISABLE_TESTS + select CRYPTO_HASH2 + config CRYPTO_MANAGER tristate "Cryptographic algorithm manager" select CRYPTO_MANAGER2 diff --git a/crypto/Makefile b/crypto/Makefile index f67e853c469021..71344adb358274 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -34,6 +34,7 @@ obj-$(CONFIG_CRYPTO_HASH2) += crypto_hash.o obj-$(CONFIG_CRYPTO_AKCIPHER2) += akcipher.o obj-$(CONFIG_CRYPTO_SIG2) += sig.o obj-$(CONFIG_CRYPTO_KPP2) += kpp.o +obj-$(CONFIG_CRYPTO_HKDF) += hkdf.o dh_generic-y := dh.o dh_generic-y += dh_helper.o diff --git a/crypto/hkdf.c b/crypto/hkdf.c new file mode 100644 index 00000000000000..2434c5c425456c --- /dev/null +++ b/crypto/hkdf.c @@ -0,0 +1,573 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Implementation of HKDF ("HMAC-based Extract-and-Expand Key Derivation + * Function"), aka RFC 5869. See also the original paper (Krawczyk 2010): + * "Cryptographic Extraction and Key Derivation: The HKDF Scheme". + * + * Copyright 2019 Google LLC + */ + +#include +#include +#include +#include + +/* + * HKDF consists of two steps: + * + * 1. HKDF-Extract: extract a pseudorandom key from the input keying material + * and optional salt. + * 2. HKDF-Expand: expand the pseudorandom key into output keying material of + * any length, parameterized by an application-specific info string. + * + */ + +/** + * hkdf_extract - HKDF-Extract (RFC 5869 section 2.2) + * @hmac_tfm: an HMAC transform using the hash function desired for HKDF. The + * caller is responsible for setting the @prk afterwards. + * @ikm: input keying material + * @ikmlen: length of @ikm + * @salt: input salt value + * @saltlen: length of @salt + * @prk: resulting pseudorandom key + * + * Extracts a pseudorandom key @prk from the input keying material + * @ikm with length @ikmlen and salt @salt with length @saltlen. + * The length of @prk is given by the digest size of @hmac_tfm. + * For an 'unsalted' version of HKDF-Extract @salt must be set + * to all zeroes and @saltlen must be set to the length of @prk. + * + * Returns 0 on success with the pseudorandom key stored in @prk, + * or a negative errno value otherwise. + */ +int hkdf_extract(struct crypto_shash *hmac_tfm, const u8 *ikm, + unsigned int ikmlen, const u8 *salt, unsigned int saltlen, + u8 *prk) +{ + int err; + + err = crypto_shash_setkey(hmac_tfm, salt, saltlen); + if (!err) + err = crypto_shash_tfm_digest(hmac_tfm, ikm, ikmlen, prk); + + return err; +} +EXPORT_SYMBOL_GPL(hkdf_extract); + +/** + * hkdf_expand - HKDF-Expand (RFC 5869 section 2.3) + * @hmac_tfm: hash context keyed with pseudorandom key + * @info: application-specific information + * @infolen: length of @info + * @okm: output keying material + * @okmlen: length of @okm + * + * This expands the pseudorandom key, which was already keyed into @hmac_tfm, + * into @okmlen bytes of output keying material parameterized by the + * application-specific @info of length @infolen bytes. + * This is thread-safe and may be called by multiple threads in parallel. + * + * Returns 0 on success with output keying material stored in @okm, + * or a negative errno value otherwise. + */ +int hkdf_expand(struct crypto_shash *hmac_tfm, + const u8 *info, unsigned int infolen, + u8 *okm, unsigned int okmlen) +{ + SHASH_DESC_ON_STACK(desc, hmac_tfm); + unsigned int i, hashlen = crypto_shash_digestsize(hmac_tfm); + int err; + const u8 *prev = NULL; + u8 counter = 1; + u8 tmp[HASH_MAX_DIGESTSIZE] = {}; + + if (WARN_ON(okmlen > 255 * hashlen)) + return -EINVAL; + + desc->tfm = hmac_tfm; + + for (i = 0; i < okmlen; i += hashlen) { + err = crypto_shash_init(desc); + if (err) + goto out; + + if (prev) { + err = crypto_shash_update(desc, prev, hashlen); + if (err) + goto out; + } + + if (infolen) { + err = crypto_shash_update(desc, info, infolen); + if (err) + goto out; + } + + BUILD_BUG_ON(sizeof(counter) != 1); + if (okmlen - i < hashlen) { + err = crypto_shash_finup(desc, &counter, 1, tmp); + if (err) + goto out; + memcpy(&okm[i], tmp, okmlen - i); + memzero_explicit(tmp, sizeof(tmp)); + } else { + err = crypto_shash_finup(desc, &counter, 1, &okm[i]); + if (err) + goto out; + } + counter++; + prev = &okm[i]; + } + err = 0; +out: + if (unlikely(err)) + memzero_explicit(okm, okmlen); /* so caller doesn't need to */ + shash_desc_zero(desc); + memzero_explicit(tmp, HASH_MAX_DIGESTSIZE); + return err; +} +EXPORT_SYMBOL_GPL(hkdf_expand); + +struct hkdf_testvec { + const char *test; + const u8 *ikm; + const u8 *salt; + const u8 *info; + const u8 *prk; + const u8 *okm; + u16 ikm_size; + u16 salt_size; + u16 info_size; + u16 prk_size; + u16 okm_size; +}; + +/* + * HKDF test vectors from RFC5869 + * + * Additional HKDF test vectors from + * https://github.com/brycx/Test-Vector-Generation/blob/master/HKDF/hkdf-hmac-sha2-test-vectors.md + */ +static const struct hkdf_testvec hkdf_sha256_tv[] = { + { + .test = "basic hdkf test", + .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b" + "\x0b\x0b\x0b\x0b\x0b\x0b", + .ikm_size = 22, + .salt = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c", + .salt_size = 13, + .info = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9", + .info_size = 10, + .prk = "\x07\x77\x09\x36\x2c\x2e\x32\xdf\x0d\xdc\x3f\x0d\xc4\x7b\xba\x63" + "\x90\xb6\xc7\x3b\xb5\x0f\x9c\x31\x22\xec\x84\x4a\xd7\xc2\xb3\xe5", + .prk_size = 32, + .okm = "\x3c\xb2\x5f\x25\xfa\xac\xd5\x7a\x90\x43\x4f\x64\xd0\x36\x2f\x2a" + "\x2d\x2d\x0a\x90\xcf\x1a\x5a\x4c\x5d\xb0\x2d\x56\xec\xc4\xc5\xbf" + "\x34\x00\x72\x08\xd5\xb8\x87\x18\x58\x65", + .okm_size = 42, + }, { + .test = "hkdf test with long input", + .ikm = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" + "\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f" + "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f" + "\x40\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f", + .ikm_size = 80, + .salt = "\x60\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f" + "\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f" + "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" + "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f" + "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf", + .salt_size = 80, + .info = "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf" + "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf" + "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf" + "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef" + "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff", + .info_size = 80, + .prk = "\x06\xa6\xb8\x8c\x58\x53\x36\x1a\x06\x10\x4c\x9c\xeb\x35\xb4\x5c" + "\xef\x76\x00\x14\x90\x46\x71\x01\x4a\x19\x3f\x40\xc1\x5f\xc2\x44", + .prk_size = 32, + .okm = "\xb1\x1e\x39\x8d\xc8\x03\x27\xa1\xc8\xe7\xf7\x8c\x59\x6a\x49\x34" + "\x4f\x01\x2e\xda\x2d\x4e\xfa\xd8\xa0\x50\xcc\x4c\x19\xaf\xa9\x7c" + "\x59\x04\x5a\x99\xca\xc7\x82\x72\x71\xcb\x41\xc6\x5e\x59\x0e\x09" + "\xda\x32\x75\x60\x0c\x2f\x09\xb8\x36\x77\x93\xa9\xac\xa3\xdb\x71" + "\xcc\x30\xc5\x81\x79\xec\x3e\x87\xc1\x4c\x01\xd5\xc1\xf3\x43\x4f" + "\x1d\x87", + .okm_size = 82, + }, { + .test = "hkdf test with zero salt and info", + .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b" + "\x0b\x0b\x0b\x0b\x0b\x0b", + .ikm_size = 22, + .salt = NULL, + .salt_size = 0, + .info = NULL, + .info_size = 0, + .prk = "\x19\xef\x24\xa3\x2c\x71\x7b\x16\x7f\x33\xa9\x1d\x6f\x64\x8b\xdf" + "\x96\x59\x67\x76\xaf\xdb\x63\x77\xac\x43\x4c\x1c\x29\x3c\xcb\x04", + .prk_size = 32, + .okm = "\x8d\xa4\xe7\x75\xa5\x63\xc1\x8f\x71\x5f\x80\x2a\x06\x3c\x5a\x31" + "\xb8\xa1\x1f\x5c\x5e\xe1\x87\x9e\xc3\x45\x4e\x5f\x3c\x73\x8d\x2d" + "\x9d\x20\x13\x95\xfa\xa4\xb6\x1a\x96\xc8", + .okm_size = 42, + }, { + .test = "hkdf test with short input", + .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b", + .ikm_size = 11, + .salt = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c", + .salt_size = 13, + .info = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9", + .info_size = 10, + .prk = "\x82\x65\xf6\x9d\x7f\xf7\xe5\x01\x37\x93\x01\x5c\xa0\xef\x92\x0c" + "\xb1\x68\x21\x99\xc8\xbc\x3a\x00\xda\x0c\xab\x47\xb7\xb0\x0f\xdf", + .prk_size = 32, + .okm = "\x58\xdc\xe1\x0d\x58\x01\xcd\xfd\xa8\x31\x72\x6b\xfe\xbc\xb7\x43" + "\xd1\x4a\x7e\xe8\x3a\xa0\x57\xa9\x3d\x59\xb0\xa1\x31\x7f\xf0\x9d" + "\x10\x5c\xce\xcf\x53\x56\x92\xb1\x4d\xd5", + .okm_size = 42, + }, { + .test = "unsalted hkdf test with zero info", + .ikm = "\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c" + "\x0c\x0c\x0c\x0c\x0c\x0c", + .ikm_size = 22, + .salt = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + .salt_size = 32, + .info = NULL, + .info_size = 0, + .prk = "\xaa\x84\x1e\x1f\x35\x74\xf3\x2d\x13\xfb\xa8\x00\x5f\xcd\x9b\x8d" + "\x77\x67\x82\xa5\xdf\xa1\x92\x38\x92\xfd\x8b\x63\x5d\x3a\x89\xdf", + .prk_size = 32, + .okm = "\x59\x68\x99\x17\x9a\xb1\xbc\x00\xa7\xc0\x37\x86\xff\x43\xee\x53" + "\x50\x04\xbe\x2b\xb9\xbe\x68\xbc\x14\x06\x63\x6f\x54\xbd\x33\x8a" + "\x66\xa2\x37\xba\x2a\xcb\xce\xe3\xc9\xa7", + .okm_size = 42, + } +}; + +static const struct hkdf_testvec hkdf_sha384_tv[] = { + { + .test = "basic hkdf test", + .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b" + "\x0b\x0b\x0b\x0b\x0b\x0b", + .ikm_size = 22, + .salt = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c", + .salt_size = 13, + .info = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9", + .info_size = 10, + .prk = "\x70\x4b\x39\x99\x07\x79\xce\x1d\xc5\x48\x05\x2c\x7d\xc3\x9f\x30" + "\x35\x70\xdd\x13\xfb\x39\xf7\xac\xc5\x64\x68\x0b\xef\x80\xe8\xde" + "\xc7\x0e\xe9\xa7\xe1\xf3\xe2\x93\xef\x68\xec\xeb\x07\x2a\x5a\xde", + .prk_size = 48, + .okm = "\x9b\x50\x97\xa8\x60\x38\xb8\x05\x30\x90\x76\xa4\x4b\x3a\x9f\x38" + "\x06\x3e\x25\xb5\x16\xdc\xbf\x36\x9f\x39\x4c\xfa\xb4\x36\x85\xf7" + "\x48\xb6\x45\x77\x63\xe4\xf0\x20\x4f\xc5", + .okm_size = 42, + }, { + .test = "hkdf test with long input", + .ikm = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" + "\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f" + "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f" + "\x40\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f", + .ikm_size = 80, + .salt = "\x60\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f" + "\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f" + "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" + "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f" + "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf", + .salt_size = 80, + .info = "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf" + "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf" + "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf" + "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef" + "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff", + .info_size = 80, + .prk = "\xb3\x19\xf6\x83\x1d\xff\x93\x14\xef\xb6\x43\xba\xa2\x92\x63\xb3" + "\x0e\x4a\x8d\x77\x9f\xe3\x1e\x9c\x90\x1e\xfd\x7d\xe7\x37\xc8\x5b" + "\x62\xe6\x76\xd4\xdc\x87\xb0\x89\x5c\x6a\x7d\xc9\x7b\x52\xce\xbb", + .prk_size = 48, + .okm = "\x48\x4c\xa0\x52\xb8\xcc\x72\x4f\xd1\xc4\xec\x64\xd5\x7b\x4e\x81" + "\x8c\x7e\x25\xa8\xe0\xf4\x56\x9e\xd7\x2a\x6a\x05\xfe\x06\x49\xee" + "\xbf\x69\xf8\xd5\xc8\x32\x85\x6b\xf4\xe4\xfb\xc1\x79\x67\xd5\x49" + "\x75\x32\x4a\x94\x98\x7f\x7f\x41\x83\x58\x17\xd8\x99\x4f\xdb\xd6" + "\xf4\xc0\x9c\x55\x00\xdc\xa2\x4a\x56\x22\x2f\xea\x53\xd8\x96\x7a" + "\x8b\x2e", + .okm_size = 82, + }, { + .test = "hkdf test with zero salt and info", + .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b" + "\x0b\x0b\x0b\x0b\x0b\x0b", + .ikm_size = 22, + .salt = NULL, + .salt_size = 0, + .info = NULL, + .info_size = 0, + .prk = "\x10\xe4\x0c\xf0\x72\xa4\xc5\x62\x6e\x43\xdd\x22\xc1\xcf\x72\x7d" + "\x4b\xb1\x40\x97\x5c\x9a\xd0\xcb\xc8\xe4\x5b\x40\x06\x8f\x8f\x0b" + "\xa5\x7c\xdb\x59\x8a\xf9\xdf\xa6\x96\x3a\x96\x89\x9a\xf0\x47\xe5", + .prk_size = 48, + .okm = "\xc8\xc9\x6e\x71\x0f\x89\xb0\xd7\x99\x0b\xca\x68\xbc\xde\xc8\xcf" + "\x85\x40\x62\xe5\x4c\x73\xa7\xab\xc7\x43\xfa\xde\x9b\x24\x2d\xaa" + "\xcc\x1c\xea\x56\x70\x41\x5b\x52\x84\x9c", + .okm_size = 42, + }, { + .test = "hkdf test with short input", + .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b", + .ikm_size = 11, + .salt = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c", + .salt_size = 13, + .info = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9", + .info_size = 10, + .prk = "\x6d\x31\x69\x98\x28\x79\x80\x88\xb3\x59\xda\xd5\x0b\x8f\x01\xb0" + "\x15\xf1\x7a\xa3\xbd\x4e\x27\xa6\xe9\xf8\x73\xb7\x15\x85\xca\x6a" + "\x00\xd1\xf0\x82\x12\x8a\xdb\x3c\xf0\x53\x0b\x57\xc0\xf9\xac\x72", + .prk_size = 48, + .okm = "\xfb\x7e\x67\x43\xeb\x42\xcd\xe9\x6f\x1b\x70\x77\x89\x52\xab\x75" + "\x48\xca\xfe\x53\x24\x9f\x7f\xfe\x14\x97\xa1\x63\x5b\x20\x1f\xf1" + "\x85\xb9\x3e\x95\x19\x92\xd8\x58\xf1\x1a", + .okm_size = 42, + }, { + .test = "unsalted hkdf test with zero info", + .ikm = "\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c" + "\x0c\x0c\x0c\x0c\x0c\x0c", + .ikm_size = 22, + .salt = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + .salt_size = 48, + .info = NULL, + .info_size = 0, + .prk = "\x9d\x2d\xa5\x06\x6f\x05\xd1\x6c\x59\xfe\xdf\x6c\x5f\x32\xc7\x5e" + "\xda\x9a\x47\xa7\x9c\x93\x6a\xa4\x4c\xb7\x63\xa8\xe2\x2f\xfb\xfc" + "\xd8\xfe\x55\x43\x58\x53\x47\x21\x90\x39\xd1\x68\x28\x36\x33\xf5", + .prk_size = 48, + .okm = "\x6a\xd7\xc7\x26\xc8\x40\x09\x54\x6a\x76\xe0\x54\x5d\xf2\x66\x78" + "\x7e\x2b\x2c\xd6\xca\x43\x73\xa1\xf3\x14\x50\xa7\xbd\xf9\x48\x2b" + "\xfa\xb8\x11\xf5\x54\x20\x0e\xad\x8f\x53", + .okm_size = 42, + } +}; + +static const struct hkdf_testvec hkdf_sha512_tv[] = { + { + .test = "basic hkdf test", + .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b" + "\x0b\x0b\x0b\x0b\x0b\x0b", + .ikm_size = 22, + .salt = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c", + .salt_size = 13, + .info = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9", + .info_size = 10, + .prk = "\x66\x57\x99\x82\x37\x37\xde\xd0\x4a\x88\xe4\x7e\x54\xa5\x89\x0b" + "\xb2\xc3\xd2\x47\xc7\xa4\x25\x4a\x8e\x61\x35\x07\x23\x59\x0a\x26" + "\xc3\x62\x38\x12\x7d\x86\x61\xb8\x8c\xf8\x0e\xf8\x02\xd5\x7e\x2f" + "\x7c\xeb\xcf\x1e\x00\xe0\x83\x84\x8b\xe1\x99\x29\xc6\x1b\x42\x37", + .prk_size = 64, + .okm = "\x83\x23\x90\x08\x6c\xda\x71\xfb\x47\x62\x5b\xb5\xce\xb1\x68\xe4" + "\xc8\xe2\x6a\x1a\x16\xed\x34\xd9\xfc\x7f\xe9\x2c\x14\x81\x57\x93" + "\x38\xda\x36\x2c\xb8\xd9\xf9\x25\xd7\xcb", + .okm_size = 42, + }, { + .test = "hkdf test with long input", + .ikm = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" + "\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f" + "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f" + "\x40\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f", + .ikm_size = 80, + .salt = "\x60\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f" + "\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f" + "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" + "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f" + "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf", + .salt_size = 80, + .info = "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf" + "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf" + "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf" + "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef" + "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff", + .info_size = 80, + .prk = "\x35\x67\x25\x42\x90\x7d\x4e\x14\x2c\x00\xe8\x44\x99\xe7\x4e\x1d" + "\xe0\x8b\xe8\x65\x35\xf9\x24\xe0\x22\x80\x4a\xd7\x75\xdd\xe2\x7e" + "\xc8\x6c\xd1\xe5\xb7\xd1\x78\xc7\x44\x89\xbd\xbe\xb3\x07\x12\xbe" + "\xb8\x2d\x4f\x97\x41\x6c\x5a\x94\xea\x81\xeb\xdf\x3e\x62\x9e\x4a", + .prk_size = 64, + .okm = "\xce\x6c\x97\x19\x28\x05\xb3\x46\xe6\x16\x1e\x82\x1e\xd1\x65\x67" + "\x3b\x84\xf4\x00\xa2\xb5\x14\xb2\xfe\x23\xd8\x4c\xd1\x89\xdd\xf1" + "\xb6\x95\xb4\x8c\xbd\x1c\x83\x88\x44\x11\x37\xb3\xce\x28\xf1\x6a" + "\xa6\x4b\xa3\x3b\xa4\x66\xb2\x4d\xf6\xcf\xcb\x02\x1e\xcf\xf2\x35" + "\xf6\xa2\x05\x6c\xe3\xaf\x1d\xe4\x4d\x57\x20\x97\xa8\x50\x5d\x9e" + "\x7a\x93", + .okm_size = 82, + }, { + .test = "hkdf test with zero salt and info", + .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b" + "\x0b\x0b\x0b\x0b\x0b\x0b", + .ikm_size = 22, + .salt = NULL, + .salt_size = 0, + .info = NULL, + .info_size = 0, + .prk = "\xfd\x20\x0c\x49\x87\xac\x49\x13\x13\xbd\x4a\x2a\x13\x28\x71\x21" + "\x24\x72\x39\xe1\x1c\x9e\xf8\x28\x02\x04\x4b\x66\xef\x35\x7e\x5b" + "\x19\x44\x98\xd0\x68\x26\x11\x38\x23\x48\x57\x2a\x7b\x16\x11\xde" + "\x54\x76\x40\x94\x28\x63\x20\x57\x8a\x86\x3f\x36\x56\x2b\x0d\xf6", + .prk_size = 64, + .okm = "\xf5\xfa\x02\xb1\x82\x98\xa7\x2a\x8c\x23\x89\x8a\x87\x03\x47\x2c" + "\x6e\xb1\x79\xdc\x20\x4c\x03\x42\x5c\x97\x0e\x3b\x16\x4b\xf9\x0f" + "\xff\x22\xd0\x48\x36\xd0\xe2\x34\x3b\xac", + .okm_size = 42, + }, { + .test = "hkdf test with short input", + .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b", + .ikm_size = 11, + .salt = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c", + .salt_size = 13, + .info = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9", + .info_size = 10, + .prk = "\x67\x40\x9c\x9c\xac\x28\xb5\x2e\xe9\xfa\xd9\x1c\x2f\xda\x99\x9f" + "\x7c\xa2\x2e\x34\x34\xf0\xae\x77\x28\x63\x83\x65\x68\xad\x6a\x7f" + "\x10\xcf\x11\x3b\xfd\xdd\x56\x01\x29\xa5\x94\xa8\xf5\x23\x85\xc2" + "\xd6\x61\xd7\x85\xd2\x9c\xe9\x3a\x11\x40\x0c\x92\x06\x83\x18\x1d", + .prk_size = 64, + .okm = "\x74\x13\xe8\x99\x7e\x02\x06\x10\xfb\xf6\x82\x3f\x2c\xe1\x4b\xff" + "\x01\x87\x5d\xb1\xca\x55\xf6\x8c\xfc\xf3\x95\x4d\xc8\xaf\xf5\x35" + "\x59\xbd\x5e\x30\x28\xb0\x80\xf7\xc0\x68", + .okm_size = 42, + }, { + .test = "unsalted hkdf test with zero info", + .ikm = "\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c" + "\x0c\x0c\x0c\x0c\x0c\x0c", + .ikm_size = 22, + .salt = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + .salt_size = 64, + .info = NULL, + .info_size = 0, + .prk = "\x53\x46\xb3\x76\xbf\x3a\xa9\xf8\x4f\x8f\x6e\xd5\xb1\xc4\xf4\x89" + "\x17\x2e\x24\x4d\xac\x30\x3d\x12\xf6\x8e\xcc\x76\x6e\xa6\x00\xaa" + "\x88\x49\x5e\x7f\xb6\x05\x80\x31\x22\xfa\x13\x69\x24\xa8\x40\xb1" + "\xf0\x71\x9d\x2d\x5f\x68\xe2\x9b\x24\x22\x99\xd7\x58\xed\x68\x0c", + .prk_size = 64, + .okm = "\x14\x07\xd4\x60\x13\xd9\x8b\xc6\xde\xce\xfc\xfe\xe5\x5f\x0f\x90" + "\xb0\xc7\xf6\x3d\x68\xeb\x1a\x80\xea\xf0\x7e\x95\x3c\xfc\x0a\x3a" + "\x52\x40\xa1\x55\xd6\xe4\xda\xa9\x65\xbb", + .okm_size = 42, + } +}; + +static int hkdf_test(const char *shash, const struct hkdf_testvec *tv) +{ struct crypto_shash *tfm = NULL; + u8 *prk = NULL, *okm = NULL; + unsigned int prk_size; + const char *driver; + int err; + + tfm = crypto_alloc_shash(shash, 0, 0); + if (IS_ERR(tfm)) { + pr_err("%s(%s): failed to allocate transform: %ld\n", + tv->test, shash, PTR_ERR(tfm)); + return PTR_ERR(tfm); + } + driver = crypto_shash_driver_name(tfm); + + prk_size = crypto_shash_digestsize(tfm); + prk = kzalloc(prk_size, GFP_KERNEL); + if (!prk) { + err = -ENOMEM; + goto out_free; + } + + if (tv->prk_size != prk_size) { + pr_err("%s(%s): prk size mismatch (vec %u, digest %u\n", + tv->test, driver, tv->prk_size, prk_size); + err = -EINVAL; + goto out_free; + } + + err = hkdf_extract(tfm, tv->ikm, tv->ikm_size, + tv->salt, tv->salt_size, prk); + if (err) { + pr_err("%s(%s): hkdf_extract failed with %d\n", + tv->test, driver, err); + goto out_free; + } + + if (memcmp(prk, tv->prk, tv->prk_size)) { + pr_err("%s(%s): hkdf_extract prk mismatch\n", + tv->test, driver); + print_hex_dump(KERN_ERR, "prk: ", DUMP_PREFIX_NONE, + 16, 1, prk, tv->prk_size, false); + err = -EINVAL; + goto out_free; + } + + okm = kzalloc(tv->okm_size, GFP_KERNEL); + if (!okm) { + err = -ENOMEM; + goto out_free; + } + + err = crypto_shash_setkey(tfm, tv->prk, tv->prk_size); + if (err) { + pr_err("%s(%s): failed to set prk, error %d\n", + tv->test, driver, err); + goto out_free; + } + + err = hkdf_expand(tfm, tv->info, tv->info_size, + okm, tv->okm_size); + if (err) { + pr_err("%s(%s): hkdf_expand() failed with %d\n", + tv->test, driver, err); + } else if (memcmp(okm, tv->okm, tv->okm_size)) { + pr_err("%s(%s): hkdf_expand() okm mismatch\n", + tv->test, driver); + print_hex_dump(KERN_ERR, "okm: ", DUMP_PREFIX_NONE, + 16, 1, okm, tv->okm_size, false); + err = -EINVAL; + } +out_free: + kfree(okm); + kfree(prk); + crypto_free_shash(tfm); + return err; +} + +static int __init crypto_hkdf_module_init(void) +{ + int ret = 0, i; + + if (IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS)) + return 0; + + for (i = 0; i < ARRAY_SIZE(hkdf_sha256_tv); i++) { + ret = hkdf_test("hmac(sha256)", &hkdf_sha256_tv[i]); + if (ret) + return ret; + } + for (i = 0; i < ARRAY_SIZE(hkdf_sha384_tv); i++) { + ret = hkdf_test("hmac(sha384)", &hkdf_sha384_tv[i]); + if (ret) + return ret; + } + for (i = 0; i < ARRAY_SIZE(hkdf_sha512_tv); i++) { + ret = hkdf_test("hmac(sha512)", &hkdf_sha512_tv[i]); + if (ret) + return ret; + } + return 0; +} + +static void __exit crypto_hkdf_module_exit(void) {} + +module_init(crypto_hkdf_module_init); +module_exit(crypto_hkdf_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("HMAC-based Key Derivation Function (HKDF)"); diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig index 5aff5934baa121..0ea56f588a4ad1 100644 --- a/fs/crypto/Kconfig +++ b/fs/crypto/Kconfig @@ -3,6 +3,7 @@ config FS_ENCRYPTION bool "FS Encryption (Per-file encryption)" select CRYPTO select CRYPTO_HASH + select CRYPTO_HKDF select CRYPTO_SKCIPHER select CRYPTO_LIB_SHA256 select KEYS diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c index 5a384dad2c72f3..855a0f4b7318b7 100644 --- a/fs/crypto/hkdf.c +++ b/fs/crypto/hkdf.c @@ -1,9 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Implementation of HKDF ("HMAC-based Extract-and-Expand Key Derivation - * Function"), aka RFC 5869. See also the original paper (Krawczyk 2010): - * "Cryptographic Extraction and Key Derivation: The HKDF Scheme". - * * This is used to derive keys from the fscrypt master keys. * * Copyright 2019 Google LLC @@ -11,6 +7,7 @@ #include #include +#include #include "fscrypt_private.h" @@ -44,20 +41,6 @@ * there's no way to persist a random salt per master key from kernel mode. */ -/* HKDF-Extract (RFC 5869 section 2.2), unsalted */ -static int hkdf_extract(struct crypto_shash *hmac_tfm, const u8 *ikm, - unsigned int ikmlen, u8 prk[HKDF_HASHLEN]) -{ - static const u8 default_salt[HKDF_HASHLEN]; - int err; - - err = crypto_shash_setkey(hmac_tfm, default_salt, HKDF_HASHLEN); - if (err) - return err; - - return crypto_shash_tfm_digest(hmac_tfm, ikm, ikmlen, prk); -} - /* * Compute HKDF-Extract using the given master key as the input keying material, * and prepare an HMAC transform object keyed by the resulting pseudorandom key. @@ -69,6 +52,7 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, unsigned int master_key_size) { struct crypto_shash *hmac_tfm; + static const u8 default_salt[HKDF_HASHLEN]; u8 prk[HKDF_HASHLEN]; int err; @@ -84,7 +68,8 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, goto err_free_tfm; } - err = hkdf_extract(hmac_tfm, master_key, master_key_size, prk); + err = hkdf_extract(hmac_tfm, master_key, master_key_size, + default_salt, HKDF_HASHLEN, prk); if (err) goto err_free_tfm; @@ -118,61 +103,21 @@ int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, u8 *okm, unsigned int okmlen) { SHASH_DESC_ON_STACK(desc, hkdf->hmac_tfm); - u8 prefix[9]; - unsigned int i; + u8 *full_info; int err; - const u8 *prev = NULL; - u8 counter = 1; - u8 tmp[HKDF_HASHLEN]; - - if (WARN_ON_ONCE(okmlen > 255 * HKDF_HASHLEN)) - return -EINVAL; + full_info = kzalloc(infolen + 9, GFP_KERNEL); + if (!full_info) + return -ENOMEM; desc->tfm = hkdf->hmac_tfm; - memcpy(prefix, "fscrypt\0", 8); - prefix[8] = context; - - for (i = 0; i < okmlen; i += HKDF_HASHLEN) { - - err = crypto_shash_init(desc); - if (err) - goto out; - - if (prev) { - err = crypto_shash_update(desc, prev, HKDF_HASHLEN); - if (err) - goto out; - } - - err = crypto_shash_update(desc, prefix, sizeof(prefix)); - if (err) - goto out; - - err = crypto_shash_update(desc, info, infolen); - if (err) - goto out; - - BUILD_BUG_ON(sizeof(counter) != 1); - if (okmlen - i < HKDF_HASHLEN) { - err = crypto_shash_finup(desc, &counter, 1, tmp); - if (err) - goto out; - memcpy(&okm[i], tmp, okmlen - i); - memzero_explicit(tmp, sizeof(tmp)); - } else { - err = crypto_shash_finup(desc, &counter, 1, &okm[i]); - if (err) - goto out; - } - counter++; - prev = &okm[i]; - } - err = 0; -out: - if (unlikely(err)) - memzero_explicit(okm, okmlen); /* so caller doesn't need to */ - shash_desc_zero(desc); + memcpy(full_info, "fscrypt\0", 8); + full_info[8] = context; + memcpy(full_info + 9, info, infolen); + + err = hkdf_expand(hkdf->hmac_tfm, full_info, infolen + 9, + okm, okmlen); + kfree_sensitive(full_info); return err; } diff --git a/include/crypto/hkdf.h b/include/crypto/hkdf.h new file mode 100644 index 00000000000000..6a9678f508f5d6 --- /dev/null +++ b/include/crypto/hkdf.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * HKDF: HMAC-based Key Derivation Function (HKDF), RFC 5869 + * + * Extracted from fs/crypto/hkdf.c, which has + * Copyright 2019 Google LLC + */ + +#ifndef _CRYPTO_HKDF_H +#define _CRYPTO_HKDF_H + +#include + +int hkdf_extract(struct crypto_shash *hmac_tfm, const u8 *ikm, + unsigned int ikmlen, const u8 *salt, unsigned int saltlen, + u8 *prk); +int hkdf_expand(struct crypto_shash *hmac_tfm, + const u8 *info, unsigned int infolen, + u8 *okm, unsigned int okmlen); +#endif From 894e2820c603f3a39bc6fa09b51638376be0bfbc Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 24 Feb 2025 13:38:10 +0100 Subject: [PATCH 13/30] nvme: add nvme_auth_generate_psk() Add a function to generate a NVMe PSK from the shared credentials negotiated by DH-HMAC-CHAP. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/common/auth.c | 87 ++++++++++++++++++++++++++++++++++++++ include/linux/nvme-auth.h | 3 ++ 2 files changed, 90 insertions(+) diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c index 9b7126e1a19d9e..76e4a12f005e82 100644 --- a/drivers/nvme/common/auth.c +++ b/drivers/nvme/common/auth.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -471,5 +472,91 @@ int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key) } EXPORT_SYMBOL_GPL(nvme_auth_generate_key); +/** + * nvme_auth_generate_psk - Generate a PSK for TLS + * @hmac_id: Hash function identifier + * @skey: Session key + * @skey_len: Length of @skey + * @c1: Value of challenge C1 + * @c2: Value of challenge C2 + * @hash_len: Hash length of the hash algorithm + * @ret_psk: Pointer too the resulting generated PSK + * @ret_len: length of @ret_psk + * + * Generate a PSK for TLS as specified in NVMe base specification, section + * 8.13.5.9: Generated PSK for TLS + * + * The generated PSK for TLS shall be computed applying the HMAC function + * using the hash function H( ) selected by the HashID parameter in the + * DH-HMAC-CHAP_Challenge message with the session key KS as key to the + * concatenation of the two challenges C1 and C2 (i.e., generated + * PSK = HMAC(KS, C1 || C2)). + * + * Returns 0 on success with a valid generated PSK pointer in @ret_psk and + * the length of @ret_psk in @ret_len, or a negative error number otherwise. + */ +int nvme_auth_generate_psk(u8 hmac_id, u8 *skey, size_t skey_len, + u8 *c1, u8 *c2, size_t hash_len, u8 **ret_psk, size_t *ret_len) +{ + struct crypto_shash *tfm; + SHASH_DESC_ON_STACK(shash, tfm); + u8 *psk; + const char *hmac_name; + int ret, psk_len; + + if (!c1 || !c2) + return -EINVAL; + + hmac_name = nvme_auth_hmac_name(hmac_id); + if (!hmac_name) { + pr_warn("%s: invalid hash algorithm %d\n", + __func__, hmac_id); + return -EINVAL; + } + + tfm = crypto_alloc_shash(hmac_name, 0, 0); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + psk_len = crypto_shash_digestsize(tfm); + psk = kzalloc(psk_len, GFP_KERNEL); + if (!psk) { + ret = -ENOMEM; + goto out_free_tfm; + } + + shash->tfm = tfm; + ret = crypto_shash_setkey(tfm, skey, skey_len); + if (ret) + goto out_free_psk; + + ret = crypto_shash_init(shash); + if (ret) + goto out_free_psk; + + ret = crypto_shash_update(shash, c1, hash_len); + if (ret) + goto out_free_psk; + + ret = crypto_shash_update(shash, c2, hash_len); + if (ret) + goto out_free_psk; + + ret = crypto_shash_final(shash, psk); + if (!ret) { + *ret_psk = psk; + *ret_len = psk_len; + } + +out_free_psk: + if (ret) + kfree_sensitive(psk); +out_free_tfm: + crypto_free_shash(tfm); + + return ret; +} +EXPORT_SYMBOL_GPL(nvme_auth_generate_psk); + MODULE_DESCRIPTION("NVMe Authentication framework"); MODULE_LICENSE("GPL v2"); diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h index c1d0bc5d962446..b13884b04dfd8f 100644 --- a/include/linux/nvme-auth.h +++ b/include/linux/nvme-auth.h @@ -40,5 +40,8 @@ int nvme_auth_gen_pubkey(struct crypto_kpp *dh_tfm, int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm, u8 *ctrl_key, size_t ctrl_key_len, u8 *sess_key, size_t sess_key_len); +int nvme_auth_generate_psk(u8 hmac_id, u8 *skey, size_t skey_len, + u8 *c1, u8 *c2, size_t hash_len, + u8 **ret_psk, size_t *ret_len); #endif /* _NVME_AUTH_H */ From cf38acda1fc1bf515c6eb43fb4716d4559b9e070 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 24 Feb 2025 13:38:11 +0100 Subject: [PATCH 14/30] nvme: add nvme_auth_generate_digest() Add a function to calculate the PSK digest as specified in TP8018. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/common/auth.c | 134 +++++++++++++++++++++++++++++++++++++ include/linux/nvme-auth.h | 2 + 2 files changed, 136 insertions(+) diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c index 76e4a12f005e82..54b75d17710169 100644 --- a/drivers/nvme/common/auth.c +++ b/drivers/nvme/common/auth.c @@ -558,5 +558,139 @@ int nvme_auth_generate_psk(u8 hmac_id, u8 *skey, size_t skey_len, } EXPORT_SYMBOL_GPL(nvme_auth_generate_psk); +/** + * nvme_auth_generate_digest - Generate TLS PSK digest + * @hmac_id: Hash function identifier + * @psk: Generated input PSK + * @psk_len: Length of @psk + * @subsysnqn: NQN of the subsystem + * @hostnqn: NQN of the host + * @ret_digest: Pointer to the returned digest + * + * Generate a TLS PSK digest as specified in TP8018 Section 3.6.1.3: + * TLS PSK and PSK identity Derivation + * + * The PSK digest shall be computed by encoding in Base64 (refer to RFC + * 4648) the result of the application of the HMAC function using the hash + * function specified in item 4 above (ie the hash function of the cipher + * suite associated with the PSK identity) with the PSK as HMAC key to the + * concatenation of: + * - the NQN of the host (i.e., NQNh) not including the null terminator; + * - a space character; + * - the NQN of the NVM subsystem (i.e., NQNc) not including the null + * terminator; + * - a space character; and + * - the seventeen ASCII characters "NVMe-over-Fabrics" + * (i.e., = Base64(HMAC(PSK, NQNh || " " || NQNc || " " || + * "NVMe-over-Fabrics"))). + * The length of the PSK digest depends on the hash function used to compute + * it as follows: + * - If the SHA-256 hash function is used, the resulting PSK digest is 44 + * characters long; or + * - If the SHA-384 hash function is used, the resulting PSK digest is 64 + * characters long. + * + * Returns 0 on success with a valid digest pointer in @ret_digest, or a + * negative error number on failure. + */ +int nvme_auth_generate_digest(u8 hmac_id, u8 *psk, size_t psk_len, + char *subsysnqn, char *hostnqn, u8 **ret_digest) +{ + struct crypto_shash *tfm; + SHASH_DESC_ON_STACK(shash, tfm); + u8 *digest, *enc; + const char *hmac_name; + size_t digest_len, hmac_len; + int ret; + + if (WARN_ON(!subsysnqn || !hostnqn)) + return -EINVAL; + + hmac_name = nvme_auth_hmac_name(hmac_id); + if (!hmac_name) { + pr_warn("%s: invalid hash algorithm %d\n", + __func__, hmac_id); + return -EINVAL; + } + + switch (nvme_auth_hmac_hash_len(hmac_id)) { + case 32: + hmac_len = 44; + break; + case 48: + hmac_len = 64; + break; + default: + pr_warn("%s: invalid hash algorithm '%s'\n", + __func__, hmac_name); + return -EINVAL; + } + + enc = kzalloc(hmac_len + 1, GFP_KERNEL); + if (!enc) + return -ENOMEM; + + tfm = crypto_alloc_shash(hmac_name, 0, 0); + if (IS_ERR(tfm)) { + ret = PTR_ERR(tfm); + goto out_free_enc; + } + + digest_len = crypto_shash_digestsize(tfm); + digest = kzalloc(digest_len, GFP_KERNEL); + if (!digest) { + ret = -ENOMEM; + goto out_free_tfm; + } + + shash->tfm = tfm; + ret = crypto_shash_setkey(tfm, psk, psk_len); + if (ret) + goto out_free_digest; + + ret = crypto_shash_init(shash); + if (ret) + goto out_free_digest; + + ret = crypto_shash_update(shash, hostnqn, strlen(hostnqn)); + if (ret) + goto out_free_digest; + + ret = crypto_shash_update(shash, " ", 1); + if (ret) + goto out_free_digest; + + ret = crypto_shash_update(shash, subsysnqn, strlen(subsysnqn)); + if (ret) + goto out_free_digest; + + ret = crypto_shash_update(shash, " NVMe-over-Fabrics", 18); + if (ret) + goto out_free_digest; + + ret = crypto_shash_final(shash, digest); + if (ret) + goto out_free_digest; + + ret = base64_encode(digest, digest_len, enc); + if (ret < hmac_len) { + ret = -ENOKEY; + goto out_free_digest; + } + *ret_digest = enc; + ret = 0; + +out_free_digest: + kfree_sensitive(digest); +out_free_tfm: + crypto_free_shash(tfm); +out_free_enc: + if (ret) + kfree_sensitive(enc); + + return ret; +} +EXPORT_SYMBOL_GPL(nvme_auth_generate_digest); + MODULE_DESCRIPTION("NVMe Authentication framework"); MODULE_LICENSE("GPL v2"); diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h index b13884b04dfd8f..998f06bf10fd3d 100644 --- a/include/linux/nvme-auth.h +++ b/include/linux/nvme-auth.h @@ -43,5 +43,7 @@ int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm, int nvme_auth_generate_psk(u8 hmac_id, u8 *skey, size_t skey_len, u8 *c1, u8 *c2, size_t hash_len, u8 **ret_psk, size_t *ret_len); +int nvme_auth_generate_digest(u8 hmac_id, u8 *psk, size_t psk_len, + char *subsysnqn, char *hostnqn, u8 **ret_digest); #endif /* _NVME_AUTH_H */ From 423e1059ea629a957718879d537cf086f4fdf3cd Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 24 Feb 2025 13:38:12 +0100 Subject: [PATCH 15/30] nvme: add nvme_auth_derive_tls_psk() Add a function to derive the TLS PSK as specified TP8018. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/common/Kconfig | 1 + drivers/nvme/common/auth.c | 116 ++++++++++++++++++++++++++++++++++++ include/linux/nvme-auth.h | 2 + 3 files changed, 119 insertions(+) diff --git a/drivers/nvme/common/Kconfig b/drivers/nvme/common/Kconfig index 244432e0b73d8e..da963e4f3f1f82 100644 --- a/drivers/nvme/common/Kconfig +++ b/drivers/nvme/common/Kconfig @@ -12,3 +12,4 @@ config NVME_AUTH select CRYPTO_SHA512 select CRYPTO_DH select CRYPTO_DH_RFC7919_GROUPS + select CRYPTO_HKDF diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c index 54b75d17710169..2c092ec8c0a94c 100644 --- a/drivers/nvme/common/auth.c +++ b/drivers/nvme/common/auth.c @@ -15,6 +15,8 @@ #include #include +#define HKDF_MAX_HASHLEN 64 + static u32 nvme_dhchap_seqnum; static DEFINE_MUTEX(nvme_dhchap_mutex); @@ -692,5 +694,119 @@ int nvme_auth_generate_digest(u8 hmac_id, u8 *psk, size_t psk_len, } EXPORT_SYMBOL_GPL(nvme_auth_generate_digest); +/** + * nvme_auth_derive_tls_psk - Derive TLS PSK + * @hmac_id: Hash function identifier + * @psk: generated input PSK + * @psk_len: size of @psk + * @psk_digest: TLS PSK digest + * @ret_psk: Pointer to the resulting TLS PSK + * + * Derive a TLS PSK as specified in TP8018 Section 3.6.1.3: + * TLS PSK and PSK identity Derivation + * + * The TLS PSK shall be derived as follows from an input PSK + * (i.e., either a retained PSK or a generated PSK) and a PSK + * identity using the HKDF-Extract and HKDF-Expand-Label operations + * (refer to RFC 5869 and RFC 8446) where the hash function is the + * one specified by the hash specifier of the PSK identity: + * 1. PRK = HKDF-Extract(0, Input PSK); and + * 2. TLS PSK = HKDF-Expand-Label(PRK, "nvme-tls-psk", PskIdentityContext, L), + * where PskIdentityContext is the hash identifier indicated in + * the PSK identity concatenated to a space character and to the + * Base64 PSK digest (i.e., " ") and L is the + * output size in bytes of the hash function (i.e., 32 for SHA-256 + * and 48 for SHA-384). + * + * Returns 0 on success with a valid psk pointer in @ret_psk or a negative + * error number otherwise. + */ +int nvme_auth_derive_tls_psk(int hmac_id, u8 *psk, size_t psk_len, + u8 *psk_digest, u8 **ret_psk) +{ + struct crypto_shash *hmac_tfm; + const char *hmac_name; + const char *psk_prefix = "tls13 nvme-tls-psk"; + static const char default_salt[HKDF_MAX_HASHLEN]; + size_t info_len, prk_len; + char *info; + unsigned char *prk, *tls_key; + int ret; + + hmac_name = nvme_auth_hmac_name(hmac_id); + if (!hmac_name) { + pr_warn("%s: invalid hash algorithm %d\n", + __func__, hmac_id); + return -EINVAL; + } + if (hmac_id == NVME_AUTH_HASH_SHA512) { + pr_warn("%s: unsupported hash algorithm %s\n", + __func__, hmac_name); + return -EINVAL; + } + + hmac_tfm = crypto_alloc_shash(hmac_name, 0, 0); + if (IS_ERR(hmac_tfm)) + return PTR_ERR(hmac_tfm); + + prk_len = crypto_shash_digestsize(hmac_tfm); + prk = kzalloc(prk_len, GFP_KERNEL); + if (!prk) { + ret = -ENOMEM; + goto out_free_shash; + } + + if (WARN_ON(prk_len > HKDF_MAX_HASHLEN)) { + ret = -EINVAL; + goto out_free_prk; + } + ret = hkdf_extract(hmac_tfm, psk, psk_len, + default_salt, prk_len, prk); + if (ret) + goto out_free_prk; + + ret = crypto_shash_setkey(hmac_tfm, prk, prk_len); + if (ret) + goto out_free_prk; + + /* + * 2 addtional bytes for the length field from HDKF-Expand-Label, + * 2 addtional bytes for the HMAC ID, and one byte for the space + * separator. + */ + info_len = strlen(psk_digest) + strlen(psk_prefix) + 5; + info = kzalloc(info_len + 1, GFP_KERNEL); + if (!info) { + ret = -ENOMEM; + goto out_free_prk; + } + + put_unaligned_be16(psk_len, info); + memcpy(info + 2, psk_prefix, strlen(psk_prefix)); + sprintf(info + 2 + strlen(psk_prefix), "%02d %s", hmac_id, psk_digest); + + tls_key = kzalloc(psk_len, GFP_KERNEL); + if (!tls_key) { + ret = -ENOMEM; + goto out_free_info; + } + ret = hkdf_expand(hmac_tfm, info, info_len, tls_key, psk_len); + if (ret) { + kfree(tls_key); + goto out_free_info; + } + *ret_psk = tls_key; + +out_free_info: + kfree(info); +out_free_prk: + kfree(prk); +out_free_shash: + crypto_free_shash(hmac_tfm); + + return ret; +} +EXPORT_SYMBOL_GPL(nvme_auth_derive_tls_psk); + MODULE_DESCRIPTION("NVMe Authentication framework"); MODULE_LICENSE("GPL v2"); diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h index 998f06bf10fd3d..60e069a6757ff6 100644 --- a/include/linux/nvme-auth.h +++ b/include/linux/nvme-auth.h @@ -45,5 +45,7 @@ int nvme_auth_generate_psk(u8 hmac_id, u8 *skey, size_t skey_len, u8 **ret_psk, size_t *ret_len); int nvme_auth_generate_digest(u8 hmac_id, u8 *psk, size_t psk_len, char *subsysnqn, char *hostnqn, u8 **ret_digest); +int nvme_auth_derive_tls_psk(int hmac_id, u8 *psk, size_t psk_len, + u8 *psk_digest, u8 **ret_psk); #endif /* _NVME_AUTH_H */ From cf1c1e40c4e71ffcde038bc0d874dbfb0c7446dd Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 24 Feb 2025 13:38:13 +0100 Subject: [PATCH 16/30] nvme-keyring: add nvme_tls_psk_refresh() Add a function to refresh a generated PSK in the specified keyring. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/common/keyring.c | 65 ++++++++++++++++++++++++++++++++++- drivers/nvme/host/tcp.c | 1 - drivers/nvme/target/tcp.c | 1 - include/linux/nvme-keyring.h | 12 ++++++- 4 files changed, 75 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/common/keyring.c b/drivers/nvme/common/keyring.c index ed5167f942d89b..32d16c53133bea 100644 --- a/drivers/nvme/common/keyring.c +++ b/drivers/nvme/common/keyring.c @@ -5,7 +5,6 @@ #include #include -#include #include #include #include @@ -124,6 +123,70 @@ static struct key *nvme_tls_psk_lookup(struct key *keyring, return key_ref_to_ptr(keyref); } +/** + * nvme_tls_psk_refresh - Refresh TLS PSK + * @keyring: Keyring holding the TLS PSK + * @hostnqn: Host NQN to use + * @subnqn: Subsystem NQN to use + * @hmac_id: Hash function identifier + * @data: TLS PSK key material + * @data_len: Length of @data + * @digest: TLS PSK digest + * + * Refresh a generated version 1 TLS PSK with the identity generated + * from @hmac_id, @hostnqn, @subnqn, and @digest in the keyring given + * by @keyring. + * + * Returns the updated key success or an error pointer otherwise. + */ +struct key *nvme_tls_psk_refresh(struct key *keyring, + const char *hostnqn, const char *subnqn, u8 hmac_id, + u8 *data, size_t data_len, const char *digest) +{ + key_perm_t keyperm = + KEY_POS_SEARCH | KEY_POS_VIEW | KEY_POS_READ | + KEY_POS_WRITE | KEY_POS_LINK | KEY_POS_SETATTR | + KEY_USR_SEARCH | KEY_USR_VIEW | KEY_USR_READ; + char *identity; + key_ref_t keyref; + key_serial_t keyring_id; + struct key *key; + + if (!hostnqn || !subnqn || !data || !data_len) + return ERR_PTR(-EINVAL); + + identity = kasprintf(GFP_KERNEL, "NVMe1G%02d %s %s %s", + hmac_id, hostnqn, subnqn, digest); + if (!identity) + return ERR_PTR(-ENOMEM); + + if (!keyring) + keyring = nvme_keyring; + keyring_id = key_serial(keyring); + pr_debug("keyring %x refresh tls psk '%s'\n", + keyring_id, identity); + keyref = key_create_or_update(make_key_ref(keyring, true), + "psk", identity, data, data_len, + keyperm, KEY_ALLOC_NOT_IN_QUOTA | + KEY_ALLOC_BUILT_IN | + KEY_ALLOC_BYPASS_RESTRICTION); + if (IS_ERR(keyref)) { + pr_debug("refresh tls psk '%s' failed, error %ld\n", + identity, PTR_ERR(keyref)); + kfree(identity); + return ERR_PTR(-ENOKEY); + } + kfree(identity); + /* + * Set the default timeout to 1 hour + * as suggested in TP8018. + */ + key = key_ref_to_ptr(keyref); + key_set_timeout(key, 3600); + return key; +} +EXPORT_SYMBOL_GPL(nvme_tls_psk_refresh); + /* * NVMe PSK priority list * diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 841238f38fddab..b50972257e49d4 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 7c51c2a8c109a9..fa59a7996efa24 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include diff --git a/include/linux/nvme-keyring.h b/include/linux/nvme-keyring.h index 19d2b256180fd7..ab8971afa9734a 100644 --- a/include/linux/nvme-keyring.h +++ b/include/linux/nvme-keyring.h @@ -6,15 +6,25 @@ #ifndef _NVME_KEYRING_H #define _NVME_KEYRING_H +#include + #if IS_ENABLED(CONFIG_NVME_KEYRING) +struct key *nvme_tls_psk_refresh(struct key *keyring, + const char *hostnqn, const char *subnqn, u8 hmac_id, + u8 *data, size_t data_len, const char *digest); key_serial_t nvme_tls_psk_default(struct key *keyring, const char *hostnqn, const char *subnqn); key_serial_t nvme_keyring_id(void); struct key *nvme_tls_key_lookup(key_serial_t key_id); #else - +static inline struct key *nvme_tls_psk_refresh(struct key *keyring, + const char *hostnqn, char *subnqn, u8 hmac_id, + u8 *data, size_t data_len, const char *digest) +{ + return ERR_PTR(-ENOTSUPP); +} static inline key_serial_t nvme_tls_psk_default(struct key *keyring, const char *hostnqn, const char *subnqn) { From 307a347fcf0dbf5ff8ad4c7640197e91bd0cb2fe Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 24 Feb 2025 13:38:14 +0100 Subject: [PATCH 17/30] nvme-tcp: request secure channel concatenation Add a fabrics option 'concat' to request secure channel concatenation as specified the NVME Base Specification v2.1, section 8.3.4.3: Secure Channel Concatenation. When secure channel concatenation is enabled a 'generated PSK' is inserted into the keyring such that it's available after reset. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/Kconfig | 2 +- drivers/nvme/host/auth.c | 115 +++++++++++++++++++++++++++++++++++- drivers/nvme/host/fabrics.c | 34 ++++++++++- drivers/nvme/host/fabrics.h | 3 + drivers/nvme/host/nvme.h | 2 + drivers/nvme/host/sysfs.c | 4 +- drivers/nvme/host/tcp.c | 53 +++++++++++++++-- include/linux/nvme.h | 7 +++ 8 files changed, 205 insertions(+), 15 deletions(-) diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index 486afe59818454..10e453b2436e80 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -109,7 +109,7 @@ config NVME_HOST_AUTH bool "NVMe over Fabrics In-Band Authentication in host side" depends on NVME_CORE select NVME_AUTH - select NVME_KEYRING if NVME_TCP_TLS + select NVME_KEYRING help This provides support for NVMe over Fabrics In-Band Authentication in host side. diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 5ea0e21709da37..6115fef74c1e93 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -12,6 +12,7 @@ #include "nvme.h" #include "fabrics.h" #include +#include #define CHAP_BUF_SIZE 4096 static struct kmem_cache *nvme_chap_buf_cache; @@ -131,7 +132,13 @@ static int nvme_auth_set_dhchap_negotiate_data(struct nvme_ctrl *ctrl, data->auth_type = NVME_AUTH_COMMON_MESSAGES; data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE; data->t_id = cpu_to_le16(chap->transaction); - data->sc_c = 0; /* No secure channel concatenation */ + if (ctrl->opts->concat && chap->qid == 0) { + if (ctrl->opts->tls_key) + data->sc_c = NVME_AUTH_SECP_REPLACETLSPSK; + else + data->sc_c = NVME_AUTH_SECP_NEWTLSPSK; + } else + data->sc_c = NVME_AUTH_SECP_NOSC; data->napd = 1; data->auth_protocol[0].dhchap.authid = NVME_AUTH_DHCHAP_AUTH_ID; data->auth_protocol[0].dhchap.halen = 3; @@ -311,8 +318,9 @@ static int nvme_auth_set_dhchap_reply_data(struct nvme_ctrl *ctrl, data->hl = chap->hash_len; data->dhvlen = cpu_to_le16(chap->host_key_len); memcpy(data->rval, chap->response, chap->hash_len); - if (ctrl->ctrl_key) { + if (ctrl->ctrl_key) chap->bi_directional = true; + if (ctrl->ctrl_key || ctrl->opts->concat) { get_random_bytes(chap->c2, chap->hash_len); data->cvalid = 1; memcpy(data->rval + chap->hash_len, chap->c2, @@ -322,7 +330,10 @@ static int nvme_auth_set_dhchap_reply_data(struct nvme_ctrl *ctrl, } else { memset(chap->c2, 0, chap->hash_len); } - chap->s2 = nvme_auth_get_seqnum(); + if (ctrl->opts->concat) + chap->s2 = 0; + else + chap->s2 = nvme_auth_get_seqnum(); data->seqnum = cpu_to_le32(chap->s2); if (chap->host_key_len) { dev_dbg(ctrl->device, "%s: qid %d host public key %*ph\n", @@ -677,6 +688,92 @@ static void nvme_auth_free_dhchap(struct nvme_dhchap_queue_context *chap) crypto_free_kpp(chap->dh_tfm); } +void nvme_auth_revoke_tls_key(struct nvme_ctrl *ctrl) +{ + dev_dbg(ctrl->device, "Wipe generated TLS PSK %08x\n", + key_serial(ctrl->opts->tls_key)); + key_revoke(ctrl->opts->tls_key); + key_put(ctrl->opts->tls_key); + ctrl->opts->tls_key = NULL; +} +EXPORT_SYMBOL_GPL(nvme_auth_revoke_tls_key); + +static int nvme_auth_secure_concat(struct nvme_ctrl *ctrl, + struct nvme_dhchap_queue_context *chap) +{ + u8 *psk, *digest, *tls_psk; + struct key *tls_key; + size_t psk_len; + int ret = 0; + + if (!chap->sess_key) { + dev_warn(ctrl->device, + "%s: qid %d no session key negotiated\n", + __func__, chap->qid); + return -ENOKEY; + } + + if (chap->qid) { + dev_warn(ctrl->device, + "qid %d: secure concatenation not supported on I/O queues\n", + chap->qid); + return -EINVAL; + } + ret = nvme_auth_generate_psk(chap->hash_id, chap->sess_key, + chap->sess_key_len, + chap->c1, chap->c2, + chap->hash_len, &psk, &psk_len); + if (ret) { + dev_warn(ctrl->device, + "%s: qid %d failed to generate PSK, error %d\n", + __func__, chap->qid, ret); + return ret; + } + dev_dbg(ctrl->device, + "%s: generated psk %*ph\n", __func__, (int)psk_len, psk); + + ret = nvme_auth_generate_digest(chap->hash_id, psk, psk_len, + ctrl->opts->subsysnqn, + ctrl->opts->host->nqn, &digest); + if (ret) { + dev_warn(ctrl->device, + "%s: qid %d failed to generate digest, error %d\n", + __func__, chap->qid, ret); + goto out_free_psk; + }; + dev_dbg(ctrl->device, "%s: generated digest %s\n", + __func__, digest); + ret = nvme_auth_derive_tls_psk(chap->hash_id, psk, psk_len, + digest, &tls_psk); + if (ret) { + dev_warn(ctrl->device, + "%s: qid %d failed to derive TLS psk, error %d\n", + __func__, chap->qid, ret); + goto out_free_digest; + }; + + tls_key = nvme_tls_psk_refresh(ctrl->opts->keyring, + ctrl->opts->host->nqn, + ctrl->opts->subsysnqn, chap->hash_id, + tls_psk, psk_len, digest); + if (IS_ERR(tls_key)) { + ret = PTR_ERR(tls_key); + dev_warn(ctrl->device, + "%s: qid %d failed to insert generated key, error %d\n", + __func__, chap->qid, ret); + tls_key = NULL; + } + kfree_sensitive(tls_psk); + if (ctrl->opts->tls_key) + nvme_auth_revoke_tls_key(ctrl); + ctrl->opts->tls_key = tls_key; +out_free_digest: + kfree_sensitive(digest); +out_free_psk: + kfree_sensitive(psk); + return ret; +} + static void nvme_queue_auth_work(struct work_struct *work) { struct nvme_dhchap_queue_context *chap = @@ -833,6 +930,13 @@ static void nvme_queue_auth_work(struct work_struct *work) } if (!ret) { chap->error = 0; + if (ctrl->opts->concat && + (ret = nvme_auth_secure_concat(ctrl, chap))) { + dev_warn(ctrl->device, + "%s: qid %d failed to enable secure concatenation\n", + __func__, chap->qid); + chap->error = ret; + } return; } @@ -912,6 +1016,11 @@ static void nvme_ctrl_auth_work(struct work_struct *work) "qid 0: authentication failed\n"); return; } + /* + * Only run authentication on the admin queue for secure concatenation. + */ + if (ctrl->opts->concat) + return; for (q = 1; q < ctrl->queue_count; q++) { ret = nvme_auth_negotiate(ctrl, q); diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 432efcbf9e2f5f..93e9041b9657eb 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -472,8 +472,9 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl) result = le32_to_cpu(res.u32); ctrl->cntlid = result & 0xFFFF; if (result & (NVME_CONNECT_AUTHREQ_ATR | NVME_CONNECT_AUTHREQ_ASCR)) { - /* Secure concatenation is not implemented */ - if (result & NVME_CONNECT_AUTHREQ_ASCR) { + /* Check for secure concatenation */ + if ((result & NVME_CONNECT_AUTHREQ_ASCR) && + !ctrl->opts->concat) { dev_warn(ctrl->device, "qid 0: secure concatenation is not supported\n"); ret = -EOPNOTSUPP; @@ -550,7 +551,7 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid) /* Secure concatenation is not implemented */ if (result & NVME_CONNECT_AUTHREQ_ASCR) { dev_warn(ctrl->device, - "qid 0: secure concatenation is not supported\n"); + "qid %d: secure concatenation is not supported\n", qid); ret = -EOPNOTSUPP; goto out_free_data; } @@ -706,6 +707,7 @@ static const match_table_t opt_tokens = { #endif #ifdef CONFIG_NVME_TCP_TLS { NVMF_OPT_TLS, "tls" }, + { NVMF_OPT_CONCAT, "concat" }, #endif { NVMF_OPT_ERR, NULL } }; @@ -735,6 +737,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, opts->tls = false; opts->tls_key = NULL; opts->keyring = NULL; + opts->concat = false; options = o = kstrdup(buf, GFP_KERNEL); if (!options) @@ -1053,6 +1056,14 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, } opts->tls = true; break; + case NVMF_OPT_CONCAT: + if (!IS_ENABLED(CONFIG_NVME_TCP_TLS)) { + pr_err("TLS is not supported\n"); + ret = -EINVAL; + goto out; + } + opts->concat = true; + break; default: pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n", p); @@ -1079,6 +1090,23 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, pr_warn("failfast tmo (%d) larger than controller loss tmo (%d)\n", opts->fast_io_fail_tmo, ctrl_loss_tmo); } + if (opts->concat) { + if (opts->tls) { + pr_err("Secure concatenation over TLS is not supported\n"); + ret = -EINVAL; + goto out; + } + if (opts->tls_key) { + pr_err("Cannot specify a TLS key for secure concatenation\n"); + ret = -EINVAL; + goto out; + } + if (!opts->dhchap_secret) { + pr_err("Need to enable DH-CHAP for secure concatenation\n"); + ret = -EINVAL; + goto out; + } + } opts->host = nvmf_host_add(hostnqn, &hostid); if (IS_ERR(opts->host)) { diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 21d75dc4a3a091..9cf5b020adba27 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -66,6 +66,7 @@ enum { NVMF_OPT_TLS = 1 << 25, NVMF_OPT_KEYRING = 1 << 26, NVMF_OPT_TLS_KEY = 1 << 27, + NVMF_OPT_CONCAT = 1 << 28, }; /** @@ -101,6 +102,7 @@ enum { * @keyring: Keyring to use for key lookups * @tls_key: TLS key for encrypted connections (TCP) * @tls: Start TLS encrypted connections (TCP) + * @concat: Enabled Secure channel concatenation (TCP) * @disable_sqflow: disable controller sq flow control * @hdr_digest: generate/verify header digest (TCP) * @data_digest: generate/verify data digest (TCP) @@ -130,6 +132,7 @@ struct nvmf_ctrl_options { struct key *keyring; struct key *tls_key; bool tls; + bool concat; bool disable_sqflow; bool hdr_digest; bool data_digest; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 7be92d07430e95..daa4c91e4848ce 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -1147,6 +1147,7 @@ void nvme_auth_stop(struct nvme_ctrl *ctrl); int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid); int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid); void nvme_auth_free(struct nvme_ctrl *ctrl); +void nvme_auth_revoke_tls_key(struct nvme_ctrl *ctrl); #else static inline int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) { @@ -1169,6 +1170,7 @@ static inline int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid) return -EPROTONOSUPPORT; } static inline void nvme_auth_free(struct nvme_ctrl *ctrl) {}; +static inline void nvme_auth_revoke_tls_key(struct nvme_ctrl *ctrl) {}; #endif u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index 3a41b9ab0f13c4..52683943be152f 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -780,10 +780,10 @@ static umode_t nvme_tls_attrs_are_visible(struct kobject *kobj, return 0; if (a == &dev_attr_tls_key.attr && - !ctrl->opts->tls) + !ctrl->opts->tls && !ctrl->opts->concat) return 0; if (a == &dev_attr_tls_configured_key.attr && - !ctrl->opts->tls_key) + (!ctrl->opts->tls_key || ctrl->opts->concat)) return 0; if (a == &dev_attr_tls_keyring.attr && !ctrl->opts->keyring) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index b50972257e49d4..196d3631885399 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -235,7 +235,7 @@ static inline bool nvme_tcp_tls_configured(struct nvme_ctrl *ctrl) if (!IS_ENABLED(CONFIG_NVME_TCP_TLS)) return 0; - return ctrl->opts->tls; + return ctrl->opts->tls || ctrl->opts->concat; } static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue) @@ -1987,7 +1987,7 @@ static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl) if (nvme_tcp_tls_configured(ctrl)) { if (ctrl->opts->tls_key) pskid = key_serial(ctrl->opts->tls_key); - else { + else if (ctrl->opts->tls) { pskid = nvme_tls_psk_default(ctrl->opts->keyring, ctrl->opts->host->nqn, ctrl->opts->subsysnqn); @@ -2017,9 +2017,25 @@ static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) { int i, ret; - if (nvme_tcp_tls_configured(ctrl) && !ctrl->tls_pskid) { - dev_err(ctrl->device, "no PSK negotiated\n"); - return -ENOKEY; + if (nvme_tcp_tls_configured(ctrl)) { + if (ctrl->opts->concat) { + /* + * The generated PSK is stored in the + * fabric options + */ + if (!ctrl->opts->tls_key) { + dev_err(ctrl->device, "no PSK generated\n"); + return -ENOKEY; + } + if (ctrl->tls_pskid && + ctrl->tls_pskid != key_serial(ctrl->opts->tls_key)) { + dev_err(ctrl->device, "Stale PSK id %08x\n", ctrl->tls_pskid); + ctrl->tls_pskid = 0; + } + } else if (!ctrl->tls_pskid) { + dev_err(ctrl->device, "no PSK negotiated\n"); + return -ENOKEY; + } } for (i = 1; i < ctrl->queue_count; i++) { @@ -2237,6 +2253,27 @@ static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl, } } +/* + * The TLS key is set by secure concatenation after negotiation has been + * completed on the admin queue. We need to revoke the key when: + * - concatenation is enabled (otherwise it's a static key set by the user) + * and + * - the generated key is present in ctrl->tls_key (otherwise there's nothing + * to revoke) + * and + * - a valid PSK key ID has been set in ctrl->tls_pskid (otherwise TLS + * negotiation has not run). + * + * We cannot always revoke the key as nvme_tcp_alloc_admin_queue() is called + * twice during secure concatenation, once on a 'normal' connection to run the + * DH-HMAC-CHAP negotiation (which generates the key, so it _must not_ be set), + * and once after the negotiation (which uses the key, so it _must_ be set). + */ +static bool nvme_tcp_key_revoke_needed(struct nvme_ctrl *ctrl) +{ + return ctrl->opts->concat && ctrl->opts->tls_key && ctrl->tls_pskid; +} + static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) { struct nvmf_ctrl_options *opts = ctrl->opts; @@ -2342,6 +2379,8 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) struct nvme_tcp_ctrl, err_work); struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl; + if (nvme_tcp_key_revoke_needed(ctrl)) + nvme_auth_revoke_tls_key(ctrl); nvme_stop_keep_alive(ctrl); flush_work(&ctrl->async_event_work); nvme_tcp_teardown_io_queues(ctrl, false); @@ -2382,6 +2421,8 @@ static void nvme_reset_ctrl_work(struct work_struct *work) container_of(work, struct nvme_ctrl, reset_work); int ret; + if (nvme_tcp_key_revoke_needed(ctrl)) + nvme_auth_revoke_tls_key(ctrl); nvme_stop_ctrl(ctrl); nvme_tcp_teardown_ctrl(ctrl, false); @@ -2877,7 +2918,7 @@ static struct nvmf_transport_ops nvme_tcp_transport = { NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST | NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES | NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE | NVMF_OPT_TLS | - NVMF_OPT_KEYRING | NVMF_OPT_TLS_KEY, + NVMF_OPT_KEYRING | NVMF_OPT_TLS_KEY | NVMF_OPT_CONCAT, .create_ctrl = nvme_tcp_create_ctrl, }; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index fe3b60818fdcfb..bfb5688363b0dc 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1746,6 +1746,13 @@ enum { NVME_AUTH_DHGROUP_INVALID = 0xff, }; +enum { + NVME_AUTH_SECP_NOSC = 0x00, + NVME_AUTH_SECP_SC = 0x01, + NVME_AUTH_SECP_NEWTLSPSK = 0x02, + NVME_AUTH_SECP_REPLACETLSPSK = 0x03, +}; + union nvmf_auth_protocol { struct nvmf_auth_dhchap_protocol_descriptor dhchap; }; From cb5d15d30fb3ee9eaec3e018a349302829bc7c5e Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 24 Feb 2025 13:38:15 +0100 Subject: [PATCH 18/30] nvme-fabrics: reset admin connection for secure concatenation When secure concatenation is requested the connection needs to be reset to enable TLS encryption on the new cnnection. That implies that the original connection used for the DH-CHAP negotiation really shouldn't be used, and we should reset as soon as the DH-CHAP negotiation has succeeded on the admin queue. Based on an idea from Sagi. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 196d3631885399..feb2d7e17c4a14 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2283,6 +2283,16 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) if (ret) return ret; + if (ctrl->opts && ctrl->opts->concat && !ctrl->tls_pskid) { + /* See comments for nvme_tcp_key_revoke_needed() */ + dev_dbg(ctrl->device, "restart admin queue for secure concatenation\n"); + nvme_stop_keep_alive(ctrl); + nvme_tcp_teardown_admin_queue(ctrl, false); + ret = nvme_tcp_configure_admin_queue(ctrl, false); + if (ret) + return ret; + } + if (ctrl->icdoff) { ret = -EOPNOTSUPP; dev_err(ctrl->device, "icdoff is not supported!\n"); From a6bd7c4a91facafa24c6b304c5aa870c3808cf9f Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 24 Feb 2025 13:38:16 +0100 Subject: [PATCH 19/30] nvmet: Add 'sq' argument to alloc_ctrl_args For secure concatenation the result of the TLS handshake will be stored in the 'sq' struct, so add it to the alloc_ctrl_args struct. Cc: Damien Le Moal Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Reviewed-by: Damien Le Moal Signed-off-by: Keith Busch --- drivers/nvme/target/auth.c | 2 +- drivers/nvme/target/core.c | 2 +- drivers/nvme/target/fabrics-cmd-auth.c | 2 +- drivers/nvme/target/fabrics-cmd.c | 11 +++++++---- drivers/nvme/target/nvmet.h | 6 ++++-- 5 files changed, 14 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c index b47d675232d2d8..d0392ccda2a1d8 100644 --- a/drivers/nvme/target/auth.c +++ b/drivers/nvme/target/auth.c @@ -139,7 +139,7 @@ int nvmet_setup_dhgroup(struct nvmet_ctrl *ctrl, u8 dhgroup_id) return ret; } -u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl) +u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq) { int ret = 0; struct nvmet_host_link *p; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index cdc4a09a6e8a48..112df8970954b5 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1649,7 +1649,7 @@ struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args) if (args->hostid) uuid_copy(&ctrl->hostid, args->hostid); - dhchap_status = nvmet_setup_auth(ctrl); + dhchap_status = nvmet_setup_auth(ctrl, args->sq); if (dhchap_status) { pr_err("Failed to setup authentication, dhchap status %u\n", dhchap_status); diff --git a/drivers/nvme/target/fabrics-cmd-auth.c b/drivers/nvme/target/fabrics-cmd-auth.c index 2022757f08dc76..43b684af816cbe 100644 --- a/drivers/nvme/target/fabrics-cmd-auth.c +++ b/drivers/nvme/target/fabrics-cmd-auth.c @@ -246,7 +246,7 @@ void nvmet_execute_auth_send(struct nvmet_req *req) pr_debug("%s: ctrl %d qid %d reset negotiation\n", __func__, ctrl->cntlid, req->sq->qid); if (!req->sq->qid) { - dhchap_status = nvmet_setup_auth(ctrl); + dhchap_status = nvmet_setup_auth(ctrl, req->sq); if (dhchap_status) { pr_err("ctrl %d qid 0 failed to setup re-authentication\n", ctrl->cntlid); diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index eb406c90c16793..5e8a3e1ca79cb3 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -234,10 +234,12 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) return ret; } -static u32 nvmet_connect_result(struct nvmet_ctrl *ctrl) +static u32 nvmet_connect_result(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq) { + bool needs_auth = nvmet_has_auth(ctrl); + return (u32)ctrl->cntlid | - (nvmet_has_auth(ctrl) ? NVME_CONNECT_AUTHREQ_ATR : 0); + (needs_auth ? NVME_CONNECT_AUTHREQ_ATR : 0); } static void nvmet_execute_admin_connect(struct nvmet_req *req) @@ -247,6 +249,7 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req) struct nvmet_ctrl *ctrl = NULL; struct nvmet_alloc_ctrl_args args = { .port = req->port, + .sq = req->sq, .ops = req->ops, .p2p_client = req->p2p_client, .kato = le32_to_cpu(c->kato), @@ -299,7 +302,7 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req) goto out; } - args.result = cpu_to_le32(nvmet_connect_result(ctrl)); + args.result = cpu_to_le32(nvmet_connect_result(ctrl, req->sq)); out: kfree(d); complete: @@ -357,7 +360,7 @@ static void nvmet_execute_io_connect(struct nvmet_req *req) goto out_ctrl_put; pr_debug("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid); - req->cqe->result.u32 = cpu_to_le32(nvmet_connect_result(ctrl)); + req->cqe->result.u32 = cpu_to_le32(nvmet_connect_result(ctrl, req->sq)); out: kfree(d); complete: diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 4be8d22d2d8d41..e0cfdae8af6de0 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -583,6 +583,7 @@ void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new); struct nvmet_alloc_ctrl_args { struct nvmet_port *port; + struct nvmet_sq *sq; char *subsysnqn; char *hostnqn; uuid_t *hostid; @@ -860,7 +861,7 @@ void nvmet_execute_auth_receive(struct nvmet_req *req); int nvmet_auth_set_key(struct nvmet_host *host, const char *secret, bool set_ctrl); int nvmet_auth_set_host_hash(struct nvmet_host *host, const char *hash); -u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl); +u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq); void nvmet_auth_sq_init(struct nvmet_sq *sq); void nvmet_destroy_auth(struct nvmet_ctrl *ctrl); void nvmet_auth_sq_free(struct nvmet_sq *sq); @@ -879,7 +880,8 @@ int nvmet_auth_ctrl_exponential(struct nvmet_req *req, int nvmet_auth_ctrl_sesskey(struct nvmet_req *req, u8 *buf, int buf_size); #else -static inline u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl) +static inline u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, + struct nvmet_sq *sq) { return 0; } From ed304be99023a8b4626c3c7340d26f5d2ca5c333 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 24 Feb 2025 13:38:17 +0100 Subject: [PATCH 20/30] nvmet-tcp: support secure channel concatenation Evaluate the SC_C flag during DH-CHAP-HMAC negotiation to check if secure concatenation as specified in the NVMe Base Specification v2.1, section 8.3.4.3: "Secure Channel Concatenationand" is requested. If requested the generated PSK is inserted into the keyring once negotiation has finished allowing for an encrypted connection once the admin queue is restarted. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/auth.c | 70 ++++++++++++++++++++++++++ drivers/nvme/target/core.c | 5 +- drivers/nvme/target/fabrics-cmd-auth.c | 58 +++++++++++++++++++-- drivers/nvme/target/fabrics-cmd.c | 16 +++++- drivers/nvme/target/nvmet.h | 32 ++++++++++-- drivers/nvme/target/tcp.c | 31 +++++++++++- 6 files changed, 199 insertions(+), 13 deletions(-) diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c index d0392ccda2a1d8..0b0645ac5df478 100644 --- a/drivers/nvme/target/auth.c +++ b/drivers/nvme/target/auth.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include "nvmet.h" @@ -165,6 +166,11 @@ u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq) goto out_unlock; } + if (nvmet_queue_tls_keyid(sq)) { + pr_debug("host %s tls enabled\n", ctrl->hostnqn); + goto out_unlock; + } + ret = nvmet_setup_dhgroup(ctrl, host->dhchap_dhgroup_id); if (ret < 0) { pr_warn("Failed to setup DH group"); @@ -233,6 +239,9 @@ u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq) void nvmet_auth_sq_free(struct nvmet_sq *sq) { cancel_delayed_work(&sq->auth_expired_work); +#ifdef CONFIG_NVME_TARGET_TCP_TLS + sq->tls_key = 0; +#endif kfree(sq->dhchap_c1); sq->dhchap_c1 = NULL; kfree(sq->dhchap_c2); @@ -261,6 +270,12 @@ void nvmet_destroy_auth(struct nvmet_ctrl *ctrl) nvme_auth_free_key(ctrl->ctrl_key); ctrl->ctrl_key = NULL; } +#ifdef CONFIG_NVME_TARGET_TCP_TLS + if (ctrl->tls_key) { + key_put(ctrl->tls_key); + ctrl->tls_key = NULL; + } +#endif } bool nvmet_check_auth_status(struct nvmet_req *req) @@ -542,3 +557,58 @@ int nvmet_auth_ctrl_sesskey(struct nvmet_req *req, return ret; } + +void nvmet_auth_insert_psk(struct nvmet_sq *sq) +{ + int hash_len = nvme_auth_hmac_hash_len(sq->ctrl->shash_id); + u8 *psk, *digest, *tls_psk; + size_t psk_len; + int ret; +#ifdef CONFIG_NVME_TARGET_TCP_TLS + struct key *tls_key = NULL; +#endif + + ret = nvme_auth_generate_psk(sq->ctrl->shash_id, + sq->dhchap_skey, + sq->dhchap_skey_len, + sq->dhchap_c1, sq->dhchap_c2, + hash_len, &psk, &psk_len); + if (ret) { + pr_warn("%s: ctrl %d qid %d failed to generate PSK, error %d\n", + __func__, sq->ctrl->cntlid, sq->qid, ret); + return; + } + ret = nvme_auth_generate_digest(sq->ctrl->shash_id, psk, psk_len, + sq->ctrl->subsysnqn, + sq->ctrl->hostnqn, &digest); + if (ret) { + pr_warn("%s: ctrl %d qid %d failed to generate digest, error %d\n", + __func__, sq->ctrl->cntlid, sq->qid, ret); + goto out_free_psk; + } + ret = nvme_auth_derive_tls_psk(sq->ctrl->shash_id, psk, psk_len, + digest, &tls_psk); + if (ret) { + pr_warn("%s: ctrl %d qid %d failed to derive TLS PSK, error %d\n", + __func__, sq->ctrl->cntlid, sq->qid, ret); + goto out_free_digest; + } +#ifdef CONFIG_NVME_TARGET_TCP_TLS + tls_key = nvme_tls_psk_refresh(NULL, sq->ctrl->hostnqn, sq->ctrl->subsysnqn, + sq->ctrl->shash_id, tls_psk, psk_len, digest); + if (IS_ERR(tls_key)) { + pr_warn("%s: ctrl %d qid %d failed to refresh key, error %ld\n", + __func__, sq->ctrl->cntlid, sq->qid, PTR_ERR(tls_key)); + tls_key = NULL; + kfree_sensitive(tls_psk); + } + if (sq->ctrl->tls_key) + key_put(sq->ctrl->tls_key); + sq->ctrl->tls_key = tls_key; +#endif + +out_free_digest: + kfree_sensitive(digest); +out_free_psk: + kfree_sensitive(psk); +} diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 112df8970954b5..a058f473652cd0 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1664,11 +1664,12 @@ struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args) args->status = NVME_SC_SUCCESS; - pr_info("Created %s controller %d for subsystem %s for NQN %s%s%s.\n", + pr_info("Created %s controller %d for subsystem %s for NQN %s%s%s%s.\n", nvmet_is_disc_subsys(ctrl->subsys) ? "discovery" : "nvm", ctrl->cntlid, ctrl->subsys->subsysnqn, ctrl->hostnqn, ctrl->pi_support ? " T10-PI is enabled" : "", - nvmet_has_auth(ctrl) ? " with DH-HMAC-CHAP" : ""); + nvmet_has_auth(ctrl, args->sq) ? " with DH-HMAC-CHAP" : "", + nvmet_queue_tls_keyid(args->sq) ? ", TLS" : ""); return ctrl; diff --git a/drivers/nvme/target/fabrics-cmd-auth.c b/drivers/nvme/target/fabrics-cmd-auth.c index 43b684af816cbe..bf01ec414c55f7 100644 --- a/drivers/nvme/target/fabrics-cmd-auth.c +++ b/drivers/nvme/target/fabrics-cmd-auth.c @@ -43,8 +43,26 @@ static u8 nvmet_auth_negotiate(struct nvmet_req *req, void *d) data->auth_protocol[0].dhchap.halen, data->auth_protocol[0].dhchap.dhlen); req->sq->dhchap_tid = le16_to_cpu(data->t_id); - if (data->sc_c) - return NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH; + if (data->sc_c != NVME_AUTH_SECP_NOSC) { + if (!IS_ENABLED(CONFIG_NVME_TARGET_TCP_TLS)) + return NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH; + /* Secure concatenation can only be enabled on the admin queue */ + if (req->sq->qid) + return NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH; + switch (data->sc_c) { + case NVME_AUTH_SECP_NEWTLSPSK: + if (nvmet_queue_tls_keyid(req->sq)) + return NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH; + break; + case NVME_AUTH_SECP_REPLACETLSPSK: + if (!nvmet_queue_tls_keyid(req->sq)) + return NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH; + break; + default: + return NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH; + } + ctrl->concat = true; + } if (data->napd != 1) return NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE; @@ -103,6 +121,12 @@ static u8 nvmet_auth_negotiate(struct nvmet_req *req, void *d) nvme_auth_dhgroup_name(fallback_dhgid)); ctrl->dh_gid = fallback_dhgid; } + if (ctrl->dh_gid == NVME_AUTH_DHGROUP_NULL && ctrl->concat) { + pr_debug("%s: ctrl %d qid %d: NULL DH group invalid " + "for secure channel concatenation\n", __func__, + ctrl->cntlid, req->sq->qid); + return NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH; + } pr_debug("%s: ctrl %d qid %d: selected DH group %s (%d)\n", __func__, ctrl->cntlid, req->sq->qid, nvme_auth_dhgroup_name(ctrl->dh_gid), ctrl->dh_gid); @@ -148,12 +172,22 @@ static u8 nvmet_auth_reply(struct nvmet_req *req, void *d) if (memcmp(data->rval, response, data->hl)) { pr_info("ctrl %d qid %d host response mismatch\n", ctrl->cntlid, req->sq->qid); + pr_debug("ctrl %d qid %d rval %*ph\n", + ctrl->cntlid, req->sq->qid, data->hl, data->rval); + pr_debug("ctrl %d qid %d response %*ph\n", + ctrl->cntlid, req->sq->qid, data->hl, response); kfree(response); return NVME_AUTH_DHCHAP_FAILURE_FAILED; } kfree(response); pr_debug("%s: ctrl %d qid %d host authenticated\n", __func__, ctrl->cntlid, req->sq->qid); + if (!data->cvalid && ctrl->concat) { + pr_debug("%s: ctrl %d qid %d invalid challenge\n", + __func__, ctrl->cntlid, req->sq->qid); + return NVME_AUTH_DHCHAP_FAILURE_FAILED; + } + req->sq->dhchap_s2 = le32_to_cpu(data->seqnum); if (data->cvalid) { req->sq->dhchap_c2 = kmemdup(data->rval + data->hl, data->hl, GFP_KERNEL); @@ -163,11 +197,23 @@ static u8 nvmet_auth_reply(struct nvmet_req *req, void *d) pr_debug("%s: ctrl %d qid %d challenge %*ph\n", __func__, ctrl->cntlid, req->sq->qid, data->hl, req->sq->dhchap_c2); - } else { + } + /* + * NVMe Base Spec 2.2 section 8.3.4.5.4: DH-HMAC-CHAP_Reply message + * Sequence Number (SEQNUM): [ .. ] + * The value 0h is used to indicate that bidirectional authentication + * is not performed, but a challenge value C2 is carried in order to + * generate a pre-shared key (PSK) for subsequent establishment of a + * secure channel. + */ + if (req->sq->dhchap_s2 == 0) { + if (ctrl->concat) + nvmet_auth_insert_psk(req->sq); req->sq->authenticated = true; + kfree(req->sq->dhchap_c2); req->sq->dhchap_c2 = NULL; - } - req->sq->dhchap_s2 = le32_to_cpu(data->seqnum); + } else if (!data->cvalid) + req->sq->authenticated = true; return 0; } @@ -303,6 +349,8 @@ void nvmet_execute_auth_send(struct nvmet_req *req) } goto done_kfree; case NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2: + if (ctrl->concat) + nvmet_auth_insert_psk(req->sq); req->sq->authenticated = true; pr_debug("%s: ctrl %d qid %d ctrl authenticated\n", __func__, ctrl->cntlid, req->sq->qid); diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index 5e8a3e1ca79cb3..f012bdf898502e 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -236,8 +236,22 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) static u32 nvmet_connect_result(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq) { - bool needs_auth = nvmet_has_auth(ctrl); + bool needs_auth = nvmet_has_auth(ctrl, sq); + key_serial_t keyid = nvmet_queue_tls_keyid(sq); + /* Do not authenticate I/O queues for secure concatenation */ + if (ctrl->concat && sq->qid) + needs_auth = false; + + if (keyid) + pr_debug("%s: ctrl %d qid %d should %sauthenticate, tls psk %08x\n", + __func__, ctrl->cntlid, sq->qid, + needs_auth ? "" : "not ", keyid); + else + pr_debug("%s: ctrl %d qid %d should %sauthenticate%s\n", + __func__, ctrl->cntlid, sq->qid, + needs_auth ? "" : "not ", + ctrl->concat ? ", secure concatenation" : ""); return (u32)ctrl->cntlid | (needs_auth ? NVME_CONNECT_AUTHREQ_ATR : 0); } diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index e0cfdae8af6de0..9f6110ac7c4abe 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -164,6 +164,9 @@ struct nvmet_sq { u32 dhchap_s2; u8 *dhchap_skey; int dhchap_skey_len; +#endif +#ifdef CONFIG_NVME_TARGET_TCP_TLS + struct key *tls_key; #endif struct completion free_done; struct completion confirm_done; @@ -289,6 +292,7 @@ struct nvmet_ctrl { u64 err_counter; struct nvme_error_slot slots[NVMET_ERROR_LOG_SLOTS]; bool pi_support; + bool concat; #ifdef CONFIG_NVME_TARGET_AUTH struct nvme_dhchap_key *host_key; struct nvme_dhchap_key *ctrl_key; @@ -297,6 +301,9 @@ struct nvmet_ctrl { u8 dh_gid; u8 *dh_key; size_t dh_keysize; +#endif +#ifdef CONFIG_NVME_TARGET_TCP_TLS + struct key *tls_key; #endif struct nvmet_pr_log_mgr pr_log_mgr; }; @@ -853,6 +860,22 @@ static inline void nvmet_req_bio_put(struct nvmet_req *req, struct bio *bio) bio_put(bio); } +#ifdef CONFIG_NVME_TARGET_TCP_TLS +static inline key_serial_t nvmet_queue_tls_keyid(struct nvmet_sq *sq) +{ + return sq->tls_key ? key_serial(sq->tls_key) : 0; +} +static inline void nvmet_sq_put_tls_key(struct nvmet_sq *sq) +{ + if (sq->tls_key) { + key_put(sq->tls_key); + sq->tls_key = NULL; + } +} +#else +static inline key_serial_t nvmet_queue_tls_keyid(struct nvmet_sq *sq) { return 0; } +static inline void nvmet_sq_put_tls_key(struct nvmet_sq *sq) {} +#endif #ifdef CONFIG_NVME_TARGET_AUTH u32 nvmet_auth_send_data_len(struct nvmet_req *req); void nvmet_execute_auth_send(struct nvmet_req *req); @@ -871,14 +894,15 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response, unsigned int hash_len); int nvmet_auth_ctrl_hash(struct nvmet_req *req, u8 *response, unsigned int hash_len); -static inline bool nvmet_has_auth(struct nvmet_ctrl *ctrl) +static inline bool nvmet_has_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq) { - return ctrl->host_key != NULL; + return ctrl->host_key != NULL && !nvmet_queue_tls_keyid(sq); } int nvmet_auth_ctrl_exponential(struct nvmet_req *req, u8 *buf, int buf_size); int nvmet_auth_ctrl_sesskey(struct nvmet_req *req, u8 *buf, int buf_size); +void nvmet_auth_insert_psk(struct nvmet_sq *sq); #else static inline u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq) @@ -894,11 +918,13 @@ static inline bool nvmet_check_auth_status(struct nvmet_req *req) { return true; } -static inline bool nvmet_has_auth(struct nvmet_ctrl *ctrl) +static inline bool nvmet_has_auth(struct nvmet_ctrl *ctrl, + struct nvmet_sq *sq) { return false; } static inline const char *nvmet_dhchap_dhgroup_name(u8 dhgid) { return NULL; } +static inline void nvmet_auth_insert_psk(struct nvmet_sq *sq) {}; #endif int nvmet_pr_init_ns(struct nvmet_ns *ns); diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index fa59a7996efa24..9cf97433b0b6d5 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1072,10 +1072,11 @@ static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue) if (unlikely(!nvmet_req_init(req, &queue->nvme_cq, &queue->nvme_sq, &nvmet_tcp_ops))) { - pr_err("failed cmd %p id %d opcode %d, data_len: %d\n", + pr_err("failed cmd %p id %d opcode %d, data_len: %d, status: %04x\n", req->cmd, req->cmd->common.command_id, req->cmd->common.opcode, - le32_to_cpu(req->cmd->common.dptr.sgl.length)); + le32_to_cpu(req->cmd->common.dptr.sgl.length), + le16_to_cpu(req->cqe->status)); nvmet_tcp_handle_req_failure(queue, queue->cmd, req); return 0; @@ -1601,6 +1602,7 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w) /* stop accepting incoming data */ queue->rcv_state = NVMET_TCP_RECV_ERR; + nvmet_sq_put_tls_key(&queue->nvme_sq); nvmet_tcp_uninit_data_in_cmds(queue); nvmet_sq_destroy(&queue->nvme_sq); cancel_work_sync(&queue->io_work); @@ -1786,6 +1788,27 @@ static int nvmet_tcp_try_peek_pdu(struct nvmet_tcp_queue *queue) return 0; } +static int nvmet_tcp_tls_key_lookup(struct nvmet_tcp_queue *queue, + key_serial_t peerid) +{ + struct key *tls_key = nvme_tls_key_lookup(peerid); + int status = 0; + + if (IS_ERR(tls_key)) { + pr_warn("%s: queue %d failed to lookup key %x\n", + __func__, queue->idx, peerid); + spin_lock_bh(&queue->state_lock); + queue->state = NVMET_TCP_Q_FAILED; + spin_unlock_bh(&queue->state_lock); + status = PTR_ERR(tls_key); + } else { + pr_debug("%s: queue %d using TLS PSK %x\n", + __func__, queue->idx, peerid); + queue->nvme_sq.tls_key = tls_key; + } + return status; +} + static void nvmet_tcp_tls_handshake_done(void *data, int status, key_serial_t peerid) { @@ -1806,6 +1829,10 @@ static void nvmet_tcp_tls_handshake_done(void *data, int status, spin_unlock_bh(&queue->state_lock); cancel_delayed_work_sync(&queue->tls_handshake_tmo_work); + + if (!status) + status = nvmet_tcp_tls_key_lookup(queue, peerid); + if (status) nvmet_tcp_schedule_release_queue(queue); else From c3603ab4bb2f6e043444d651bc1e96870f8be460 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 24 Feb 2025 13:38:18 +0100 Subject: [PATCH 21/30] nvmet: add tls_concat and tls_key debugfs entries Add debugfs entries to display the 'concat' and 'tls_key' controller attributes. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/debugfs.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/drivers/nvme/target/debugfs.c b/drivers/nvme/target/debugfs.c index 220c7391fc19ad..e4300eb95101a7 100644 --- a/drivers/nvme/target/debugfs.c +++ b/drivers/nvme/target/debugfs.c @@ -132,6 +132,27 @@ static int nvmet_ctrl_host_traddr_show(struct seq_file *m, void *p) } NVMET_DEBUGFS_ATTR(nvmet_ctrl_host_traddr); +#ifdef CONFIG_NVME_TARGET_TCP_TLS +static int nvmet_ctrl_tls_key_show(struct seq_file *m, void *p) +{ + struct nvmet_ctrl *ctrl = m->private; + key_serial_t keyid = nvmet_queue_tls_keyid(ctrl->sqs[0]); + + seq_printf(m, "%08x\n", keyid); + return 0; +} +NVMET_DEBUGFS_ATTR(nvmet_ctrl_tls_key); + +static int nvmet_ctrl_tls_concat_show(struct seq_file *m, void *p) +{ + struct nvmet_ctrl *ctrl = m->private; + + seq_printf(m, "%d\n", ctrl->concat); + return 0; +} +NVMET_DEBUGFS_ATTR(nvmet_ctrl_tls_concat); +#endif + int nvmet_debugfs_ctrl_setup(struct nvmet_ctrl *ctrl) { char name[32]; @@ -157,6 +178,12 @@ int nvmet_debugfs_ctrl_setup(struct nvmet_ctrl *ctrl) &nvmet_ctrl_state_fops); debugfs_create_file("host_traddr", S_IRUSR, ctrl->debugfs_dir, ctrl, &nvmet_ctrl_host_traddr_fops); +#ifdef CONFIG_NVME_TARGET_TCP_TLS + debugfs_create_file("tls_concat", S_IRUSR, ctrl->debugfs_dir, ctrl, + &nvmet_ctrl_tls_concat_fops); + debugfs_create_file("tls_key", S_IRUSR, ctrl->debugfs_dir, ctrl, + &nvmet_ctrl_tls_key_fops); +#endif return 0; } From f02c16b8c93398dff6da001a2d7f793ad421bfe5 Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Sun, 12 Jan 2025 18:11:44 +0530 Subject: [PATCH 22/30] nvme-multipath: Add visibility for round-robin io-policy This patch helps add nvme native multipath visibility for round-robin io-policy. It creates a "multipath" sysfs directory under head gendisk device node directory and then from "multipath" directory it adds a link to each namespace path device the head node refers. For instance, if we have a shared namespace accessible from two different controllers/paths then we create a soft link to each path device from head disk node as shown below: $ ls -l /sys/block/nvme1n1/multipath/ nvme1c1n1 -> ../../../../../pci052e:78/052e:78:00.0/nvme/nvme1/nvme1c1n1 nvme1c3n1 -> ../../../../../pci058e:78/058e:78:00.0/nvme/nvme3/nvme1c3n1 In the above example, nvme1n1 is head gendisk node created for a shared namespace and the namespace is accessible from nvme1c1n1 and nvme1c3n1 paths. For round-robin I/O policy, we could easily infer from the above output that I/O workload targeted to nvme1n1 would toggle across paths nvme1c1n1 and nvme1c3n1. Reviewed-by: Hannes Reinecke Signed-off-by: Nilay Shroff Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 3 ++ drivers/nvme/host/multipath.c | 99 +++++++++++++++++++++++++++++++++++ drivers/nvme/host/nvme.h | 18 +++++-- drivers/nvme/host/sysfs.c | 14 +++++ 4 files changed, 130 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 818d4e49aab51c..870314c521077c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4020,6 +4020,9 @@ static void nvme_ns_remove(struct nvme_ns *ns) if (!nvme_ns_head_multipath(ns->head)) nvme_cdev_del(&ns->cdev, &ns->cdev_device); + + nvme_mpath_remove_sysfs_link(ns); + del_gendisk(ns->disk); mutex_lock(&ns->ctrl->namespaces_lock); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 2a763556508304..7c89c8da46d6ac 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -686,6 +686,8 @@ static void nvme_mpath_set_live(struct nvme_ns *ns) kblockd_schedule_work(&head->partition_scan_work); } + nvme_mpath_add_sysfs_link(ns->head); + mutex_lock(&head->lock); if (nvme_path_is_optimized(ns)) { int node, srcu_idx; @@ -768,6 +770,25 @@ static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc, if (nvme_state_is_live(ns->ana_state) && nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE) nvme_mpath_set_live(ns); + else { + /* + * Add sysfs link from multipath head gendisk node to path + * device gendisk node. + * If path's ana state is live (i.e. state is either optimized + * or non-optimized) while we alloc the ns then sysfs link would + * be created from nvme_mpath_set_live(). In that case we would + * not fallthrough this code path. However for the path's ana + * state other than live, we call nvme_mpath_set_live() only + * after ana state transitioned to the live state. But we still + * want to create the sysfs link from head node to a path device + * irrespctive of the path's ana state. + * If we reach through here then it means that path's ana state + * is not live but still create the sysfs link to this path from + * head node if head node of the path has already come alive. + */ + if (test_bit(NVME_NSHEAD_DISK_LIVE, &ns->head->flags)) + nvme_mpath_add_sysfs_link(ns->head); + } } static int nvme_update_ana_state(struct nvme_ctrl *ctrl, @@ -967,6 +988,84 @@ static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl, return -ENXIO; /* just break out of the loop */ } +void nvme_mpath_add_sysfs_link(struct nvme_ns_head *head) +{ + struct device *target; + int rc, srcu_idx; + struct nvme_ns *ns; + struct kobject *kobj; + + /* + * Ensure head disk node is already added otherwise we may get invalid + * kobj for head disk node + */ + if (!test_bit(GD_ADDED, &head->disk->state)) + return; + + kobj = &disk_to_dev(head->disk)->kobj; + + /* + * loop through each ns chained through the head->list and create the + * sysfs link from head node to the ns path node + */ + srcu_idx = srcu_read_lock(&head->srcu); + + list_for_each_entry_rcu(ns, &head->list, siblings) { + /* + * Avoid creating link if it already exists for the given path. + * When path ana state transitions from optimized to non- + * optimized or vice-versa, the nvme_mpath_set_live() is + * invoked which in truns call this function. Now if the sysfs + * link already exists for the given path and we attempt to re- + * create the link then sysfs code would warn about it loudly. + * So we evaluate NVME_NS_SYSFS_ATTR_LINK flag here to ensure + * that we're not creating duplicate link. + * The test_and_set_bit() is used because it is protecting + * against multiple nvme paths being simultaneously added. + */ + if (test_and_set_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags)) + continue; + + /* + * Ensure that ns path disk node is already added otherwise we + * may get invalid kobj name for target + */ + if (!test_bit(GD_ADDED, &ns->disk->state)) + continue; + + target = disk_to_dev(ns->disk); + /* + * Create sysfs link from head gendisk kobject @kobj to the + * ns path gendisk kobject @target->kobj. + */ + rc = sysfs_add_link_to_group(kobj, nvme_ns_mpath_attr_group.name, + &target->kobj, dev_name(target)); + if (unlikely(rc)) { + dev_err(disk_to_dev(ns->head->disk), + "failed to create link to %s\n", + dev_name(target)); + clear_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags); + } + } + + srcu_read_unlock(&head->srcu, srcu_idx); +} + +void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns) +{ + struct device *target; + struct kobject *kobj; + + if (!test_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags)) + return; + + target = disk_to_dev(ns->disk); + kobj = &disk_to_dev(ns->head->disk)->kobj; + sysfs_remove_link_from_group(kobj, nvme_ns_mpath_attr_group.name, + dev_name(target)); + clear_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags); +} + void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid) { if (nvme_ctrl_use_ana(ns->ctrl)) { diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index daa4c91e4848ce..1eeb782947db99 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -534,10 +534,11 @@ struct nvme_ns { struct nvme_ns_head *head; unsigned long flags; -#define NVME_NS_REMOVING 0 -#define NVME_NS_ANA_PENDING 2 -#define NVME_NS_FORCE_RO 3 -#define NVME_NS_READY 4 +#define NVME_NS_REMOVING 0 +#define NVME_NS_ANA_PENDING 2 +#define NVME_NS_FORCE_RO 3 +#define NVME_NS_READY 4 +#define NVME_NS_SYSFS_ATTR_LINK 5 struct cdev cdev; struct device cdev_device; @@ -933,6 +934,7 @@ int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo); int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags); extern const struct attribute_group *nvme_ns_attr_groups[]; +extern const struct attribute_group nvme_ns_mpath_attr_group; extern const struct pr_ops nvme_pr_ops; extern const struct block_device_operations nvme_ns_head_ops; extern const struct attribute_group nvme_dev_attrs_group; @@ -955,6 +957,8 @@ void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys); void nvme_failover_req(struct request *req); void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); +void nvme_mpath_add_sysfs_link(struct nvme_ns_head *ns); +void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns); void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid); void nvme_mpath_remove_disk(struct nvme_ns_head *head); int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id); @@ -1009,6 +1013,12 @@ static inline void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid) static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head) { } +static inline void nvme_mpath_add_sysfs_link(struct nvme_ns *ns) +{ +} +static inline void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns) +{ +} static inline bool nvme_mpath_clear_current_path(struct nvme_ns *ns) { return false; diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index 52683943be152f..477c123a4b86a8 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -299,8 +299,22 @@ static const struct attribute_group nvme_ns_attr_group = { .is_visible = nvme_ns_attrs_are_visible, }; +#ifdef CONFIG_NVME_MULTIPATH +static struct attribute *nvme_ns_mpath_attrs[] = { + NULL, +}; + +const struct attribute_group nvme_ns_mpath_attr_group = { + .name = "multipath", + .attrs = nvme_ns_mpath_attrs, +}; +#endif + const struct attribute_group *nvme_ns_attr_groups[] = { &nvme_ns_attr_group, +#ifdef CONFIG_NVME_MULTIPATH + &nvme_ns_mpath_attr_group, +#endif NULL, }; From baab30aa385180b593bfc3e74f91aefcbbc88606 Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Sun, 12 Jan 2025 18:11:45 +0530 Subject: [PATCH 23/30] nvme-multipath: Add visibility for numa io-policy This patch helps add nvme native multipath visibility for numa io-policy. It adds a new attribute file named "numa_nodes" under namespace gendisk device path node which prints the list of numa nodes preferred by the given namespace path. The numa nodes value is comma delimited list of nodes or A-B range of nodes. For instance, if we have a shared namespace accessible from two different controllers/paths then accessing head node of the shared namespace would show the following output: $ ls -l /sys/block/nvme1n1/multipath/ nvme1c1n1 -> ../../../../../pci052e:78/052e:78:00.0/nvme/nvme1/nvme1c1n1 nvme1c3n1 -> ../../../../../pci058e:78/058e:78:00.0/nvme/nvme3/nvme1c3n1 In the above example, nvme1n1 is head gendisk node created for a shared namespace and this namespace is accessible from nvme1c1n1 and nvme1c3n1 paths. For numa io-policy we can then refer the "numa_nodes" attribute file created under each namespace path: $ cat /sys/block/nvme1n1/multipath/nvme1c1n1/numa_nodes 0-1 $ cat /sys/block/nvme1n1/multipath/nvme1c3n1/numa_nodes 2-3 >From the above output, we infer that I/O workload targeted at nvme1n1 and running on numa nodes 0 and 1 would prefer using path nvme1c1n1. Similarly, I/O workload running on numa nodes 2 and 3 would prefer using path nvme1c3n1. Reading "numa_nodes" file when configured io-policy is anything but numa would show no output. Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Signed-off-by: Nilay Shroff Signed-off-by: Keith Busch --- drivers/nvme/host/multipath.c | 27 +++++++++++++++++++++++++++ drivers/nvme/host/nvme.h | 1 + drivers/nvme/host/sysfs.c | 5 +++++ 3 files changed, 33 insertions(+) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 7c89c8da46d6ac..b47e01942ca4a6 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -976,6 +976,33 @@ static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr, } DEVICE_ATTR_RO(ana_state); +static ssize_t numa_nodes_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + int node, srcu_idx; + nodemask_t numa_nodes; + struct nvme_ns *current_ns; + struct nvme_ns *ns = nvme_get_ns_from_dev(dev); + struct nvme_ns_head *head = ns->head; + + if (head->subsys->iopolicy != NVME_IOPOLICY_NUMA) + return 0; + + nodes_clear(numa_nodes); + + srcu_idx = srcu_read_lock(&head->srcu); + for_each_node(node) { + current_ns = srcu_dereference(head->current_path[node], + &head->srcu); + if (ns == current_ns) + node_set(node, numa_nodes); + } + srcu_read_unlock(&head->srcu, srcu_idx); + + return sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&numa_nodes)); +} +DEVICE_ATTR_RO(numa_nodes); + static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *desc, void *data) { diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 1eeb782947db99..ff4bd0c2e401e7 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -984,6 +984,7 @@ static inline void nvme_trace_bio_complete(struct request *req) extern bool multipath; extern struct device_attribute dev_attr_ana_grpid; extern struct device_attribute dev_attr_ana_state; +extern struct device_attribute dev_attr_numa_nodes; extern struct device_attribute subsys_attr_iopolicy; static inline bool nvme_disk_is_ns_head(struct gendisk *disk) diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index 477c123a4b86a8..96eaecda0ab915 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -258,6 +258,7 @@ static struct attribute *nvme_ns_attrs[] = { #ifdef CONFIG_NVME_MULTIPATH &dev_attr_ana_grpid.attr, &dev_attr_ana_state.attr, + &dev_attr_numa_nodes.attr, #endif &dev_attr_io_passthru_err_log_enabled.attr, NULL, @@ -290,6 +291,10 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj, if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl)) return 0; } + if (a == &dev_attr_numa_nodes.attr) { + if (nvme_disk_is_ns_head(dev_to_disk(dev))) + return 0; + } #endif return a->mode; } From 4621b83f8eacad875ba5a234f628c2c2dc4f6822 Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Sun, 12 Jan 2025 18:11:46 +0530 Subject: [PATCH 24/30] nvme-multipath: Add visibility for queue-depth io-policy This patch helps add nvme native multipath visibility for queue-depth io-policy. It adds a new attribute file named "queue_depth" under namespace device path node which would print the number of active/ in-flight I/O requests currently queued for the given path. For instance, if we have a shared namespace accessible from two different controllers/paths then accessing head block node of the shared namespace would show the following output: $ ls -l /sys/block/nvme1n1/multipath/ nvme1c1n1 -> ../../../../../pci052e:78/052e:78:00.0/nvme/nvme1/nvme1c1n1 nvme1c3n1 -> ../../../../../pci058e:78/058e:78:00.0/nvme/nvme3/nvme1c3n1 In the above example, nvme1n1 is head gendisk node created for a shared namespace and the namespace is accessible from nvme1c1n1 and nvme1c3n1 paths. For queue-depth io-policy we can then refer the "queue_depth" attribute file created under each namespace path: $ cat /sys/block/nvme1n1/multipath/nvme1c1n1/queue_depth 518 $cat /sys/block/nvme1n1/multipath/nvme1c3n1/queue_depth 504 >From the above output, we can infer that I/O workload targeted at nvme1n1 uses two paths nvme1c1n1 and nvme1c3n1 and the current queue depth of each path is 518 and 504 respectively. Reading "queue_depth" file when configured io-policy is anything but queue-depth would show no output. Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Signed-off-by: Nilay Shroff Signed-off-by: Keith Busch --- drivers/nvme/host/multipath.c | 12 ++++++++++++ drivers/nvme/host/nvme.h | 1 + drivers/nvme/host/sysfs.c | 3 ++- 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index b47e01942ca4a6..6b12ca80aa2730 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -976,6 +976,18 @@ static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr, } DEVICE_ATTR_RO(ana_state); +static ssize_t queue_depth_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ns *ns = nvme_get_ns_from_dev(dev); + + if (ns->head->subsys->iopolicy != NVME_IOPOLICY_QD) + return 0; + + return sysfs_emit(buf, "%d\n", atomic_read(&ns->ctrl->nr_active)); +} +DEVICE_ATTR_RO(queue_depth); + static ssize_t numa_nodes_show(struct device *dev, struct device_attribute *attr, char *buf) { diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index ff4bd0c2e401e7..51e07864212710 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -984,6 +984,7 @@ static inline void nvme_trace_bio_complete(struct request *req) extern bool multipath; extern struct device_attribute dev_attr_ana_grpid; extern struct device_attribute dev_attr_ana_state; +extern struct device_attribute dev_attr_queue_depth; extern struct device_attribute dev_attr_numa_nodes; extern struct device_attribute subsys_attr_iopolicy; diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index 96eaecda0ab915..6d31226f7a4f84 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -258,6 +258,7 @@ static struct attribute *nvme_ns_attrs[] = { #ifdef CONFIG_NVME_MULTIPATH &dev_attr_ana_grpid.attr, &dev_attr_ana_state.attr, + &dev_attr_queue_depth.attr, &dev_attr_numa_nodes.attr, #endif &dev_attr_io_passthru_err_log_enabled.attr, @@ -291,7 +292,7 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj, if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl)) return 0; } - if (a == &dev_attr_numa_nodes.attr) { + if (a == &dev_attr_queue_depth.attr || a == &dev_attr_numa_nodes.attr) { if (nvme_disk_is_ns_head(dev_to_disk(dev))) return 0; } From 13f8f3946085f178a2b2218fe700bbcd8320e08c Mon Sep 17 00:00:00 2001 From: Qasim Ijaz Date: Thu, 13 Feb 2025 22:16:22 +0000 Subject: [PATCH 25/30] nvme-fc: Utilise min3() to simplify queue count calculation Refactor nvme_fc_create_io_queues() and nvme_fc_recreate_io_queues() to use the min3() macro to find the minimum between 3 values instead of multiple min()'s. This shortens the code and makes it easier to read. Signed-off-by: Qasim Ijaz Reviewed-by: James Smart Signed-off-by: Keith Busch --- drivers/nvme/host/fc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index f4f1866fbd5b8b..c0b7134db0e6c7 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2907,7 +2907,7 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl) unsigned int nr_io_queues; int ret; - nr_io_queues = min(min(opts->nr_io_queues, num_online_cpus()), + nr_io_queues = min3(opts->nr_io_queues, num_online_cpus(), ctrl->lport->ops->max_hw_queues); ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues); if (ret) { @@ -2961,7 +2961,7 @@ nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl) unsigned int nr_io_queues; int ret; - nr_io_queues = min(min(opts->nr_io_queues, num_online_cpus()), + nr_io_queues = min3(opts->nr_io_queues, num_online_cpus(), ctrl->lport->ops->max_hw_queues); ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues); if (ret) { From ac2928203ec4c048e716fa2d0e1fdc07297a03f9 Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Thu, 6 Mar 2025 10:53:31 +0200 Subject: [PATCH 26/30] nvme-pci: remove stale comment The ns variable has been removed in commit 62451a2b2e7e ("nvme: separate command prep and issue"). Drop reference to ns in comment. Fixes: 62451a2b2e7e ("nvme: separate command prep and issue") Signed-off-by: Baruch Siach Reviewed-by: Anuj Gupta Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 9197a5b173fdff..1fd803e05af251 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -953,9 +953,6 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) return ret; } -/* - * NOTE: ns is NULL when called on the admin queue. - */ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { From 29471439b23222e9b2bfc4e4ff5c818cedeed3b8 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Thu, 6 Feb 2025 14:42:44 +0100 Subject: [PATCH 27/30] nvmet: add command quiesce time TP4129 introduces Command Quiesce Time (CQT) for coordinating the shutdown sequence when for example KATO expires. Add support to nvmet but only report CQT is available but the controller doesn't need any additional time when shutting down. In this case the spec says nvmet should report a value of 1. Signed-off-by: Daniel Wagner --- drivers/nvme/target/admin-cmd.c | 6 ++++++ drivers/nvme/target/nvmet.h | 1 + include/linux/nvme.h | 4 +++- 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index acc138bbf8f23d..1700276caf8d42 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -733,6 +733,12 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) /* We support keep-alive timeout in granularity of seconds */ id->kas = cpu_to_le16(NVMET_KAS); + /* + * Command Quiesce Time in milliseconds. If the controller is not + * need any quiencse time, the controller should set it to 1. + */ + id->cqt = cpu_to_le16(NVMET_CQT); + id->sqes = (0x6 << 4) | 0x6; id->cqes = (0x4 << 4) | 0x4; diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 9f6110ac7c4abe..c9955c39621d8d 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -679,6 +679,7 @@ bool nvmet_subsys_nsid_exists(struct nvmet_subsys *subsys, u32 nsid); #define NVMET_KAS 10 #define NVMET_DISC_KATO_MS 120000 +#define NVMET_CQT 1 int __init nvmet_init_configfs(void); void __exit nvmet_exit_configfs(void); diff --git a/include/linux/nvme.h b/include/linux/nvme.h index bfb5688363b0dc..a91ac2b651d956 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -335,7 +335,9 @@ struct nvme_id_ctrl { __u8 anacap; __le32 anagrpmax; __le32 nanagrpid; - __u8 rsvd352[160]; + __u8 rsvd352[34]; + __u16 cqt; + __u8 rsvd388[124]; __u8 sqes; __u8 cqes; __le16 maxcmd; From 700ebbb9354d30a290fc03b9792469ac2a6575c4 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Thu, 6 Feb 2025 14:42:45 +0100 Subject: [PATCH 28/30] nvme: store cqt value into nvme ctrl object Signed-off-by: Daniel Wagner --- drivers/nvme/host/core.c | 1 + drivers/nvme/host/nvme.h | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 870314c521077c..c31fc2763d74ba 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3386,6 +3386,7 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl) ctrl->kas = le16_to_cpu(id->kas); ctrl->max_namespaces = le32_to_cpu(id->mnan); ctrl->ctratt = le32_to_cpu(id->ctratt); + ctrl->cqt = le16_to_cpu(id->cqt); ctrl->cntrltype = id->cntrltype; ctrl->dctype = id->dctype; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 51e07864212710..4c122e72db82fa 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -344,6 +344,7 @@ struct nvme_ctrl { u32 oaes; u32 aen_result; u32 ctratt; + u16 cqt; unsigned int shutdown_timeout; unsigned int kato; bool subsystem; From b6e2e25bddefc3fea2c44fc42630fcee910a1244 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Thu, 6 Feb 2025 14:42:46 +0100 Subject: [PATCH 29/30] nvme-fabrics: add recovery_delay option Rebased onto nvme-6.15 Signed-off-by: Daniel Wagner --- drivers/nvme/host/fabrics.c | 13 +++++++++++++ drivers/nvme/host/fabrics.h | 3 +++ drivers/nvme/host/sysfs.c | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 48 insertions(+) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 93e9041b9657eb..e93a60b2ee42af 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -709,6 +709,7 @@ static const match_table_t opt_tokens = { { NVMF_OPT_TLS, "tls" }, { NVMF_OPT_CONCAT, "concat" }, #endif + { NVMF_OPT_RECOVERY_DELAY, "recovery_delay=%d" }, { NVMF_OPT_ERR, NULL } }; @@ -1064,6 +1065,18 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, } opts->concat = true; break; + case NVMF_OPT_RECOVERY_DELAY: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out; + } + if (token <= 0) { + pr_err("Invalid recovery_delay %d\n", token); + ret = -EINVAL; + goto out; + } + opts->recovery_delay = token; + break; default: pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n", p); diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 9cf5b020adba27..3b44aa44408363 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -67,6 +67,7 @@ enum { NVMF_OPT_KEYRING = 1 << 26, NVMF_OPT_TLS_KEY = 1 << 27, NVMF_OPT_CONCAT = 1 << 28, + NVMF_OPT_RECOVERY_DELAY = 1 << 29, }; /** @@ -93,6 +94,7 @@ enum { * @queue_size: Number of IO queue elements. * @nr_io_queues: Number of controller IO queues that will be established. * @reconnect_delay: Time between two consecutive reconnect attempts. + * @recovery_delay: Time before error recovery starts after error detection. * @discovery_nqn: indicates if the subsysnqn is the well-known discovery NQN. * @kato: Keep-alive timeout. * @host: Virtual NVMe host, contains the NQN and Host ID. @@ -123,6 +125,7 @@ struct nvmf_ctrl_options { size_t queue_size; unsigned int nr_io_queues; unsigned int reconnect_delay; + unsigned int recovery_delay; bool discovery_nqn; bool duplicate_connect; unsigned int kato; diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index 6d31226f7a4f84..a8f38ada8339f8 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -684,6 +684,35 @@ static DEVICE_ATTR(dhchap_ctrl_secret, S_IRUGO | S_IWUSR, nvme_ctrl_dhchap_ctrl_secret_show, nvme_ctrl_dhchap_ctrl_secret_store); #endif +static ssize_t recovery_delay_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + struct nvmf_ctrl_options *opts = ctrl->opts; + + if (opts->recovery_delay == -1) + return sysfs_emit(buf, "off\n"); + return sysfs_emit(buf, "%d\n", opts->recovery_delay); +} + +static ssize_t recovery_delay_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + struct nvmf_ctrl_options *opts = ctrl->opts; + unsigned int recovery_delay; + int err; + + err = kstrtou32(buf, 10, &recovery_delay); + if (err) + return err; + + opts->recovery_delay = recovery_delay; + return count; +} +static DEVICE_ATTR(recovery_delay, S_IRUGO | S_IWUSR, + recovery_delay_show, recovery_delay_store); + static struct attribute *nvme_dev_attrs[] = { &dev_attr_reset_controller.attr, &dev_attr_rescan_controller.attr, @@ -712,6 +741,7 @@ static struct attribute *nvme_dev_attrs[] = { &dev_attr_dhchap_ctrl_secret.attr, #endif &dev_attr_adm_passthru_err_log_enabled.attr, + &dev_attr_recovery_delay.attr, NULL }; @@ -741,6 +771,8 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, if (a == &dev_attr_dhchap_ctrl_secret.attr && !ctrl->opts) return 0; #endif + if (a == &dev_attr_recovery_delay.attr && !ctrl->opts) + return 0; return a->mode; } From f791e1a1af77ff1fd9c451d3788d540669fe3349 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Thu, 6 Feb 2025 14:42:47 +0100 Subject: [PATCH 30/30] nvme: delay failover by command quiesce timeout The TP4129 mendates that the failover should be delayed by CQT. Instead modifying the transports to handle this failover (a very invasive change) introdcue a new requeue_list on controller level. When nvme_decide_disposition returns FAILOVER do not immediatly requeue it on the namespace level instead queue it on the ctrl's request_list and moved later to the namespace's requeue_list. The tricky part is on error recovery and reset so the transports cleanup function do not block when waiting for all inflight requests to complete. Thus we have to mark the request as completed before failing over. Note, when a request is retried it is also first marked as completed. Signed-off-by: Daniel Wagner --- drivers/nvme/host/core.c | 19 +++++++++++++ drivers/nvme/host/fc.c | 4 +++ drivers/nvme/host/multipath.c | 52 +++++++++++++++++++++++++++++++++-- drivers/nvme/host/nvme.h | 15 ++++++++++ drivers/nvme/host/rdma.c | 2 ++ drivers/nvme/host/tcp.c | 1 + 6 files changed, 90 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c31fc2763d74ba..8f8e159d3cb00a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -239,6 +239,7 @@ static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl) flush_work(&ctrl->reset_work); nvme_stop_ctrl(ctrl); + nvme_flush_failover(ctrl); nvme_remove_namespaces(ctrl); ctrl->ops->delete_ctrl(ctrl); nvme_uninit_ctrl(ctrl); @@ -1310,6 +1311,19 @@ static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl) queue_delayed_work(nvme_wq, &ctrl->ka_work, delay); } +void nvme_schedule_failover(struct nvme_ctrl *ctrl) +{ + unsigned long delay; + + if (ctrl->cqt) + delay = msecs_to_jiffies(ctrl->cqt); + else + delay = ctrl->kato * HZ; + + queue_delayed_work(nvme_wq, &ctrl->failover_work, delay); +} +EXPORT_SYMBOL_GPL(nvme_schedule_failover); + static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq, blk_status_t status) { @@ -1336,6 +1350,8 @@ static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq, dev_err(ctrl->device, "failed nvme_keep_alive_end_io error=%d\n", status); + + nvme_schedule_failover(ctrl); return RQ_END_IO_NONE; } @@ -4725,6 +4741,7 @@ EXPORT_SYMBOL_GPL(nvme_remove_io_tag_set); void nvme_stop_ctrl(struct nvme_ctrl *ctrl) { + nvme_schedule_failover(ctrl); nvme_mpath_stop(ctrl); nvme_auth_stop(ctrl); nvme_stop_failfast_work(ctrl); @@ -4851,6 +4868,8 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work); INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work); + INIT_DELAYED_WORK(&ctrl->failover_work, nvme_failover_work); + INIT_LIST_HEAD(&ctrl->failover_list); memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd)); ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive; ctrl->ka_last_check_time = jiffies; diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index c0b7134db0e6c7..725ff35d52a688 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2544,6 +2544,8 @@ nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg) { enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl); + nvme_schedule_failover(&ctrl->ctrl); + /* * if an error (io timeout, etc) while (re)connecting, the remote * port requested terminating of the association (disconnect_ls) @@ -3371,6 +3373,8 @@ nvme_fc_reset_ctrl_work(struct work_struct *work) /* will block will waiting for io to terminate */ nvme_fc_delete_association(ctrl); + nvme_schedule_failover(&ctrl->ctrl); + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) dev_err(ctrl->ctrl.device, "NVME-FC{%d}: error_recovery: Couldn't change state " diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 6b12ca80aa2730..df6b01364e2e39 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -86,9 +86,11 @@ void nvme_mpath_start_freeze(struct nvme_subsystem *subsys) void nvme_failover_req(struct request *req) { struct nvme_ns *ns = req->q->queuedata; + struct nvme_ctrl *ctrl = nvme_req(req)->ctrl; u16 status = nvme_req(req)->status & NVME_SCT_SC_MASK; unsigned long flags; struct bio *bio; + enum nvme_ctrl_state state = nvme_ctrl_state(ctrl); nvme_mpath_clear_current_path(ns); @@ -121,9 +123,53 @@ void nvme_failover_req(struct request *req) blk_steal_bios(&ns->head->requeue_list, req); spin_unlock_irqrestore(&ns->head->requeue_lock, flags); - nvme_req(req)->status = 0; - nvme_end_req(req); - kblockd_schedule_work(&ns->head->requeue_work); + spin_lock_irqsave(&ctrl->lock, flags); + list_add_tail(&req->queuelist, &ctrl->failover_list); + spin_unlock_irqrestore(&ctrl->lock, flags); + + if (state == NVME_CTRL_DELETING) { + /* + * request which fail over in the DELETING state were + * canceled and blk_mq_tagset_wait_completed_request will + * block until they have been proceed. Though + * nvme_failover_work is already stopped. Thus schedule + * a failover; it's still necessary to delay these commands + * by CQT. + */ + nvme_schedule_failover(ctrl); + } +} + +void nvme_flush_failover(struct nvme_ctrl *ctrl) +{ + LIST_HEAD(failover_list); + struct request *rq; + bool kick = false; + + spin_lock_irq(&ctrl->lock); + list_splice_init(&ctrl->failover_list, &failover_list); + spin_unlock_irq(&ctrl->lock); + + while (!list_empty(&failover_list)) { + rq = list_entry(failover_list.next, + struct request, queuelist); + list_del_init(&rq->queuelist); + + nvme_req(rq)->status = 0; + nvme_end_req(rq); + kick = true; + } + + if (kick) + nvme_kick_requeue_lists(ctrl); +} + +void nvme_failover_work(struct work_struct *work) +{ + struct nvme_ctrl *ctrl = container_of(to_delayed_work(work), + struct nvme_ctrl, failover_work); + + nvme_flush_failover(ctrl); } void nvme_mpath_start_request(struct request *rq) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 4c122e72db82fa..95de807159a546 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -411,6 +411,9 @@ struct nvme_ctrl { enum nvme_ctrl_type cntrltype; enum nvme_dctype dctype; + + struct delayed_work failover_work; + struct list_head failover_list; }; static inline enum nvme_ctrl_state nvme_ctrl_state(struct nvme_ctrl *ctrl) @@ -956,6 +959,9 @@ void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys); void nvme_mpath_start_freeze(struct nvme_subsystem *subsys); void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys); void nvme_failover_req(struct request *req); +void nvme_failover_work(struct work_struct *work); +void nvme_schedule_failover(struct nvme_ctrl *ctrl); +void nvme_flush_failover(struct nvme_ctrl *ctrl); void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); void nvme_mpath_add_sysfs_link(struct nvme_ns_head *ns); @@ -1002,6 +1008,15 @@ static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl) static inline void nvme_failover_req(struct request *req) { } +static inline void nvme_failover_work(struct work_struct *work) +{ +} +static inline void nvme_schedule_failover(struct nvme_ctrl *ctrl) +{ +} +static inline void nvme_flush_failover(struct nvme_ctrl *ctrl) +{ +} static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) { } diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 86a2891d9bcc7a..9bee376f881b4c 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1127,6 +1127,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) nvme_stop_keep_alive(&ctrl->ctrl); flush_work(&ctrl->ctrl.async_event_work); + nvme_schedule_failover(&ctrl->ctrl); nvme_rdma_teardown_io_queues(ctrl, false); nvme_unquiesce_io_queues(&ctrl->ctrl); nvme_rdma_teardown_admin_queue(ctrl, false); @@ -2153,6 +2154,7 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = { static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) { + nvme_schedule_failover(&ctrl->ctrl); nvme_rdma_teardown_io_queues(ctrl, shutdown); nvme_quiesce_admin_queue(&ctrl->ctrl); nvme_disable_ctrl(&ctrl->ctrl, shutdown); diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index feb2d7e17c4a14..fa0ea7149d3ec2 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2393,6 +2393,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) nvme_auth_revoke_tls_key(ctrl); nvme_stop_keep_alive(ctrl); flush_work(&ctrl->async_event_work); + nvme_schedule_failover(ctrl); nvme_tcp_teardown_io_queues(ctrl, false); /* unquiesce to fail fast pending requests */ nvme_unquiesce_io_queues(ctrl);