From 5719e2823565a304a2afb752d4792c622a693e22 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 15 Jan 2025 21:26:03 +0100 Subject: [PATCH 01/20] io_uring/rsrc: Simplify buffer cloning by locking both rings The locking in the buffer cloning code is somewhat complex because it goes back and forth between locking the source ring and the destination ring. Make it easier to reason about by locking both rings at the same time. To avoid ABBA deadlocks, lock the rings in ascending kernel address order, just like in lock_two_nondirectories(). Signed-off-by: Jann Horn Link: https://lore.kernel.org/r/20250115-uring-clone-refactor-v2-1-7289ba50776d@google.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 73 +++++++++++++++++++++++++++---------------------- 1 file changed, 40 insertions(+), 33 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index e32ac5853391..a1c7c8db5545 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -921,6 +921,16 @@ int io_import_fixed(int ddir, struct iov_iter *iter, return 0; } +/* Lock two rings at once. The rings must be different! */ +static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2) +{ + if (ctx1 > ctx2) + swap(ctx1, ctx2); + mutex_lock(&ctx1->uring_lock); + mutex_lock_nested(&ctx2->uring_lock, SINGLE_DEPTH_NESTING); +} + +/* Both rings are locked by the caller. */ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx, struct io_uring_clone_buffers *arg) { @@ -928,6 +938,9 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx int i, ret, off, nr; unsigned int nbufs; + lockdep_assert_held(&ctx->uring_lock); + lockdep_assert_held(&src_ctx->uring_lock); + /* * Accounting state is shared between the two rings; that only works if * both rings are accounted towards the same counters. @@ -942,7 +955,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE)) return -EBUSY; - nbufs = READ_ONCE(src_ctx->buf_table.nr); + nbufs = src_ctx->buf_table.nr; if (!arg->nr) arg->nr = nbufs; else if (arg->nr > nbufs) @@ -966,27 +979,20 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx } } - /* - * Drop our own lock here. We'll setup the data we need and reference - * the source buffers, then re-grab, check, and assign at the end. - */ - mutex_unlock(&ctx->uring_lock); - - mutex_lock(&src_ctx->uring_lock); ret = -ENXIO; nbufs = src_ctx->buf_table.nr; if (!nbufs) - goto out_unlock; + goto out_free; ret = -EINVAL; if (!arg->nr) arg->nr = nbufs; else if (arg->nr > nbufs) - goto out_unlock; + goto out_free; ret = -EOVERFLOW; if (check_add_overflow(arg->nr, arg->src_off, &off)) - goto out_unlock; + goto out_free; if (off > nbufs) - goto out_unlock; + goto out_free; off = arg->dst_off; i = arg->src_off; @@ -1001,7 +1007,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); if (!dst_node) { ret = -ENOMEM; - goto out_unlock; + goto out_free; } refcount_inc(&src_node->buf->refs); @@ -1011,10 +1017,6 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx i++; } - /* Have a ref on the bufs now, drop src lock and re-grab our own lock */ - mutex_unlock(&src_ctx->uring_lock); - mutex_lock(&ctx->uring_lock); - /* * If asked for replace, put the old table. data->nodes[] holds both * old and new nodes at this point. @@ -1023,24 +1025,17 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx io_rsrc_data_free(ctx, &ctx->buf_table); /* - * ctx->buf_table should be empty now - either the contents are being - * replaced and we just freed the table, or someone raced setting up - * a buffer table while the clone was happening. If not empty, fall - * through to failure handling. + * ctx->buf_table must be empty now - either the contents are being + * replaced and we just freed the table, or the contents are being + * copied to a ring that does not have buffers yet (checked at function + * entry). */ - if (!ctx->buf_table.nr) { - ctx->buf_table = data; - return 0; - } + WARN_ON_ONCE(ctx->buf_table.nr); + ctx->buf_table = data; + return 0; - mutex_unlock(&ctx->uring_lock); - mutex_lock(&src_ctx->uring_lock); - /* someone raced setting up buffers, dump ours */ - ret = -EBUSY; -out_unlock: +out_free: io_rsrc_data_free(ctx, &data); - mutex_unlock(&src_ctx->uring_lock); - mutex_lock(&ctx->uring_lock); return ret; } @@ -1054,6 +1049,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) { struct io_uring_clone_buffers buf; + struct io_ring_ctx *src_ctx; bool registered_src; struct file *file; int ret; @@ -1071,7 +1067,18 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) file = io_uring_register_get_file(buf.src_fd, registered_src); if (IS_ERR(file)) return PTR_ERR(file); - ret = io_clone_buffers(ctx, file->private_data, &buf); + + src_ctx = file->private_data; + if (src_ctx != ctx) { + mutex_unlock(&ctx->uring_lock); + lock_two_rings(ctx, src_ctx); + } + + ret = io_clone_buffers(ctx, src_ctx, &buf); + + if (src_ctx != ctx) + mutex_unlock(&src_ctx->uring_lock); + if (!registered_src) fput(file); return ret; From bb2d76344bc80933a462aa84581d1258e0dc758b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jan 2025 02:53:26 +0000 Subject: [PATCH 02/20] io_uring: clean up io_uring_register_get_file() Make it always reference the returned file. It's safer, especially with unregistrations happening under it. And it makes the api cleaner with no conditional clean ups by the caller. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0d0b13a63e8edd6b5d360fc821dcdb035cb6b7e0.1736995897.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/register.c | 6 ++++-- io_uring/rsrc.c | 3 +-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/io_uring/register.c b/io_uring/register.c index 05025047d1da..0db181437ae3 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -853,6 +853,8 @@ struct file *io_uring_register_get_file(unsigned int fd, bool registered) return ERR_PTR(-EINVAL); fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); file = tctx->registered_rings[fd]; + if (file) + get_file(file); } else { file = fget(fd); } @@ -919,7 +921,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr, ctx->buf_table.nr, ret); mutex_unlock(&ctx->uring_lock); - if (!use_registered_ring) - fput(file); + + fput(file); return ret; } diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index a1c7c8db5545..b5e47030764e 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1079,7 +1079,6 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) if (src_ctx != ctx) mutex_unlock(&src_ctx->uring_lock); - if (!registered_src) - fput(file); + fput(file); return ret; } From b73de0da50129d790975bb8a9893b421cc38bc24 Mon Sep 17 00:00:00 2001 From: Sidong Yang Date: Wed, 15 Jan 2025 14:20:31 +0000 Subject: [PATCH 03/20] io_uring/rsrc: remove unused parameter ctx for io_rsrc_node_alloc() io_uring_ctx parameter for io_rsrc_node_alloc() is unused for now. This patch removes the parameter and fixes the callers accordingly. Signed-off-by: Sidong Yang Link: https://lore.kernel.org/r/20250115142033.658599-1-sidong.yang@furiosa.ai Signed-off-by: Jens Axboe --- io_uring/filetable.c | 2 +- io_uring/rsrc.c | 10 +++++----- io_uring/rsrc.h | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/io_uring/filetable.c b/io_uring/filetable.c index a21660e3145a..dd8eeec97acf 100644 --- a/io_uring/filetable.c +++ b/io_uring/filetable.c @@ -68,7 +68,7 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file, if (slot_index >= ctx->file_table.data.nr) return -EINVAL; - node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); + node = io_rsrc_node_alloc(IORING_RSRC_FILE); if (!node) return -ENOMEM; diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index b5e47030764e..a5fc035af8ff 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -118,7 +118,7 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node) } } -struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type) +struct io_rsrc_node *io_rsrc_node_alloc(int type) { struct io_rsrc_node *node; @@ -203,7 +203,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, err = -EBADF; break; } - node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); + node = io_rsrc_node_alloc(IORING_RSRC_FILE); if (!node) { err = -ENOMEM; fput(file); @@ -525,7 +525,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, goto fail; } ret = -ENOMEM; - node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); + node = io_rsrc_node_alloc(IORING_RSRC_FILE); if (!node) { fput(file); goto fail; @@ -730,7 +730,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, if (!iov->iov_base) return NULL; - node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); + node = io_rsrc_node_alloc(IORING_RSRC_BUFFER); if (!node) return ERR_PTR(-ENOMEM); node->buf = NULL; @@ -1004,7 +1004,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx if (!src_node) { dst_node = NULL; } else { - dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); + dst_node = io_rsrc_node_alloc(IORING_RSRC_BUFFER); if (!dst_node) { ret = -ENOMEM; goto out_free; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index c8b093584461..5cd00b7baef8 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -43,7 +43,7 @@ struct io_imu_folio_data { unsigned int nr_folios; }; -struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type); +struct io_rsrc_node *io_rsrc_node_alloc(int type); void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node); void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data); int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr); From 2839ab71ac9009884fe41a7422a167a64716c0a7 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 20 Jan 2025 17:21:57 +0100 Subject: [PATCH 04/20] io_uring/rsrc: Move lockdep assert from io_free_rsrc_node() to caller Checking for lockdep_assert_held(&ctx->uring_lock) in io_free_rsrc_node() means that the assertion is only checked when the resource drops to zero references. Move the lockdep assertion up into the caller io_put_rsrc_node() so that it instead happens on every reference count decrement. Signed-off-by: Jann Horn Link: https://lore.kernel.org/r/20250120-uring-lockdep-assert-earlier-v1-1-68d8e071a4bb@google.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 2 -- io_uring/rsrc.h | 3 +++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index a5fc035af8ff..af39b69eb4fd 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -444,8 +444,6 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags) void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { - lockdep_assert_held(&ctx->uring_lock); - if (node->tag) io_post_aux_cqe(ctx, node->tag, 0, 0); diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 5cd00b7baef8..190f7ee45de9 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -2,6 +2,8 @@ #ifndef IOU_RSRC_H #define IOU_RSRC_H +#include + #define IO_NODE_ALLOC_CACHE_MAX 32 #define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3) @@ -80,6 +82,7 @@ static inline struct io_rsrc_node *io_rsrc_node_lookup(struct io_rsrc_data *data static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { + lockdep_assert_held(&ctx->uring_lock); if (node && !--node->refs) io_free_rsrc_node(ctx, node); } From 69a62e03f896a7382671877b6ad6aab87c53e9c3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 22 Jan 2025 17:03:28 -0700 Subject: [PATCH 05/20] io_uring/msg_ring: don't leave potentially dangling ->tctx pointer For remote posting of messages, req->tctx is assigned even though it is never used. Rather than leave a dangling pointer, just clear it to NULL and use the previous check for a valid submitter_task to gate on whether or not the request should be terminated. Reported-by: Jann Horn Fixes: b6f58a3f4aa8 ("io_uring: move struct io_kiocb from task_struct to io_uring_task") Signed-off-by: Jens Axboe --- io_uring/msg_ring.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index bd3cd78d2dba..7e6f68e911f1 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -89,8 +89,7 @@ static void io_msg_tw_complete(struct io_kiocb *req, struct io_tw_state *ts) static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req, int res, u32 cflags, u64 user_data) { - req->tctx = READ_ONCE(ctx->submitter_task->io_uring); - if (!req->tctx) { + if (!READ_ONCE(ctx->submitter_task)) { kmem_cache_free(req_cachep, req); return -EOWNERDEAD; } @@ -98,6 +97,7 @@ static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req, io_req_set_res(req, res, cflags); percpu_ref_get(&ctx->refs); req->ctx = ctx; + req->tctx = NULL; req->io_task_work.func = io_msg_tw_complete; io_req_task_work_add_remote(req, ctx, IOU_F_TWQ_LAZY_WAKE); return 0; From d58d82bd0efd6c8edd452fc2f6c6dd052ec57cb2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 22 Jan 2025 17:29:31 -0700 Subject: [PATCH 06/20] io_uring/uring_cmd: use cached cmd_op in io_uring_cmd_sock() io_uring_cmd_sock() does a normal read of cmd->sqe->cmd_op, where it really should be using a READ_ONCE() as ->sqe may still be pointing to the original SQE. Since the prep side already does this READ_ONCE() and stores it locally, use that value rather than re-read it. Fixes: 8e9fad0e70b7b ("io_uring: Add io_uring command support for sockets") Link: https://lore.kernel.org/r/20250121-uring-sockcmd-fix-v1-1-add742802a29@google.com Signed-off-by: Jens Axboe --- io_uring/uring_cmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index fc94c465a985..3993c9339ac7 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -350,7 +350,7 @@ int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) if (!prot || !prot->ioctl) return -EOPNOTSUPP; - switch (cmd->sqe->cmd_op) { + switch (cmd->cmd_op) { case SOCKET_URING_OP_SIOCINQ: ret = prot->ioctl(sk, SIOCINQ, &arg); if (ret) From eaf72f7b414f5944585e7dee9c915c7f8f7f6344 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 22 Jan 2025 19:50:31 -0700 Subject: [PATCH 07/20] io_uring/uring_cmd: cleanup struct io_uring_cmd_data layout A few spots in uring_cmd assume that the SQEs copied are always at the start of the structure, and hence mix req->async_data and the struct itself. Clean that up and use the proper indices. Signed-off-by: Jens Axboe --- io_uring/uring_cmd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 3993c9339ac7..6a63ec4b5445 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -192,8 +192,8 @@ static int io_uring_cmd_prep_setup(struct io_kiocb *req, return 0; } - memcpy(req->async_data, sqe, uring_sqe_size(req->ctx)); - ioucmd->sqe = req->async_data; + memcpy(cache->sqes, sqe, uring_sqe_size(req->ctx)); + ioucmd->sqe = cache->sqes; return 0; } @@ -260,7 +260,7 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) struct io_uring_cmd_data *cache = req->async_data; if (ioucmd->sqe != (void *) cache) - memcpy(cache, ioucmd->sqe, uring_sqe_size(req->ctx)); + memcpy(cache->sqes, ioucmd->sqe, uring_sqe_size(req->ctx)); return -EAGAIN; } else if (ret == -EIOCBQUEUED) { return -EIOCBQUEUED; From fa3595523d72d13508befd28cf2ca642cafc69f7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 22 Jan 2025 20:00:57 -0700 Subject: [PATCH 08/20] io_uring: get rid of alloc cache init_once handling init_once is called when an object doesn't come from the cache, and hence needs initial clearing of certain members. While the whole struct could get cleared by memset() in that case, a few of the cache members are large enough that this may cause unnecessary overhead if the caches used aren't large enough to satisfy the workload. For those cases, some churn of kmalloc+kfree is to be expected. Ensure that the 3 users that need clearing put the members they need cleared at the start of the struct, and wrap the rest of the struct in a struct group so the offset is known. While at it, improve the interaction with KASAN such that when/if KASAN writes to members inside the struct that should be retained over caching, it won't trip over itself. For rw and net, the retaining of the iovec over caching is disabled if KASAN is enabled. A helper will free and clear those members in that case. Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 2 +- include/linux/io_uring_types.h | 3 ++- io_uring/alloc_cache.h | 43 +++++++++++++++++++++++++++------- io_uring/futex.c | 4 ++-- io_uring/io_uring.c | 12 ++++++---- io_uring/io_uring.h | 5 ++-- io_uring/net.c | 28 +++++----------------- io_uring/net.h | 20 +++++++++------- io_uring/poll.c | 2 +- io_uring/rw.c | 27 +++++---------------- io_uring/rw.h | 27 ++++++++++++--------- io_uring/uring_cmd.c | 11 ++------- 12 files changed, 91 insertions(+), 93 deletions(-) diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index a3ce553413de..abd0c8bd950b 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -19,8 +19,8 @@ struct io_uring_cmd { }; struct io_uring_cmd_data { - struct io_uring_sqe sqes[2]; void *op_data; + struct io_uring_sqe sqes[2]; }; static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 623d8e798a11..3def525a1da3 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -222,7 +222,8 @@ struct io_alloc_cache { void **entries; unsigned int nr_cached; unsigned int max_cached; - size_t elem_size; + unsigned int elem_size; + unsigned int init_clear; }; struct io_ring_ctx { diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h index a3a8cfec32ce..cca96aff3277 100644 --- a/io_uring/alloc_cache.h +++ b/io_uring/alloc_cache.h @@ -6,6 +6,19 @@ */ #define IO_ALLOC_CACHE_MAX 128 +#if defined(CONFIG_KASAN) +static inline void io_alloc_cache_kasan(struct iovec **iov, int *nr) +{ + kfree(*iov); + *iov = NULL; + *nr = 0; +} +#else +static inline void io_alloc_cache_kasan(struct iovec **iov, int *nr) +{ +} +#endif + static inline bool io_alloc_cache_put(struct io_alloc_cache *cache, void *entry) { @@ -23,35 +36,47 @@ static inline void *io_alloc_cache_get(struct io_alloc_cache *cache) if (cache->nr_cached) { void *entry = cache->entries[--cache->nr_cached]; + /* + * If KASAN is enabled, always clear the initial bytes that + * must be zeroed post alloc, in case any of them overlap + * with KASAN storage. + */ +#if defined(CONFIG_KASAN) kasan_mempool_unpoison_object(entry, cache->elem_size); + if (cache->init_clear) + memset(entry, 0, cache->init_clear); +#endif return entry; } return NULL; } -static inline void *io_cache_alloc(struct io_alloc_cache *cache, gfp_t gfp, - void (*init_once)(void *obj)) +static inline void *io_cache_alloc(struct io_alloc_cache *cache, gfp_t gfp) { - if (unlikely(!cache->nr_cached)) { - void *obj = kmalloc(cache->elem_size, gfp); + void *obj; - if (obj && init_once) - init_once(obj); + obj = io_alloc_cache_get(cache); + if (obj) return obj; - } - return io_alloc_cache_get(cache); + + obj = kmalloc(cache->elem_size, gfp); + if (obj && cache->init_clear) + memset(obj, 0, cache->init_clear); + return obj; } /* returns false if the cache was initialized properly */ static inline bool io_alloc_cache_init(struct io_alloc_cache *cache, - unsigned max_nr, size_t size) + unsigned max_nr, unsigned int size, + unsigned int init_bytes) { cache->entries = kvmalloc_array(max_nr, sizeof(void *), GFP_KERNEL); if (cache->entries) { cache->nr_cached = 0; cache->max_cached = max_nr; cache->elem_size = size; + cache->init_clear = init_bytes; return false; } return true; diff --git a/io_uring/futex.c b/io_uring/futex.c index 30139cc150f2..3159a2b7eeca 100644 --- a/io_uring/futex.c +++ b/io_uring/futex.c @@ -36,7 +36,7 @@ struct io_futex_data { bool io_futex_cache_init(struct io_ring_ctx *ctx) { return io_alloc_cache_init(&ctx->futex_cache, IO_FUTEX_ALLOC_CACHE_MAX, - sizeof(struct io_futex_data)); + sizeof(struct io_futex_data), 0); } void io_futex_cache_free(struct io_ring_ctx *ctx) @@ -320,7 +320,7 @@ int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags) } io_ring_submit_lock(ctx, issue_flags); - ifd = io_cache_alloc(&ctx->futex_cache, GFP_NOWAIT, NULL); + ifd = io_cache_alloc(&ctx->futex_cache, GFP_NOWAIT); if (!ifd) { ret = -ENOMEM; goto done_unlock; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 7bfbc7c22367..263e504be4a8 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -315,16 +315,18 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->cq_overflow_list); INIT_LIST_HEAD(&ctx->io_buffers_cache); ret = io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX, - sizeof(struct async_poll)); + sizeof(struct async_poll), 0); ret |= io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX, - sizeof(struct io_async_msghdr)); + sizeof(struct io_async_msghdr), + offsetof(struct io_async_msghdr, clear)); ret |= io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX, - sizeof(struct io_async_rw)); + sizeof(struct io_async_rw), + offsetof(struct io_async_rw, clear)); ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX, - sizeof(struct io_uring_cmd_data)); + sizeof(struct io_uring_cmd_data), 0); spin_lock_init(&ctx->msg_lock); ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX, - sizeof(struct io_kiocb)); + sizeof(struct io_kiocb), 0); ret |= io_futex_cache_init(ctx); if (ret) goto free_ref; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index f65e3f3ede51..67adbb3c1bf5 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -226,10 +226,9 @@ static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags) } static inline void *io_uring_alloc_async_data(struct io_alloc_cache *cache, - struct io_kiocb *req, - void (*init_once)(void *obj)) + struct io_kiocb *req) { - req->async_data = io_cache_alloc(cache, GFP_KERNEL, init_once); + req->async_data = io_cache_alloc(cache, GFP_KERNEL); if (req->async_data) req->flags |= REQ_F_ASYNC_DATA; return req->async_data; diff --git a/io_uring/net.c b/io_uring/net.c index 85f55fbc25c9..41eef286f8b9 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -137,7 +137,6 @@ static void io_netmsg_iovec_free(struct io_async_msghdr *kmsg) static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags) { struct io_async_msghdr *hdr = req->async_data; - struct iovec *iov; /* can't recycle, ensure we free the iovec if we have one */ if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) { @@ -146,39 +145,25 @@ static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags) } /* Let normal cleanup path reap it if we fail adding to the cache */ - iov = hdr->free_iov; + io_alloc_cache_kasan(&hdr->free_iov, &hdr->free_iov_nr); if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) { - if (iov) - kasan_mempool_poison_object(iov); req->async_data = NULL; req->flags &= ~REQ_F_ASYNC_DATA; } } -static void io_msg_async_data_init(void *obj) -{ - struct io_async_msghdr *hdr = (struct io_async_msghdr *)obj; - - hdr->free_iov = NULL; - hdr->free_iov_nr = 0; -} - static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; struct io_async_msghdr *hdr; - hdr = io_uring_alloc_async_data(&ctx->netmsg_cache, req, - io_msg_async_data_init); + hdr = io_uring_alloc_async_data(&ctx->netmsg_cache, req); if (!hdr) return NULL; /* If the async data was cached, we might have an iov cached inside. */ - if (hdr->free_iov) { - kasan_mempool_unpoison_object(hdr->free_iov, - hdr->free_iov_nr * sizeof(struct iovec)); + if (hdr->free_iov) req->flags |= REQ_F_NEED_CLEANUP; - } return hdr; } @@ -1813,11 +1798,10 @@ void io_netmsg_cache_free(const void *entry) { struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry; - if (kmsg->free_iov) { - kasan_mempool_unpoison_object(kmsg->free_iov, - kmsg->free_iov_nr * sizeof(struct iovec)); +#if !defined(CONFIG_KASAN) + if (kmsg->free_iov) io_netmsg_iovec_free(kmsg); - } +#endif kfree(kmsg); } #endif diff --git a/io_uring/net.h b/io_uring/net.h index 52bfee05f06a..b804c2b36e60 100644 --- a/io_uring/net.h +++ b/io_uring/net.h @@ -5,16 +5,20 @@ struct io_async_msghdr { #if defined(CONFIG_NET) - struct iovec fast_iov; - /* points to an allocated iov, if NULL we use fast_iov instead */ struct iovec *free_iov; + /* points to an allocated iov, if NULL we use fast_iov instead */ int free_iov_nr; - int namelen; - __kernel_size_t controllen; - __kernel_size_t payloadlen; - struct sockaddr __user *uaddr; - struct msghdr msg; - struct sockaddr_storage addr; + struct_group(clear, + int namelen; + struct iovec fast_iov; + __kernel_size_t controllen; + __kernel_size_t payloadlen; + struct sockaddr __user *uaddr; + struct msghdr msg; + struct sockaddr_storage addr; + ); +#else + struct_group(clear); #endif }; diff --git a/io_uring/poll.c b/io_uring/poll.c index cc01c40b43d3..356474c66f32 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -650,7 +650,7 @@ static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req, kfree(apoll->double_poll); } else { if (!(issue_flags & IO_URING_F_UNLOCKED)) - apoll = io_cache_alloc(&ctx->apoll_cache, GFP_ATOMIC, NULL); + apoll = io_cache_alloc(&ctx->apoll_cache, GFP_ATOMIC); else apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); if (!apoll) diff --git a/io_uring/rw.c b/io_uring/rw.c index a9a2733be842..991ecfbea88e 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -158,16 +158,13 @@ static void io_rw_iovec_free(struct io_async_rw *rw) static void io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags) { struct io_async_rw *rw = req->async_data; - struct iovec *iov; if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) { io_rw_iovec_free(rw); return; } - iov = rw->free_iovec; + io_alloc_cache_kasan(&rw->free_iovec, &rw->free_iov_nr); if (io_alloc_cache_put(&req->ctx->rw_cache, rw)) { - if (iov) - kasan_mempool_poison_object(iov); req->async_data = NULL; req->flags &= ~REQ_F_ASYNC_DATA; } @@ -208,27 +205,16 @@ static void io_req_rw_cleanup(struct io_kiocb *req, unsigned int issue_flags) } } -static void io_rw_async_data_init(void *obj) -{ - struct io_async_rw *rw = (struct io_async_rw *)obj; - - rw->free_iovec = NULL; - rw->bytes_done = 0; -} - static int io_rw_alloc_async(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; struct io_async_rw *rw; - rw = io_uring_alloc_async_data(&ctx->rw_cache, req, io_rw_async_data_init); + rw = io_uring_alloc_async_data(&ctx->rw_cache, req); if (!rw) return -ENOMEM; - if (rw->free_iovec) { - kasan_mempool_unpoison_object(rw->free_iovec, - rw->free_iov_nr * sizeof(struct iovec)); + if (rw->free_iovec) req->flags |= REQ_F_NEED_CLEANUP; - } rw->bytes_done = 0; return 0; } @@ -1323,10 +1309,9 @@ void io_rw_cache_free(const void *entry) { struct io_async_rw *rw = (struct io_async_rw *) entry; - if (rw->free_iovec) { - kasan_mempool_unpoison_object(rw->free_iovec, - rw->free_iov_nr * sizeof(struct iovec)); +#if !defined(CONFIG_KASAN) + if (rw->free_iovec) io_rw_iovec_free(rw); - } +#endif kfree(rw); } diff --git a/io_uring/rw.h b/io_uring/rw.h index 2d7656bd268d..eaa59bd64870 100644 --- a/io_uring/rw.h +++ b/io_uring/rw.h @@ -9,19 +9,24 @@ struct io_meta_state { struct io_async_rw { size_t bytes_done; - struct iov_iter iter; - struct iov_iter_state iter_state; - struct iovec fast_iov; struct iovec *free_iovec; - int free_iov_nr; - /* wpq is for buffered io, while meta fields are used with direct io */ - union { - struct wait_page_queue wpq; - struct { - struct uio_meta meta; - struct io_meta_state meta_state; + struct_group(clear, + struct iov_iter iter; + struct iov_iter_state iter_state; + struct iovec fast_iov; + int free_iov_nr; + /* + * wpq is for buffered io, while meta fields are used with + * direct io + */ + union { + struct wait_page_queue wpq; + struct { + struct uio_meta meta; + struct io_meta_state meta_state; + }; }; - }; + ); }; int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 6a63ec4b5445..1f6a82128b47 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -168,23 +168,16 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, u64 res2, } EXPORT_SYMBOL_GPL(io_uring_cmd_done); -static void io_uring_cmd_init_once(void *obj) -{ - struct io_uring_cmd_data *data = obj; - - data->op_data = NULL; -} - static int io_uring_cmd_prep_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); struct io_uring_cmd_data *cache; - cache = io_uring_alloc_async_data(&req->ctx->uring_cache, req, - io_uring_cmd_init_once); + cache = io_uring_alloc_async_data(&req->ctx->uring_cache, req); if (!cache) return -ENOMEM; + cache->op_data = NULL; if (!(req->flags & REQ_F_FORCE_ASYNC)) { /* defer memcpy until we need it */ From ff74954e4e9374f24b95dd46ef0bb1b5fa0a46f2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 23 Jan 2025 07:34:36 -0700 Subject: [PATCH 09/20] io_uring/alloc_cache: get rid of _nocache() helper Just allow passing in NULL for the cache, if the type in question doesn't have a cache associated with it. Signed-off-by: Jens Axboe --- io_uring/io_uring.h | 18 +++++++----------- io_uring/timeout.c | 2 +- io_uring/waitid.c | 2 +- 3 files changed, 9 insertions(+), 13 deletions(-) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 67adbb3c1bf5..ab619e63ef39 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -228,18 +228,14 @@ static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags) static inline void *io_uring_alloc_async_data(struct io_alloc_cache *cache, struct io_kiocb *req) { - req->async_data = io_cache_alloc(cache, GFP_KERNEL); - if (req->async_data) - req->flags |= REQ_F_ASYNC_DATA; - return req->async_data; -} + if (cache) { + req->async_data = io_cache_alloc(cache, GFP_KERNEL); + } else { + const struct io_issue_def *def = &io_issue_defs[req->opcode]; -static inline void *io_uring_alloc_async_data_nocache(struct io_kiocb *req) -{ - const struct io_issue_def *def = &io_issue_defs[req->opcode]; - - WARN_ON_ONCE(!def->async_size); - req->async_data = kmalloc(def->async_size, GFP_KERNEL); + WARN_ON_ONCE(!def->async_size); + req->async_data = kmalloc(def->async_size, GFP_KERNEL); + } if (req->async_data) req->flags |= REQ_F_ASYNC_DATA; return req->async_data; diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 2bd7e0a317bb..48fc8cf70784 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -544,7 +544,7 @@ static int __io_timeout_prep(struct io_kiocb *req, if (WARN_ON_ONCE(req_has_async_data(req))) return -EFAULT; - data = io_uring_alloc_async_data_nocache(req); + data = io_uring_alloc_async_data(NULL, req); if (!data) return -ENOMEM; data->req = req; diff --git a/io_uring/waitid.c b/io_uring/waitid.c index 6778c0ee76c4..853e97a7b0ec 100644 --- a/io_uring/waitid.c +++ b/io_uring/waitid.c @@ -303,7 +303,7 @@ int io_waitid(struct io_kiocb *req, unsigned int issue_flags) struct io_waitid_async *iwa; int ret; - iwa = io_uring_alloc_async_data_nocache(req); + iwa = io_uring_alloc_async_data(NULL, req); if (!iwa) return -ENOMEM; From a23ad06bfee5e51cd9e51aebf11401e7b4b5d00a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 24 Jan 2025 14:32:25 -0700 Subject: [PATCH 10/20] io_uring/register: use atomic_read/write for sq_flags migration A previous commit changed all of the migration from the old to the new ring for resizing to use READ/WRITE_ONCE. However, ->sq_flags is an atomic_t, and while most archs won't complain on this, some will indeed flag this: io_uring/register.c:554:9: sparse: sparse: cast to non-scalar io_uring/register.c:554:9: sparse: sparse: cast from non-scalar Just use atomic_set/atomic_read for handling this case. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202501242000.A2sKqaCL-lkp@intel.com/ Fixes: 2c5aae129f42 ("io_uring/register: document io_register_resize_rings() shared mem usage") Signed-off-by: Jens Axboe --- io_uring/register.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/register.c b/io_uring/register.c index 0db181437ae3..9a4d2fbce4ae 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -552,7 +552,7 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) ctx->cqe_cached = ctx->cqe_sentinel = NULL; WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped)); - WRITE_ONCE(n.rings->sq_flags, READ_ONCE(o.rings->sq_flags)); + atomic_set(&n.rings->sq_flags, atomic_read(&o.rings->sq_flags)); WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags)); WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow)); From d63b0e8a628e62ca85a0f7915230186bb92f8bb4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 28 Jan 2025 00:55:24 +0000 Subject: [PATCH 11/20] io_uring: fix multishots with selected buffers We do io_kbuf_recycle() when arming a poll but every iteration of a multishot can grab more buffers, which is why we need to flush the kbuf ring state before continuing with waiting. Cc: stable@vger.kernel.org Fixes: b3fdea6ecb55c ("io_uring: multishot recv") Reported-by: Muhammad Ramdhan Reported-by: Bing-Jhong Billy Jheng Reported-by: Jacob Soo Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1bfc9990fe435f1fc6152ca9efeba5eb3e68339c.1738025570.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/io_uring/poll.c b/io_uring/poll.c index 356474c66f32..31b118133bb0 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -315,8 +315,10 @@ void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts) ret = io_poll_check_events(req, ts); if (ret == IOU_POLL_NO_ACTION) { + io_kbuf_recycle(req, 0); return; } else if (ret == IOU_POLL_REQUEUE) { + io_kbuf_recycle(req, 0); __io_poll_execute(req, 0); return; } From 299276502d41cd86376f47b7e087d017eaa0f914 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 28 Jan 2025 20:56:09 +0000 Subject: [PATCH 12/20] io_uring: include all deps for alloc_cache.h alloc_cache.h uses types it doesn't declare and thus depends on the order in which it's included. Make it self contained and pull all needed definitions. Signed-off-by: Pavel Begunkov Reviewed-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/39569f3d5b250b4fe78bb609d57f67d3736ebcc4.1738087204.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/alloc_cache.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h index cca96aff3277..28436f413bd2 100644 --- a/io_uring/alloc_cache.h +++ b/io_uring/alloc_cache.h @@ -1,6 +1,8 @@ #ifndef IOU_ALLOC_CACHE_H #define IOU_ALLOC_CACHE_H +#include + /* * Don't allow the cache to grow beyond this size. */ From 16ac51a0a7aa051fd3b82fa077597488b5572d41 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 28 Jan 2025 20:56:10 +0000 Subject: [PATCH 13/20] io_uring: dont ifdef io_alloc_cache_kasan() Use IS_ENABLED in io_alloc_cache_kasan() so at least it gets compile tested without KASAN. Signed-off-by: Pavel Begunkov Reviewed-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/35e53e83f6e16478dca0028a64a6cc905dc764d3.1738087204.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/alloc_cache.h | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h index 28436f413bd2..9eb374ad7490 100644 --- a/io_uring/alloc_cache.h +++ b/io_uring/alloc_cache.h @@ -8,18 +8,14 @@ */ #define IO_ALLOC_CACHE_MAX 128 -#if defined(CONFIG_KASAN) -static inline void io_alloc_cache_kasan(struct iovec **iov, int *nr) -{ - kfree(*iov); - *iov = NULL; - *nr = 0; -} -#else static inline void io_alloc_cache_kasan(struct iovec **iov, int *nr) { + if (IS_ENABLED(CONFIG_KASAN)) { + kfree(*iov); + *iov = NULL; + *nr = 0; + } } -#endif static inline bool io_alloc_cache_put(struct io_alloc_cache *cache, void *entry) From d19af0e9366298aa60afc0fb51ffcbd6205edcee Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 28 Jan 2025 20:56:11 +0000 Subject: [PATCH 14/20] io_uring: add alloc_cache.c Avoid inlining all and everything from alloc_cache.h and move cold bits into a new file. Signed-off-by: Pavel Begunkov Reviewed-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/06984c6cd58e703f7cfae5ab3067912f9f635a06.1738087204.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/alloc_cache.c | 44 ++++++++++++++++++++++++++++++++++++++++++ io_uring/alloc_cache.h | 44 +++++++++--------------------------------- 3 files changed, 54 insertions(+), 36 deletions(-) create mode 100644 io_uring/alloc_cache.c diff --git a/io_uring/Makefile b/io_uring/Makefile index 53167bef37d7..d695b60dba4f 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -13,7 +13,7 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ sync.o msg_ring.o advise.o openclose.o \ epoll.o statx.o timeout.o fdinfo.o \ cancel.o waitid.o register.o \ - truncate.o memmap.o + truncate.o memmap.o alloc_cache.o obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o diff --git a/io_uring/alloc_cache.c b/io_uring/alloc_cache.c new file mode 100644 index 000000000000..58423888b736 --- /dev/null +++ b/io_uring/alloc_cache.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "alloc_cache.h" + +void io_alloc_cache_free(struct io_alloc_cache *cache, + void (*free)(const void *)) +{ + void *entry; + + if (!cache->entries) + return; + + while ((entry = io_alloc_cache_get(cache)) != NULL) + free(entry); + + kvfree(cache->entries); + cache->entries = NULL; +} + +/* returns false if the cache was initialized properly */ +bool io_alloc_cache_init(struct io_alloc_cache *cache, + unsigned max_nr, unsigned int size, + unsigned int init_bytes) +{ + cache->entries = kvmalloc_array(max_nr, sizeof(void *), GFP_KERNEL); + if (!cache->entries) + return true; + + cache->nr_cached = 0; + cache->max_cached = max_nr; + cache->elem_size = size; + cache->init_clear = init_bytes; + return false; +} + +void *io_cache_alloc_new(struct io_alloc_cache *cache, gfp_t gfp) +{ + void *obj; + + obj = kmalloc(cache->elem_size, gfp); + if (obj && cache->init_clear) + memset(obj, 0, cache->init_clear); + return obj; +} diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h index 9eb374ad7490..0dd17d8ba93a 100644 --- a/io_uring/alloc_cache.h +++ b/io_uring/alloc_cache.h @@ -8,6 +8,14 @@ */ #define IO_ALLOC_CACHE_MAX 128 +void io_alloc_cache_free(struct io_alloc_cache *cache, + void (*free)(const void *)); +bool io_alloc_cache_init(struct io_alloc_cache *cache, + unsigned max_nr, unsigned int size, + unsigned int init_bytes); + +void *io_cache_alloc_new(struct io_alloc_cache *cache, gfp_t gfp); + static inline void io_alloc_cache_kasan(struct iovec **iov, int *nr) { if (IS_ENABLED(CONFIG_KASAN)) { @@ -57,41 +65,7 @@ static inline void *io_cache_alloc(struct io_alloc_cache *cache, gfp_t gfp) obj = io_alloc_cache_get(cache); if (obj) return obj; - - obj = kmalloc(cache->elem_size, gfp); - if (obj && cache->init_clear) - memset(obj, 0, cache->init_clear); - return obj; -} - -/* returns false if the cache was initialized properly */ -static inline bool io_alloc_cache_init(struct io_alloc_cache *cache, - unsigned max_nr, unsigned int size, - unsigned int init_bytes) -{ - cache->entries = kvmalloc_array(max_nr, sizeof(void *), GFP_KERNEL); - if (cache->entries) { - cache->nr_cached = 0; - cache->max_cached = max_nr; - cache->elem_size = size; - cache->init_clear = init_bytes; - return false; - } - return true; + return io_cache_alloc_new(cache, gfp); } -static inline void io_alloc_cache_free(struct io_alloc_cache *cache, - void (*free)(const void *)) -{ - void *entry; - - if (!cache->entries) - return; - - while ((entry = io_alloc_cache_get(cache)) != NULL) - free(entry); - - kvfree(cache->entries); - cache->entries = NULL; -} #endif From fefcb0dcd02fd34f808e91b13ce25f9847e52eb9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 28 Jan 2025 20:56:12 +0000 Subject: [PATCH 15/20] io_uring/net: make io_net_vec_assign() return void io_net_vec_assign() can only return 0 and it doesn't make sense for it to fail, so make it return void. Signed-off-by: Pavel Begunkov Reviewed-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/7c1a2390c99e17d3ae4e8562063e572d3cdeb164.1738087204.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 41eef286f8b9..e72205802055 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -168,7 +168,7 @@ static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req) } /* assign new iovec to kmsg, if we need to */ -static int io_net_vec_assign(struct io_kiocb *req, struct io_async_msghdr *kmsg, +static void io_net_vec_assign(struct io_kiocb *req, struct io_async_msghdr *kmsg, struct iovec *iov) { if (iov) { @@ -178,7 +178,6 @@ static int io_net_vec_assign(struct io_kiocb *req, struct io_async_msghdr *kmsg, kfree(kmsg->free_iov); kmsg->free_iov = iov; } - return 0; } static inline void io_mshot_prep_retry(struct io_kiocb *req, @@ -240,7 +239,8 @@ static int io_compat_msg_copy_hdr(struct io_kiocb *req, if (unlikely(ret < 0)) return ret; - return io_net_vec_assign(req, iomsg, iov); + io_net_vec_assign(req, iomsg, iov); + return 0; } #endif @@ -299,7 +299,8 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, if (unlikely(ret < 0)) return ret; - return io_net_vec_assign(req, iomsg, iov); + io_net_vec_assign(req, iomsg, iov); + return 0; } static int io_sendmsg_copy_hdr(struct io_kiocb *req, From 2b350f756b7acf84afab31d65ce6e3d496213ae5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 28 Jan 2025 20:56:13 +0000 Subject: [PATCH 16/20] io_uring/net: clean io_msg_copy_hdr() Put msg->msg_iov into a local variable in io_msg_copy_hdr(), it reads better and clearly shows the used types. Signed-off-by: Pavel Begunkov Reviewed-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/6a5d4f7a96b10e571d6128be010166b3aaf7afd5.1738087204.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index e72205802055..dedf274fc049 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -280,11 +280,12 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, ret = -EINVAL; goto ua_end; } else { + struct iovec __user *uiov = msg->msg_iov; + /* we only need the length for provided buffers */ - if (!access_ok(&msg->msg_iov[0].iov_len, sizeof(__kernel_size_t))) + if (!access_ok(&uiov->iov_len, sizeof(uiov->iov_len))) goto ua_end; - unsafe_get_user(iov->iov_len, &msg->msg_iov[0].iov_len, - ua_end); + unsafe_get_user(iov->iov_len, &uiov->iov_len, ua_end); sr->len = iov->iov_len; } ret = 0; From 86e62354eef16993834be5bd218d38ec96c47f16 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 28 Jan 2025 20:56:14 +0000 Subject: [PATCH 17/20] io_uring/net: extract io_send_select_buffer() Extract a helper out of io_send() for provided buffer selection to improve readability as it has grown to take too many lines. Signed-off-by: Pavel Begunkov Reviewed-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/26a769cdabd61af7f40c5d88a22469c5ad071796.1738087204.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 87 +++++++++++++++++++++++++++++--------------------- 1 file changed, 50 insertions(+), 37 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index dedf274fc049..4d21f7bd2149 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -566,6 +566,54 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } +static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags, + struct io_async_msghdr *kmsg) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + + int ret; + struct buf_sel_arg arg = { + .iovs = &kmsg->fast_iov, + .max_len = min_not_zero(sr->len, INT_MAX), + .nr_iovs = 1, + }; + + if (kmsg->free_iov) { + arg.nr_iovs = kmsg->free_iov_nr; + arg.iovs = kmsg->free_iov; + arg.mode = KBUF_MODE_FREE; + } + + if (!(sr->flags & IORING_RECVSEND_BUNDLE)) + arg.nr_iovs = 1; + else + arg.mode |= KBUF_MODE_EXPAND; + + ret = io_buffers_select(req, &arg, issue_flags); + if (unlikely(ret < 0)) + return ret; + + if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { + kmsg->free_iov_nr = ret; + kmsg->free_iov = arg.iovs; + req->flags |= REQ_F_NEED_CLEANUP; + } + sr->len = arg.out_len; + + if (ret == 1) { + sr->buf = arg.iovs[0].iov_base; + ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, + &kmsg->msg.msg_iter); + if (unlikely(ret)) + return ret; + } else { + iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE, + arg.iovs, ret, arg.out_len); + } + + return 0; +} + int io_send(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); @@ -589,44 +637,9 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) retry_bundle: if (io_do_buffer_select(req)) { - struct buf_sel_arg arg = { - .iovs = &kmsg->fast_iov, - .max_len = min_not_zero(sr->len, INT_MAX), - .nr_iovs = 1, - }; - - if (kmsg->free_iov) { - arg.nr_iovs = kmsg->free_iov_nr; - arg.iovs = kmsg->free_iov; - arg.mode = KBUF_MODE_FREE; - } - - if (!(sr->flags & IORING_RECVSEND_BUNDLE)) - arg.nr_iovs = 1; - else - arg.mode |= KBUF_MODE_EXPAND; - - ret = io_buffers_select(req, &arg, issue_flags); - if (unlikely(ret < 0)) + ret = io_send_select_buffer(req, issue_flags, kmsg); + if (ret) return ret; - - if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { - kmsg->free_iov_nr = ret; - kmsg->free_iov = arg.iovs; - req->flags |= REQ_F_NEED_CLEANUP; - } - sr->len = arg.out_len; - - if (ret == 1) { - sr->buf = arg.iovs[0].iov_base; - ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, - &kmsg->msg.msg_iter); - if (unlikely(ret)) - return ret; - } else { - iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE, - arg.iovs, ret, arg.out_len); - } } /* From 0d124578fed92cadeaca47d734da782beacdc1a7 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 28 Jan 2025 20:56:15 +0000 Subject: [PATCH 18/20] io_uring: remove !KASAN guards from cache free Test setups (with KASAN) will avoid !KASAN sections, and so it's not testing paths that would be exercised otherwise. That's bad as to be sure that your code works you now have to specifically test both KASAN and !KASAN configs. Remove !CONFIG_KASAN guards from io_netmsg_cache_free() and io_rw_cache_free(). The free functions should always be getting valid entries, and even though for KASAN iovecs should already be cleared, that's better than skipping the chunks completely. Signed-off-by: Pavel Begunkov Reviewed-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/d6078a51c7137a243f9d00849bc3daa660873209.1738087204.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 2 -- io_uring/rw.c | 2 -- 2 files changed, 4 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 4d21f7bd2149..d89c39f853e3 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1813,10 +1813,8 @@ void io_netmsg_cache_free(const void *entry) { struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry; -#if !defined(CONFIG_KASAN) if (kmsg->free_iov) io_netmsg_iovec_free(kmsg); -#endif kfree(kmsg); } #endif diff --git a/io_uring/rw.c b/io_uring/rw.c index 991ecfbea88e..c496f195aae2 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -1309,9 +1309,7 @@ void io_rw_cache_free(const void *entry) { struct io_async_rw *rw = (struct io_async_rw *) entry; -#if !defined(CONFIG_KASAN) if (rw->free_iovec) io_rw_iovec_free(rw); -#endif kfree(rw); } From d1fdab8c06791945d9454fb430951533eba9e175 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 28 Jan 2025 20:56:16 +0000 Subject: [PATCH 19/20] io_uring/rw: simplify io_rw_recycle() Instead of freeing iovecs in case of IO_URING_F_UNLOCKED in io_rw_recycle(), leave it be and rely on the core io_uring code to call io_readv_writev_cleanup() later. This way the iovec will get recycled and we can clean up io_rw_recycle() and kill io_rw_iovec_free(). Signed-off-by: Pavel Begunkov Reviewed-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/14f83b112eb40078bea18e15d77a4f99fc981a44.1738087204.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rw.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index c496f195aae2..7aa1e4c9f64a 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -146,23 +146,13 @@ static inline int io_import_iovec(int rw, struct io_kiocb *req, return 0; } -static void io_rw_iovec_free(struct io_async_rw *rw) -{ - if (rw->free_iovec) { - kfree(rw->free_iovec); - rw->free_iov_nr = 0; - rw->free_iovec = NULL; - } -} - static void io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags) { struct io_async_rw *rw = req->async_data; - if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) { - io_rw_iovec_free(rw); + if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) return; - } + io_alloc_cache_kasan(&rw->free_iovec, &rw->free_iov_nr); if (io_alloc_cache_put(&req->ctx->rw_cache, rw)) { req->async_data = NULL; @@ -1310,6 +1300,6 @@ void io_rw_cache_free(const void *entry) struct io_async_rw *rw = (struct io_async_rw *) entry; if (rw->free_iovec) - io_rw_iovec_free(rw); + kfree(rw->free_iovec); kfree(rw); } From 8c8492ca64e79c6e0f433e8c9d2bcbd039ef83d0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 30 Jan 2025 08:40:29 -0700 Subject: [PATCH 20/20] io_uring/net: don't retry connect operation on EPOLLERR If a socket is shutdown before the connection completes, POLLERR is set in the poll mask. However, connect ignores this as it doesn't know, and attempts the connection again. This may lead to a bogus -ETIMEDOUT result, where it should have noticed the POLLERR and just returned -ECONNRESET instead. Have the poll logic check for whether or not POLLERR is set in the mask, and if so, mark the request as failed. Then connect can appropriately fail the request rather than retry it. Reported-by: Sergey Galas Cc: stable@vger.kernel.org Link: https://github.com/axboe/liburing/discussions/1335 Fixes: 3fb1bd688172 ("io_uring/net: handle -EINPROGRESS correct for IORING_OP_CONNECT") Signed-off-by: Jens Axboe --- io_uring/net.c | 5 +++++ io_uring/poll.c | 2 ++ 2 files changed, 7 insertions(+) diff --git a/io_uring/net.c b/io_uring/net.c index d89c39f853e3..17852a6616ff 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1710,6 +1710,11 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags) int ret; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + if (unlikely(req->flags & REQ_F_FAIL)) { + ret = -ECONNRESET; + goto out; + } + file_flags = force_nonblock ? O_NONBLOCK : 0; ret = __sys_connect_file(req->file, &io->addr, connect->addr_len, diff --git a/io_uring/poll.c b/io_uring/poll.c index 31b118133bb0..bb1c0cd4f809 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -273,6 +273,8 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) return IOU_POLL_REISSUE; } } + if (unlikely(req->cqe.res & EPOLLERR)) + req_set_fail(req); if (req->apoll_events & EPOLLONESHOT) return IOU_POLL_DONE;