diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig index 5602241ac9737f..7c79e25b45f210 100644 --- a/drivers/vhost/Kconfig +++ b/drivers/vhost/Kconfig @@ -95,4 +95,16 @@ config VHOST_CROSS_ENDIAN_LEGACY If unsure, say "N". +config VHOST_BLK + tristate "Host kernel accelerator for virtio-blk" + depends on BLOCK && EVENTFD + select VHOST + default n + help + This kernel module can be loaded in host kernel to accelerate + guest vm with virtio-blk driver. + + To compile this driver as a module, choose M here: the module will + be called vhost_blk. + endif diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile index 52c1a8e37f19bf..d08c8dde89aa82 100644 --- a/drivers/vhost/Makefile +++ b/drivers/vhost/Makefile @@ -18,5 +18,8 @@ obj-$(CONFIG_VHOST) += vhost.o obj-$(CONFIG_VHOST_IOTLB) += vhost_iotlb.o vhost_iotlb-y := iotlb.o +obj-$(CONFIG_VHOST_BLK) += vhost_blk.o +vhost_blk-y := blk.o + obj-$(CONFIG_VHOST_XEN) += vhost_xen.o vhost_xen-y := xen.o diff --git a/drivers/vhost/blk.c b/drivers/vhost/blk.c new file mode 100644 index 00000000000000..dfb6579a441229 --- /dev/null +++ b/drivers/vhost/blk.c @@ -0,0 +1,819 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2011 Taobao, Inc. + * Author: Liu Yuan + * + * Copyright (C) 2012 Red Hat, Inc. + * Author: Asias He + * + * Copyright (c) 2022 Virtuozzo International GmbH. + * Author: Andrey Zhadchenko + * + * virtio-blk host kernel accelerator. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vhost.h" + +enum { + VHOST_BLK_FEATURES = VHOST_FEATURES | + (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | + (1ULL << VIRTIO_RING_F_EVENT_IDX) | + (1ULL << VIRTIO_BLK_F_MQ) | + (1ULL << VIRTIO_BLK_F_FLUSH), +}; + +/* + * Max number of bytes transferred before requeueing the job. + * Using this limit prevents one virtqueue from starving others. + */ +#define VHOST_DEV_WEIGHT 0x80000 + +/* + * Max number of packets transferred before requeueing the job. + * Using this limit prevents one virtqueue from starving others with + * pkts. + */ +#define VHOST_DEV_PKT_WEIGHT 256 + +#define VHOST_BLK_VQ_MAX 16 + +#define VHOST_MAX_METADATA_IOV 1 + +#define VHOST_BLK_SECTOR_BITS 9 +#define VHOST_BLK_SECTOR_SIZE (1 << VHOST_BLK_SECTOR_BITS) +#define VHOST_BLK_SECTOR_MASK (VHOST_BLK_SECTOR_SIZE - 1) + +struct req_page_list { + struct page **pages; + int pages_nr; +}; + +#define NR_INLINE 16 + +struct vhost_blk_req { + struct req_page_list inline_pl[NR_INLINE]; + struct page *inline_page[NR_INLINE]; + struct bio *inline_bio[NR_INLINE]; + struct req_page_list *pl; + int during_flush; + bool use_inline; + + struct llist_node llnode; + + struct vhost_blk *blk; + + struct iovec *iov; + int iov_nr; + + struct bio **bio; + atomic_t bio_nr; + + struct iovec status[VHOST_MAX_METADATA_IOV]; + + sector_t sector; + int bi_opf; + u16 head; + long len; + int bio_err; + + struct vhost_blk_vq *blk_vq; +}; + +struct vhost_blk_vq { + struct vhost_virtqueue vq; + struct vhost_blk_req *req; + struct iovec iov[UIO_MAXIOV]; + struct llist_head llhead; + struct vhost_work work; +}; + +struct vhost_blk { + wait_queue_head_t flush_wait; + struct vhost_blk_vq vqs[VHOST_BLK_VQ_MAX]; + atomic_t req_inflight[2]; + spinlock_t flush_lock; + struct vhost_dev dev; + int during_flush; + struct file *backend; + int index; +}; + +static int gen; + +static int move_iovec(struct iovec *from, struct iovec *to, + size_t len, int iov_count_from, int iov_count_to) +{ + int moved_seg = 0, spent_seg = 0; + size_t size; + + while (len && spent_seg < iov_count_from && moved_seg < iov_count_to) { + if (from->iov_len == 0) { + ++from; + ++spent_seg; + continue; + } + size = min(from->iov_len, len); + to->iov_base = from->iov_base; + to->iov_len = size; + from->iov_len -= size; + from->iov_base += size; + len -= size; + ++from; + ++to; + ++moved_seg; + ++spent_seg; + } + + return len ? -1 : moved_seg; +} + +static inline int iov_num_pages(struct iovec *iov) +{ + return (PAGE_ALIGN((unsigned long)iov->iov_base + iov->iov_len) - + ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT; +} + +static inline int vhost_blk_set_status(struct vhost_blk_req *req, u8 status) +{ + struct iov_iter iter; + int ret; + + iov_iter_init(&iter, WRITE, req->status, ARRAY_SIZE(req->status), sizeof(status)); + ret = copy_to_iter(&status, sizeof(status), &iter); + if (ret != sizeof(status)) { + vq_err(&req->blk_vq->vq, "Failed to write status\n"); + return -EFAULT; + } + + return 0; +} + +static void vhost_blk_req_done(struct bio *bio) +{ + struct vhost_blk_req *req = bio->bi_private; + + req->bio_err = blk_status_to_errno(bio->bi_status); + + if (atomic_dec_and_test(&req->bio_nr)) { + llist_add(&req->llnode, &req->blk_vq->llhead); + vhost_work_vqueue(&req->blk_vq->vq, &req->blk_vq->work); + } + + bio_put(bio); +} + +static void vhost_blk_req_umap(struct vhost_blk_req *req) +{ + struct req_page_list *pl; + int i, j; + + if (req->pl) { + for (i = 0; i < req->iov_nr; i++) { + pl = &req->pl[i]; + + for (j = 0; j < pl->pages_nr; j++) { + if (!req->bi_opf) + set_page_dirty_lock(pl->pages[j]); + put_page(pl->pages[j]); + } + } + } + + if (!req->use_inline) + kfree(req->pl); +} + +static int vhost_blk_bio_make_simple(struct vhost_blk_req *req, + struct block_device *bdev) +{ + struct bio *bio; + + req->use_inline = true; + req->pl = NULL; + req->bio = req->inline_bio; + + bio = bio_alloc(req->bi_opf, GFP_KERNEL); + if (!bio) + return -ENOMEM; + + bio->bi_iter.bi_sector = req->sector; + bio->bi_private = req; + bio->bi_end_io = vhost_blk_req_done; + req->bio[0] = bio; + + atomic_set(&req->bio_nr, 1); + + return 0; +} + +static struct page **vhost_blk_prepare_req(struct vhost_blk_req *req, + int total_pages, int iov_nr) +{ + int pl_len, page_len, bio_len; + void *buf; + + req->use_inline = false; + pl_len = iov_nr * sizeof(req->pl[0]); + page_len = total_pages * sizeof(struct page *); + bio_len = total_pages * sizeof(struct bio *); + + buf = kmalloc(pl_len + page_len + bio_len, GFP_KERNEL); + if (!buf) + return NULL; + + req->pl = buf; + req->bio = buf + pl_len + page_len; + + return buf + pl_len; +} + +static int vhost_blk_bio_make(struct vhost_blk_req *req, + struct block_device *bdev) +{ + int pages_nr_total, i, j, ret; + struct iovec *iov = req->iov; + int iov_nr = req->iov_nr; + struct page **pages, *page; + struct bio *bio = NULL; + int bio_nr = 0; + + if (unlikely(req->bi_opf == REQ_OP_FLUSH)) + return vhost_blk_bio_make_simple(req, bdev); + + pages_nr_total = 0; + for (i = 0; i < iov_nr; i++) + pages_nr_total += iov_num_pages(&iov[i]); + + if (pages_nr_total > NR_INLINE) { + pages = vhost_blk_prepare_req(req, pages_nr_total, iov_nr); + if (!pages) + return -ENOMEM; + } else { + req->use_inline = true; + req->pl = req->inline_pl; + pages = req->inline_page; + req->bio = req->inline_bio; + } + + req->iov_nr = 0; + for (i = 0; i < iov_nr; i++) { + int pages_nr = iov_num_pages(&iov[i]); + unsigned long iov_base, iov_len; + struct req_page_list *pl; + + iov_base = (unsigned long)iov[i].iov_base; + iov_len = (unsigned long)iov[i].iov_len; + + ret = get_user_pages_fast(iov_base, pages_nr, + !req->bi_opf, pages); + if (ret != pages_nr) + goto fail; + + req->iov_nr++; + pl = &req->pl[i]; + pl->pages_nr = pages_nr; + pl->pages = pages; + + for (j = 0; j < pages_nr; j++) { + unsigned int off, len, pos; + + page = pages[j]; + off = iov_base & ~PAGE_MASK; + len = PAGE_SIZE - off; + if (len > iov_len) + len = iov_len; + + while (!bio || !bio_add_page(bio, page, len, off)) { + bio = bio_alloc(req->bi_opf, GFP_KERNEL); + if (!bio) + goto fail; + bio->bi_iter.bi_sector = req->sector; + bio->bi_private = req; + bio->bi_end_io = vhost_blk_req_done; + req->bio[bio_nr++] = bio; + } + + iov_base += len; + iov_len -= len; + + pos = (iov_base & VHOST_BLK_SECTOR_MASK) + iov_len; + req->sector += pos >> VHOST_BLK_SECTOR_BITS; + } + + pages += pages_nr; + } + atomic_set(&req->bio_nr, bio_nr); + return 0; + +fail: + for (i = 0; i < bio_nr; i++) + bio_put(req->bio[i]); + vhost_blk_req_umap(req); + return -ENOMEM; +} + +static inline void vhost_blk_bio_send(struct vhost_blk_req *req) +{ + struct blk_plug plug; + int i, bio_nr; + + bio_nr = atomic_read(&req->bio_nr); + blk_start_plug(&plug); + for (i = 0; i < bio_nr; i++) + submit_bio(req->bio[i]); + + blk_finish_plug(&plug); +} + +static int vhost_blk_req_submit(struct vhost_blk_req *req, struct file *file) +{ + + struct inode *inode = file->f_mapping->host; + struct block_device *bdev = I_BDEV(inode); + int ret; + + ret = vhost_blk_bio_make(req, bdev); + if (ret < 0) + return ret; + + vhost_blk_bio_send(req); + + spin_lock(&req->blk->flush_lock); + req->during_flush = req->blk->during_flush; + atomic_inc(&req->blk->req_inflight[req->during_flush]); + spin_unlock(&req->blk->flush_lock); + + return ret; +} + +static int vhost_blk_req_handle(struct vhost_virtqueue *vq, + struct virtio_blk_outhdr *hdr, + u16 head, u16 total_iov_nr, + struct file *file) +{ + struct vhost_blk *blk = container_of(vq->dev, struct vhost_blk, dev); + struct vhost_blk_vq *blk_vq = container_of(vq, struct vhost_blk_vq, vq); + unsigned char id[VIRTIO_BLK_ID_BYTES]; + struct vhost_blk_req *req; + struct iov_iter iter; + int ret, len; + u8 status; + + req = &blk_vq->req[head]; + req->blk_vq = blk_vq; + req->head = head; + req->blk = blk; + req->sector = hdr->sector; + req->iov = blk_vq->iov; + + req->len = iov_length(vq->iov, total_iov_nr) - sizeof(status); + req->iov_nr = move_iovec(vq->iov, req->iov, req->len, total_iov_nr, + ARRAY_SIZE(blk_vq->iov)); + + ret = move_iovec(vq->iov, req->status, sizeof(status), total_iov_nr, + ARRAY_SIZE(req->status)); + if (ret < 0 || req->iov_nr < 0) + return -EINVAL; + + switch (hdr->type) { + case VIRTIO_BLK_T_OUT: + req->bi_opf = REQ_OP_WRITE; + ret = vhost_blk_req_submit(req, file); + break; + case VIRTIO_BLK_T_IN: + req->bi_opf = REQ_OP_READ; + ret = vhost_blk_req_submit(req, file); + break; + case VIRTIO_BLK_T_FLUSH: + req->bi_opf = REQ_OP_FLUSH; + ret = vhost_blk_req_submit(req, file); + break; + case VIRTIO_BLK_T_GET_ID: + len = snprintf(id, VIRTIO_BLK_ID_BYTES, "vhost-blk%d", blk->index); + iov_iter_init(&iter, WRITE, req->iov, req->iov_nr, req->len); + ret = copy_to_iter(id, len, &iter); + status = ret != len ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK; + ret = vhost_blk_set_status(req, status); + if (ret) + break; + vhost_add_used_and_signal(&blk->dev, vq, head, len); + break; + default: + vq_err(vq, "Unsupported request type %d\n", hdr->type); + status = VIRTIO_BLK_S_UNSUPP; + ret = vhost_blk_set_status(req, status); + if (ret) + break; + vhost_add_used_and_signal(&blk->dev, vq, head, 0); + } + + return ret; +} + +static void vhost_blk_handle_guest_kick(struct vhost_work *work) +{ + struct virtio_blk_outhdr hdr; + struct vhost_blk_vq *blk_vq; + struct vhost_virtqueue *vq; + struct iovec hdr_iovec[VHOST_MAX_METADATA_IOV]; + struct vhost_blk *blk; + struct iov_iter iter; + int in, out, ret; + struct file *f; + u16 head; + + vq = container_of(work, struct vhost_virtqueue, poll.work); + blk = container_of(vq->dev, struct vhost_blk, dev); + blk_vq = container_of(vq, struct vhost_blk_vq, vq); + + f = vhost_vq_get_backend(vq); + if (!f) + return; + + vhost_disable_notify(&blk->dev, vq); + for (;;) { + head = vhost_get_vq_desc(vq, vq->iov, + ARRAY_SIZE(vq->iov), + &out, &in, NULL, NULL); + if (unlikely(head < 0)) + break; + + if (unlikely(head == vq->num)) { + if (unlikely(vhost_enable_notify(&blk->dev, vq))) { + vhost_disable_notify(&blk->dev, vq); + continue; + } + break; + } + + ret = move_iovec(vq->iov, hdr_iovec, sizeof(hdr), in + out, ARRAY_SIZE(hdr_iovec)); + if (ret < 0) { + vq_err(vq, "virtio_blk_hdr is too split!"); + vhost_discard_vq_desc(vq, 1); + break; + } + + iov_iter_init(&iter, READ, hdr_iovec, ARRAY_SIZE(hdr_iovec), sizeof(hdr)); + ret = copy_from_iter(&hdr, sizeof(hdr), &iter); + if (ret != sizeof(hdr)) { + vq_err(vq, "Failed to get block header: read %d bytes instead of %ld!\n", + ret, sizeof(hdr)); + vhost_discard_vq_desc(vq, 1); + break; + } + + if (vhost_blk_req_handle(vq, &hdr, head, out + in, f) < 0) { + vhost_discard_vq_desc(vq, 1); + break; + } + + if (!llist_empty(&blk_vq->llhead)) { + vhost_poll_queue(&vq->poll); + break; + } + } +} + +static void vhost_blk_handle_host_kick(struct vhost_work *work) +{ + struct vhost_blk_vq *blk_vq; + struct vhost_virtqueue *vq; + struct vhost_blk_req *req; + struct llist_node *llnode; + struct vhost_blk *blk = NULL; + bool added, zero; + u8 status; + int ret; + + blk_vq = container_of(work, struct vhost_blk_vq, work); + vq = &blk_vq->vq; + llnode = llist_del_all(&blk_vq->llhead); + added = false; + while (llnode) { + req = llist_entry(llnode, struct vhost_blk_req, llnode); + llnode = llist_next(llnode); + + if (!blk) + blk = req->blk; + + vhost_blk_req_umap(req); + + status = req->bio_err == 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR; + ret = vhost_blk_set_status(req, status); + if (unlikely(ret)) + continue; + + vhost_add_used(vq, req->head, req->len); + added = true; + + spin_lock(&req->blk->flush_lock); + zero = atomic_dec_and_test( + &req->blk->req_inflight[req->during_flush]); + if (zero && !req->during_flush) + wake_up(&blk->flush_wait); + spin_unlock(&req->blk->flush_lock); + + } + + if (likely(added)) + vhost_signal(&blk->dev, vq); +} + +static void vhost_blk_flush(struct vhost_blk *blk) +{ + spin_lock(&blk->flush_lock); + blk->during_flush = 1; + spin_unlock(&blk->flush_lock); + + vhost_dev_flush(&blk->dev); + /* + * Wait until requests fired before the flush to be finished + * req_inflight[0] is used to track the requests fired before the flush + * req_inflight[1] is used to track the requests fired during the flush + */ + wait_event(blk->flush_wait, !atomic_read(&blk->req_inflight[0])); + + spin_lock(&blk->flush_lock); + blk->during_flush = 0; + spin_unlock(&blk->flush_lock); +} + +static inline void vhost_blk_drop_backends(struct vhost_blk *blk) +{ + struct vhost_virtqueue *vq; + int i; + + for (i = 0; i < VHOST_BLK_VQ_MAX; i++) { + vq = &blk->vqs[i].vq; + + mutex_lock(&vq->mutex); + vhost_vq_set_backend(vq, NULL); + mutex_unlock(&vq->mutex); + } +} + +static int vhost_blk_open(struct inode *inode, struct file *file) +{ + struct vhost_blk *blk; + struct vhost_virtqueue **vqs; + int ret = 0, i = 0; + + blk = kvzalloc(sizeof(*blk), GFP_KERNEL); + if (!blk) { + ret = -ENOMEM; + goto out; + } + + vqs = kcalloc(VHOST_BLK_VQ_MAX, sizeof(*vqs), GFP_KERNEL); + if (!vqs) { + ret = -ENOMEM; + goto out_blk; + } + + for (i = 0; i < VHOST_BLK_VQ_MAX; i++) { + blk->vqs[i].vq.handle_kick = vhost_blk_handle_guest_kick; + vqs[i] = &blk->vqs[i].vq; + } + + blk->index = gen++; + + atomic_set(&blk->req_inflight[0], 0); + atomic_set(&blk->req_inflight[1], 0); + blk->during_flush = 0; + spin_lock_init(&blk->flush_lock); + init_waitqueue_head(&blk->flush_wait); + + vhost_dev_init(&blk->dev, vqs, VHOST_BLK_VQ_MAX, UIO_MAXIOV, + VHOST_DEV_WEIGHT, VHOST_DEV_PKT_WEIGHT, true, NULL); + file->private_data = blk; + + for (i = 0; i < VHOST_BLK_VQ_MAX; i++) + vhost_work_init(&blk->vqs[i].work, vhost_blk_handle_host_kick); + + return ret; +out_blk: + kvfree(blk); +out: + return ret; +} + +static int vhost_blk_release(struct inode *inode, struct file *f) +{ + struct vhost_blk *blk = f->private_data; + int i; + + vhost_blk_drop_backends(blk); + vhost_blk_flush(blk); + vhost_dev_stop(&blk->dev); + if (blk->backend) + fput(blk->backend); + vhost_dev_cleanup(&blk->dev); + for (i = 0; i < VHOST_BLK_VQ_MAX; i++) + kvfree(blk->vqs[i].req); + kfree(blk->dev.vqs); + kvfree(blk); + + return 0; +} + +static int vhost_blk_set_features(struct vhost_blk *blk, u64 features) +{ + struct vhost_virtqueue *vq; + int i; + + mutex_lock(&blk->dev.mutex); + if ((features & (1 << VHOST_F_LOG_ALL)) && + !vhost_log_access_ok(&blk->dev)) { + mutex_unlock(&blk->dev.mutex); + return -EFAULT; + } + + for (i = 0; i < VHOST_BLK_VQ_MAX; i++) { + vq = &blk->vqs[i].vq; + mutex_lock(&vq->mutex); + vq->acked_features = features & (VHOST_BLK_FEATURES); + mutex_unlock(&vq->mutex); + } + + vhost_blk_flush(blk); + mutex_unlock(&blk->dev.mutex); + + return 0; +} + +static long vhost_blk_set_backend(struct vhost_blk *blk, int fd) +{ + struct vhost_virtqueue *vq; + struct file *file; + struct inode *inode; + int ret, i; + + mutex_lock(&blk->dev.mutex); + ret = vhost_dev_check_owner(&blk->dev); + if (ret) + goto out_dev; + + if (blk->backend) { + ret = -EBUSY; + goto out_dev; + } + + file = fget(fd); + if (IS_ERR(file)) { + ret = PTR_ERR(file); + goto out_dev; + } + + inode = file->f_mapping->host; + if (!S_ISBLK(inode->i_mode)) { + ret = -EFAULT; + goto out_file; + } + + for (i = 0; i < VHOST_BLK_VQ_MAX; i++) { + vq = &blk->vqs[i].vq; + if (!vhost_vq_access_ok(vq)) { + ret = -EFAULT; + goto out_drop; + } + + mutex_lock(&vq->mutex); + vhost_vq_set_backend(vq, file); + ret = vhost_vq_init_access(vq); + mutex_unlock(&vq->mutex); + } + + blk->backend = file; + + mutex_unlock(&blk->dev.mutex); + return 0; + +out_drop: + vhost_blk_drop_backends(blk); +out_file: + fput(file); +out_dev: + mutex_unlock(&blk->dev.mutex); + return ret; +} + +static long vhost_blk_reset_owner(struct vhost_blk *blk) +{ + struct vhost_iotlb *umem; + int err, i; + + mutex_lock(&blk->dev.mutex); + err = vhost_dev_check_owner(&blk->dev); + if (err) + goto done; + umem = vhost_dev_reset_owner_prepare(); + if (!umem) { + err = -ENOMEM; + goto done; + } + vhost_blk_drop_backends(blk); + if (blk->backend) { + fput(blk->backend); + blk->backend = NULL; + } + vhost_blk_flush(blk); + vhost_dev_stop(&blk->dev); + vhost_dev_reset_owner(&blk->dev, umem); + + for (i = 0; i < VHOST_BLK_VQ_MAX; i++) { + kvfree(blk->vqs[i].req); + blk->vqs[i].req = NULL; + } + +done: + mutex_unlock(&blk->dev.mutex); + return err; +} + +static int vhost_blk_setup(struct vhost_blk *blk, void __user *argp) +{ + struct vhost_vring_state s; + + if (copy_from_user(&s, argp, sizeof(s))) + return -EFAULT; + + if (blk->vqs[s.index].req) + return 0; + + blk->vqs[s.index].req = kvmalloc(sizeof(struct vhost_blk_req) * s.num, GFP_KERNEL); + if (!blk->vqs[s.index].req) + return -ENOMEM; + + return 0; +} + +static long vhost_blk_ioctl(struct file *f, unsigned int ioctl, + unsigned long arg) +{ + struct vhost_blk *blk = f->private_data; + void __user *argp = (void __user *)arg; + struct vhost_vring_file backend; + u64 __user *featurep = argp; + u64 features; + int ret; + + switch (ioctl) { + case VHOST_BLK_SET_BACKEND: + if (copy_from_user(&backend, argp, sizeof(backend))) + return -EFAULT; + return vhost_blk_set_backend(blk, backend.fd); + case VHOST_GET_FEATURES: + features = VHOST_BLK_FEATURES; + if (copy_to_user(featurep, &features, sizeof(features))) + return -EFAULT; + return 0; + case VHOST_SET_FEATURES: + if (copy_from_user(&features, featurep, sizeof(features))) + return -EFAULT; + if (features & ~VHOST_BLK_FEATURES) + return -EOPNOTSUPP; + return vhost_blk_set_features(blk, features); + case VHOST_RESET_OWNER: + return vhost_blk_reset_owner(blk); + default: + mutex_lock(&blk->dev.mutex); + ret = vhost_dev_ioctl(&blk->dev, ioctl, argp); + if (ret == -ENOIOCTLCMD) + ret = vhost_vring_ioctl(&blk->dev, ioctl, argp); + if (!ret && ioctl == VHOST_SET_VRING_NUM) + ret = vhost_blk_setup(blk, argp); + vhost_blk_flush(blk); + mutex_unlock(&blk->dev.mutex); + return ret; + } +} + +static const struct file_operations vhost_blk_fops = { + .owner = THIS_MODULE, + .open = vhost_blk_open, + .release = vhost_blk_release, + .llseek = noop_llseek, + .unlocked_ioctl = vhost_blk_ioctl, +}; + +static struct miscdevice vhost_blk_misc = { + MISC_DYNAMIC_MINOR, + "vhost-blk", + &vhost_blk_fops, +}; +module_misc_device(vhost_blk_misc); + +MODULE_VERSION("0.0.1"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Andrey Zhadchenko"); +MODULE_DESCRIPTION("Host kernel accelerator for virtio_blk"); diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 6f8542535afeea..923d27d4c6942b 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -1406,16 +1406,9 @@ static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock, *rx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_RX].vq); } -static void vhost_net_flush_vq(struct vhost_net *n, int index) -{ - vhost_poll_flush(n->poll + index); - vhost_poll_flush(&n->vqs[index].vq.poll); -} - static void vhost_net_flush(struct vhost_net *n) { - vhost_net_flush_vq(n, VHOST_NET_VQ_TX); - vhost_net_flush_vq(n, VHOST_NET_VQ_RX); + vhost_dev_flush(&n->dev); if (n->vqs[VHOST_NET_VQ_TX].ubufs) { mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex); n->tx_flush = true; @@ -1605,7 +1598,7 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) } if (oldsock) { - vhost_net_flush_vq(n, index); + vhost_dev_flush(&n->dev); sockfd_put(oldsock); } diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index 4ce9f00ae10e84..fe7570c1b6e9b2 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -1470,8 +1470,8 @@ static void vhost_scsi_flush(struct vhost_scsi *vs) /* Flush both the vhost poll and vhost work */ for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) vhost_scsi_flush_vq(vs, i); - vhost_work_flush(&vs->dev, &vs->vs_completion_work); - vhost_work_flush(&vs->dev, &vs->vs_event_work); + vhost_dev_flush(&vs->dev); + vhost_dev_flush(&vs->dev); /* Wait for all reqs issued before the flush to be finished */ for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c index a09dedc79f6820..39e71f431d8810 100644 --- a/drivers/vhost/test.c +++ b/drivers/vhost/test.c @@ -146,7 +146,7 @@ static void vhost_test_stop(struct vhost_test *n, void **privatep) static void vhost_test_flush_vq(struct vhost_test *n, int index) { - vhost_poll_flush(&n->vqs[index].poll); + vhost_dev_flush(n->vqs[index].poll.dev); } static void vhost_test_flush(struct vhost_test *n) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 601e8b35ea218a..f3941ee74affbb 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -170,7 +170,7 @@ static int vhost_poll_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, if (!(key_to_poll(key) & poll->mask)) return 0; - if (!poll->dev->use_worker) + if (!poll->vq->dev->use_worker) work->fn(work); else vhost_poll_queue(poll); @@ -185,19 +185,27 @@ void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn) } EXPORT_SYMBOL_GPL(vhost_work_init); -/* Init poll structure */ void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, __poll_t mask, struct vhost_dev *dev) +{ + vhost_poll_init_vq(poll, fn, mask, dev->vqs[0]); +} +EXPORT_SYMBOL_GPL(vhost_poll_init); + + +/* Init poll structure */ +void vhost_poll_init_vq(struct vhost_poll *poll, vhost_work_fn_t fn, + __poll_t mask, struct vhost_virtqueue *vq) { init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup); init_poll_funcptr(&poll->table, vhost_poll_func); poll->mask = mask; - poll->dev = dev; + poll->vq = vq; poll->wqh = NULL; vhost_work_init(&poll->work, fn); } -EXPORT_SYMBOL_GPL(vhost_poll_init); +EXPORT_SYMBOL_GPL(vhost_poll_init_vq); /* Start polling a file. We add ourselves to file's wait queue. The caller must * keep a reference to a file until after vhost_poll_stop is called. */ @@ -231,44 +239,80 @@ void vhost_poll_stop(struct vhost_poll *poll) } EXPORT_SYMBOL_GPL(vhost_poll_stop); -void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work) +static void vhost_work_queue_at_worker(struct vhost_worker *w, + struct vhost_work *work) { - struct vhost_flush_struct flush; + if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) { + /* We can only add the work to the list after we're + * sure it was not in the list. + * test_and_set_bit() implies a memory barrier. + */ + llist_add(&work->node, &w->work_list); + wake_up_process(w->worker); + } +} + +void vhost_dev_flush(struct vhost_dev *dev) +{ + struct vhost_flush_struct flush[VHOST_MAX_WORKERS]; + int i, nworkers; - if (dev->worker) { - init_completion(&flush.wait_event); - vhost_work_init(&flush.work, vhost_flush_work); + nworkers = READ_ONCE(dev->nworkers); - vhost_work_queue(dev, &flush.work); - wait_for_completion(&flush.wait_event); + for (i = 0; i < nworkers; i++) { + init_completion(&flush[i].wait_event); + vhost_work_init(&flush[i].work, vhost_flush_work); + vhost_work_queue_at_worker(&dev->workers[i], &flush[i].work); } + + for (i = 0; i < nworkers; i++) + wait_for_completion(&flush[i].wait_event); +} +EXPORT_SYMBOL_GPL(vhost_dev_flush); + +static void vhost_worker_flush(struct vhost_worker *w) +{ + struct vhost_flush_struct flush; + + init_completion(&flush.wait_event); + vhost_work_init(&flush.work, vhost_flush_work); + vhost_work_queue_at_worker(w, &flush.work); + wait_for_completion(&flush.wait_event); } -EXPORT_SYMBOL_GPL(vhost_work_flush); -/* Flush any work that has been scheduled. When calling this, don't hold any - * locks that are also used by the callback. */ -void vhost_poll_flush(struct vhost_poll *poll) +void vhost_work_flush_vq(struct vhost_virtqueue *vq) { - vhost_work_flush(poll->dev, &poll->work); + struct vhost_worker *w = READ_ONCE(vq->worker); + + if (!w) + return; + + vhost_worker_flush(w); } -EXPORT_SYMBOL_GPL(vhost_poll_flush); +EXPORT_SYMBOL_GPL(vhost_work_flush_vq); void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work) { - if (!dev->worker) + struct vhost_worker *w = &dev->workers[0]; + + if (!w->worker) return; - if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) { - /* We can only add the work to the list after we're - * sure it was not in the list. - * test_and_set_bit() implies a memory barrier. - */ - llist_add(&work->node, &dev->work_list); - wake_up_process(dev->worker); - } + vhost_work_queue_at_worker(w, work); } EXPORT_SYMBOL_GPL(vhost_work_queue); +void vhost_work_vqueue(struct vhost_virtqueue *vq, struct vhost_work *work) +{ + struct vhost_worker *w = READ_ONCE(vq->worker); + + if (!w) + return; + + vhost_work_queue_at_worker(w, work); +} +EXPORT_SYMBOL_GPL(vhost_work_vqueue); + /* A lockless hint for busy polling code to exit the loop */ bool vhost_has_work(struct vhost_dev *dev) { @@ -278,7 +322,7 @@ EXPORT_SYMBOL_GPL(vhost_has_work); void vhost_poll_queue(struct vhost_poll *poll) { - vhost_work_queue(poll->dev, &poll->work); + vhost_work_vqueue(poll->vq, &poll->work); } EXPORT_SYMBOL_GPL(vhost_poll_queue); @@ -339,11 +383,32 @@ static void vhost_vq_reset(struct vhost_dev *dev, vq->iotlb = NULL; vhost_vring_call_reset(&vq->call_ctx); __vhost_vq_meta_reset(vq); + vq->worker = NULL; +} + +static void vhost_worker_reset(struct vhost_worker *w) +{ + init_llist_head(&w->work_list); + w->worker = NULL; +} + +void vhost_cleanup_workers(struct vhost_dev *dev) +{ + int i; + + for (i = 0; i < dev->nworkers; ++i) { + WARN_ON(!llist_empty(&dev->workers[i].work_list)); + kthread_stop(dev->workers[i].worker); + vhost_worker_reset(&dev->workers[i]); + } + + dev->nworkers = 0; } static int vhost_worker(void *data) { - struct vhost_dev *dev = data; + struct vhost_worker *w = data; + struct vhost_dev *dev = w->dev; struct vhost_work *work, *work_next; struct llist_node *node; @@ -358,7 +423,7 @@ static int vhost_worker(void *data) break; } - node = llist_del_all(&dev->work_list); + node = llist_del_all(&w->work_list); if (!node) schedule(); @@ -481,7 +546,6 @@ void vhost_dev_init(struct vhost_dev *dev, dev->umem = NULL; dev->iotlb = NULL; dev->mm = NULL; - dev->worker = NULL; dev->iov_limit = iov_limit; dev->weight = weight; dev->byte_weight = byte_weight; @@ -493,6 +557,11 @@ void vhost_dev_init(struct vhost_dev *dev, INIT_LIST_HEAD(&dev->pending_list); spin_lock_init(&dev->iotlb_lock); + dev->nworkers = 0; + for (i = 0; i < VHOST_MAX_WORKERS; ++i) { + dev->workers[i].dev = dev; + vhost_worker_reset(&dev->workers[i]); + } for (i = 0; i < dev->nvqs; ++i) { vq = dev->vqs[i]; @@ -506,8 +575,8 @@ void vhost_dev_init(struct vhost_dev *dev, #endif vhost_vq_reset(dev, vq); if (vq->handle_kick) - vhost_poll_init(&vq->poll, vq->handle_kick, - EPOLLIN, dev); + vhost_poll_init_vq(&vq->poll, vq->handle_kick, + EPOLLIN, vq); } } EXPORT_SYMBOL_GPL(vhost_dev_init); @@ -534,14 +603,14 @@ static void vhost_attach_cgroups_work(struct vhost_work *work) s->ret = cgroup_attach_task_all(s->owner, current); } -static int vhost_attach_cgroups(struct vhost_dev *dev) +static int vhost_worker_attach_cgroups(struct vhost_worker *w) { struct vhost_attach_cgroups_struct attach; attach.owner = current; vhost_work_init(&attach.work, vhost_attach_cgroups_work); - vhost_work_queue(dev, &attach.work); - vhost_work_flush(dev, &attach.work); + vhost_work_queue_at_worker(w, &attach.work); + vhost_worker_flush(w); return attach.ret; } @@ -582,51 +651,96 @@ static void vhost_detach_mm(struct vhost_dev *dev) dev->mm = NULL; } +static int vhost_add_worker(struct vhost_dev *dev) +{ + struct vhost_worker *w = &dev->workers[dev->nworkers]; + struct task_struct *worker; + int err; + + if (dev->nworkers == VHOST_MAX_WORKERS) + return -E2BIG; + + worker = kthread_create(vhost_worker, w, + "vhost-%d-%d", current->pid, dev->nworkers); + if (IS_ERR(worker)) + return PTR_ERR(worker); + + w->worker = worker; + wake_up_process(worker); /* avoid contributing to loadavg */ + + err = vhost_worker_attach_cgroups(w); + if (err) + goto cleanup; + + dev->nworkers++; + return 0; + +cleanup: + kthread_stop(worker); + w->worker = NULL; + + return err; +} + +static int vhost_set_workers(struct vhost_dev *dev, int n) +{ + int i, ret; + + if (n > dev->nvqs) + n = dev->nvqs; + + if (n > VHOST_MAX_WORKERS) + n = VHOST_MAX_WORKERS; + + for (i = 0; i < n - dev->nworkers ; i++) { + ret = vhost_add_worker(dev); + if (ret) + break; + } + + return ret; +} + +static void vhost_assign_workers(struct vhost_dev *dev) +{ + int i, j = 0; + + for (i = 0; i < dev->nvqs; i++) { + dev->vqs[i]->worker = &dev->workers[j]; + if (++j == dev->nworkers) + j = 0; + } +} + /* Caller should have device mutex */ long vhost_dev_set_owner(struct vhost_dev *dev) { - struct task_struct *worker; int err; /* Is there an owner already? */ - if (vhost_dev_has_owner(dev)) { - err = -EBUSY; - goto err_mm; - } + if (vhost_dev_has_owner(dev)) + return -EBUSY; vhost_attach_mm(dev); dev->kcov_handle = kcov_common_handle(); if (dev->use_worker) { - worker = kthread_create(vhost_worker, dev, - "vhost-%d", current->pid); - if (IS_ERR(worker)) { - err = PTR_ERR(worker); - goto err_worker; - } - - dev->worker = worker; - wake_up_process(worker); /* avoid contributing to loadavg */ - - err = vhost_attach_cgroups(dev); + err = vhost_add_worker(dev); if (err) - goto err_cgroup; + goto err_mm; } err = vhost_dev_alloc_iovecs(dev); if (err) - goto err_cgroup; + goto err_worker; + vhost_assign_workers(dev); return 0; -err_cgroup: - if (dev->worker) { - kthread_stop(dev->worker); - dev->worker = NULL; - } err_worker: + vhost_cleanup_workers(dev); +err_mm: vhost_detach_mm(dev); dev->kcov_handle = 0; -err_mm: return err; } EXPORT_SYMBOL_GPL(vhost_dev_set_owner); @@ -664,11 +778,11 @@ void vhost_dev_stop(struct vhost_dev *dev) int i; for (i = 0; i < dev->nvqs; ++i) { - if (dev->vqs[i]->kick && dev->vqs[i]->handle_kick) { + if (dev->vqs[i]->kick && dev->vqs[i]->handle_kick) vhost_poll_stop(&dev->vqs[i]->poll); - vhost_poll_flush(&dev->vqs[i]->poll); - } } + + vhost_dev_flush(dev); } EXPORT_SYMBOL_GPL(vhost_dev_stop); @@ -707,6 +821,7 @@ void vhost_dev_cleanup(struct vhost_dev *dev) vhost_xen_unmap_desc_all(dev->vqs[i]); #endif } + vhost_dev_free_iovecs(dev); if (dev->log_ctx) eventfd_ctx_put(dev->log_ctx); @@ -718,10 +833,8 @@ void vhost_dev_cleanup(struct vhost_dev *dev) dev->iotlb = NULL; vhost_clear_msg(dev); wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM); - WARN_ON(!llist_empty(&dev->work_list)); - if (dev->worker) { - kthread_stop(dev->worker); - dev->worker = NULL; + if (dev->use_worker) { + vhost_cleanup_workers(dev); dev->kcov_handle = 0; } vhost_detach_mm(dev); @@ -1720,7 +1833,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg mutex_unlock(&vq->mutex); if (pollstop && vq->handle_kick) - vhost_poll_flush(&vq->poll); + vhost_dev_flush(d); return r; } EXPORT_SYMBOL_GPL(vhost_vring_ioctl); @@ -1758,7 +1871,7 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp) struct eventfd_ctx *ctx; u64 p; long r; - int i, fd; + int i, fd, n; /* If you are not the owner, you can become one */ if (ioctl == VHOST_SET_OWNER) { @@ -1815,6 +1928,18 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp) if (ctx) eventfd_ctx_put(ctx); break; + case VHOST_SET_NWORKERS: + r = get_user(n, (int __user *)argp); + if (r < 0) + break; + if (n < d->nworkers) { + r = -EINVAL; + break; + } + + r = vhost_set_workers(d, n); + vhost_assign_workers(d); + break; default: r = -ENOIOCTLCMD; break; diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 8396e54ce1ce5e..4ee85a9ce4732c 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -25,6 +25,13 @@ struct vhost_work { unsigned long flags; }; +#define VHOST_MAX_WORKERS 4 +struct vhost_worker { + struct task_struct *worker; + struct llist_head work_list; + struct vhost_dev *dev; +}; + /* Poll a file (eventfd or socket) */ /* Note: there's nothing vhost specific about this structure. */ struct vhost_poll { @@ -33,7 +40,7 @@ struct vhost_poll { wait_queue_entry_t wait; struct vhost_work work; __poll_t mask; - struct vhost_dev *dev; + struct vhost_virtqueue *vq; }; void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn); @@ -42,11 +49,12 @@ bool vhost_has_work(struct vhost_dev *dev); void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, __poll_t mask, struct vhost_dev *dev); +void vhost_poll_init_vq(struct vhost_poll *poll, vhost_work_fn_t fn, + __poll_t mask, struct vhost_virtqueue *vq); int vhost_poll_start(struct vhost_poll *poll, struct file *file); void vhost_poll_stop(struct vhost_poll *poll); -void vhost_poll_flush(struct vhost_poll *poll); void vhost_poll_queue(struct vhost_poll *poll); -void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work); +void vhost_dev_flush(struct vhost_dev *dev); long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp); struct vhost_log { @@ -132,6 +140,7 @@ struct vhost_virtqueue { bool user_be; #endif u32 busyloop_timeout; + struct vhost_worker *worker; #ifdef CONFIG_VHOST_XEN /* @@ -143,6 +152,11 @@ struct vhost_virtqueue { #endif }; +/* Queue the work on virtqueue assigned worker */ +void vhost_work_vqueue(struct vhost_virtqueue *vq, struct vhost_work *work); +/* Flush virtqueue assigned worker */ +void vhost_work_flush_vq(struct vhost_virtqueue *vq); + struct vhost_msg_node { union { struct vhost_msg msg; @@ -159,7 +173,8 @@ struct vhost_dev { int nvqs; struct eventfd_ctx *log_ctx; struct llist_head work_list; - struct task_struct *worker; + struct vhost_worker workers[VHOST_MAX_WORKERS]; + int nworkers; struct vhost_iotlb *umem; struct vhost_iotlb *iotlb; spinlock_t iotlb_lock; diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index 9885cab70ea59b..8c2569743f6c9a 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -684,12 +684,7 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file) static void vhost_vsock_flush(struct vhost_vsock *vsock) { - int i; - - for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) - if (vsock->vqs[i].handle_kick) - vhost_poll_flush(&vsock->vqs[i].poll); - vhost_work_flush(&vsock->dev, &vsock->send_pkt_work); + vhost_dev_flush(&vsock->dev); } static void vhost_vsock_reset_orphans(struct sock *sk) diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index c998860d7bbc43..d6d87f6315f60a 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -71,6 +71,15 @@ #define VHOST_SET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x13, struct vhost_vring_state) #define VHOST_GET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x14, struct vhost_vring_state) +/* Set number of vhost workers + * Currently nuber of vhost workers can only be increased. + * All workers are freed upon reset. + * If the value is too big it is silently truncated to the maximum number of + * supported vhost workers + * Even if the error is returned it is possible that some workers were created + */ +#define VHOST_SET_NWORKERS _IOW(VHOST_VIRTIO, 0x1F, int) + /* The following ioctls use eventfd file descriptors to signal and poll * for events. */ @@ -150,4 +159,9 @@ /* Get the valid iova range */ #define VHOST_VDPA_GET_IOVA_RANGE _IOR(VHOST_VIRTIO, 0x78, \ struct vhost_vdpa_iova_range) + +/* VHOST_BLK specific defines */ +#define VHOST_BLK_SET_BACKEND _IOW(VHOST_VIRTIO, 0xFF, \ + struct vhost_vring_file) + #endif