[PATCH 16/24] vhost: vhost net support
by Michael S. Tsirkin
This adds vhost net support in qemu. Will be tied to tap device and
virtio by following patches. Raw backend is currently missing, will be
worked on/submitted separately.
Signed-off-by: Michael S. Tsirkin <mst(a)redhat.com>
---
Makefile.target | 1 +
hw/vhost.c | 603 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
hw/vhost.h | 44 ++++
hw/vhost_net.c | 147 ++++++++++++++
hw/vhost_net.h | 20 ++
5 files changed, 815 insertions(+), 0 deletions(-)
create mode 100644 hw/vhost.c
create mode 100644 hw/vhost.h
create mode 100644 hw/vhost_net.c
create mode 100644 hw/vhost_net.h
diff --git a/Makefile.target b/Makefile.target
index 0c844a9..2ebd30c 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -168,6 +168,7 @@ obj-y = vl.o async.o monitor.o pci.o pci_host.o pcie_host.o machine.o gdbstub.o
# need to fix this properly
obj-y += virtio-blk.o virtio-balloon.o virtio-net.o virtio-console.o virtio-pci.o
obj-y += notifier.o
+obj-y += vhost_net.o vhost.o
obj-$(CONFIG_KVM) += kvm.o kvm-all.o
# MSI-X depends on kvm for interrupt injection,
# so moved it from Makefile.hw to Makefile.target for now
diff --git a/hw/vhost.c b/hw/vhost.c
new file mode 100644
index 0000000..e5c1ead
--- /dev/null
+++ b/hw/vhost.c
@@ -0,0 +1,603 @@
+#include "linux/vhost.h"
+#include <sys/ioctl.h>
+#include <sys/eventfd.h>
+#include "vhost.h"
+#include "hw/hw.h"
+/* For range_get_last */
+#include "pci.h"
+
+static void vhost_dev_sync_region(struct vhost_dev *dev,
+ uint64_t mfirst, uint64_t mlast,
+ uint64_t rfirst, uint64_t rlast)
+{
+ uint64_t start = MAX(mfirst, rfirst);
+ uint64_t end = MIN(mlast, rlast);
+ vhost_log_chunk_t *from = dev->log + start / VHOST_LOG_CHUNK;
+ vhost_log_chunk_t *to = dev->log + end / VHOST_LOG_CHUNK + 1;
+ uint64_t addr = (start / VHOST_LOG_CHUNK) * VHOST_LOG_CHUNK;
+
+ assert(end / VHOST_LOG_CHUNK < dev->log_size);
+ assert(start / VHOST_LOG_CHUNK < dev->log_size);
+ if (end < start) {
+ return;
+ }
+ for (;from < to; ++from) {
+ vhost_log_chunk_t log;
+ int bit;
+ /* We first check with non-atomic: much cheaper,
+ * and we expect non-dirty to be the common case. */
+ if (!*from) {
+ continue;
+ }
+ /* Data must be read atomically. We don't really
+ * need the barrier semantics of __sync
+ * builtins, but it's easier to use them than
+ * roll our own. */
+ log = __sync_fetch_and_and(from, 0);
+ while ((bit = sizeof(log) > sizeof(int) ?
+ ffsll(log) : ffs(log))) {
+ bit -= 1;
+ cpu_physical_memory_set_dirty(addr + bit * VHOST_LOG_PAGE);
+ log &= ~(0x1ull << bit);
+ }
+ addr += VHOST_LOG_CHUNK;
+ }
+}
+
+static int vhost_client_sync_dirty_bitmap(struct CPUPhysMemoryClient *client,
+ target_phys_addr_t start_addr,
+ target_phys_addr_t end_addr)
+{
+ struct vhost_dev *dev = container_of(client, struct vhost_dev, client);
+ int i;
+ if (!dev->log_enabled || !dev->started) {
+ return 0;
+ }
+ for (i = 0; i < dev->mem->nregions; ++i) {
+ struct vhost_memory_region *reg = dev->mem->regions + i;
+ vhost_dev_sync_region(dev, start_addr, end_addr,
+ reg->guest_phys_addr,
+ range_get_last(reg->guest_phys_addr,
+ reg->memory_size));
+ }
+ for (i = 0; i < dev->nvqs; ++i) {
+ struct vhost_virtqueue *vq = dev->vqs + i;
+ unsigned size = sizeof(struct vring_used_elem) * vq->num;
+ vhost_dev_sync_region(dev, start_addr, end_addr, vq->used_phys,
+ range_get_last(vq->used_phys, size));
+ }
+ return 0;
+}
+
+/* Assign/unassign. Keep an unsorted array of non-overlapping
+ * memory regions in dev->mem. */
+static void vhost_dev_unassign_memory(struct vhost_dev *dev,
+ uint64_t start_addr,
+ uint64_t size)
+{
+ int from, to, n = dev->mem->nregions;
+ /* Track overlapping/split regions for sanity checking. */
+ int overlap_start = 0, overlap_end = 0, overlap_middle = 0, split = 0;
+
+ for (from = 0, to = 0; from < n; ++from, ++to) {
+ struct vhost_memory_region *reg = dev->mem->regions + to;
+ uint64_t reglast;
+ uint64_t memlast;
+ uint64_t change;
+
+ /* clone old region */
+ if (to != from) {
+ memcpy(reg, dev->mem->regions + from, sizeof *reg);
+ }
+
+ /* No overlap is simple */
+ if (!ranges_overlap(reg->guest_phys_addr, reg->memory_size,
+ start_addr, size)) {
+ continue;
+ }
+
+ /* Split only happens if supplied region
+ * is in the middle of an existing one. Thus it can not
+ * overlap with any other existing region. */
+ assert(!split);
+
+ reglast = range_get_last(reg->guest_phys_addr, reg->memory_size);
+ memlast = range_get_last(start_addr, size);
+
+ /* Remove whole region */
+ if (start_addr <= reg->guest_phys_addr && memlast >= reglast) {
+ --dev->mem->nregions;
+ --to;
+ assert(to >= 0);
+ ++overlap_middle;
+ continue;
+ }
+
+ /* Shrink region */
+ if (memlast >= reglast) {
+ reg->memory_size = start_addr - reg->guest_phys_addr;
+ assert(reg->memory_size);
+ assert(!overlap_end);
+ ++overlap_end;
+ continue;
+ }
+
+ /* Shift region */
+ if (start_addr <= reg->guest_phys_addr) {
+ change = memlast + 1 - reg->guest_phys_addr;
+ reg->memory_size -= change;
+ reg->guest_phys_addr += change;
+ reg->userspace_addr += change;
+ assert(reg->memory_size);
+ assert(!overlap_start);
+ ++overlap_start;
+ continue;
+ }
+
+ /* This only happens if supplied region
+ * is in the middle of an existing one. Thus it can not
+ * overlap with any other existing region. */
+ assert(!overlap_start);
+ assert(!overlap_end);
+ assert(!overlap_middle);
+ /* Split region: shrink first part, shift second part. */
+ memcpy(dev->mem->regions + n, reg, sizeof *reg);
+ reg->memory_size = start_addr - reg->guest_phys_addr;
+ assert(reg->memory_size);
+ change = memlast + 1 - reg->guest_phys_addr;
+ reg = dev->mem->regions + n;
+ reg->memory_size -= change;
+ assert(reg->memory_size);
+ reg->guest_phys_addr += change;
+ reg->userspace_addr += change;
+ /* Never add more than 1 region */
+ assert(dev->mem->nregions == n);
+ ++dev->mem->nregions;
+ ++split;
+ }
+}
+
+/* Called after unassign, so no regions overlap the given range. */
+static void vhost_dev_assign_memory(struct vhost_dev *dev,
+ uint64_t start_addr,
+ uint64_t size,
+ uint64_t uaddr)
+{
+ int from, to;
+ struct vhost_memory_region *merged = NULL;
+ for (from = 0, to = 0; from < dev->mem->nregions; ++from, ++to) {
+ struct vhost_memory_region *reg = dev->mem->regions + to;
+ uint64_t prlast, urlast;
+ uint64_t pmlast, umlast;
+ uint64_t s, e, u;
+
+ /* clone old region */
+ if (to != from) {
+ memcpy(reg, dev->mem->regions + from, sizeof *reg);
+ }
+ prlast = range_get_last(reg->guest_phys_addr, reg->memory_size);
+ pmlast = range_get_last(start_addr, size);
+ urlast = range_get_last(reg->userspace_addr, reg->memory_size);
+ umlast = range_get_last(uaddr, size);
+
+ /* check for overlapping regions: should never happen. */
+ assert(prlast < start_addr || pmlast < reg->guest_phys_addr);
+ /* Not an adjacent or overlapping region - do not merge. */
+ if ((prlast + 1 != start_addr || urlast + 1 != uaddr) &&
+ (pmlast + 1 != reg->guest_phys_addr ||
+ umlast + 1 != reg->userspace_addr)) {
+ continue;
+ }
+
+ if (merged) {
+ --to;
+ assert(to >= 0);
+ } else {
+ merged = reg;
+ }
+ u = MIN(uaddr, reg->userspace_addr);
+ s = MIN(start_addr, reg->guest_phys_addr);
+ e = MAX(pmlast, prlast);
+ uaddr = merged->userspace_addr = u;
+ start_addr = merged->guest_phys_addr = s;
+ size = merged->memory_size = e - s + 1;
+ assert(merged->memory_size);
+ }
+
+ if (!merged) {
+ struct vhost_memory_region *reg = dev->mem->regions + to;
+ memset(reg, 0, sizeof *reg);
+ reg->memory_size = size;
+ assert(reg->memory_size);
+ reg->guest_phys_addr = start_addr;
+ reg->userspace_addr = uaddr;
+ ++to;
+ }
+ assert(to <= dev->mem->nregions + 1);
+ dev->mem->nregions = to;
+}
+
+static uint64_t vhost_get_log_size(struct vhost_dev *dev)
+{
+ uint64_t log_size = 0;
+ int i;
+ for (i = 0; i < dev->mem->nregions; ++i) {
+ struct vhost_memory_region *reg = dev->mem->regions + i;
+ uint64_t last = range_get_last(reg->guest_phys_addr,
+ reg->memory_size);
+ log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
+ }
+ for (i = 0; i < dev->nvqs; ++i) {
+ struct vhost_virtqueue *vq = dev->vqs + i;
+ uint64_t last = vq->used_phys +
+ sizeof(struct vring_used_elem) * vq->num - 1;
+ log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
+ }
+ return log_size;
+}
+
+static inline void vhost_dev_log_resize(struct vhost_dev* dev, uint64_t size)
+{
+ vhost_log_chunk_t *log;
+ int r;
+ if (size) {
+ log = qemu_mallocz(size * sizeof *log);
+ } else {
+ log = NULL;
+ }
+ r = ioctl(dev->control, VHOST_SET_LOG_BASE,
+ (uint64_t)(unsigned long)log);
+ assert(r >= 0);
+ vhost_client_sync_dirty_bitmap(&dev->client, 0,
+ (target_phys_addr_t)~0x0ull);
+ if (dev->log) {
+ qemu_free(dev->log);
+ }
+ dev->log = log;
+ dev->log_size = size;
+}
+
+static void vhost_client_set_memory(CPUPhysMemoryClient *client,
+ target_phys_addr_t start_addr,
+ ram_addr_t size,
+ ram_addr_t phys_offset)
+{
+ struct vhost_dev *dev = container_of(client, struct vhost_dev, client);
+ ram_addr_t flags = phys_offset & ~TARGET_PAGE_MASK;
+ int s = offsetof(struct vhost_memory, regions) +
+ (dev->mem->nregions + 1) * sizeof dev->mem->regions[0];
+ uint64_t log_size;
+ int r;
+ dev->mem = qemu_realloc(dev->mem, s);
+
+ assert(size);
+
+ vhost_dev_unassign_memory(dev, start_addr, size);
+ if (flags == IO_MEM_RAM) {
+ /* Add given mapping, merging adjacent regions if any */
+ vhost_dev_assign_memory(dev, start_addr, size,
+ (uintptr_t)qemu_get_ram_ptr(phys_offset));
+ } else {
+ /* Remove old mapping for this memory, if any. */
+ vhost_dev_unassign_memory(dev, start_addr, size);
+ }
+
+ if (!dev->started) {
+ return;
+ }
+ if (!dev->log_enabled) {
+ r = ioctl(dev->control, VHOST_SET_MEM_TABLE, dev->mem);
+ assert(r >= 0);
+ return;
+ }
+ log_size = vhost_get_log_size(dev);
+ /* We allocate an extra 4K bytes to log,
+ * to reduce the * number of reallocations. */
+#define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log)
+ /* To log more, must increase log size before table update. */
+ if (dev->log_size < log_size) {
+ vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER);
+ }
+ r = ioctl(dev->control, VHOST_SET_MEM_TABLE, dev->mem);
+ assert(r >= 0);
+ /* To log less, can only decrease log size after table update. */
+ if (dev->log_size > log_size + VHOST_LOG_BUFFER) {
+ vhost_dev_log_resize(dev, log_size);
+ }
+}
+
+static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
+{
+ uint64_t features = dev->acked_features;
+ int r;
+ if (dev->log_enabled) {
+ features |= 0x1 << VHOST_F_LOG_ALL;
+ }
+ r = ioctl(dev->control, VHOST_SET_FEATURES, &features);
+ return r < 0 ? -errno : 0;
+}
+
+static int vhost_client_migration_log(struct CPUPhysMemoryClient *client,
+ int enable)
+{
+ struct vhost_dev *dev = container_of(client, struct vhost_dev, client);
+ int r;
+ if (!!enable == dev->log_enabled) {
+ return 0;
+ }
+ if (!dev->started) {
+ dev->log_enabled = enable;
+ return 0;
+ }
+ if (!enable) {
+ r = vhost_dev_set_log(dev, false);
+ if (r < 0) {
+ return r;
+ }
+ if (dev->log) {
+ qemu_free(dev->log);
+ }
+ dev->log = NULL;
+ dev->log_size = 0;
+ } else {
+ vhost_dev_log_resize(dev, vhost_get_log_size(dev));
+ r = vhost_dev_set_log(dev, false);
+ if (r < 0) {
+ return r;
+ }
+ }
+ dev->log_enabled = enable;
+ return 0;
+}
+
+static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
+ struct vhost_virtqueue *vq,
+ unsigned idx, bool enable_log)
+{
+ struct vhost_vring_addr addr = {
+ .index = idx,
+ .desc_user_addr = (u_int64_t)(unsigned long)vq->desc,
+ .avail_user_addr = (u_int64_t)(unsigned long)vq->avail,
+ .used_user_addr = (u_int64_t)(unsigned long)vq->used,
+ .log_guest_addr = vq->used_phys,
+ .flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0,
+ };
+ int r = ioctl(dev->control, VHOST_SET_VRING_ADDR, &addr);
+ if (r < 0) {
+ return -errno;
+ }
+ return 0;
+}
+
+static int vhost_virtqueue_init(struct vhost_dev *dev,
+ struct VirtIODevice *vdev,
+ struct vhost_virtqueue *vq,
+ unsigned idx)
+{
+ target_phys_addr_t s, l, a;
+ int r;
+ struct vhost_vring_file file = {
+ .index = idx,
+ };
+ struct vhost_vring_state state = {
+ .index = idx,
+ };
+ struct VirtQueue *q = virtio_queue(vdev, idx);
+
+ vq->num = state.num = virtio_queue_get_num(vdev, idx);
+ r = ioctl(dev->control, VHOST_SET_VRING_NUM, &state);
+ if (r) {
+ return -errno;
+ }
+
+ state.num = virtio_queue_last_avail_idx(vdev, idx);
+ r = ioctl(dev->control, VHOST_SET_VRING_BASE, &state);
+ if (r) {
+ return -errno;
+ }
+
+ s = l = sizeof(struct vring_desc) * vq->num;
+ a = virtio_queue_get_desc(vdev, idx);
+ vq->desc = cpu_physical_memory_map(a, &l, 0);
+ if (!vq->desc || l != s) {
+ r = -ENOMEM;
+ goto fail_alloc;
+ }
+ s = l = offsetof(struct vring_avail, ring) +
+ sizeof(u_int64_t) * vq->num;
+ a = virtio_queue_get_avail(vdev, idx);
+ vq->avail = cpu_physical_memory_map(a, &l, 0);
+ if (!vq->avail || l != s) {
+ r = -ENOMEM;
+ goto fail_alloc;
+ }
+ s = l = offsetof(struct vring_used, ring) +
+ sizeof(struct vring_used_elem) * vq->num;
+ vq->used_phys = a = virtio_queue_get_used(vdev, idx);
+ vq->used = cpu_physical_memory_map(a, &l, 1);
+ if (!vq->used || l != s) {
+ r = -ENOMEM;
+ goto fail_alloc;
+ }
+
+ r = vhost_virtqueue_set_addr(dev, vq, idx, dev->log_enabled);
+ if (r < 0) {
+ r = -errno;
+ goto fail_alloc;
+ }
+ if (!vdev->binding->guest_notifier || !vdev->binding->host_notifier) {
+ fprintf(stderr, "binding does not support irqfd/queuefd\n");
+ r = -ENOSYS;
+ goto fail_alloc;
+ }
+ r = vdev->binding->guest_notifier(vdev->binding_opaque, idx, true);
+ if (r < 0) {
+ fprintf(stderr, "Error binding guest notifier: %d\n", -r);
+ goto fail_guest_notifier;
+ }
+
+ r = vdev->binding->host_notifier(vdev->binding_opaque, idx, true);
+ if (r < 0) {
+ fprintf(stderr, "Error binding host notifier: %d\n", -r);
+ goto fail_host_notifier;
+ }
+
+ file.fd = event_notifier_get_fd(virtio_queue_host_notifier(q));
+ r = ioctl(dev->control, VHOST_SET_VRING_KICK, &file);
+ if (r) {
+ goto fail_kick;
+ }
+
+ file.fd = event_notifier_get_fd(virtio_queue_guest_notifier(q));
+ r = ioctl(dev->control, VHOST_SET_VRING_CALL, &file);
+ if (r) {
+ goto fail_call;
+ }
+
+ return 0;
+
+fail_call:
+fail_kick:
+ vdev->binding->host_notifier(vdev->binding_opaque, idx, false);
+fail_host_notifier:
+ vdev->binding->guest_notifier(vdev->binding_opaque, idx, false);
+fail_guest_notifier:
+fail_alloc:
+ return r;
+}
+
+static void vhost_virtqueue_cleanup(struct vhost_dev *dev,
+ struct VirtIODevice *vdev,
+ struct vhost_virtqueue *vq,
+ unsigned idx)
+{
+ struct vhost_vring_state state = {
+ .index = idx,
+ };
+ int r;
+ r = vdev->binding->guest_notifier(vdev->binding_opaque, idx, false);
+ if (r < 0) {
+ fprintf(stderr, "vhost VQ %d guest cleanup failed: %d\n", idx, r);
+ fflush(stderr);
+ }
+ assert (r >= 0);
+
+ r = vdev->binding->host_notifier(vdev->binding_opaque, idx, false);
+ if (r < 0) {
+ fprintf(stderr, "vhost VQ %d host cleanup failed: %d\n", idx, r);
+ fflush(stderr);
+ }
+ assert (r >= 0);
+ r = ioctl(dev->control, VHOST_GET_VRING_BASE, &state);
+ if (r < 0) {
+ fprintf(stderr, "vhost VQ %d ring restore failed: %d\n", idx, r);
+ fflush(stderr);
+ }
+ virtio_queue_set_last_avail_idx(vdev, idx, state.num);
+ assert (r >= 0);
+}
+
+int vhost_dev_init(struct vhost_dev *hdev, int devfd)
+{
+ uint64_t features;
+ int r;
+ if (devfd >= 0) {
+ hdev->control = devfd;
+ } else {
+ hdev->control = open("/dev/vhost-net", O_RDWR);
+ if (hdev->control < 0)
+ return -errno;
+ }
+ r = ioctl(hdev->control, VHOST_SET_OWNER, NULL);
+ if (r < 0)
+ goto fail;
+
+ r = ioctl(hdev->control, VHOST_GET_FEATURES, &features);
+ if (r < 0)
+ goto fail;
+ hdev->features = features;
+
+ hdev->client.set_memory = vhost_client_set_memory;
+ hdev->client.sync_dirty_bitmap = vhost_client_sync_dirty_bitmap;
+ hdev->client.migration_log = vhost_client_migration_log;
+ hdev->mem = qemu_mallocz(offsetof(struct vhost_memory, regions));
+ hdev->log = NULL;
+ hdev->log_size = 0;
+ hdev->log_enabled = false;
+ hdev->started = false;
+ cpu_register_phys_memory_client(&hdev->client);
+ return 0;
+fail:
+ r = -errno;
+ close(hdev->control);
+ return r;
+}
+
+void vhost_dev_cleanup(struct vhost_dev *hdev)
+{
+ cpu_unregister_phys_memory_client(&hdev->client);
+ qemu_free(hdev->mem);
+ close(hdev->control);
+}
+
+int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
+{
+ int i, r;
+
+ r = vhost_dev_set_log(hdev, hdev->log_enabled);
+ if (r < 0)
+ goto fail;
+ r = ioctl(hdev->control, VHOST_SET_MEM_TABLE, hdev->mem);
+ if (r < 0) {
+ r = -errno;
+ goto fail;
+ }
+ if (hdev->log_enabled) {
+ hdev->log_size = vhost_get_log_size(hdev);
+ hdev->log = hdev->log_size ?
+ qemu_mallocz(hdev->log_size * sizeof *hdev->log) : NULL;
+ r = ioctl(hdev->control, VHOST_SET_LOG_BASE,
+ (uint64_t)(unsigned long)hdev->log);
+ if (r < 0) {
+ r = -errno;
+ goto fail;
+ }
+ }
+
+ for (i = 0; i < hdev->nvqs; ++i) {
+ r = vhost_virtqueue_init(hdev,
+ vdev,
+ hdev->vqs + i,
+ i);
+ if (r < 0)
+ goto fail_vq;
+ }
+ hdev->started = true;
+
+ return 0;
+fail_vq:
+ while (--i >= 0) {
+ vhost_virtqueue_cleanup(hdev,
+ vdev,
+ hdev->vqs + i,
+ i);
+ }
+fail:
+ return r;
+}
+
+void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
+{
+ int i;
+ for (i = 0; i < hdev->nvqs; ++i) {
+ vhost_virtqueue_cleanup(hdev,
+ vdev,
+ hdev->vqs + i,
+ i);
+ }
+ vhost_client_sync_dirty_bitmap(&hdev->client, 0,
+ (target_phys_addr_t)~0x0ull);
+ hdev->started = false;
+ qemu_free(hdev->log);
+ hdev->log_size = 0;
+}
diff --git a/hw/vhost.h b/hw/vhost.h
new file mode 100644
index 0000000..2ed3933
--- /dev/null
+++ b/hw/vhost.h
@@ -0,0 +1,44 @@
+#ifndef VHOST_H
+#define VHOST_H
+
+#include "hw/hw.h"
+#include "hw/virtio.h"
+
+/* Generic structures common for any vhost based device. */
+struct vhost_virtqueue {
+ int kick;
+ int call;
+ void *desc;
+ void *avail;
+ void *used;
+ int num;
+ unsigned long long used_phys;
+};
+
+typedef unsigned long vhost_log_chunk_t;
+#define VHOST_LOG_PAGE 0x1000
+#define VHOST_LOG_BITS (8 * sizeof(vhost_log_chunk_t))
+#define VHOST_LOG_CHUNK (VHOST_LOG_PAGE * VHOST_LOG_BITS)
+
+struct vhost_memory;
+struct vhost_dev {
+ CPUPhysMemoryClient client;
+ int control;
+ struct vhost_memory *mem;
+ struct vhost_virtqueue *vqs;
+ int nvqs;
+ unsigned long long features;
+ unsigned long long acked_features;
+ unsigned long long backend_features;
+ bool started;
+ bool log_enabled;
+ vhost_log_chunk_t *log;
+ unsigned long long log_size;
+};
+
+int vhost_dev_init(struct vhost_dev *hdev, int devfd);
+void vhost_dev_cleanup(struct vhost_dev *hdev);
+int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev);
+void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev);
+
+#endif
diff --git a/hw/vhost_net.c b/hw/vhost_net.c
new file mode 100644
index 0000000..c89ff40
--- /dev/null
+++ b/hw/vhost_net.c
@@ -0,0 +1,147 @@
+#include <sys/eventfd.h>
+#include <sys/socket.h>
+#include <linux/kvm.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <linux/vhost.h>
+#include <linux/virtio_ring.h>
+#include <netpacket/packet.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <netinet/in.h>
+
+#include <stdio.h>
+
+#include "net.h"
+#include "net/tap.h"
+
+#include "virtio-net.h"
+#include "vhost.h"
+#include "vhost_net.h"
+
+struct vhost_net {
+ struct vhost_dev dev;
+ struct vhost_virtqueue vqs[2];
+ int backend;
+ VLANClientState *vc;
+};
+
+unsigned vhost_net_get_features(struct vhost_net *net, unsigned features)
+{
+ /* Clear features not supported by host kernel. */
+ if (!(net->dev.features & (1 << VIRTIO_F_NOTIFY_ON_EMPTY)))
+ features &= ~(1 << VIRTIO_F_NOTIFY_ON_EMPTY);
+ if (!(net->dev.features & (1 << VIRTIO_RING_F_INDIRECT_DESC)))
+ features &= ~(1 << VIRTIO_RING_F_INDIRECT_DESC);
+ if (!(net->dev.features & (1 << VIRTIO_NET_F_MRG_RXBUF)))
+ features &= ~(1 << VIRTIO_NET_F_MRG_RXBUF);
+ return features;
+}
+
+void vhost_net_ack_features(struct vhost_net *net, unsigned features)
+{
+ net->dev.acked_features = net->dev.backend_features;
+ if (features & (1 << VIRTIO_F_NOTIFY_ON_EMPTY))
+ net->dev.acked_features |= (1 << VIRTIO_F_NOTIFY_ON_EMPTY);
+ if (features & (1 << VIRTIO_RING_F_INDIRECT_DESC))
+ net->dev.acked_features |= (1 << VIRTIO_RING_F_INDIRECT_DESC);
+}
+
+static int vhost_net_get_fd(VLANClientState *backend)
+{
+ switch (backend->info->type) {
+ case NET_CLIENT_TYPE_TAP:
+ return tap_get_fd(backend);
+ default:
+ fprintf(stderr, "vhost-net requires tap backend\n");
+ return -EBADFD;
+ }
+}
+
+struct vhost_net *vhost_net_init(VLANClientState *backend, int devfd)
+{
+ int r;
+ struct vhost_net *net = qemu_malloc(sizeof *net);
+ if (!backend) {
+ fprintf(stderr, "vhost-net requires backend to be setup\n");
+ goto fail;
+ }
+ r = vhost_net_get_fd(backend);
+ if (r < 0)
+ goto fail;
+ net->vc = backend;
+ net->dev.backend_features = tap_has_vnet_hdr(backend) ? 0 :
+ (1 << VHOST_NET_F_VIRTIO_NET_HDR);
+ net->backend = r;
+
+ r = vhost_dev_init(&net->dev, devfd);
+ if (r < 0)
+ goto fail;
+ if (~net->dev.features & net->dev.backend_features) {
+ fprintf(stderr, "vhost lacks feature mask %llu for backend\n",
+ ~net->dev.features & net->dev.backend_features);
+ vhost_dev_cleanup(&net->dev);
+ goto fail;
+ }
+
+ /* Set sane init value. Override when guest acks. */
+ vhost_net_ack_features(net, 0);
+ return net;
+fail:
+ qemu_free(net);
+ return NULL;
+}
+
+int vhost_net_start(struct vhost_net *net,
+ VirtIODevice *dev)
+{
+ struct vhost_vring_file file = { };
+ int r;
+
+ net->dev.nvqs = 2;
+ net->dev.vqs = net->vqs;
+ r = vhost_dev_start(&net->dev, dev);
+ if (r < 0)
+ return r;
+
+ net->vc->info->poll(net->vc, false);
+ qemu_set_fd_handler(net->backend, NULL, NULL, NULL);
+ file.fd = net->backend;
+ for (file.index = 0; file.index < net->dev.nvqs; ++file.index) {
+ r = ioctl(net->dev.control, VHOST_NET_SET_BACKEND, &file);
+ if (r < 0) {
+ r = -errno;
+ goto fail;
+ }
+ }
+ return 0;
+fail:
+ file.fd = -1;
+ while (--file.index >= 0) {
+ int r = ioctl(net->dev.control, VHOST_NET_SET_BACKEND, &file);
+ assert(r >= 0);
+ }
+ net->vc->info->poll(net->vc, true);
+ vhost_dev_stop(&net->dev, dev);
+ return r;
+}
+
+void vhost_net_stop(struct vhost_net *net,
+ VirtIODevice *dev)
+{
+ struct vhost_vring_file file = { .fd = -1 };
+
+ for (file.index = 0; file.index < net->dev.nvqs; ++file.index) {
+ int r = ioctl(net->dev.control, VHOST_NET_SET_BACKEND, &file);
+ assert(r >= 0);
+ }
+ net->vc->info->poll(net->vc, true);
+ vhost_dev_stop(&net->dev, dev);
+}
+
+void vhost_net_cleanup(struct vhost_net *net)
+{
+ vhost_dev_cleanup(&net->dev);
+ qemu_free(net);
+}
+/* TODO: log */
diff --git a/hw/vhost_net.h b/hw/vhost_net.h
new file mode 100644
index 0000000..21f0277
--- /dev/null
+++ b/hw/vhost_net.h
@@ -0,0 +1,20 @@
+#ifndef VHOST_NET_H
+#define VHOST_NET_H
+
+#include "net.h"
+
+struct vhost_net;
+
+struct vhost_net *vhost_net_init(VLANClientState *backend, int devfd);
+
+int vhost_net_start(struct vhost_net *net,
+ VirtIODevice *dev);
+void vhost_net_stop(struct vhost_net *net,
+ VirtIODevice *dev);
+
+void vhost_net_cleanup(struct vhost_net *net);
+
+unsigned vhost_net_get_features(struct vhost_net *net, unsigned features);
+void vhost_net_ack_features(struct vhost_net *net, unsigned features);
+
+#endif
--
1.6.6.144.g5c3af
14 years, 2 months
[PATCH 15/24] tap: add interface to get device fd
by Michael S. Tsirkin
Will be used by vhost to attach/detach to backend.
Signed-off-by: Michael S. Tsirkin <mst(a)redhat.com>
---
net/tap.c | 7 +++++++
net/tap.h | 2 ++
2 files changed, 9 insertions(+), 0 deletions(-)
diff --git a/net/tap.c b/net/tap.c
index d3492de..7e9ca79 100644
--- a/net/tap.c
+++ b/net/tap.c
@@ -269,6 +269,13 @@ static void tap_poll(VLANClientState *nc, bool enable)
tap_write_poll(s, enable);
}
+int tap_get_fd(VLANClientState *nc)
+{
+ TAPState *s = DO_UPCAST(TAPState, nc, nc);
+ assert(nc->info->type == NET_CLIENT_TYPE_TAP);
+ return s->fd;
+}
+
/* fd support */
static NetClientInfo net_tap_info = {
diff --git a/net/tap.h b/net/tap.h
index 538a562..a244b28 100644
--- a/net/tap.h
+++ b/net/tap.h
@@ -48,4 +48,6 @@ int tap_probe_vnet_hdr(int fd);
int tap_probe_has_ufo(int fd);
void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo);
+int tap_get_fd(VLANClientState *vc);
+
#endif /* QEMU_NET_TAP_H */
--
1.6.6.144.g5c3af
14 years, 2 months
[PATCH 14/24] virtio-pci: fill in notifier support
by Michael S. Tsirkin
Support host/guest notifiers in virtio-pci.
The last one only with kvm, that's okay
because vhost relies on kvm anyway.
Signed-off-by: Michael S. Tsirkin <mst(a)redhat.com>
---
hw/virtio-pci.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 62 insertions(+), 0 deletions(-)
diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c
index 05898c8..c454093 100644
--- a/hw/virtio-pci.c
+++ b/hw/virtio-pci.c
@@ -23,6 +23,7 @@
#include "msix.h"
#include "net.h"
#include "loader.h"
+#include "kvm.h"
/* from Linux's linux/virtio_pci.h */
@@ -394,6 +395,65 @@ static unsigned virtio_pci_get_features(void *opaque)
return proxy->host_features;
}
+static void virtio_pci_guest_notifier_read(void *opaque)
+{
+ VirtQueue *vq = opaque;
+ EventNotifier *n = virtio_queue_guest_notifier(vq);
+ if (event_notifier_test_and_clear(n)) {
+ virtio_irq(vq);
+ }
+}
+
+static int virtio_pci_guest_notifier(void *opaque, int n, bool assign)
+{
+ VirtIOPCIProxy *proxy = opaque;
+ VirtQueue *vq = virtio_queue(proxy->vdev, n);
+ EventNotifier *notifier = virtio_queue_guest_notifier(vq);
+
+ if (assign) {
+ int r = event_notifier_init(notifier, 0);
+ if (r < 0)
+ return r;
+ qemu_set_fd_handler(event_notifier_get_fd(notifier),
+ virtio_pci_guest_notifier_read, NULL, vq);
+ } else {
+ qemu_set_fd_handler(event_notifier_get_fd(notifier),
+ NULL, NULL, vq);
+ event_notifier_cleanup(notifier);
+ }
+
+ return 0;
+}
+
+static int virtio_pci_host_notifier(void *opaque, int n, bool assign)
+{
+ VirtIOPCIProxy *proxy = opaque;
+ VirtQueue *vq = virtio_queue(proxy->vdev, n);
+ EventNotifier *notifier = virtio_queue_host_notifier(vq);
+ int r;
+ if (assign) {
+ r = event_notifier_init(notifier, 1);
+ if (r < 0) {
+ return r;
+ }
+ r = kvm_set_ioeventfd(proxy->addr + VIRTIO_PCI_QUEUE_NOTIFY,
+ n, event_notifier_get_fd(notifier),
+ assign);
+ if (r < 0) {
+ event_notifier_cleanup(notifier);
+ }
+ } else {
+ r = kvm_set_ioeventfd(proxy->addr + VIRTIO_PCI_QUEUE_NOTIFY,
+ n, event_notifier_get_fd(notifier),
+ assign);
+ if (r < 0) {
+ return r;
+ }
+ event_notifier_cleanup(notifier);
+ }
+ return r;
+}
+
static const VirtIOBindings virtio_pci_bindings = {
.notify = virtio_pci_notify,
.save_config = virtio_pci_save_config,
@@ -401,6 +461,8 @@ static const VirtIOBindings virtio_pci_bindings = {
.save_queue = virtio_pci_save_queue,
.load_queue = virtio_pci_load_queue,
.get_features = virtio_pci_get_features,
+ .host_notifier = virtio_pci_host_notifier,
+ .guest_notifier = virtio_pci_guest_notifier,
};
static void virtio_init_pci(VirtIOPCIProxy *proxy, VirtIODevice *vdev,
--
1.6.6.144.g5c3af
14 years, 2 months
[PATCH 13/24] virtio: move typedef to qemu-common
by Michael S. Tsirkin
make it possible to use type without header include
Signed-off-by: Michael S. Tsirkin <mst(a)redhat.com>
---
hw/virtio.h | 1 -
qemu-common.h | 1 +
2 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/hw/virtio.h b/hw/virtio.h
index 39d0763..a5bd0ba 100644
--- a/hw/virtio.h
+++ b/hw/virtio.h
@@ -68,7 +68,6 @@ static inline target_phys_addr_t vring_align(target_phys_addr_t addr,
}
typedef struct VirtQueue VirtQueue;
-typedef struct VirtIODevice VirtIODevice;
#define VIRTQUEUE_MAX_SIZE 1024
diff --git a/qemu-common.h b/qemu-common.h
index cdead98..1a54f9e 100644
--- a/qemu-common.h
+++ b/qemu-common.h
@@ -218,6 +218,7 @@ typedef struct I2SCodec I2SCodec;
typedef struct DeviceState DeviceState;
typedef struct SSIBus SSIBus;
typedef struct EventNotifier EventNotifier;
+typedef struct VirtIODevice VirtIODevice;
/* CPU save/load. */
void cpu_save(QEMUFile *f, void *opaque);
--
1.6.6.144.g5c3af
14 years, 2 months
[PATCH 12/24] virtio: add status change callback
by Michael S. Tsirkin
vhost net backend needs to be notified when
frontend status changes. Add a callback.
Signed-off-by: Michael S. Tsirkin <mst(a)redhat.com>
---
hw/s390-virtio-bus.c | 3 +++
hw/syborg_virtio.c | 2 ++
hw/virtio-pci.c | 6 ++++++
hw/virtio.h | 1 +
4 files changed, 12 insertions(+), 0 deletions(-)
diff --git a/hw/s390-virtio-bus.c b/hw/s390-virtio-bus.c
index 980e7eb..f45b67d 100644
--- a/hw/s390-virtio-bus.c
+++ b/hw/s390-virtio-bus.c
@@ -243,6 +243,9 @@ void s390_virtio_device_update_status(VirtIOS390Device *dev)
uint32_t features;
vdev->status = ldub_phys(dev->dev_offs + VIRTIO_DEV_OFFS_STATUS);
+ if (vdev->set_status) {
+ vdev->set_status(vdev);
+ }
/* Update guest supported feature bitmap */
diff --git a/hw/syborg_virtio.c b/hw/syborg_virtio.c
index 65239a0..19f6473 100644
--- a/hw/syborg_virtio.c
+++ b/hw/syborg_virtio.c
@@ -152,6 +152,8 @@ static void syborg_virtio_writel(void *opaque, target_phys_addr_t offset,
vdev->status = value & 0xFF;
if (vdev->status == 0)
virtio_reset(vdev);
+ if (vdev->set_status)
+ vdev->set_status(vdev);
break;
case SYBORG_VIRTIO_INT_ENABLE:
s->int_enable = value;
diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c
index 573c98a..05898c8 100644
--- a/hw/virtio-pci.c
+++ b/hw/virtio-pci.c
@@ -208,6 +208,9 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
virtio_reset(proxy->vdev);
msix_unuse_all_vectors(&proxy->pci_dev);
}
+ if (vdev->set_status) {
+ vdev->set_status(vdev);
+ }
break;
case VIRTIO_MSI_CONFIG_VECTOR:
msix_vector_unuse(&proxy->pci_dev, vdev->config_vector);
@@ -375,6 +378,9 @@ static void virtio_write_config(PCIDevice *pci_dev, uint32_t address,
if (PCI_COMMAND == address) {
if (!(val & PCI_COMMAND_MASTER)) {
proxy->vdev->status &= ~VIRTIO_CONFIG_S_DRIVER_OK;
+ if (proxy->vdev->set_status) {
+ proxy->vdev->set_status(proxy->vdev);
+ }
}
}
diff --git a/hw/virtio.h b/hw/virtio.h
index f140ca3..39d0763 100644
--- a/hw/virtio.h
+++ b/hw/virtio.h
@@ -114,6 +114,7 @@ struct VirtIODevice
void (*get_config)(VirtIODevice *vdev, uint8_t *config);
void (*set_config)(VirtIODevice *vdev, const uint8_t *config);
void (*reset)(VirtIODevice *vdev);
+ void (*set_status)(VirtIODevice *vdev);
VirtQueue *vq;
const VirtIOBindings *binding;
void *binding_opaque;
--
1.6.6.144.g5c3af
14 years, 2 months
[PATCH 11/24] virtio: add APIs for queue fields
by Michael S. Tsirkin
vhost needs physical addresses for ring and other queue fields,
so add APIs for these.
Signed-off-by: Michael S. Tsirkin <mst(a)redhat.com>
---
hw/virtio.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++----
hw/virtio.h | 10 +++++++++-
2 files changed, 56 insertions(+), 5 deletions(-)
diff --git a/hw/virtio.c b/hw/virtio.c
index c2b80aa..b16ee1a 100644
--- a/hw/virtio.c
+++ b/hw/virtio.c
@@ -73,6 +73,9 @@ struct VirtQueue
int inuse;
uint16_t vector;
void (*handle_output)(VirtIODevice *vdev, VirtQueue *vq);
+ VirtIODevice *vdev;
+ EventNotifier guest_notifier;
+ EventNotifier host_notifier;
};
#define VIRTIO_PCI_QUEUE_MAX 16
@@ -594,10 +597,10 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size,
return &vdev->vq[i];
}
-void virtio_irq(VirtIODevice *vdev, VirtQueue *vq)
+void virtio_irq(VirtQueue *vq)
{
- vdev->isr |= 0x01;
- virtio_notify_vector(vdev, vq->vector);
+ vq->vdev->isr |= 0x01;
+ virtio_notify_vector(vq->vdev, vq->vector);
}
void virtio_notify(VirtIODevice *vdev, VirtQueue *vq)
@@ -608,7 +611,8 @@ void virtio_notify(VirtIODevice *vdev, VirtQueue *vq)
(vq->inuse || vring_avail_idx(vq) != vq->last_avail_idx)))
return;
- virtio_irq(vdev, vq);
+ vdev->isr |= 0x01;
+ virtio_notify_vector(vdev, vq->vector);
}
void virtio_notify_config(VirtIODevice *vdev)
@@ -742,3 +746,42 @@ void virtio_bind_device(VirtIODevice *vdev, const VirtIOBindings *binding,
vdev->binding = binding;
vdev->binding_opaque = opaque;
}
+
+target_phys_addr_t virtio_queue_get_desc(VirtIODevice *vdev, int n)
+{
+ return vdev->vq[n].vring.desc;
+}
+
+target_phys_addr_t virtio_queue_get_avail(VirtIODevice *vdev, int n)
+{
+ return vdev->vq[n].vring.avail;
+}
+
+target_phys_addr_t virtio_queue_get_used(VirtIODevice *vdev, int n)
+{
+ return vdev->vq[n].vring.used;
+}
+
+uint16_t virtio_queue_last_avail_idx(VirtIODevice *vdev, int n)
+{
+ return vdev->vq[n].last_avail_idx;
+}
+
+void virtio_queue_set_last_avail_idx(VirtIODevice *vdev, int n, uint16_t idx)
+{
+ vdev->vq[n].last_avail_idx = idx;
+}
+
+VirtQueue *virtio_queue(VirtIODevice *vdev, int n)
+{
+ return vdev->vq + n;
+}
+
+EventNotifier *virtio_queue_guest_notifier(VirtQueue *vq)
+{
+ return &vq->guest_notifier;
+}
+EventNotifier *virtio_queue_host_notifier(VirtQueue *vq)
+{
+ return &vq->host_notifier;
+}
diff --git a/hw/virtio.h b/hw/virtio.h
index 10a0959..f140ca3 100644
--- a/hw/virtio.h
+++ b/hw/virtio.h
@@ -183,5 +183,13 @@ void virtio_net_exit(VirtIODevice *vdev);
DEFINE_PROP_BIT("indirect_desc", _state, _field, \
VIRTIO_RING_F_INDIRECT_DESC, true)
-void virtio_irq(VirtIODevice *vdev, VirtQueue *vq);
+target_phys_addr_t virtio_queue_get_desc(VirtIODevice *vdev, int n);
+target_phys_addr_t virtio_queue_get_avail(VirtIODevice *vdev, int n);
+target_phys_addr_t virtio_queue_get_used(VirtIODevice *vdev, int n);
+uint16_t virtio_queue_last_avail_idx(VirtIODevice *vdev, int n);
+void virtio_queue_set_last_avail_idx(VirtIODevice *vdev, int n, uint16_t idx);
+VirtQueue *virtio_queue(VirtIODevice *vdev, int n);
+EventNotifier *virtio_queue_guest_notifier(VirtQueue *vq);
+EventNotifier *virtio_queue_host_notifier(VirtQueue *vq);
+void virtio_irq(VirtQueue *vq);
#endif
--
1.6.6.144.g5c3af
14 years, 2 months
[PATCH 10/24] virtio: add notifier support
by Michael S. Tsirkin
Add binding API to set host/guest notifiers.
Will be used by vhost.
Signed-off-by: Michael S. Tsirkin <mst(a)redhat.com>
---
hw/virtio.c | 13 ++++++++++---
hw/virtio.h | 5 ++++-
2 files changed, 14 insertions(+), 4 deletions(-)
diff --git a/hw/virtio.c b/hw/virtio.c
index fa7184a..c2b80aa 100644
--- a/hw/virtio.c
+++ b/hw/virtio.c
@@ -594,6 +594,12 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size,
return &vdev->vq[i];
}
+void virtio_irq(VirtIODevice *vdev, VirtQueue *vq)
+{
+ vdev->isr |= 0x01;
+ virtio_notify_vector(vdev, vq->vector);
+}
+
void virtio_notify(VirtIODevice *vdev, VirtQueue *vq)
{
/* Always notify when queue is empty (when feature acknowledge) */
@@ -602,8 +608,7 @@ void virtio_notify(VirtIODevice *vdev, VirtQueue *vq)
(vq->inuse || vring_avail_idx(vq) != vq->last_avail_idx)))
return;
- vdev->isr |= 0x01;
- virtio_notify_vector(vdev, vq->vector);
+ virtio_irq(vdev, vq);
}
void virtio_notify_config(VirtIODevice *vdev)
@@ -716,8 +721,10 @@ VirtIODevice *virtio_common_init(const char *name, uint16_t device_id,
vdev->queue_sel = 0;
vdev->config_vector = VIRTIO_NO_VECTOR;
vdev->vq = qemu_mallocz(sizeof(VirtQueue) * VIRTIO_PCI_QUEUE_MAX);
- for(i = 0; i < VIRTIO_PCI_QUEUE_MAX; i++)
+ for(i = 0; i < VIRTIO_PCI_QUEUE_MAX; i++) {
vdev->vq[i].vector = VIRTIO_NO_VECTOR;
+ vdev->vq[i].vdev = vdev;
+ }
vdev->name = name;
vdev->config_len = config_size;
diff --git a/hw/virtio.h b/hw/virtio.h
index 3994cc9..10a0959 100644
--- a/hw/virtio.h
+++ b/hw/virtio.h
@@ -18,6 +18,7 @@
#include "net.h"
#include "qdev.h"
#include "sysemu.h"
+#include "notifier.h"
/* from Linux's linux/virtio_config.h */
@@ -88,6 +89,8 @@ typedef struct {
int (*load_config)(void * opaque, QEMUFile *f);
int (*load_queue)(void * opaque, int n, QEMUFile *f);
unsigned (*get_features)(void * opaque);
+ int (*guest_notifier)(void * opaque, int n, bool assigned);
+ int (*host_notifier)(void * opaque, int n, bool assigned);
} VirtIOBindings;
#define VIRTIO_PCI_QUEUE_MAX 16
@@ -180,5 +183,5 @@ void virtio_net_exit(VirtIODevice *vdev);
DEFINE_PROP_BIT("indirect_desc", _state, _field, \
VIRTIO_RING_F_INDIRECT_DESC, true)
-
+void virtio_irq(VirtIODevice *vdev, VirtQueue *vq);
#endif
--
1.6.6.144.g5c3af
14 years, 2 months
[PATCH 09/24] notifier: event notifier implementation
by Michael S. Tsirkin
event notifiers are slightly generalized eventfd descriptors. Current
implementation depends on eventfd because vhost is the only user, and
vhost depends on eventfd anyway, but a stub is provided for non-eventfd
case.
We'll be able to further generalize this when another user comes along
and we see how to best do this.
Signed-off-by: Michael S. Tsirkin <mst(a)redhat.com>
---
Makefile.target | 1 +
hw/notifier.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
hw/notifier.h | 16 ++++++++++++++++
qemu-common.h | 1 +
4 files changed, 68 insertions(+), 0 deletions(-)
create mode 100644 hw/notifier.c
create mode 100644 hw/notifier.h
diff --git a/Makefile.target b/Makefile.target
index 6037fed..0c844a9 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -167,6 +167,7 @@ obj-y = vl.o async.o monitor.o pci.o pci_host.o pcie_host.o machine.o gdbstub.o
# virtio has to be here due to weird dependency between PCI and virtio-net.
# need to fix this properly
obj-y += virtio-blk.o virtio-balloon.o virtio-net.o virtio-console.o virtio-pci.o
+obj-y += notifier.o
obj-$(CONFIG_KVM) += kvm.o kvm-all.o
# MSI-X depends on kvm for interrupt injection,
# so moved it from Makefile.hw to Makefile.target for now
diff --git a/hw/notifier.c b/hw/notifier.c
new file mode 100644
index 0000000..dff38de
--- /dev/null
+++ b/hw/notifier.c
@@ -0,0 +1,50 @@
+#include "hw.h"
+#include "notifier.h"
+#ifdef CONFIG_EVENTFD
+#include <sys/eventfd.h>
+#endif
+
+int event_notifier_init(EventNotifier *e, int active)
+{
+#ifdef CONFIG_EVENTFD
+ int fd = eventfd(!!active, EFD_NONBLOCK | EFD_CLOEXEC);
+ if (fd < 0)
+ return -errno;
+ e->fd = fd;
+ return 0;
+#else
+ return -ENOSYS;
+#endif
+}
+
+void event_notifier_cleanup(EventNotifier *e)
+{
+ close(e->fd);
+}
+
+int event_notifier_get_fd(EventNotifier *e)
+{
+ return e->fd;
+}
+
+int event_notifier_test_and_clear(EventNotifier *e)
+{
+ uint64_t value;
+ int r = read(e->fd, &value, sizeof value);
+ return r == sizeof value;
+}
+
+int event_notifier_test(EventNotifier *e)
+{
+ uint64_t value;
+ int r = read(e->fd, &value, sizeof value);
+ if (r == sizeof value) {
+ /* restore previous value. */
+ int s = write(e->fd, &value, sizeof value);
+ /* never blocks because we use EFD_SEMAPHORE.
+ * If we didn't we'd get EAGAIN on overflow
+ * and we'd have to write code to ignore it. */
+ assert(s == sizeof value);
+ }
+ return r == sizeof value;
+}
diff --git a/hw/notifier.h b/hw/notifier.h
new file mode 100644
index 0000000..24117ea
--- /dev/null
+++ b/hw/notifier.h
@@ -0,0 +1,16 @@
+#ifndef QEMU_EVENT_NOTIFIER_H
+#define QEMU_EVENT_NOTIFIER_H
+
+#include "qemu-common.h"
+
+struct EventNotifier {
+ int fd;
+};
+
+int event_notifier_init(EventNotifier *, int active);
+void event_notifier_cleanup(EventNotifier *);
+int event_notifier_get_fd(EventNotifier *);
+int event_notifier_test_and_clear(EventNotifier *);
+int event_notifier_test(EventNotifier *);
+
+#endif
diff --git a/qemu-common.h b/qemu-common.h
index 5fbe0f9..cdead98 100644
--- a/qemu-common.h
+++ b/qemu-common.h
@@ -217,6 +217,7 @@ typedef struct uWireSlave uWireSlave;
typedef struct I2SCodec I2SCodec;
typedef struct DeviceState DeviceState;
typedef struct SSIBus SSIBus;
+typedef struct EventNotifier EventNotifier;
/* CPU save/load. */
void cpu_save(QEMUFile *f, void *opaque);
--
1.6.6.144.g5c3af
14 years, 2 months
[PATCH 08/24] kvm: add API to set ioeventfd
by Michael S. Tsirkin
This adds API to set ioeventfd to kvm,
as well as stubs for non-eventfd case,
making it possible for users to use this API
without ifdefs.
Signed-off-by: Michael S. Tsirkin <mst(a)redhat.com>
---
kvm-all.c | 20 ++++++++++++++++++++
kvm.h | 16 ++++++++++++++++
2 files changed, 36 insertions(+), 0 deletions(-)
diff --git a/kvm-all.c b/kvm-all.c
index 0423fff..efdf40c 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -1102,4 +1102,24 @@ void kvm_remove_all_breakpoints(CPUState *current_env)
}
#endif /* !KVM_CAP_SET_GUEST_DEBUG */
+#ifdef KVM_IOEVENTFD
+int kvm_set_ioeventfd(uint16_t addr, uint16_t data, int fd, bool assigned)
+{
+ struct kvm_ioeventfd kick = {
+ .datamatch = data,
+ .addr = addr,
+ .len = 2,
+ .flags = KVM_IOEVENTFD_FLAG_DATAMATCH | KVM_IOEVENTFD_FLAG_PIO,
+ .fd = fd,
+ };
+ int r;
+ if (!assigned)
+ kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
+ r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
+ if (r < 0)
+ return r;
+ return 0;
+}
+#endif
+
#include "qemu-kvm.c"
diff --git a/kvm.h b/kvm.h
index 9fa4e25..e98b5c8 100644
--- a/kvm.h
+++ b/kvm.h
@@ -14,6 +14,8 @@
#ifndef QEMU_KVM_H
#define QEMU_KVM_H
+#include <stdbool.h>
+#include <errno.h>
#include "config.h"
#include "qemu-queue.h"
#include "qemu-kvm.h"
@@ -21,6 +23,10 @@
#ifdef KVM_UPSTREAM
#ifdef CONFIG_KVM
+#include <linux/kvm.h>
+#endif
+
+#ifdef CONFIG_KVM
extern int kvm_allowed;
#define kvm_enabled() (kvm_allowed)
@@ -151,4 +157,14 @@ static inline void cpu_synchronize_state(CPUState *env)
#endif
+#if defined(KVM_IOEVENTFD) && defined(CONFIG_KVM)
+int kvm_set_ioeventfd(uint16_t addr, uint16_t data, int fd, bool assigned);
+#else
+static inline
+int kvm_set_ioeventfd(uint16_t data, uint16_t addr, int fd, bool assigned)
+{
+ return -ENOSYS;
+}
+#endif
+
#endif
--
1.6.6.144.g5c3af
14 years, 2 months
[PATCH 07/24] exec: memory notifiers
by Michael S. Tsirkin
This adds notifiers for phys memory changes: a set of callbacks that
vhost can register and update kernel accordingly. Down the road, kvm
code can be switched to use these as well, instead of calling kvm code
directly from exec.c as is done now.
Signed-off-by: Michael S. Tsirkin <mst(a)redhat.com>
---
cpu-common.h | 19 ++++++++++
exec.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
2 files changed, 130 insertions(+), 3 deletions(-)
diff --git a/cpu-common.h b/cpu-common.h
index 5e59564..326513d 100644
--- a/cpu-common.h
+++ b/cpu-common.h
@@ -8,6 +8,7 @@
#endif
#include "bswap.h"
+#include "qemu-queue.h"
/* address in the RAM (different from a physical address) */
typedef unsigned long ram_addr_t;
@@ -62,6 +63,24 @@ void cpu_physical_memory_unmap(void *buffer, target_phys_addr_t len,
void *cpu_register_map_client(void *opaque, void (*callback)(void *opaque));
void cpu_unregister_map_client(void *cookie);
+struct CPUPhysMemoryClient;
+typedef struct CPUPhysMemoryClient CPUPhysMemoryClient;
+struct CPUPhysMemoryClient {
+ void (*set_memory)(struct CPUPhysMemoryClient *client,
+ target_phys_addr_t start_addr,
+ ram_addr_t size,
+ ram_addr_t phys_offset);
+ int (*sync_dirty_bitmap)(struct CPUPhysMemoryClient *client,
+ target_phys_addr_t start_addr,
+ target_phys_addr_t end_addr);
+ int (*migration_log)(struct CPUPhysMemoryClient *client,
+ int enable);
+ QLIST_ENTRY(CPUPhysMemoryClient) list;
+};
+
+void cpu_register_phys_memory_client(CPUPhysMemoryClient *);
+void cpu_unregister_phys_memory_client(CPUPhysMemoryClient *);
+
uint32_t ldub_phys(target_phys_addr_t addr);
uint32_t lduw_phys(target_phys_addr_t addr);
uint32_t ldl_phys(target_phys_addr_t addr);
diff --git a/exec.c b/exec.c
index 8f873ab..cbba15e 100644
--- a/exec.c
+++ b/exec.c
@@ -1640,6 +1640,101 @@ const CPULogItem cpu_log_items[] = {
{ 0, NULL, NULL },
};
+#ifndef CONFIG_USER_ONLY
+static QLIST_HEAD(memory_client_list, CPUPhysMemoryClient) memory_client_list
+ = QLIST_HEAD_INITIALIZER(memory_client_list);
+
+static void cpu_notify_set_memory(target_phys_addr_t start_addr,
+ ram_addr_t size,
+ ram_addr_t phys_offset)
+{
+ CPUPhysMemoryClient *client;
+ QLIST_FOREACH(client, &memory_client_list, list) {
+ client->set_memory(client, start_addr, size, phys_offset);
+ }
+}
+
+static int cpu_notify_sync_dirty_bitmap(target_phys_addr_t start,
+ target_phys_addr_t end)
+{
+ CPUPhysMemoryClient *client;
+ QLIST_FOREACH(client, &memory_client_list, list) {
+ int r = client->sync_dirty_bitmap(client, start, end);
+ if (r < 0)
+ return r;
+ }
+ return 0;
+}
+
+static int cpu_notify_migration_log(int enable)
+{
+ CPUPhysMemoryClient *client;
+ QLIST_FOREACH(client, &memory_client_list, list) {
+ int r = client->migration_log(client, enable);
+ if (r < 0)
+ return r;
+ }
+ return 0;
+}
+
+static void phys_page_for_each_in_l1_map(PhysPageDesc **phys_map,
+ CPUPhysMemoryClient *client)
+{
+ PhysPageDesc *pd;
+ int l1, l2;
+
+ for (l1 = 0; l1 < L1_SIZE; ++l1) {
+ pd = phys_map[l1];
+ if (!pd) {
+ continue;
+ }
+ for (l2 = 0; l2 < L2_SIZE; ++l2) {
+ if (pd[l2].phys_offset == IO_MEM_UNASSIGNED) {
+ continue;
+ }
+ client->set_memory(client, pd[l2].region_offset,
+ TARGET_PAGE_SIZE, pd[l2].phys_offset);
+ }
+ }
+}
+
+static void phys_page_for_each(CPUPhysMemoryClient *client)
+{
+#if TARGET_PHYS_ADDR_SPACE_BITS > 32
+
+#if TARGET_PHYS_ADDR_SPACE_BITS > (32 + L1_BITS)
+#error unsupported TARGET_PHYS_ADDR_SPACE_BITS
+#endif
+ void **phys_map = (void **)l1_phys_map;
+ int l1;
+ if (!l1_phys_map) {
+ return;
+ }
+ for (l1 = 0; l1 < L1_SIZE; ++l1) {
+ if (phys_map[l1]) {
+ phys_page_for_each_in_l1_map(phys_map[l1], client);
+ }
+ }
+#else
+ if (!l1_phys_map) {
+ return;
+ }
+ phys_page_for_each_in_l1_map(l1_phys_map, client);
+#endif
+}
+
+void cpu_register_phys_memory_client(CPUPhysMemoryClient *client)
+{
+ QLIST_INSERT_HEAD(&memory_client_list, client, list);
+ phys_page_for_each(client);
+}
+
+void cpu_unregister_phys_memory_client(CPUPhysMemoryClient *client)
+{
+ QLIST_REMOVE(client, list);
+}
+#endif
+
static int cmp1(const char *s1, int n, const char *s2)
{
if (strlen(s2) != n)
@@ -1899,10 +1994,16 @@ void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t end,
int cpu_physical_memory_set_dirty_tracking(int enable)
{
+ int ret = 0;
+ in_migration = enable;
if (kvm_enabled()) {
- return kvm_set_migration_log(enable);
+ ret = kvm_set_migration_log(enable);
}
- return 0;
+ if (ret < 0) {
+ return ret;
+ }
+ ret = cpu_notify_migration_log(!!enable);
+ return ret;
}
int cpu_physical_memory_get_dirty_tracking(void)
@@ -1915,8 +2016,13 @@ int cpu_physical_sync_dirty_bitmap(target_phys_addr_t start_addr,
{
int ret = 0;
- if (kvm_enabled())
+ if (kvm_enabled()) {
ret = kvm_physical_sync_dirty_bitmap(start_addr, end_addr);
+ }
+ if (ret < 0) {
+ return ret;
+ }
+ ret = cpu_notify_sync_dirty_bitmap(start_addr, end_addr);
return ret;
}
@@ -2331,6 +2437,8 @@ void cpu_register_physical_memory_offset(target_phys_addr_t start_addr,
if (kvm_enabled())
kvm_set_phys_mem(start_addr, size, phys_offset);
+ cpu_notify_set_memory(start_addr, size, phys_offset);
+
if (phys_offset == IO_MEM_UNASSIGNED) {
region_offset = start_addr;
}
--
1.6.6.144.g5c3af
14 years, 2 months