Understanding VHOST kernel implementation

Performance comparison

Before I go into details of vhost-kernel implementation let's run VM with and without vhost network acceleration to compare results:

QEMU command-line without virtio support:

/usr/bin/qemu-system-x86_64 -machine pc-q35-4.2,accel=kvm -netdev tap,fd=33,id=hostnet0 -device e1000e,netdev=hostnet0,id=net0,mac=52:54:00:de:64:76,bus=pci.1,addr=0x0 

Network throughput with virtio disabled:

evg@evg:~$ iperf -s
------------------------------------------------------------
Server listening on TCP port 5001
TCP window size:  128 KByte (default)
------------------------------------------------------------
[  4] local 192.168.122.1 port 5001 connected with 192.168.122.192 port 59196
[ ID] Interval       Transfer     Bandwidth
[  4]  0.0-10.0 sec  3.73 GBytes  3.20 Gbits/sec

QEMU command-line with only virtio support enabled:

/usr/bin/qemu-system-x86_64 -machine pc-q35-4.2,accel=kvm -netdev tap,fd=33,id=hostnet0 -device virtio-net-pci,netdev=hostnet0,id=net0,mac=52:54:00:de:64:76,bus=pci.1,addr=0x0   

Network throughput with only virtio support enabled:

evg@evg:~$ iperf -s
------------------------------------------------------------
Server listening on TCP port 5001
TCP window size:  128 KByte (default)
------------------------------------------------------------
[  4] local 192.168.122.1 port 5001 connected with 192.168.122.192 port 37898
[ ID] Interval       Transfer     Bandwidth
[  4]  0.0-10.0 sec  27.2 GBytes  23.3 Gbits/sec

QEMU command-line with virtio and vhost network acceleration enabled:

/usr/bin/qemu-system-x86_64 -machine pc-q35-4.2,accel=kvm -netdev tap,fd=33,id=hostnet0,vhost=on,vhostfd=34 -device virtio-net-pci,netdev=hostnet0,id=net0,mac=52:54:00:de:64:76,bus=pci.1,addr=0x0   

Network throughput with vhost network acceleration enabled:

evg@evg:~$ iperf -s
------------------------------------------------------------
Server listening on TCP port 5001
TCP window size:  128 KByte (default)
------------------------------------------------------------
[  4] local 192.168.122.1 port 5001 connected with 192.168.122.192 port 49158
[ ID] Interval       Transfer     Bandwidth
[  4]  0.0-10.0 sec  30.7 GBytes  26.4 Gbits/sec

QEMU & TAP initialization

We know that QEMU uses TAP device in order to send/receive Ethernet packets. Let's have a look how tun/tap network device is initialized in QEMU:

qemu-5.0.0/net/net.c:        
static int (* const net_client_init_fun[NET_CLIENT_DRIVER__MAX])(
    const Netdev *netdev,
    const char *name,
    NetClientState *peer, Error **errp) = {
    ...
    [NET_CLIENT_DRIVER_TAP]       = net_init_tap,
    ...
}

Short overview of net_init_tap():

qemu-5.0.0/net/tap.c:
int net_init_tap(const Netdev *netdev, const char *name,
                 NetClientState *peer, Error **errp)
{
     ...
     for (i = 0; i < queues; i++) {
         fd = net_tap_init(tap, &vnet_hdr, i >= 1 ? "no" : script,
                              ifname, sizeof ifname, queues > 1, errp);
         ...
         net_init_tap_one(tap, peer, "tap", name, ifname,
                             i >= 1 ? "no" : script,
                             i >= 1 ? "no" : downscript,
                             vhostfdname, vnet_hdr, fd, &err);
         ...
     }
     ...
}     

For every queue net_init_tap() calls net_tap_init():

qemu-5.0.0/net/tap.c:
static int net_tap_init(const NetdevTapOptions *tap, int *vnet_hdr,
                        const char *setup_script, char *ifname,
                        size_t ifname_sz, int mq_required, Error **errp)
{
     ...
     TFR(fd = tap_open(ifname, ifname_sz, vnet_hdr, vnet_hdr_required,
                      mq_required, errp));
     ...
}

net_tap_init() calls tap_open() which operates with /dev/net/tun:

qemu-5.0.0/net/tap-linux.c:
#define PATH_NET_TUN "/dev/net/tun"

int tap_open(char *ifname, int ifname_size, int *vnet_hdr,
             int vnet_hdr_required, int mq_required, Error **errp)
{
    ...
    TFR(fd = open(PATH_NET_TUN, O_RDWR));
    ...
}

QEMU vhost initialization and signalling

OK let's go back to net_init_tap() after initialization of tap device it calls net_init_tap_one():

qemu-5.0.0/net/tap.c:
static void net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer,
                             const char *model, const char *name,
                             const char *ifname, const char *script,
                             const char *downscript, const char *vhostfdname,
                             int vnet_hdr, int fd, Error **errp)
{
     ...
     vhostfd = open("/dev/vhost-net", O_RDWR);
     ...
     s->vhost_net = vhost_net_init(&options);
     ...
}

net_init_tap_one() continues setting up communication channel and if backend is in the kernel (vhost-kernel enabled) it opens /dev/vhost-net device and afterwards calls vhost_net_init() and vhost_dev_init() respectively:

qemu-5.0.0/hw/net/vhost_net.c:
struct vhost_net *vhost_net_init(VhostNetOptions *options)
{
     ...
     r = vhost_dev_init(&net->dev, options->opaque,
                       options->backend_type, options->busyloop_timeout);
     ...
}
qemu-5.0.0/hw/virtio/vhost.c:
int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
                   VhostBackendType backend_type, uint32_t busyloop_timeout)
{   
     ...
     for (i = 0; i < hdev->nvqs; ++i, ++n_initialized_vqs) {
        r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i);
        ...
     }
     ...
}

vhost_dev_init() initializes virtqueues via vhost_virtqueue_init():

qemu-5.0.0/hw/virtio/vhost.c:
static int vhost_virtqueue_init(struct vhost_dev *dev,
                                struct vhost_virtqueue *vq, int n)
{
    int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
    struct vhost_vring_file file = {
        .index = vhost_vq_index,
    };
    int r = event_notifier_init(&vq->masked_notifier, 0);
    ...

    file.fd = event_notifier_get_fd(&vq->masked_notifier);
    r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
    ...
}

vhost_virtqueue_init() sets up event notifier via event_notifier_init():

qemu-5.0.0/util/event_notifier-posix.c:
int event_notifier_init(EventNotifier *e, int active)
{
    ...    
#ifdef CONFIG_EVENTFD
    ret = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
    ...
}

event_notifier_init() simply creates eventfd and afterwards vhost_virtqueue_init() registers the eventfd created with vhost driver via vhost_kernel_set_vring_call():

qemu-5.0.0/hw/virtio/vhost-backend.c:
static int vhost_kernel_set_vring_call(struct vhost_dev *dev,
                                       struct vhost_vring_file *file)
{
    return vhost_kernel_call(dev, VHOST_SET_VRING_CALL, file);
}

static int vhost_kernel_call(struct vhost_dev *dev, unsigned long int request,
                             void *arg)
{
    ...
    return ioctl(fd, request, arg);
}

Let's have a look how vhost handles this ioctl():

linux-5.7/drivers/vhost/vhost.c:
long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
{
     ...
     case VHOST_SET_VRING_CALL:
         if (copy_from_user(&f, argp, sizeof f)) {
             ...
         }
         ctx = f.fd == -1 ? NULL : eventfd_ctx_fdget(f.fd);
         ...
         swap(ctx, vq->call_ctx);
         break;
     ...
}

After registration vhost uses eventfd to signal guest:

void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
{
        /* Signal the Guest tell them we used something up. */
        if (vq->call_ctx && vhost_notify(dev, vq))
                eventfd_signal(vq->call_ctx, 1);
}

Vhost start in QEMU

QEMU starts virtio-net when the link status of the interface is changing in the guest.

qemu-5.0.0/hw/net/virtio-net.c:
static NetClientInfo net_virtio_info = {
    .link_status_changed = virtio_net_set_link_status,
}

virtio_net_set_link_status() calls virtio_net_set_status() and it calls virtio_net_vhost_status() which starts vhost:

qemu-5.0.0/hw/net/virtio-net.c:
static void virtio_net_vhost_status(VirtIONet *n, uint8_t status)
{
     ...
     r = vhost_net_start(vdev, n->nic->ncs, queues);
     ...
}

vhost_net_start() calls vhost_net_start_one() which executes vhost_dev_start():

qemu-5.0.0/hw/net/vhost_net.c
static int vhost_net_start_one(struct vhost_net *net,
                               VirtIODevice *dev)
{
     ...
     r = vhost_dev_start(&net->dev, dev);
     ...
}

vhost_dev_start() sets memory table in vhost and copies information such as vring descriptor address, number of descriptors, used and available from virtio specific structures to vhost structures using vhost_virtqueue_start():

qemu-5.0.0/hw/virtio/vhost.c
static int vhost_virtqueue_start(struct vhost_dev *dev,
                                struct VirtIODevice *vdev,
                                struct vhost_virtqueue *vq,
                                unsigned idx)
{
    hwaddr s, l, a;
    ...
    a = virtio_queue_get_desc_addr(vdev, idx);
    ...
    vq->desc_size = s = l = virtio_queue_get_desc_size(vdev, idx);
    vq->desc_phys = a;
    vq->desc = vhost_memory_map(dev, a, &l, false);
    ...
    vq->avail_size = s = l = virtio_queue_get_avail_size(vdev, idx);
    vq->avail_phys = a = virtio_queue_get_avail_addr(vdev, idx);
    vq->avail = vhost_memory_map(dev, a, &l, false);
    ...
    vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx);
    vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx);
    vq->used = vhost_memory_map(dev, a, &l, true);
    ...

    r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled);
    ...
}

Finally vhost_virtqueue_start() calls vhost_virtqueue_set_addr() which hands over vring address to the vhost via ioctl().

qemu-5.0.0/hw/virtio/vhost.c
static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
                                    struct vhost_virtqueue *vq,
                                    unsigned idx, bool enable_log)
{
    struct vhost_vring_addr addr = {
        .index = idx,
        .desc_user_addr = (uint64_t)(unsigned long)vq->desc,
        .avail_user_addr = (uint64_t)(unsigned long)vq->avail,
        .used_user_addr = (uint64_t)(unsigned long)vq->used,
        .log_guest_addr = vq->used_phys,
        .flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0,
    };
    int r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr);
    ...
}

If you're interested who sets virtio vring descriptor address in QEMU please check out the latest section of this article.

On the return to vhost_net_start_one() it hands over TAP device FD to vhost by calling vhost_net_set_backend():

qemu-5.0.0/hw/net/vhost_net.c
static int vhost_net_start_one(struct vhost_net *net,
                               VirtIODevice *dev)
{
     ...
     if (net->nc->info->type == NET_CLIENT_DRIVER_TAP) {
        ...
        file.fd = net->backend; // tap FD
        for (file.index = 0; file.index < net->dev.nvqs; ++file.index)
        {
            r = vhost_net_set_backend(&net->dev, &file);
            ...
        }
     }
     ...
}

Vhost start in kernel

As soon as VHOST_NET_SET_BACKEND ioctl() arrives to the kernel vhost handles it as follows:

linux-5.7/drivers/vhost/net.c:
static long vhost_net_ioctl(struct file *f, unsigned int ioctl, unsigned long arg)
{
        switch (ioctl) {
        case VHOST_NET_SET_BACKEND:
                if (copy_from_user(&backend, argp, sizeof backend))
                        return -EFAULT;
                return vhost_net_set_backend(n, backend.index, backend.fd);
        ...
}

static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
{        
      ...
      struct socket *sock = get_socket(fd);
      ...
      r = vhost_net_enable_vq(n, vq);
}

vhost_net_set_backend() first gets socket structure from TAP file descriptor and eventually calls vhost_net_enable_vq():

linux-5.7/drivers/vhost/net.c:
static int vhost_net_enable_vq(struct vhost_net *n,
                                struct vhost_virtqueue *vq)
{
        ...
        sock = vhost_vq_get_backend(vq);
        ...
        return vhost_poll_start(poll, sock->file);
}

vhost_net_enable_vq() calls vhost_poll_start() which starts polling on TAP device:

linux-5.7/drivers/vhost/vhost.c:
int vhost_poll_start(struct vhost_poll *poll, struct file *file)
{
        ...
        mask = vfs_poll(file, &poll->table);
        if (mask)
                vhost_poll_wakeup(&poll->wait, 0, 0, poll_to_key(mask));
        ...
}


Vhost TAP polling

On the case if some data is available for RX/TX vhost_poll_wakeup() wakes up it's worker:

linux-5.7/drivers/vhost/vhost.c:
static int vhost_poll_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync,
                             void *key)
{
      struct vhost_poll *poll = container_of(wait, struct vhost_poll, wait);
      ...
      vhost_poll_queue(poll);
}
void vhost_poll_queue(struct vhost_poll *poll)
{
      vhost_work_queue(poll->dev, &poll->work);
}
void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
{
      if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) {
           llist_add(&work->node, &dev->work_list);
           wake_up_process(dev->worker);
      }
}

dev->worker is thread which is created in vhost_dev_set_owner():

linux-5.7/drivers/vhost/vhost.c:
long vhost_dev_set_owner(struct vhost_dev *dev)
{
     ...
     dev->worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid);
}

Worker thread sleeps until it gets data to handle:

linux-5.7/drivers/vhost/vhost.c:
static int vhost_worker(void *data)
{
     ...
     for (;;) {
          ...
          node = llist_del_all(&dev->work_list);
          if (!node)
             schedule();

          llist_for_each_entry_safe(work, work_next, node, node) {
             clear_bit(VHOST_WORK_QUEUED, &work->flags);
             work->fn(work);
             ...
          }
     }
}

When QEMU process opens /dev/vhost-net device vhost initializes poll structures for RX/TX as follows:

linux-5.7/drivers/vhost/net.c:
static int vhost_net_open(struct inode *inode, struct file *f)
{
      ...
      vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, dev);
      vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev);
      ...
}

Depending on the mask IN/OUT vhost registers handle_rx_net/handle_tx_net callbacks in poll structures. vhost_poll_init() associates these structures with poll_table and on the case of event IN or OUT appropriate work structure will be obtained using the mask and corresponding callback will be called by vhost_worker thread.


Vhost RX/RX processing

Before going into details how RX is handled in vhost let's have a look at vhost_virtqueue structure:

linux-5.7/drivers/vhost/vhost.h:
struct vhost_virtqueue {
        struct vhost_dev *dev;
        ...
        unsigned int num;
        struct vring_desc __user *desc; 
        struct vring_avail __user *avail;
        struct vring_used __user *used; 
        struct eventfd_ctx *call_ctx;
        ...
        struct vhost_poll poll;
        u16 last_avail_idx;
        u16 avail_idx;
        u16 last_used_idx;
        ...
        struct iovec iov[UIO_MAXIOV];
        struct iovec iotlb_iov[64];
};

struct vring_desc is defined as follows:

linux-5.7/include/uapi/linux/virtio_ring.h:
struct vring_desc {
        /* Address (guest-physical). */
        __virtio64 addr;
        /* Length. */
        __virtio32 len;
        __virtio16 flags; 
        __virtio16 next;
};

struct iovec is defined as follows:

linux-5.7/include/uapi/linux/uio.h:
struct iovec
{
        void __user *iov_base; 
        __kernel_size_t iov_len;
};

This is short overview of vhost handle_rx() function:

linux-5.7/drivers/vhost/net.c:
static void handle_rx(struct vhost_net *net)
{
        ...
        struct msghdr msg;
        ...
        do {
            sock_len = vhost_net_rx_peek_head_len(net, sock->sk,
                                                          &busyloop_intr);
            sock_len += sock_hlen;
            vhost_len = sock_len + vhost_hlen;
            headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx,
                                            vhost_len, &in, vq_log, &log,
                                            likely(mergeable) ? UIO_MAXIOV : 1);
            ...
            iov_iter_init(&msg.msg_iter, READ, vq->iov, in, vhost_len);
            ...
            err = sock->ops->recvmsg(sock, &msg,
                                             sock_len, MSG_DONTWAIT | MSG_TRUNC);
            ...
            total_len += vhost_len;
        } while (likely(!vhost_exceeds_weight(vq, ++recv_pkts, total_len)));
        ...                                        
out:
        vhost_net_signal_used(nvq);
}

The first function vhost_net_rx_peek_head_len() is called to get the size of the data available for RX.

Function get_rx_bufs() tries to get number of vring descriptors needed to receive data for virtio-net device:

linux-5.7/drivers/vhost/net.c:
static int get_rx_bufs(struct vhost_virtqueue *vq,
                       struct vring_used_elem *heads,
                       int datalen,
                       unsigned *iovcount,
                       struct vhost_log *log,
                       unsigned *log_num,
                       unsigned int quota)
{       
        ...
        while (datalen > 0 && headcount < quota) {
                ...
                r = vhost_get_vq_desc(vq, vq->iov + seg,
                                      ARRAY_SIZE(vq->iov) - seg, &out,
                                      &in, log, log_num);
                ...
                heads[headcount].id = cpu_to_vhost32(vq, d);
                len = iov_length(vq->iov + seg, in);
                heads[headcount].len = cpu_to_vhost32(vq, len);
                datalen -= len;
                ++headcount;
                seg += in;
        }       
        heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
        ...
}

vhost_get_vq_desc() looks for the first available descriptor in the virtqueue and converts it to iovec structure:

linux-5.7/drivers/vhost/vhost.c:
int vhost_get_vq_desc(struct vhost_virtqueue *vq,
                      struct iovec iov[], unsigned int iov_size,
                      unsigned int *out_num, unsigned int *in_num,
                      struct vhost_log *log, unsigned int *log_num)
{
        i = head;
        do {
                unsigned iov_count = *in_num + *out_num;
                ...
                ret = vhost_get_desc(vq, &desc, i);
                ...
                ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
                                     vhost32_to_cpu(vq, desc.len), iov + iov_count,
                                     iov_size - iov_count, access);
				...
        } while ((i = next_desc(vq, &desc)) != -1);
		vq->last_avail_idx++;
        return head;
}

vhost_get_desc() copies the descriptor from the QEMU user space. translate_desc() translates the descriptor to iovec structure.

On the return to handle_rx() it initializes iov_iter in msghdr structure to resulted iovec array via calling iov_iter_init() and passes the control to sock->ops->recvmsg() callback to receive data for virtio-net device.

handle_tx() uses the same approach except calling sock->ops->sendmsg() for data transmission.


BIOS & VIRTIO-PCI device initialization

In order to understand how vring descriptor is set in QEMU let's have a look how virtio-pci device is initialized:

qemu-5.0.0/hw/virtio/virtio-pci.c:
static void virtio_pci_bus_class_init(ObjectClass *klass, void *data)
{
     ...
     k->device_plugged = virtio_pci_device_plugged;
     ...
}
/* This is called by virtio-bus just after the device is plugged. */
static void virtio_pci_device_plugged(DeviceState *d, Error **errp)
{
     ...
     memory_region_init_io(&proxy->bar, OBJECT(proxy),
                              &virtio_pci_config_ops,
                              proxy, "virtio-pci", size);
     ...
}
static const MemoryRegionOps virtio_pci_config_ops = {
    .read = virtio_pci_config_read,
    .write = virtio_pci_config_write,
    ...
}

VIRTIO_PCI_QUEUE_PFN command is used to set up vring descriptor address while writing to PCI configuration space:

qemu-5.0.0/hw/virtio/virtio-pci.c:
static void virtio_pci_config_write(void *opaque, hwaddr addr,
                                    uint64_t val, unsigned size)
{
    VirtIOPCIProxy *proxy = opaque;
    uint32_t config = VIRTIO_PCI_CONFIG_SIZE(&proxy->pci_dev);

    if (addr < config) {
        virtio_ioport_write(proxy, addr, val);
        return;
    }
    ...
}
static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
{
    VirtIOPCIProxy *proxy = opaque;
    VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
    hwaddr pa;

    switch (addr) {
    case VIRTIO_PCI_QUEUE_PFN:
        pa = (hwaddr)val << VIRTIO_PCI_QUEUE_ADDR_SHIFT;
        if (pa == 0) {
            virtio_pci_reset(DEVICE(proxy));
        }
        else
            virtio_queue_set_addr(vdev, vdev->queue_sel, pa);
        break;
    ...
}

and it's how vring descriptor address is set in virtio_queue_set_addr():

qemu-5.0.0/hw/virtio/virtio.c:
void virtio_queue_set_addr(VirtIODevice *vdev, int n, hwaddr addr)
{
    if (!vdev->vq[n].vring.num) {
        return;
    }
    vdev->vq[n].vring.desc = addr;
    virtio_queue_update_rings(vdev, n);
}

You might be curious who sets the device up? The answer is BIOS/iPXE (U-boot on other architectures):

qemu-5.0.0/roms/ipxe/src/drivers/bus/virtio-pci.c:
int vp_find_vq(unsigned int ioaddr, int queue_index,
               struct vring_virtqueue *vq)
{
   struct vring * vr = &vq->vring;
   u16 num;
   ...
   outw(queue_index, ioaddr + VIRTIO_PCI_QUEUE_SEL);

   num = inw(ioaddr + VIRTIO_PCI_QUEUE_NUM);
   if (!num) {
           DBG("VIRTIO-PCI ERROR: queue size is 0\n");
           return -1;
   }
   vq->queue_index = queue_index;
   rc = vp_alloc_vq(vq, num);
   ...
   vring_init(vr, num, vq->queue);
   outl((unsigned long)virt_to_phys(vr->desc) >> PAGE_SHIFT,
        ioaddr + VIRTIO_PCI_QUEUE_PFN);
   ...
}

The picture below depicts how BIOS/iPXE, Qemu and Vhost communication occurs:


References:

Introduction into virtio networking and vhost-net