new file mode 100644
@@ -0,0 +1,48 @@
+# BSD LICENSE
+#
+# Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+# library name
+LIB = librte_vhost.a
+
+CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -D_FILE_OFFSET_BITS=64 -lfuse
+LDFLAGS += -lfuse
+# all source are stored in SRCS-y
+SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := vhost-net-cdev.c virtio-net.c vhost_rxtx.c
+
+# install includes
+SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_virtio_net.h
+
+# this lib needs eal
+DEPDIRS-$(CONFIG_RTE_LIBRTE_VHOST) += lib/librte_eal lib/librte_mbuf
+
+include $(RTE_SDK)/mk/rte.lib.mk
@@ -34,28 +34,25 @@
#ifndef _VIRTIO_NET_H_
#define _VIRTIO_NET_H_
+#include <stdint.h>
+#include <linux/virtio_ring.h>
+#include <linux/virtio_net.h>
+#include <sys/eventfd.h>
+
+#include <rte_memory.h>
+#include <rte_mempool.h>
+#include <rte_mbuf.h>
+
/* Used to indicate that the device is running on a data core */
#define VIRTIO_DEV_RUNNING 1
/* Backend value set by guest. */
#define VIRTIO_DEV_STOPPED -1
-#define PAGE_SIZE 4096
/* Enum for virtqueue management. */
enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
-#define BUF_VECTOR_MAX 256
-
-/*
- * Structure contains buffer address, length and descriptor index
- * from vring to do scatter RX.
-*/
-struct buf_vector {
-uint64_t buf_addr;
-uint32_t buf_len;
-uint32_t desc_idx;
-};
/*
* Structure contains variables relevant to TX/RX virtqueues.
@@ -72,36 +69,8 @@ struct vhost_virtqueue
volatile uint16_t last_used_idx_res; /* Used for multiple devices reserving buffers. */
eventfd_t callfd; /* Currently unused as polling mode is enabled. */
eventfd_t kickfd; /* Used to notify the guest (trigger interrupt). */
- /* Used for scatter RX. */
- struct buf_vector buf_vec[BUF_VECTOR_MAX];
-} __rte_cache_aligned;
-
-/*
- * Device structure contains all configuration information relating to the device.
- */
-struct virtio_net
-{
- struct vhost_virtqueue *virtqueue[VIRTIO_QNUM]; /* Contains all virtqueue information. */
- struct virtio_memory *mem; /* QEMU memory and memory region information. */
- struct ether_addr mac_address; /* Device MAC address (Obtained on first TX packet). */
- uint64_t features; /* Negotiated feature set. */
- uint64_t device_fh; /* device identifier. */
- uint32_t vmdq_rx_q; /* RX VMDQ queue number. */
- uint32_t flags; /* Device flags. Only used to check if device is running on data core. */
- uint32_t vlan_tag; /* Vlan tag for device. Currently set to device_id (0-63). */
- uint16_t coreid; /* Data core that the device is added to. */
- volatile uint8_t ready; /* A device is set as ready if the MAC address has been set. */
- volatile uint8_t remove; /* Device is marked for removal from the data core. */
} __rte_cache_aligned;
-/*
- * Device linked list structure for configuration.
- */
-struct virtio_net_config_ll
-{
- struct virtio_net dev; /* Virtio device. */
- struct virtio_net_config_ll *next; /* Next entry on linked list. */
-};
/*
* Information relating to memory regions including offsets to addresses in QEMUs memory file.
@@ -114,48 +83,116 @@ struct virtio_memory_regions {
uint64_t address_offset; /* Offset of region for address translation. */
};
-/*
- * Information relating to memory regions including offsets to
- * addresses in host physical space.
- */
-struct virtio_memory_regions_hpa {
- /* Base guest physical address of region. */
- uint64_t guest_phys_address;
- /* End guest physical address of region. */
- uint64_t guest_phys_address_end;
- /* Size of region. */
- uint64_t memory_size;
- /* Offset of region for gpa to hpa translation. */
- uint64_t host_phys_addr_offset;
-};
-/*
+/**
* Memory structure includes region and mapping information.
*/
struct virtio_memory {
- uint64_t base_address; /* Base QEMU userspace address of the memory file. */
- uint64_t mapped_address; /* Mapped address of memory file base in our applications memory space. */
- uint64_t mapped_size; /* Total size of memory file. */
- uint32_t nregions; /* Number of memory regions. */
- /* Number of memory regions for gpa to hpa translation. */
- uint32_t nregions_hpa;
- /* Memory region information for gpa to hpa translation. */
- struct virtio_memory_regions_hpa *regions_hpa;
- /* Memory region information. */
- struct virtio_memory_regions regions[0];
+ uint64_t base_address; /**< Base QEMU userspace address of the memory file. */
+ uint64_t mapped_address; /**< Mapped address of memory file base in our applications memory space. */
+ uint64_t mapped_size; /**< Total size of memory file. */
+ uint32_t nregions; /**< Number of memory regions. */
+ struct virtio_memory_regions regions[0]; /**< Memory region information. */
};
-/*
+/**
+ * Device structure contains all configuration information relating to the device.
+ */
+struct virtio_net {
+ struct vhost_virtqueue *virtqueue[VIRTIO_QNUM]; /**< Contains all virtqueue information. */
+ struct virtio_memory *mem; /**< QEMU memory and memory region information. */
+ uint64_t features; /**< Negotiated feature set. */
+ uint64_t device_fh; /**< Device identifier. */
+ uint32_t flags; /**< Device flags. Only used to check if device is running on data core. */
+ void *priv;
+} __rte_cache_aligned;
+
+/**
* Device operations to add/remove device.
*/
struct virtio_net_device_ops {
- int (* new_device) (struct virtio_net *); /* Add device. */
- void (* destroy_device) (volatile struct virtio_net *); /* Remove device. */
+ int (*new_device)(struct virtio_net *); /**< Add device. */
+ void (*destroy_device)(struct virtio_net *); /**< Remove device. */
};
-int init_virtio_net(struct virtio_net_device_ops const * const);
-int deinit_virtio_net(void);
-struct vhost_net_device_ops const * get_virtio_net_callbacks(void);
+static inline uint16_t __attribute__((always_inline))
+rte_vring_available_entries(struct virtio_net *dev, uint16_t queue_id)
+{
+ struct vhost_virtqueue *vq = dev->virtqueue[queue_id];
+ return *(volatile uint16_t *)&vq->avail->idx - vq->last_used_idx_res;
+}
+
+/**
+ * Function to convert guest physical addresses to vhost virtual addresses.
+ * This is used to convert guest virtio buffer addresses.
+ */
+static inline uint64_t __attribute__((always_inline))
+gpa_to_vva(struct virtio_net *dev, uint64_t guest_pa)
+{
+ struct virtio_memory_regions *region;
+ uint32_t regionidx;
+ uint64_t vhost_va = 0;
+
+ for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
+ region = &dev->mem->regions[regionidx];
+ if ((guest_pa >= region->guest_phys_address) &&
+ (guest_pa <= region->guest_phys_address_end)) {
+ vhost_va = region->address_offset + guest_pa;
+ break;
+ }
+ }
+ return vhost_va;
+}
+
+/**
+ * Disable features in feature_mask. Returns 0 on success.
+ */
+int rte_vhost_feature_disable(uint64_t feature_mask);
+
+/**
+ * Enable features in feature_mask. Returns 0 on success.
+ */
+int rte_vhost_feature_enable(uint64_t feature_mask);
+
+/* Returns currently supported vhost features */
+uint64_t rte_vhost_feature_get(void);
+
+int rte_vhost_enable_guest_notification(struct virtio_net *dev, uint16_t queue_id, int enable);
+
+/* Register vhost driver. dev_name could be different for multiple instance support. */
+int rte_vhost_driver_register(const char *dev_name);
+
+/* Register callbacks. */
+int rte_vhost_driver_callback_register(struct virtio_net_device_ops const * const);
+
+int rte_vhost_driver_session_start(void);
+
+/**
+ * This function adds buffers to the virtio devices RX virtqueue. Buffers can
+ * be received from the physical port or from another virtual device. A packet
+ * count is returned to indicate the number of packets that were succesfully
+ * added to the RX queue.
+ * @param queue_id
+ * virtio queue index in mq case
+ * @return
+ * num of packets enqueued
+ */
+uint32_t rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id,
+ struct rte_mbuf **pkts, uint32_t count);
+
+/**
+ * This function gets guest buffers from the virtio device TX virtqueue,
+ * construct host mbufs, copies guest buffer content to host mbufs and
+ * store them in pkts to be processed.
+ * @param mbuf_pool
+ * mbuf_pool where host mbuf is allocated.
+ * @param queue_id
+ * virtio queue index in mq case.
+ * @return
+ * num of packets dequeued
+ */
+uint32_t rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
+ struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint32_t count);
#endif /* _VIRTIO_NET_H_ */
@@ -42,18 +42,18 @@
#include <rte_ethdev.h>
#include <rte_log.h>
#include <rte_string_fns.h>
+#include <rte_virtio_net.h>
-#include "main.h"
#include "vhost-net-cdev.h"
#define FUSE_OPT_DUMMY "\0\0"
#define FUSE_OPT_FORE "-f\0\0"
#define FUSE_OPT_NOMULTI "-s\0\0"
-const uint32_t default_major = 231;
-const uint32_t default_minor = 1;
-const char cuse_device_name[] = "/dev/cuse";
-const char default_cdev[] = "vhost-net";
+static const uint32_t default_major = 231;
+static const uint32_t default_minor = 1;
+static const char cuse_device_name[] = "/dev/cuse";
+static const char default_cdev[] = "vhost-net";
static struct fuse_session *session;
static struct vhost_net_device_ops const *ops;
@@ -116,7 +116,7 @@ vhost_net_release(fuse_req_t req, struct fuse_file_info *fi)
#define VHOST_IOCTL(func) do { \
result = (func)(ctx); \
fuse_reply_ioctl(req, result, NULL, 0); \
-} while(0) \
+} while(0)
/*
* Boilerplate IOCTL RETRY
@@ -215,6 +215,7 @@ vhost_net_ioctl(fuse_req_t req, int cmd, void *arg,
break;
case VHOST_SET_MEM_TABLE:
+ /*TODO fix race condition.*/
LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") IOCTL: VHOST_SET_MEM_TABLE\n", ctx.fh);
static struct vhost_memory mem_temp;
@@ -302,7 +303,7 @@ static const struct cuse_lowlevel_ops vhost_net_ops = {
* also passed when the device is registered in main.c.
*/
int
-register_cuse_device(const char *base_name, int index, struct vhost_net_device_ops const * const pops)
+rte_vhost_driver_register(const char *dev_name)
{
struct cuse_info cuse_info;
char device_name[PATH_MAX] = "";
@@ -321,16 +322,11 @@ register_cuse_device(const char *base_name, int index, struct vhost_net_device_o
/*
* The device name is created. This is passed to QEMU so that it can register
- * the device with our application. The index allows us to have multiple instances
+ * the device with our application. The dev_name allows us to have multiple instances
* of userspace vhost which we can then add devices to separately.
*/
- if (strncmp(base_name, default_cdev, PATH_MAX)!=0) {
- snprintf(device_name, PATH_MAX, "DEVNAME=%s-%d", base_name, index);
- snprintf(char_device_name, PATH_MAX, "/dev/%s-%d", base_name, index);
- } else {
- snprintf(device_name, PATH_MAX, "DEVNAME=%s", base_name);
- snprintf(char_device_name, PATH_MAX, "/dev/%s", base_name);
- }
+ snprintf(device_name, PATH_MAX, "DEVNAME=%s", dev_name);
+ snprintf(char_device_name, PATH_MAX, "/dev/%s", dev_name);
/* Check if device already exists. */
if (access(char_device_name, F_OK) != -1) {
@@ -340,12 +336,12 @@ register_cuse_device(const char *base_name, int index, struct vhost_net_device_o
memset(&cuse_info, 0, sizeof(cuse_info));
cuse_info.dev_major = default_major;
- cuse_info.dev_minor = default_minor + index;
+ cuse_info.dev_minor = default_minor;
cuse_info.dev_info_argc = 1;
cuse_info.dev_info_argv = device_argv;
cuse_info.flags = CUSE_UNRESTRICTED_IOCTL;
- ops = pops;
+ ops = get_virtio_net_callbacks();
session = cuse_lowlevel_setup(3, fuse_argv,
&cuse_info, &vhost_net_ops, 0, NULL);
@@ -355,11 +351,12 @@ register_cuse_device(const char *base_name, int index, struct vhost_net_device_o
return 0;
}
-/*
+
+/**
* The CUSE session is launched allowing the application to receive open, release and ioctl calls.
*/
int
-start_cuse_session_loop(void)
+rte_vhost_driver_session_start(void)
{
fuse_session_loop(session);
@@ -33,13 +33,45 @@
#ifndef _VHOST_NET_CDEV_H_
#define _VHOST_NET_CDEV_H_
-
+#include <stdint.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <unistd.h>
#include <linux/vhost.h>
-struct vhost_memory;
-struct vhost_vring_state;
-struct vhost_vring_addr;
-struct vhost_vring_file;
+#include <rte_log.h>
+
+/* Macros for printing using RTE_LOG */
+#define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1
+#define RTE_LOGTYPE_VHOST_DATA RTE_LOGTYPE_USER1
+
+#ifdef RTE_LIBRTE_VHOST_DEBUG
+#define VHOST_MAX_PRINT_BUFF 6072
+#define LOG_LEVEL RTE_LOG_DEBUG
+#define LOG_DEBUG(log_type, fmt, args...) RTE_LOG(DEBUG, log_type, fmt, ##args)
+#define VHOST_PRINT_PACKET(device, addr, size, header) do { \
+ char *pkt_addr = (char *)(addr); \
+ unsigned int index; \
+ char packet[VHOST_MAX_PRINT_BUFF]; \
+ \
+ if ((header)) \
+ snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \
+ else \
+ snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \
+ for (index = 0; index < (size); index++) { \
+ snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), \
+ "%02hhx ", pkt_addr[index]); \
+ } \
+ snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), "\n"); \
+ \
+ LOG_DEBUG(VHOST_DATA, "%s", packet); \
+} while (0)
+#else
+#define LOG_LEVEL RTE_LOG_INFO
+#define LOG_DEBUG(log_type, fmt, args...) do {} while (0)
+#define VHOST_PRINT_PACKET(device, addr, size, header) do {} while (0)
+#endif
+
/*
* Structure used to identify device context.
@@ -77,7 +109,6 @@ struct vhost_net_device_ops {
int (* reset_owner) (struct vhost_device_ctx);
};
-int register_cuse_device(const char *base_name, int index, struct vhost_net_device_ops const * const);
-int start_cuse_session_loop(void);
+struct vhost_net_device_ops const *get_virtio_net_callbacks(void);
#endif /* _VHOST_NET_CDEV_H_ */
@@ -40,17 +40,17 @@
#include "vhost-net-cdev.h"
-#define MAX_PKT_BURST 64
-#define MAX_MRG_PKT_BURST 64
+#define VHOST_MAX_PKT_BURST 64
+#define VHOST_MAX_MRG_PKT_BURST 64
-/*
+/**
* This function adds buffers to the virtio devices RX virtqueue. Buffers can
* be received from the physical port or from another virtio device. A packet
* count is returned to indicate the number of packets that were succesfully
* added to the RX queue. This function works when mergeable is disabled.
*/
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count)
+uint32_t
+rte_vhost_enqueue_burst(struct virtio_net *dev, uint16_t queue_id, struct rte_mbuf **pkts, uint32_t count)
{
struct vhost_virtqueue *vq;
struct vring_desc *desc;
@@ -59,36 +59,28 @@ virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count)
struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0,0,0,0,0,0},0};
uint64_t buff_addr = 0;
uint64_t buff_hdr_addr = 0;
- uint32_t head[MAX_PKT_BURST], packet_len = 0;
+ uint32_t head[VHOST_MAX_PKT_BURST], packet_len = 0;
uint32_t head_idx, packet_success = 0;
- uint32_t retry = 0;
+ uint32_t mergeable, mrg_count = 0;
uint16_t avail_idx, res_cur_idx;
uint16_t res_base_idx, res_end_idx;
uint16_t free_entries;
uint8_t success = 0;
LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
- vq = dev->virtqueue[VIRTIO_RXQ];
- count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
+ if (unlikely(queue_id != VIRTIO_RXQ)) {
+ LOG_DEBUG(VHOST_DATA, "mq isn't supported in this version.\n");
+ return 0;
+ }
+ vq = dev->virtqueue[VIRTIO_RXQ];
+ count = (count > VHOST_MAX_PKT_BURST) ? VHOST_MAX_PKT_BURST : count;
/* As many data cores may want access to available buffers, they need to be reserved. */
do {
res_base_idx = vq->last_used_idx_res;
avail_idx = *((volatile uint16_t *)&vq->avail->idx);
free_entries = (avail_idx - res_base_idx);
- /* If retry is enabled and the queue is full then we wait and retry to avoid packet loss. */
- if (enable_retry && unlikely(count > free_entries)) {
- for (retry = 0; retry < burst_rx_retry_num; retry++) {
- rte_delay_us(burst_rx_delay_time);
- avail_idx =
- *((volatile uint16_t *)&vq->avail->idx);
- free_entries = (avail_idx - res_base_idx);
- if (count <= free_entries)
- break;
- }
- }
-
/*check that we have enough buffers*/
if (unlikely(count > free_entries))
count = free_entries;
@@ -98,8 +90,10 @@ virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count)
res_end_idx = res_base_idx + count;
/* vq->last_used_idx_res is atomically updated. */
- success = rte_atomic16_cmpset(&vq->last_used_idx_res, res_base_idx,
- res_end_idx);
+ /* TODO: Allow to disable cmpset if no concurrency in application */
+ success = rte_atomic16_cmpset(&vq->last_used_idx_res,
+ res_base_idx, res_end_idx);
+ /* If there is contention here and failed, try again. */
} while (unlikely(success == 0));
res_cur_idx = res_base_idx;
LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", dev->device_fh, res_cur_idx, res_end_idx);
@@ -107,6 +101,9 @@ virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count)
/* Prefetch available ring to retrieve indexes. */
rte_prefetch0(&vq->avail->ring[res_cur_idx & (vq->size - 1)]);
+ /* Check if the VIRTIO_NET_F_MRG_RXBUF feature is enabled. */
+ mergeable = dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF);
+
/* Retrieve all of the head indexes first to avoid caching issues. */
for (head_idx = 0; head_idx < count; head_idx++)
head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) & (vq->size - 1)];
@@ -125,44 +122,60 @@ virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count)
/* Prefetch buffer address. */
rte_prefetch0((void*)(uintptr_t)buff_addr);
- /* Copy virtio_hdr to packet and increment buffer address */
- buff_hdr_addr = buff_addr;
- packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
-
- /*
- * If the descriptors are chained the header and data are
- * placed in separate buffers.
- */
- if (desc->flags & VRING_DESC_F_NEXT) {
- desc->len = vq->vhost_hlen;
- desc = &vq->desc[desc->next];
- /* Buffer address translation. */
- buff_addr = gpa_to_vva(dev, desc->addr);
- desc->len = rte_pktmbuf_data_len(buff);
+ if (mergeable && (mrg_count != 0)) {
+ desc->len = packet_len = rte_pktmbuf_data_len(buff);
} else {
- buff_addr += vq->vhost_hlen;
- desc->len = packet_len;
+ /* Copy virtio_hdr to packet and increment buffer address */
+ buff_hdr_addr = buff_addr;
+ packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
+
+ /*
+ * If the descriptors are chained the header and data are placed in
+ * separate buffers.
+ */
+ if (desc->flags & VRING_DESC_F_NEXT) {
+ desc->len = vq->vhost_hlen;
+ desc = &vq->desc[desc->next];
+ /* Buffer address translation. */
+ buff_addr = gpa_to_vva(dev, desc->addr);
+ desc->len = rte_pktmbuf_data_len(buff);
+ } else {
+ buff_addr += vq->vhost_hlen;
+ desc->len = packet_len;
+ }
}
+
/* Update used ring with desc information */
vq->used->ring[res_cur_idx & (vq->size - 1)].id = head[packet_success];
vq->used->ring[res_cur_idx & (vq->size - 1)].len = packet_len;
/* Copy mbuf data to buffer */
+ /* TODO fixme for sg mbuf and the case that desc couldn't hold the mbuf data */
rte_memcpy((void *)(uintptr_t)buff_addr,
(const void *)buff->pkt.data,
rte_pktmbuf_data_len(buff));
- PRINT_PACKET(dev, (uintptr_t)buff_addr,
+ VHOST_PRINT_PACKET(dev, (uintptr_t)buff_addr,
rte_pktmbuf_data_len(buff), 0);
res_cur_idx++;
packet_success++;
-
- rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
- (const void *)&virtio_hdr, vq->vhost_hlen);
-
- PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
-
+
+ /* If mergeable is disabled then a header is required per buffer. */
+ if (!mergeable) {
+ rte_memcpy((void *)(uintptr_t)buff_hdr_addr, (const void *)&virtio_hdr, vq->vhost_hlen);
+ VHOST_PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
+ } else {
+ mrg_count++;
+ /* Merge buffer can only handle so many buffers at a time. Tell the guest if this limit is reached. */
+ if ((mrg_count == VHOST_MAX_MRG_PKT_BURST) || (res_cur_idx == res_end_idx)) {
+ virtio_hdr.num_buffers = mrg_count;
+ LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge buffers %d\n", dev->device_fh, virtio_hdr.num_buffers);
+ rte_memcpy((void *)(uintptr_t)buff_hdr_addr, (const void *)&virtio_hdr, vq->vhost_hlen);
+ VHOST_PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
+ mrg_count = 0;
+ }
+ }
if (res_cur_idx < res_end_idx) {
/* Prefetch descriptor index. */
rte_prefetch0(&vq->desc[head[packet_success]]);
@@ -185,25 +198,30 @@ virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count)
}
-static inline void __attribute__((always_inline))
-virtio_dev_tx(struct virtio_net* dev, struct rte_mempool *mbuf_pool)
+uint32_t
+rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint32_t count)
{
- struct rte_mbuf m;
+ struct rte_mbuf *mbuf;
struct vhost_virtqueue *vq;
struct vring_desc *desc;
uint64_t buff_addr = 0;
- uint32_t head[MAX_PKT_BURST];
+ uint32_t head[VHOST_MAX_PKT_BURST];
uint32_t used_idx;
uint32_t i;
uint16_t free_entries, packet_success = 0;
uint16_t avail_idx;
+ if (unlikely(queue_id != VIRTIO_TXQ)) {
+ LOG_DEBUG(VHOST_DATA, "mq isn't supported in this version.\n");
+ return 0;
+ }
+
vq = dev->virtqueue[VIRTIO_TXQ];
avail_idx = *((volatile uint16_t *)&vq->avail->idx);
/* If there are no available buffers then return. */
if (vq->last_used_idx == avail_idx)
- return;
+ return 0;
LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
@@ -213,9 +231,11 @@ virtio_dev_tx(struct virtio_net* dev, struct rte_mempool *mbuf_pool)
/*get the number of free entries in the ring*/
free_entries = (avail_idx - vq->last_used_idx);
+ if (free_entries > count)
+ free_entries = count;
/* Limit to MAX_PKT_BURST. */
- if (free_entries > MAX_PKT_BURST)
- free_entries = MAX_PKT_BURST;
+ if (free_entries > VHOST_MAX_PKT_BURST)
+ free_entries = VHOST_MAX_PKT_BURST;
LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", dev->device_fh, free_entries);
/* Retrieve all of the head indexes first to avoid caching issues. */
@@ -249,23 +269,20 @@ virtio_dev_tx(struct virtio_net* dev, struct rte_mempool *mbuf_pool)
vq->used->ring[used_idx].id = head[packet_success];
vq->used->ring[used_idx].len = 0;
- /* Setup dummy mbuf. This is copied to a real mbuf if transmitted out the physical port. */
- m.pkt.data_len = desc->len;
- m.pkt.pkt_len = desc->len;
- m.pkt.data = (void*)(uintptr_t)buff_addr;
+ mbuf = rte_pktmbuf_alloc(mbuf_pool);
+ if (unlikely(mbuf == NULL)) {
+ RTE_LOG(ERR, VHOST_DATA, "Failed to allocate memory for mbuf.\n");
+ return packet_success;
+ }
+ mbuf->pkt.data_len = desc->len;
+ mbuf->pkt.pkt_len = mbuf->pkt.data_len;
+
+ rte_memcpy((void *) mbuf->pkt.data,
+ (const void *) buff_addr, mbuf->pkt.data_len);
- PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
+ pkts[packet_success] = mbuf;
- /* If this is the first received packet we need to learn the MAC and setup VMDQ */
- if (dev->ready == DEVICE_MAC_LEARNING) {
- if (dev->remove || (link_vmdq(dev, &m) == -1)) {
- /*discard frame if device is scheduled for removal or a duplicate MAC address is found. */
- packet_success += free_entries;
- vq->last_used_idx += packet_success;
- break;
- }
- }
- virtio_tx_route(dev, &m, mbuf_pool, (uint16_t)dev->device_fh);
+ VHOST_PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
vq->last_used_idx++;
packet_success++;
@@ -276,6 +293,6 @@ virtio_dev_tx(struct virtio_net* dev, struct rte_mempool *mbuf_pool)
/* Kick guest if required. */
if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
eventfd_write((int)vq->kickfd, 1);
-}
-
+ return packet_success;
+}
@@ -47,27 +47,32 @@
#include <rte_log.h>
#include <rte_string_fns.h>
#include <rte_memory.h>
+#include <rte_virtio_net.h>
-#include "main.h"
-#include "virtio-net.h"
#include "vhost-net-cdev.h"
#include "eventfd_link/eventfd_link.h"
-const char eventfd_cdev[] = "/dev/eventfd-link";
+/**
+ * Device linked list structure for configuration.
+ */
+struct virtio_net_config_ll {
+ struct virtio_net dev; /* Virtio device. */
+ struct virtio_net_config_ll *next; /* Next entry on linked list. */
+};
-extern uint32_t num_devices;
-static uint32_t num_cur_devices = 0;
+static const char eventfd_cdev[] = "/dev/eventfd-link";
/* device ops to add/remove device to data core. */
static struct virtio_net_device_ops const * notify_ops;
/* Root address of the linked list in the configuration core. */
static struct virtio_net_config_ll *ll_root = NULL;
-/* Features supported by this application. RX merge buffers are disabled by default. */
-uint64_t VHOST_FEATURES = (0ULL << VIRTIO_NET_F_MRG_RXBUF);
+/* Features supported by this library. */
+#define VHOST_SUPPORTED_FEATURES (1ULL << VIRTIO_NET_F_MRG_RXBUF)
+static uint64_t VHOST_FEATURES = VHOST_SUPPORTED_FEATURES;
/* Line size for reading maps file. */
-const uint32_t BUFSIZE = PATH_MAX;
+static const uint32_t BUFSIZE = PATH_MAX;
/* Size of prot char array in procmap. */
#define PROT_SZ 5
@@ -347,8 +352,6 @@ cleanup_device(struct virtio_net *dev)
/* Unmap QEMU memory file if mapped. */
if (dev->mem) {
munmap((void*)(uintptr_t)dev->mem->mapped_address, (size_t)dev->mem->mapped_size);
- if (dev->mem->regions_hpa)
- free(dev->mem->regions_hpa);
free(dev->mem);
}
@@ -434,12 +437,6 @@ new_device(struct vhost_device_ctx ctx)
struct virtio_net_config_ll *new_ll_dev;
struct vhost_virtqueue *virtqueue_rx, *virtqueue_tx;
- /*check the number of devices in the system*/
- if (num_cur_devices == num_devices) {
- RTE_LOG(ERR, VHOST_CONFIG, "() Max num devices (%u) exceeded\n", num_devices);
- return -1;
- }
-
/* Setup device and virtqueues. */
new_ll_dev = malloc(sizeof(struct virtio_net_config_ll));
if (new_ll_dev == NULL) {
@@ -473,9 +470,6 @@ new_device(struct vhost_device_ctx ctx)
/* Add entry to device configuration linked list. */
add_config_ll_entry(new_ll_dev);
- /*increment the number of devices in the system*/
- num_cur_devices++;
-
return new_ll_dev->dev.device_fh;
}
@@ -506,9 +500,6 @@ destroy_device(struct vhost_device_ctx ctx)
ll_dev_cur = ll_dev_cur->next;
}
}
-
- /*decrement the number of devices in the system*/
- num_cur_devices--;
}
/*
@@ -592,153 +583,6 @@ set_features(struct vhost_device_ctx ctx, uint64_t *pu)
return 0;
}
-/*
- * Calculate the region count of physical continous regions for one particular
- * region of whose vhost virtual address is continous. The particular region
- * start from vva_start, with size of 'size' in argument.
- */
-static uint32_t check_hpa_regions(uint64_t vva_start, uint64_t size)
-{
- uint32_t i, nregions = 0, page_size = PAGE_SIZE;
- uint64_t cur_phys_addr = 0, next_phys_addr = 0;
- if (vva_start % page_size) {
- LOG_DEBUG(VHOST_CONFIG,
- "in check_countinous: vva start(%p) mod page_size(%d) "
- "has remainder\n",
- (void *)(uintptr_t)vva_start, page_size);
- return 0;
- }
- if (size % page_size) {
- LOG_DEBUG(VHOST_CONFIG,
- "in check_countinous: "
- "size((%"PRIu64")) mod page_size(%d) has remainder\n",
- size, page_size);
- return 0;
- }
- for (i = 0; i < size - page_size; i = i + page_size) {
- cur_phys_addr
- = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
- next_phys_addr = rte_mem_virt2phy(
- (void *)(uintptr_t)(vva_start + i + page_size));
- if ((cur_phys_addr + page_size) != next_phys_addr) {
- ++nregions;
- LOG_DEBUG(VHOST_CONFIG,
- "in check_continuous: hva addr:(%p) is not "
- "continuous with hva addr:(%p), diff:%d\n",
- (void *)(uintptr_t)(vva_start + (uint64_t)i),
- (void *)(uintptr_t)(vva_start + (uint64_t)i
- + page_size), page_size);
- LOG_DEBUG(VHOST_CONFIG,
- "in check_continuous: hpa addr:(%p) is not "
- "continuous with hpa addr:(%p), "
- "diff:(%"PRIu64")\n",
- (void *)(uintptr_t)cur_phys_addr,
- (void *)(uintptr_t)next_phys_addr,
- (next_phys_addr-cur_phys_addr));
- }
- }
- return nregions;
-}
-
-/*
- * Divide each region whose vhost virtual address is continous into a few
- * sub-regions, make sure the physical address within each sub-region are
- * continous. And fill offset(to GPA) and size etc. information of each
- * sub-region into regions_hpa.
- */
-static uint32_t fill_hpa_memory_regions(void *memory)
-{
- uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = PAGE_SIZE;
- uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
- struct virtio_memory *virtio_memory = (struct virtio_memory *)memory;
- struct virtio_memory_regions_hpa *mem_region_hpa
- = virtio_memory->regions_hpa;
-
- if (mem_region_hpa == NULL)
- return 0;
-
- for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
- vva_start = virtio_memory->regions[regionidx].guest_phys_address
- + virtio_memory->regions[regionidx].address_offset;
- mem_region_hpa[regionidx_hpa].guest_phys_address
- = virtio_memory->regions[regionidx].guest_phys_address;
- mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
- rte_mem_virt2phy((void *)(uintptr_t)(vva_start))
- - mem_region_hpa[regionidx_hpa].guest_phys_address;
- LOG_DEBUG(VHOST_CONFIG,
- "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
- regionidx_hpa,
- (void *)(uintptr_t)
- (mem_region_hpa[regionidx_hpa].guest_phys_address));
- LOG_DEBUG(VHOST_CONFIG,
- "in fill_hpa_regions: host phys addr start[%d]:(%p)\n",
- regionidx_hpa,
- (void *)(uintptr_t)
- (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
- for (i = 0, k = 0;
- i < virtio_memory->regions[regionidx].memory_size
- - page_size;
- i += page_size) {
- cur_phys_addr = rte_mem_virt2phy(
- (void *)(uintptr_t)(vva_start + i));
- next_phys_addr = rte_mem_virt2phy(
- (void *)(uintptr_t)(vva_start
- + i + page_size));
- if ((cur_phys_addr + page_size) != next_phys_addr) {
- mem_region_hpa[regionidx_hpa].guest_phys_address_end =
- mem_region_hpa[regionidx_hpa].guest_phys_address
- + k + page_size;
- mem_region_hpa[regionidx_hpa].memory_size
- = k + page_size;
- LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
- "phys addr end [%d]:(%p)\n",
- regionidx_hpa,
- (void *)(uintptr_t)
- (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
- LOG_DEBUG(VHOST_CONFIG,
- "in fill_hpa_regions: guest phys addr "
- "size [%d]:(%p)\n",
- regionidx_hpa,
- (void *)(uintptr_t)
- (mem_region_hpa[regionidx_hpa].memory_size));
- mem_region_hpa[regionidx_hpa + 1].guest_phys_address
- = mem_region_hpa[regionidx_hpa].guest_phys_address_end;
- ++regionidx_hpa;
- mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
- next_phys_addr
- - mem_region_hpa[regionidx_hpa].guest_phys_address;
- LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
- " phys addr start[%d]:(%p)\n",
- regionidx_hpa,
- (void *)(uintptr_t)
- (mem_region_hpa[regionidx_hpa].guest_phys_address));
- LOG_DEBUG(VHOST_CONFIG,
- "in fill_hpa_regions: host phys addr "
- "start[%d]:(%p)\n",
- regionidx_hpa,
- (void *)(uintptr_t)
- (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
- k = 0;
- } else {
- k += page_size;
- }
- }
- mem_region_hpa[regionidx_hpa].guest_phys_address_end
- = mem_region_hpa[regionidx_hpa].guest_phys_address
- + k + page_size;
- mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
- LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end "
- "[%d]:(%p)\n", regionidx_hpa,
- (void *)(uintptr_t)
- (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
- LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
- "[%d]:(%p)\n", regionidx_hpa,
- (void *)(uintptr_t)
- (mem_region_hpa[regionidx_hpa].memory_size));
- ++regionidx_hpa;
- }
- return regionidx_hpa;
-}
/*
* Called from CUSE IOCTL: VHOST_SET_MEM_TABLE
@@ -832,7 +676,6 @@ set_mem_table(struct vhost_device_ctx ctx, const void *mem_regions_addr, uint32_
}
}
mem->nregions = valid_regions;
- mem->nregions_hpa = mem->nregions;
dev->mem = mem;
/*
@@ -843,34 +686,7 @@ set_mem_table(struct vhost_device_ctx ctx, const void *mem_regions_addr, uint32_
dev->mem->regions[regionidx].address_offset = dev->mem->regions[regionidx].userspace_address - dev->mem->base_address
+ dev->mem->mapped_address - dev->mem->regions[regionidx].guest_phys_address;
- dev->mem->nregions_hpa
- += check_hpa_regions(
- dev->mem->regions[regionidx].guest_phys_address
- + dev->mem->regions[regionidx].address_offset,
- dev->mem->regions[regionidx].memory_size);
- }
- if (dev->mem->regions_hpa != NULL) {
- free(dev->mem->regions_hpa);
- dev->mem->regions_hpa = NULL;
}
-
- dev->mem->regions_hpa = (struct virtio_memory_regions_hpa *) calloc(1,
- (sizeof(struct virtio_memory_regions_hpa)
- * dev->mem->nregions_hpa));
- if (dev->mem->regions_hpa == NULL) {
- RTE_LOG(ERR, VHOST_CONFIG,
- "(%"PRIu64") Failed to allocate memory for "
- "dev->mem->regions_hpa.\n", dev->device_fh);
- return -1;
- }
- if (fill_hpa_memory_regions(
- (void *)dev->mem) != dev->mem->nregions_hpa) {
- RTE_LOG(ERR, VHOST_CONFIG,
- "in set_mem_table: hpa memory regions number mismatch: "
- "[%d]\n", dev->mem->nregions_hpa);
- return -1;
- }
-
return 0;
}
@@ -1144,22 +960,45 @@ get_virtio_net_callbacks(void)
return &vhost_device_ops;
}
-/*
- * Register ops so that we can add/remove device to data core.
- */
-int
-init_virtio_net(struct virtio_net_device_ops const * const ops)
+int rte_vhost_enable_guest_notification(struct virtio_net *dev, uint16_t queue_id, int enable)
{
- notify_ops = ops;
+ if (enable) {
+ RTE_LOG(ERR, VHOST_CONFIG, "guest notification isn't supported.\n");
+ return -1;
+ }
+ dev->virtqueue[queue_id]->used->flags = enable ? 0 : VRING_USED_F_NO_NOTIFY;
return 0;
}
+uint64_t rte_vhost_feature_get(void)
+{
+ return VHOST_FEATURES;
+}
+
+int rte_vhost_feature_disable(uint64_t feature_mask)
+{
+ VHOST_FEATURES = VHOST_FEATURES & ~feature_mask;
+ return 0;
+}
+
+int rte_vhost_feature_enable(uint64_t feature_mask)
+{
+ if ((feature_mask & VHOST_SUPPORTED_FEATURES) == feature_mask) {
+ VHOST_FEATURES = VHOST_FEATURES | feature_mask;
+ return 0;
+ }
+ return -1;
+}
+
+
/*
- * Currently not used as we Ctrl+c to exit application.
+ * Register ops so that we can add/remove device to data core.
*/
int
-deinit_virtio_net(void)
+rte_vhost_driver_callback_register(struct virtio_net_device_ops const * const ops)
{
+ notify_ops = ops;
+
return 0;
}