@@ -18,6 +18,7 @@
#define SFC_VDPA_MAC_ADDR "mac"
#define SFC_VDPA_DEFAULT_MCDI_IOVA 0x200000000000
+#define SFC_SW_VRING_IOVA 0x300000000000
/* Broadcast & Unicast MAC filters are supported */
#define SFC_MAX_SUPPORTED_FILTERS 3
@@ -4,10 +4,13 @@
#include <pthread.h>
#include <unistd.h>
+#include <sys/epoll.h>
#include <sys/ioctl.h>
+#include <rte_eal_paging.h>
#include <rte_errno.h>
#include <rte_malloc.h>
+#include <rte_memory.h>
#include <rte_vdpa.h>
#include <rte_vfio.h>
#include <rte_vhost.h>
@@ -33,7 +36,9 @@
*/
#define SFC_VDPA_DEFAULT_FEATURES \
((1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \
- (1ULL << VIRTIO_NET_F_MQ))
+ (1ULL << VIRTIO_NET_F_MQ) | \
+ (1ULL << VHOST_F_LOG_ALL) | \
+ (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE))
#define SFC_VDPA_MSIX_IRQ_SET_BUF_LEN \
(sizeof(struct vfio_irq_set) + \
@@ -42,6 +47,142 @@
/* It will be used for target VF when calling function is not PF */
#define SFC_VDPA_VF_NULL 0xFFFF
+#define SFC_VDPA_DECODE_FD(data) (data.u64 >> 32)
+#define SFC_VDPA_DECODE_QID(data) (data.u32 >> 1)
+#define SFC_VDPA_DECODE_EV_TYPE(data) (data.u32 & 1)
+
+/*
+ * Create q_num number of epoll events for kickfd interrupts
+ * and q_num/2 events for callfd interrupts. Round up the
+ * total to (q_num * 2) number of events.
+ */
+#define SFC_VDPA_SW_RELAY_EVENT_NUM(q_num) (q_num * 2)
+
+static inline uint64_t
+sfc_vdpa_encode_ev_data(int type, uint32_t qid, int fd)
+{
+ SFC_VDPA_ASSERT(fd > UINT32_MAX || qid > UINT32_MAX / 2);
+ return type | (qid << 1) | (uint64_t)fd << 32;
+}
+
+static inline void
+sfc_vdpa_queue_relay(struct sfc_vdpa_ops_data *ops_data, uint32_t qid)
+{
+ rte_vdpa_relay_vring_used(ops_data->vid, qid, &ops_data->sw_vq[qid]);
+ rte_vhost_vring_call(ops_data->vid, qid);
+}
+
+static void*
+sfc_vdpa_sw_relay(void *data)
+{
+ uint64_t buf;
+ uint32_t qid, q_num;
+ struct epoll_event ev;
+ struct rte_vhost_vring vring;
+ int nbytes, i, ret, fd, epfd, nfds = 0;
+ struct epoll_event events[SFC_VDPA_MAX_QUEUE_PAIRS * 2];
+ struct sfc_vdpa_ops_data *ops_data = (struct sfc_vdpa_ops_data *)data;
+
+ q_num = rte_vhost_get_vring_num(ops_data->vid);
+ epfd = epoll_create(SFC_VDPA_SW_RELAY_EVENT_NUM(q_num));
+ if (epfd < 0) {
+ sfc_vdpa_log_init(ops_data->dev_handle,
+ "failed to create epoll instance");
+ goto fail_epoll;
+ }
+ ops_data->epfd = epfd;
+
+ vring.kickfd = -1;
+ for (qid = 0; qid < q_num; qid++) {
+ ev.events = EPOLLIN | EPOLLPRI;
+ ret = rte_vhost_get_vhost_vring(ops_data->vid, qid, &vring);
+ if (ret != 0) {
+ sfc_vdpa_log_init(ops_data->dev_handle,
+ "rte_vhost_get_vhost_vring error %s",
+ strerror(errno));
+ goto fail_vring;
+ }
+
+ ev.data.u64 = sfc_vdpa_encode_ev_data(0, qid, vring.kickfd);
+ if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
+ sfc_vdpa_log_init(ops_data->dev_handle,
+ "epoll add error: %s",
+ strerror(errno));
+ goto fail_epoll_add;
+ }
+ }
+
+ /*
+ * Register intr_fd created by vDPA driver in lieu of qemu's callfd
+ * to intercept rx queue notification. So that we can monitor rx
+ * notifications and issue rte_vdpa_relay_vring_used()
+ */
+ for (qid = 0; qid < q_num; qid += 2) {
+ fd = ops_data->intr_fd[qid];
+ ev.events = EPOLLIN | EPOLLPRI;
+ ev.data.u64 = sfc_vdpa_encode_ev_data(1, qid, fd);
+ if (epoll_ctl(epfd, EPOLL_CTL_ADD, fd, &ev) < 0) {
+ sfc_vdpa_log_init(ops_data->dev_handle,
+ "epoll add error: %s",
+ strerror(errno));
+ goto fail_epoll_add;
+ }
+ sfc_vdpa_queue_relay(ops_data, qid);
+ }
+
+ /*
+ * virtio driver in VM was continuously sending queue notifications
+ * while were setting up software vrings and hence the HW misses
+ * these doorbell notifications. Since, it is safe to send duplicate
+ * doorbell, send another doorbell from vDPA driver.
+ */
+ for (qid = 0; qid < q_num; qid++)
+ rte_write16(qid, ops_data->vq_cxt[qid].doorbell);
+
+ for (;;) {
+ nfds = epoll_wait(epfd, events,
+ SFC_VDPA_SW_RELAY_EVENT_NUM(q_num), -1);
+ if (nfds < 0) {
+ if (errno == EINTR)
+ continue;
+ sfc_vdpa_log_init(ops_data->dev_handle,
+ "epoll_wait return fail\n");
+ goto fail_epoll_wait;
+ }
+
+ for (i = 0; i < nfds; i++) {
+ fd = SFC_VDPA_DECODE_FD(events[i].data);
+ /* Ensure kickfd is not busy before proceeding */
+ for (;;) {
+ nbytes = read(fd, &buf, 8);
+ if (nbytes < 0) {
+ if (errno == EINTR ||
+ errno == EWOULDBLOCK ||
+ errno == EAGAIN)
+ continue;
+ }
+ break;
+ }
+
+ qid = SFC_VDPA_DECODE_QID(events[i].data);
+ if (SFC_VDPA_DECODE_EV_TYPE(events[i].data))
+ sfc_vdpa_queue_relay(ops_data, qid);
+ else
+ rte_write16(qid, ops_data->vq_cxt[qid].doorbell);
+ }
+ }
+
+ return NULL;
+
+fail_epoll:
+fail_vring:
+fail_epoll_add:
+fail_epoll_wait:
+ close(epfd);
+ ops_data->epfd = -1;
+ return NULL;
+}
+
static int
sfc_vdpa_get_device_features(struct sfc_vdpa_ops_data *ops_data)
{
@@ -99,7 +240,7 @@
static int
sfc_vdpa_enable_vfio_intr(struct sfc_vdpa_ops_data *ops_data)
{
- int rc;
+ int rc, fd;
int *irq_fd_ptr;
int vfio_dev_fd;
uint32_t i, num_vring;
@@ -131,6 +272,17 @@
return -1;
irq_fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = vring.callfd;
+ if (ops_data->sw_fallback_mode && !(i & 1)) {
+ fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
+ if (fd < 0) {
+ sfc_vdpa_err(ops_data->dev_handle,
+ "failed to create eventfd");
+ goto fail_eventfd;
+ }
+ ops_data->intr_fd[i] = fd;
+ irq_fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = fd;
+ } else
+ ops_data->intr_fd[i] = -1;
}
rc = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
@@ -138,16 +290,26 @@
sfc_vdpa_err(ops_data->dev_handle,
"error enabling MSI-X interrupts: %s",
strerror(errno));
- return -1;
+ goto fail_ioctl;
}
return 0;
+
+fail_ioctl:
+fail_eventfd:
+ for (i = 0; i < num_vring; i++) {
+ if (ops_data->intr_fd[i] != -1) {
+ close(ops_data->intr_fd[i]);
+ ops_data->intr_fd[i] = -1;
+ }
+ }
+ return -1;
}
static int
sfc_vdpa_disable_vfio_intr(struct sfc_vdpa_ops_data *ops_data)
{
- int rc;
+ int rc, i;
int vfio_dev_fd;
struct vfio_irq_set irq_set;
void *dev;
@@ -161,6 +323,12 @@
irq_set.index = VFIO_PCI_MSIX_IRQ_INDEX;
irq_set.start = 0;
+ for (i = 0; i < ops_data->vq_count; i++) {
+ if (ops_data->intr_fd[i] >= 0)
+ close(ops_data->intr_fd[i]);
+ ops_data->intr_fd[i] = -1;
+ }
+
rc = ioctl(vfio_dev_fd, VFIO_DEVICE_SET_IRQS, &irq_set);
if (rc) {
sfc_vdpa_err(ops_data->dev_handle,
@@ -223,12 +391,15 @@
static int
sfc_vdpa_virtq_start(struct sfc_vdpa_ops_data *ops_data, int vq_num)
{
- int rc;
+ int rc, fd;
+ uint64_t size;
uint32_t doorbell;
efx_virtio_vq_t *vq;
+ void *vring_buf, *dev;
struct sfc_vdpa_vring_info vring;
efx_virtio_vq_cfg_t vq_cfg;
efx_virtio_vq_dyncfg_t vq_dyncfg;
+ uint64_t sw_vq_iova = ops_data->sw_vq_iova;
vq = ops_data->vq_cxt[vq_num].vq;
if (vq == NULL)
@@ -241,6 +412,33 @@
goto fail_vring_info;
}
+ if (ops_data->sw_fallback_mode) {
+ size = vring_size(vring.size, rte_mem_page_size());
+ size = RTE_ALIGN_CEIL(size, rte_mem_page_size());
+ vring_buf = rte_zmalloc("vdpa", size, rte_mem_page_size());
+ vring_init(&ops_data->sw_vq[vq_num], vring.size, vring_buf,
+ rte_mem_page_size());
+
+ dev = ops_data->dev_handle;
+ fd = sfc_vdpa_adapter_by_dev_handle(dev)->vfio_container_fd;
+ rc = rte_vfio_container_dma_map(fd,
+ (uint64_t)(uintptr_t)vring_buf,
+ sw_vq_iova, size);
+
+ /* Direct I/O for Tx queue, relay for Rx queue */
+ if (!(vq_num & 1))
+ vring.used = sw_vq_iova +
+ (char *)ops_data->sw_vq[vq_num].used -
+ (char *)ops_data->sw_vq[vq_num].desc;
+
+ ops_data->sw_vq[vq_num].used->idx = vring.last_used_idx;
+ ops_data->sw_vq[vq_num].avail->idx = vring.last_avail_idx;
+
+ ops_data->vq_cxt[vq_num].sw_vq_iova = sw_vq_iova;
+ ops_data->vq_cxt[vq_num].sw_vq_size = size;
+ ops_data->sw_vq_iova += size;
+ }
+
vq_cfg.evvc_target_vf = SFC_VDPA_VF_NULL;
/* even virtqueue for RX and odd for TX */
@@ -309,9 +507,12 @@
static int
sfc_vdpa_virtq_stop(struct sfc_vdpa_ops_data *ops_data, int vq_num)
{
- int rc;
+ int rc, fd;
+ void *dev, *buf;
+ uint64_t size, len, iova;
efx_virtio_vq_dyncfg_t vq_idx;
efx_virtio_vq_t *vq;
+ struct rte_vhost_vring vring;
if (ops_data->vq_cxt[vq_num].enable != B_TRUE)
return -1;
@@ -319,13 +520,34 @@
vq = ops_data->vq_cxt[vq_num].vq;
if (vq == NULL)
return -1;
+ if (ops_data->sw_fallback_mode) {
+ dev = ops_data->dev_handle;
+ fd = sfc_vdpa_adapter_by_dev_handle(dev)->vfio_container_fd;
+ /* synchronize remaining new used entries if any */
+ if (!(vq_num & 1))
+ sfc_vdpa_queue_relay(ops_data, vq_num);
+
+ rte_vhost_get_vhost_vring(ops_data->vid, vq_num, &vring);
+ len = SFC_VDPA_USED_RING_LEN(vring.size);
+ rte_vhost_log_used_vring(ops_data->vid, vq_num, 0, len);
+
+ buf = ops_data->sw_vq[vq_num].desc;
+ size = ops_data->vq_cxt[vq_num].sw_vq_size;
+ iova = ops_data->vq_cxt[vq_num].sw_vq_iova;
+ rte_vfio_container_dma_unmap(fd, (uint64_t)(uintptr_t)buf,
+ iova, size);
+ }
/* stop the vq */
rc = efx_virtio_qstop(vq, &vq_idx);
if (rc == 0) {
- ops_data->vq_cxt[vq_num].cidx = vq_idx.evvd_vq_cidx;
- ops_data->vq_cxt[vq_num].pidx = vq_idx.evvd_vq_pidx;
+ if (ops_data->sw_fallback_mode)
+ vq_idx.evvd_vq_avail_idx = vq_idx.evvd_vq_used_idx;
+ rte_vhost_set_vring_base(ops_data->vid, vq_num,
+ vq_idx.evvd_vq_avail_idx,
+ vq_idx.evvd_vq_used_idx);
}
+
ops_data->vq_cxt[vq_num].enable = B_FALSE;
return rc;
@@ -450,7 +672,11 @@
SFC_EFX_ASSERT(ops_data->state == SFC_VDPA_STATE_CONFIGURED);
- sfc_vdpa_log_init(ops_data->dev_handle, "entry");
+ if (ops_data->sw_fallback_mode) {
+ sfc_vdpa_log_init(ops_data->dev_handle,
+ "Trying to start VDPA with SW I/O relay");
+ ops_data->sw_vq_iova = SFC_SW_VRING_IOVA;
+ }
ops_data->state = SFC_VDPA_STATE_STARTING;
@@ -675,6 +901,7 @@
sfc_vdpa_dev_close(int vid)
{
int ret;
+ void *status;
struct rte_vdpa_device *vdpa_dev;
struct sfc_vdpa_ops_data *ops_data;
@@ -707,7 +934,23 @@
}
ops_data->is_notify_thread_started = false;
+ if (ops_data->sw_fallback_mode) {
+ ret = pthread_cancel(ops_data->sw_relay_thread_id);
+ if (ret != 0)
+ sfc_vdpa_err(ops_data->dev_handle,
+ "failed to cancel LM relay thread: %s",
+ rte_strerror(ret));
+
+ ret = pthread_join(ops_data->sw_relay_thread_id, &status);
+ if (ret != 0)
+ sfc_vdpa_err(ops_data->dev_handle,
+ "failed to join LM relay thread: %s",
+ rte_strerror(ret));
+ }
+
sfc_vdpa_stop(ops_data);
+ ops_data->sw_fallback_mode = false;
+
sfc_vdpa_close(ops_data);
sfc_vdpa_adapter_unlock(ops_data->dev_handle);
@@ -774,9 +1017,49 @@
static int
sfc_vdpa_set_features(int vid)
{
- RTE_SET_USED(vid);
+ int ret;
+ uint64_t features = 0;
+ struct rte_vdpa_device *vdpa_dev;
+ struct sfc_vdpa_ops_data *ops_data;
- return -1;
+ vdpa_dev = rte_vhost_get_vdpa_device(vid);
+ ops_data = sfc_vdpa_get_data_by_dev(vdpa_dev);
+ if (ops_data == NULL)
+ return -1;
+
+ rte_vhost_get_negotiated_features(vid, &features);
+
+ if (!RTE_VHOST_NEED_LOG(features))
+ return -1;
+
+ sfc_vdpa_info(ops_data->dev_handle, "live-migration triggered");
+
+ sfc_vdpa_adapter_lock(ops_data->dev_handle);
+
+ /* Stop HW Offload and unset host notifier */
+ sfc_vdpa_stop(ops_data);
+ if (rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, false) != 0)
+ sfc_vdpa_info(ops_data->dev_handle,
+ "vDPA (%s): Failed to clear host notifier",
+ ops_data->vdpa_dev->device->name);
+
+ /* Restart vDPA with SW relay on RX queue */
+ ops_data->sw_fallback_mode = true;
+ sfc_vdpa_start(ops_data);
+ ret = pthread_create(&ops_data->sw_relay_thread_id, NULL,
+ sfc_vdpa_sw_relay, (void *)ops_data);
+ if (ret != 0)
+ sfc_vdpa_err(ops_data->dev_handle,
+ "failed to create rx_relay thread: %s",
+ rte_strerror(ret));
+
+ if (rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, true) != 0)
+ sfc_vdpa_info(ops_data->dev_handle, "notifier setup failed!");
+
+ sfc_vdpa_adapter_unlock(ops_data->dev_handle);
+ sfc_vdpa_info(ops_data->dev_handle, "SW fallback setup done!");
+
+ return 0;
}
static int
@@ -860,17 +1143,28 @@
sfc_vdpa_info(dev, "vDPA ops get_notify_area :: offset : 0x%" PRIx64,
*offset);
- pci_dev = sfc_vdpa_adapter_by_dev_handle(dev)->pdev;
- doorbell = (uint8_t *)pci_dev->mem_resource[reg.index].addr + *offset;
+ if (!ops_data->sw_fallback_mode) {
+ pci_dev = sfc_vdpa_adapter_by_dev_handle(dev)->pdev;
+ doorbell = (uint8_t *)pci_dev->mem_resource[reg.index].addr +
+ *offset;
+ /*
+ * virtio-net driver in VM sends queue notifications before
+ * vDPA has a chance to setup the queues and notification area,
+ * and hence the HW misses these doorbell notifications.
+ * Since, it is safe to send duplicate doorbell, send another
+ * doorbell from vDPA driver as workaround for this timing issue
+ */
+ rte_write16(qid, doorbell);
+
+ /*
+ * Update doorbell address, it will come in handy during
+ * live-migration.
+ */
+ ops_data->vq_cxt[qid].doorbell = doorbell;
+ }
- /*
- * virtio-net driver in VM sends queue notifications before
- * vDPA has a chance to setup the queues and notification area,
- * and hence the HW misses these doorbell notifications.
- * Since, it is safe to send duplicate doorbell, send another
- * doorbell from vDPA driver as workaround for this timing issue.
- */
- rte_write16(qid, doorbell);
+ sfc_vdpa_info(dev, "vDPA ops get_notify_area :: offset : 0x%" PRIx64,
+ *offset);
return 0;
}
@@ -6,8 +6,11 @@
#define _SFC_VDPA_OPS_H
#include <rte_vdpa.h>
+#include <vdpa_driver.h>
#define SFC_VDPA_MAX_QUEUE_PAIRS 8
+#define SFC_VDPA_USED_RING_LEN(size) \
+ ((size) * sizeof(struct vring_used_elem) + sizeof(uint16_t) * 3)
enum sfc_vdpa_context {
SFC_VDPA_AS_VF
@@ -37,9 +40,10 @@ struct sfc_vdpa_vring_info {
typedef struct sfc_vdpa_vq_context_s {
volatile void *doorbell;
uint8_t enable;
- uint32_t pidx;
- uint32_t cidx;
efx_virtio_vq_t *vq;
+
+ uint64_t sw_vq_iova;
+ uint64_t sw_vq_size;
} sfc_vdpa_vq_context_t;
struct sfc_vdpa_ops_data {
@@ -57,6 +61,13 @@ struct sfc_vdpa_ops_data {
uint16_t vq_count;
struct sfc_vdpa_vq_context_s vq_cxt[SFC_VDPA_MAX_QUEUE_PAIRS * 2];
+
+ int epfd;
+ uint64_t sw_vq_iova;
+ bool sw_fallback_mode;
+ pthread_t sw_relay_thread_id;
+ struct vring sw_vq[SFC_VDPA_MAX_QUEUE_PAIRS * 2];
+ int intr_fd[SFC_VDPA_MAX_QUEUE_PAIRS * 2];
};
struct sfc_vdpa_ops_data *