[dpdk-dev,3/3] vhost: support VFIO based accelerator

Message ID 20180306104327.14470-4-tiwei.bie@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Maxime Coquelin
Headers

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation fail apply patch file failure

Commit Message

Tiwei Bie March 6, 2018, 10:43 a.m. UTC
  This commit adds the VFIO based accelerator support to
vhost. A new API is provided to support asking QEMU to
do further setup to allow notifications and interrupts
being delivered directly between the driver in guest
and the vDPA device in host.

Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
---
 lib/librte_vhost/rte_vhost.h           |  28 ++++++
 lib/librte_vhost/rte_vhost_version.map |   1 +
 lib/librte_vhost/vhost_user.c          | 166 +++++++++++++++++++++++++++++++++
 lib/librte_vhost/vhost_user.h          |   9 ++
 4 files changed, 204 insertions(+)
  

Comments

Maxime Coquelin March 6, 2018, 2:24 p.m. UTC | #1
On 03/06/2018 11:43 AM, Tiwei Bie wrote:
> This commit adds the VFIO based accelerator support to
> vhost. A new API is provided to support asking QEMU to
> do further setup to allow notifications and interrupts
> being delivered directly between the driver in guest
> and the vDPA device in host.
> 
> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> ---
>   lib/librte_vhost/rte_vhost.h           |  28 ++++++
>   lib/librte_vhost/rte_vhost_version.map |   1 +
>   lib/librte_vhost/vhost_user.c          | 166 +++++++++++++++++++++++++++++++++
>   lib/librte_vhost/vhost_user.h          |   9 ++
>   4 files changed, 204 insertions(+)
> 
> diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
> index d5589c543..68842e908 100644
> --- a/lib/librte_vhost/rte_vhost.h
> +++ b/lib/librte_vhost/rte_vhost.h
> @@ -35,6 +35,7 @@ extern "C" {
>   #define RTE_VHOST_USER_PROTOCOL_F_REPLY_ACK	3
>   #define RTE_VHOST_USER_PROTOCOL_F_NET_MTU	4
>   #define RTE_VHOST_USER_PROTOCOL_F_SLAVE_REQ	5
> +#define RTE_VHOST_USER_PROTOCOL_F_VFIO		8
>   #define RTE_VHOST_USER_F_PROTOCOL_FEATURES	30
>   
>   /**
> @@ -591,6 +592,33 @@ rte_vhost_get_vdpa_eid(int vid);
>   int __rte_experimental
>   rte_vhost_get_vdpa_did(int vid);
>   
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Enable or disable the VFIO based accelerator for vhost-user.
> + *
> + * This function is to ask QEMU to do further setup to better
> + * support the vDPA device at vhost user backend. With this
> + * setup, the notifications and interrupts will be delivered
> + * directly between the driver in guest and the vDPA device
> + * in host if platform supports e.g. EPT and Posted interrupt.
> + * It's nice to have, and not mandatory.
> + *
> + * @param vid
> + *  vhost device ID
> + * @param int
> + *  Enable or disable
> + *
> + * @return
> + *   0: success
> + *   -ENODEV: no such vhost device
> + *   -ENOTSUP: device does not support VFIO based accelerator feature
> + *   -EINVAL: there is no accelerator assigned to this vhost device
> + *   -EFAULT: failed to talk with QEMU
> + */
> +int rte_vhost_vfio_accelerator_ctrl(int vid, int enable);
> +
>   #ifdef __cplusplus
>   }
>   #endif
> diff --git a/lib/librte_vhost/rte_vhost_version.map b/lib/librte_vhost/rte_vhost_version.map
> index 36257e51b..ca970170f 100644
> --- a/lib/librte_vhost/rte_vhost_version.map
> +++ b/lib/librte_vhost/rte_vhost_version.map
> @@ -72,6 +72,7 @@ EXPERIMENTAL {
>   	rte_vhost_set_vring_base;
>   	rte_vhost_get_vdpa_eid;
>   	rte_vhost_get_vdpa_did;
> +	rte_vhost_vfio_accelerator_ctrl;
>   	rte_vdpa_register_engine;
>   	rte_vdpa_unregister_engine;
>   	rte_vdpa_find_engine_id;
> diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
> index e3a1dfbfb..a65598d80 100644
> --- a/lib/librte_vhost/vhost_user.c
> +++ b/lib/librte_vhost/vhost_user.c
> @@ -35,6 +35,7 @@
>   #include <rte_common.h>
>   #include <rte_malloc.h>
>   #include <rte_log.h>
> +#include <rte_vhost.h>
>   
>   #include "iotlb.h"
>   #include "vhost.h"
> @@ -1628,6 +1629,27 @@ vhost_user_msg_handler(int vid, int fd)
>   	return 0;
>   }
>   
> +static int process_slave_message_reply(struct virtio_net *dev,
> +				       const VhostUserMsg *msg)
> +{
> +	VhostUserMsg msg_reply;
> +
> +	if ((msg->flags & VHOST_USER_NEED_REPLY) == 0)
> +		return 0;
> +
> +	if (read_vhost_message(dev->slave_req_fd, &msg_reply) < 0)
> +		return -1;
> +
> +	if (msg_reply.request.slave != msg->request.slave) {
> +		RTE_LOG(ERR, VHOST_CONFIG,
> +			"received unexpected msg type (%u), expected %u\n",
> +			msg_reply.request.slave, msg->request.slave);
> +		return -1;
> +	}
> +
> +	return msg_reply.payload.u64;
> +}
> +
>   int
>   vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm)
>   {
> @@ -1653,3 +1675,147 @@ vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm)
>   
>   	return 0;
>   }
> +
> +static int vhost_user_slave_set_vring_file(struct virtio_net *dev,
> +					   uint32_t request,
> +					   struct vhost_vring_file *file)
Why passing the request as an argument?
It seems to be called only with the same request ID.

> +{
> +	int *fdp = NULL;
> +	size_t fd_num = 0;
> +	int ret;
> +	struct VhostUserMsg msg = {
> +		.request.slave = request,
> +		.flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY,
> +		.payload.u64 = file->index & VHOST_USER_VRING_IDX_MASK,
> +		.size = sizeof(msg.payload.u64),
> +	};
> +
> +	if (file->fd < 0)
> +		msg.payload.u64 |= VHOST_USER_VRING_NOFD_MASK;
> +	else {
> +		fdp = &file->fd;
> +		fd_num = 1;
> +	}
> +
> +	ret = send_vhost_message(dev->slave_req_fd, &msg, fdp, fd_num);
> +	if (ret < 0) {
> +		RTE_LOG(ERR, VHOST_CONFIG,
> +			"Failed to send slave message %u (%d)\n",
> +			request, ret);
> +		return ret;
> +	}
> +
> +	return process_slave_message_reply(dev, &msg);

Maybe not needed right now, but we'll need a lock to avoid concurrent
requests sending and waiting for reply.

Thanks,
Maxime
  
Tiwei Bie March 7, 2018, 8:59 a.m. UTC | #2
On Tue, Mar 06, 2018 at 03:24:27PM +0100, Maxime Coquelin wrote:
> On 03/06/2018 11:43 AM, Tiwei Bie wrote:
[...]
> > +
> > +static int vhost_user_slave_set_vring_file(struct virtio_net *dev,
> > +					   uint32_t request,
> > +					   struct vhost_vring_file *file)
> Why passing the request as an argument?
> It seems to be called only with the same request ID.

I thought there may be other requests that also need to
send a file descriptor for a ring in the future. So I
made this a common routine. Maybe it's not really helpful.
I won't pass the request as an argument in next version.

> 
> > +{
> > +	int *fdp = NULL;
> > +	size_t fd_num = 0;
> > +	int ret;
> > +	struct VhostUserMsg msg = {
> > +		.request.slave = request,
> > +		.flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY,
> > +		.payload.u64 = file->index & VHOST_USER_VRING_IDX_MASK,
> > +		.size = sizeof(msg.payload.u64),
> > +	};
> > +
> > +	if (file->fd < 0)
> > +		msg.payload.u64 |= VHOST_USER_VRING_NOFD_MASK;
> > +	else {
> > +		fdp = &file->fd;
> > +		fd_num = 1;
> > +	}
> > +
> > +	ret = send_vhost_message(dev->slave_req_fd, &msg, fdp, fd_num);
> > +	if (ret < 0) {
> > +		RTE_LOG(ERR, VHOST_CONFIG,
> > +			"Failed to send slave message %u (%d)\n",
> > +			request, ret);
> > +		return ret;
> > +	}
> > +
> > +	return process_slave_message_reply(dev, &msg);
> 
> Maybe not needed right now, but we'll need a lock to avoid concurrent
> requests sending and waiting for reply.

Yeah, probably, we need a lock for each slave channel. I didn't
check the code of Linux. Maybe it will cause problems when two
threads send e.g. below messages at the same time:

thread A:
 IOTLB miss message

thread B:
 VFIO group message which has a file descriptor

Thanks for the comments! :)

Best regards,
Tiwei Bie
  

Patch

diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index d5589c543..68842e908 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -35,6 +35,7 @@  extern "C" {
 #define RTE_VHOST_USER_PROTOCOL_F_REPLY_ACK	3
 #define RTE_VHOST_USER_PROTOCOL_F_NET_MTU	4
 #define RTE_VHOST_USER_PROTOCOL_F_SLAVE_REQ	5
+#define RTE_VHOST_USER_PROTOCOL_F_VFIO		8
 #define RTE_VHOST_USER_F_PROTOCOL_FEATURES	30
 
 /**
@@ -591,6 +592,33 @@  rte_vhost_get_vdpa_eid(int vid);
 int __rte_experimental
 rte_vhost_get_vdpa_did(int vid);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Enable or disable the VFIO based accelerator for vhost-user.
+ *
+ * This function is to ask QEMU to do further setup to better
+ * support the vDPA device at vhost user backend. With this
+ * setup, the notifications and interrupts will be delivered
+ * directly between the driver in guest and the vDPA device
+ * in host if platform supports e.g. EPT and Posted interrupt.
+ * It's nice to have, and not mandatory.
+ *
+ * @param vid
+ *  vhost device ID
+ * @param int
+ *  Enable or disable
+ *
+ * @return
+ *   0: success
+ *   -ENODEV: no such vhost device
+ *   -ENOTSUP: device does not support VFIO based accelerator feature
+ *   -EINVAL: there is no accelerator assigned to this vhost device
+ *   -EFAULT: failed to talk with QEMU
+ */
+int rte_vhost_vfio_accelerator_ctrl(int vid, int enable);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_vhost/rte_vhost_version.map b/lib/librte_vhost/rte_vhost_version.map
index 36257e51b..ca970170f 100644
--- a/lib/librte_vhost/rte_vhost_version.map
+++ b/lib/librte_vhost/rte_vhost_version.map
@@ -72,6 +72,7 @@  EXPERIMENTAL {
 	rte_vhost_set_vring_base;
 	rte_vhost_get_vdpa_eid;
 	rte_vhost_get_vdpa_did;
+	rte_vhost_vfio_accelerator_ctrl;
 	rte_vdpa_register_engine;
 	rte_vdpa_unregister_engine;
 	rte_vdpa_find_engine_id;
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index e3a1dfbfb..a65598d80 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -35,6 +35,7 @@ 
 #include <rte_common.h>
 #include <rte_malloc.h>
 #include <rte_log.h>
+#include <rte_vhost.h>
 
 #include "iotlb.h"
 #include "vhost.h"
@@ -1628,6 +1629,27 @@  vhost_user_msg_handler(int vid, int fd)
 	return 0;
 }
 
+static int process_slave_message_reply(struct virtio_net *dev,
+				       const VhostUserMsg *msg)
+{
+	VhostUserMsg msg_reply;
+
+	if ((msg->flags & VHOST_USER_NEED_REPLY) == 0)
+		return 0;
+
+	if (read_vhost_message(dev->slave_req_fd, &msg_reply) < 0)
+		return -1;
+
+	if (msg_reply.request.slave != msg->request.slave) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"received unexpected msg type (%u), expected %u\n",
+			msg_reply.request.slave, msg->request.slave);
+		return -1;
+	}
+
+	return msg_reply.payload.u64;
+}
+
 int
 vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm)
 {
@@ -1653,3 +1675,147 @@  vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm)
 
 	return 0;
 }
+
+static int vhost_user_slave_set_vring_file(struct virtio_net *dev,
+					   uint32_t request,
+					   struct vhost_vring_file *file)
+{
+	int *fdp = NULL;
+	size_t fd_num = 0;
+	int ret;
+	struct VhostUserMsg msg = {
+		.request.slave = request,
+		.flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY,
+		.payload.u64 = file->index & VHOST_USER_VRING_IDX_MASK,
+		.size = sizeof(msg.payload.u64),
+	};
+
+	if (file->fd < 0)
+		msg.payload.u64 |= VHOST_USER_VRING_NOFD_MASK;
+	else {
+		fdp = &file->fd;
+		fd_num = 1;
+	}
+
+	ret = send_vhost_message(dev->slave_req_fd, &msg, fdp, fd_num);
+	if (ret < 0) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"Failed to send slave message %u (%d)\n",
+			request, ret);
+		return ret;
+	}
+
+	return process_slave_message_reply(dev, &msg);
+}
+
+static int vhost_user_slave_set_vring_notify_area(struct virtio_net *dev,
+						  int index, int fd,
+						  uint64_t offset,
+						  uint64_t size)
+{
+	int *fdp = NULL;
+	size_t fd_num = 0;
+	int ret;
+	struct VhostUserMsg msg = {
+		.request.slave = VHOST_USER_SLAVE_VRING_NOTIFY_AREA_MSG,
+		.flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY,
+		.payload.area = {
+			.u64 = index & VHOST_USER_VRING_IDX_MASK,
+			.size = size,
+			.offset = offset,
+		},
+		.size = sizeof(msg.payload.area),
+	};
+
+	if (fd < 0)
+		msg.payload.area.u64 |= VHOST_USER_VRING_NOFD_MASK;
+	else {
+		fdp = &fd;
+		fd_num = 1;
+	}
+
+	ret = send_vhost_message(dev->slave_req_fd, &msg, fdp, fd_num);
+	if (ret < 0) {
+		RTE_LOG(ERR, VHOST_CONFIG,
+			"Failed to set vring notify area (%d)\n", ret);
+		return ret;
+	}
+
+	return process_slave_message_reply(dev, &msg);
+}
+
+int __rte_experimental
+rte_vhost_vfio_accelerator_ctrl(int vid, int enable)
+{
+	struct virtio_net *dev = get_device(vid);
+	int groupfd, devicefd, eid, ret = 0;
+	struct rte_vdpa_eng_driver *drv;
+	struct vhost_vring_file file;
+	uint64_t offset, size;
+	unsigned int i;
+
+	if (!dev)
+		return -ENODEV;
+
+	eid = dev->eid;
+	if (eid < 0)
+		return -EINVAL;
+
+	if (!(dev->features & (1ULL << VIRTIO_F_VERSION_1)) ||
+	    !(dev->features & (1ULL << RTE_VHOST_USER_F_PROTOCOL_FEATURES)) ||
+	    !(dev->protocol_features &
+			(1ULL << RTE_VHOST_USER_PROTOCOL_F_VFIO)))
+		return -ENOTSUP;
+
+	drv = vdpa_engines[eid]->eng_drv;
+
+	RTE_FUNC_PTR_OR_ERR_RET(drv->dev_ops.get_vfio_device_fd, -ENOTSUP);
+	RTE_FUNC_PTR_OR_ERR_RET(drv->dev_ops.get_vfio_group_fd, -ENOTSUP);
+	RTE_FUNC_PTR_OR_ERR_RET(drv->dev_ops.get_notify_area, -ENOTSUP);
+
+	devicefd = drv->dev_ops.get_vfio_device_fd(vid);
+	if (devicefd < 0)
+		return -ENOTSUP;
+
+	groupfd = drv->dev_ops.get_vfio_group_fd(vid);
+	if (groupfd < 0)
+		return -ENOTSUP;
+
+	if (enable) {
+		for (i = 0; i < dev->nr_vring * 2; i++) {
+			file.index = i;
+			file.fd = groupfd;
+
+			if (drv->dev_ops.get_notify_area(vid, i, &offset,
+					&size) < 0) {
+				ret = -ENOTSUP;
+				goto disable;
+			}
+
+			if (vhost_user_slave_set_vring_file(dev,
+					VHOST_USER_SLAVE_VRING_VFIO_GROUP_MSG,
+					&file) < 0) {
+				ret = -EFAULT;
+				goto disable;
+			}
+			if (vhost_user_slave_set_vring_notify_area(dev, i,
+					devicefd, offset, size) < 0) {
+				ret = -EFAULT;
+				goto disable;
+			}
+		}
+	} else {
+disable:
+		for (i = 0; i < dev->nr_vring * 2; i++) {
+			file.index = i;
+			file.fd = -1;
+			vhost_user_slave_set_vring_file(dev,
+					VHOST_USER_SLAVE_VRING_VFIO_GROUP_MSG,
+					&file);
+			vhost_user_slave_set_vring_notify_area(dev, i, -1,
+					0, 0);
+		}
+	}
+
+	return ret;
+}
diff --git a/lib/librte_vhost/vhost_user.h b/lib/librte_vhost/vhost_user.h
index 066e772dd..c74d288d4 100644
--- a/lib/librte_vhost/vhost_user.h
+++ b/lib/librte_vhost/vhost_user.h
@@ -52,6 +52,8 @@  typedef enum VhostUserRequest {
 typedef enum VhostUserSlaveRequest {
 	VHOST_USER_SLAVE_NONE = 0,
 	VHOST_USER_SLAVE_IOTLB_MSG = 1,
+	VHOST_USER_SLAVE_VRING_VFIO_GROUP_MSG = 3,
+	VHOST_USER_SLAVE_VRING_NOTIFY_AREA_MSG = 4,
 	VHOST_USER_SLAVE_MAX
 } VhostUserSlaveRequest;
 
@@ -73,6 +75,12 @@  typedef struct VhostUserLog {
 	uint64_t mmap_offset;
 } VhostUserLog;
 
+typedef struct VhostUserVringArea {
+	uint64_t u64;
+	uint64_t size;
+	uint64_t offset;
+} VhostUserVringArea;
+
 typedef struct VhostUserMsg {
 	union {
 		uint32_t master; /* a VhostUserRequest value */
@@ -93,6 +101,7 @@  typedef struct VhostUserMsg {
 		VhostUserMemory memory;
 		VhostUserLog    log;
 		struct vhost_iotlb_msg iotlb;
+		VhostUserVringArea area;
 	} payload;
 	int fds[VHOST_MEMORY_MAX_NREGIONS];
 } __attribute((packed)) VhostUserMsg;