[dpdk-dev,10/12] lib/librte_vhost: vhost user support
Commit Message
In rte_vhost_driver_register(), vhost unix domain socket listener fd is created
and added to the selected fdset.
In rte_vhost_driver_session_start(), fds in the fdset are checked for processing.
If there is new connection on listener fd from qemu, connection fd accepted is
added to the selected fdset. The listener and connection fds in the fdset are
then both checked there. When there is message on the connection fd, its
callback vserver_message_handler is called to process the vhost messages.
To support identifying which virtio is from which guest VM, rte_vhost_driver_register
is allowed to be called multiple times to specify different socket path for different
virtio device. The socket path is then set in the virtio_net device.
Signed-off-by: Huawei Xie <huawei.xie@intel.com>
---
lib/librte_vhost/Makefile | 8 +-
lib/librte_vhost/rte_virtio_net.h | 2 +
lib/librte_vhost/vhost-net.h | 4 +-
lib/librte_vhost/vhost_user/vhost-net-user.c | 455 ++++++++++++++++++++++++++
lib/librte_vhost/vhost_user/vhost-net-user.h | 106 ++++++
lib/librte_vhost/vhost_user/virtio-net-user.c | 322 ++++++++++++++++++
lib/librte_vhost/vhost_user/virtio-net-user.h | 49 +++
lib/librte_vhost/virtio-net.c | 26 +-
8 files changed, 957 insertions(+), 15 deletions(-)
create mode 100644 lib/librte_vhost/vhost_user/vhost-net-user.c
create mode 100644 lib/librte_vhost/vhost_user/vhost-net-user.h
create mode 100644 lib/librte_vhost/vhost_user/virtio-net-user.c
create mode 100644 lib/librte_vhost/vhost_user/virtio-net-user.h
Comments
Hi Xie,
On 2015/01/30 15:36, Huawei Xie wrote:
> In rte_vhost_driver_register(), vhost unix domain socket listener fd is created
> and added to the selected fdset.
>
> In rte_vhost_driver_session_start(), fds in the fdset are checked for processing.
> If there is new connection on listener fd from qemu, connection fd accepted is
> added to the selected fdset. The listener and connection fds in the fdset are
> then both checked there. When there is message on the connection fd, its
> callback vserver_message_handler is called to process the vhost messages.
>
> To support identifying which virtio is from which guest VM, rte_vhost_driver_register
> is allowed to be called multiple times to specify different socket path for different
> virtio device. The socket path is then set in the virtio_net device.
>
> Signed-off-by: Huawei Xie <huawei.xie@intel.com>
> ---
> lib/librte_vhost/Makefile | 8 +-
> lib/librte_vhost/rte_virtio_net.h | 2 +
> lib/librte_vhost/vhost-net.h | 4 +-
> lib/librte_vhost/vhost_user/vhost-net-user.c | 455 ++++++++++++++++++++++++++
> lib/librte_vhost/vhost_user/vhost-net-user.h | 106 ++++++
> lib/librte_vhost/vhost_user/virtio-net-user.c | 322 ++++++++++++++++++
> lib/librte_vhost/vhost_user/virtio-net-user.h | 49 +++
> lib/librte_vhost/virtio-net.c | 26 +-
> 8 files changed, 957 insertions(+), 15 deletions(-)
> create mode 100644 lib/librte_vhost/vhost_user/vhost-net-user.c
> create mode 100644 lib/librte_vhost/vhost_user/vhost-net-user.h
> create mode 100644 lib/librte_vhost/vhost_user/virtio-net-user.c
> create mode 100644 lib/librte_vhost/vhost_user/virtio-net-user.h
>
> diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile
> index 92ab9a6..22319b8 100644
> --- a/lib/librte_vhost/Makefile
> +++ b/lib/librte_vhost/Makefile
> @@ -34,10 +34,14 @@ include $(RTE_SDK)/mk/rte.vars.mk
> # library name
> LIB = librte_vhost.a
>
> -CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -I vhost_cuse -O3 -D_FILE_OFFSET_BITS=64 -lfuse
> +CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -D_FILE_OFFSET_BITS=64
> +CFLAGS += -I vhost_cuse -lfuse
> +CFLAGS += -I vhost_user
> LDFLAGS += -lfuse
I rethink about an abstraction layer of vhost-user and cuse.
A few month ago, I just think some users still uses cuse, so we should
not obsolete cuse implementation in DPDK-2.0.
After famous Linux distribution adopts QEMU-2.1, we will be able to
obsolete it.
While we need to maintain cuse and vhost-user in parallel, I guess an
abstraction layer will be good.
And now we have your vhost implementation.
According to your implementation, we can nicely choose vhost-user or
cuse in Makefile.
It only takes a few lines changing.
Even if we implement the abstraction, still we need to change a
parameter of below codes.
- int rte_vhost_driver_register();
- int rte_vhost_driver_session_start();
So now, the only advantage of the abstraction is that we can use
vhost-user and cuse at the same.
I guess not so many users want to use vhost like above.
I guess the above abstraction isn't need any more.
Probably we can say that your implementation has already had a some kind
of abstraction.
Anyway, how about having a new config option like below.
In "config/common_linux"
CONFIG_RTE_LIBRTE_VHOST_CUSE=n
Check it in Makefile like below.
-----------------------------------------------
CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -D_FILE_OFFSET_BITS=64
ifeq ($(CONFIG_RTE_LIBRTE_VHOST_CUSE), y)
CFLAGS += -I vhost_cuse -lfuse
LDFLAGS += -lfuse
SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := vhost_cuse/vhost-net-cdev.c
vhost_cuse/virtio-net-cdev.c vhost_cuse/eventfd_copy.c
else
CFLAGS += -I vhost_user
SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := vhost_user/vhost-net-user.c
vhost_user/virtio-net-user.c vhost_user/fd_man.c
endif
# all source are stored in SRCS-y
SRCS-$(CONFIG_RTE_LIBRTE_VHOST) += virtio-net.c vhost_rxtx.c
-----------------------------------------------
And after obsoleting cuse, just remove this option and cuse files.
What do you think?
Thanks,
Tetsuya
> # all source are stored in SRCS-y
> -SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := vhost_cuse/vhost-net-cdev.c vhost_cuse/virtio-net-cdev.c vhost_cuse/eventfd_copy.c virtio-net.c vhost_rxtx.c
> +SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := virtio-net.c vhost_rxtx.c
> +#SRCS-$(CONFIG_RTE_LIBRTE_VHOST) += vhost_cuse/vhost-net-cdev.c vhost_cuse/virtio-net-cdev.c vhost_cuse/eventfd_copy.c
> +SRCS-$(CONFIG_RTE_LIBRTE_VHOST) += vhost_user/vhost-net-user.c vhost_user/virtio-net-user.c vhost_user/fd_man.c
>
> # install includes
> SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_virtio_net.h
> diff --git a/lib/librte_vhost/rte_virtio_net.h b/lib/librte_vhost/rte_virtio_net.h
> index 0bf07c7..46c2072 100644
> --- a/lib/librte_vhost/rte_virtio_net.h
> +++ b/lib/librte_vhost/rte_virtio_net.h
> @@ -50,6 +50,8 @@
> #include <rte_mempool.h>
> #include <rte_mbuf.h>
>
> +#define VHOST_MEMORY_MAX_NREGIONS 8
> +
> /* Used to indicate that the device is running on a data core */
> #define VIRTIO_DEV_RUNNING 1
>
> diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
> index 11737cc..94b359f 100644
> --- a/lib/librte_vhost/vhost-net.h
> +++ b/lib/librte_vhost/vhost-net.h
> @@ -41,7 +41,9 @@
>
> #include <rte_log.h>
>
> -#define VHOST_MEMORY_MAX_NREGIONS 8
> +#include "rte_virtio_net.h"
> +
> +extern struct vhost_net_device_ops const *ops;
>
> /* Macros for printing using RTE_LOG */
> #define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1
> diff --git a/lib/librte_vhost/vhost_user/vhost-net-user.c b/lib/librte_vhost/vhost_user/vhost-net-user.c
> new file mode 100644
> index 0000000..ff83511
> --- /dev/null
> +++ b/lib/librte_vhost/vhost_user/vhost-net-user.c
> @@ -0,0 +1,455 @@
> +/*-
> + * BSD LICENSE
> + *
> + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> + * All rights reserved.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + *
> + * * Redistributions of source code must retain the above copyright
> + * notice, this list of conditions and the following disclaimer.
> + * * Redistributions in binary form must reproduce the above copyright
> + * notice, this list of conditions and the following disclaimer in
> + * the documentation and/or other materials provided with the
> + * distribution.
> + * * Neither the name of Intel Corporation nor the names of its
> + * contributors may be used to endorse or promote products derived
> + * from this software without specific prior written permission.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#include <stdint.h>
> +#include <stdio.h>
> +#include <limits.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <string.h>
> +#include <sys/types.h>
> +#include <sys/socket.h>
> +#include <sys/un.h>
> +#include <errno.h>
> +
> +#include <rte_log.h>
> +#include <rte_virtio_net.h>
> +
> +#include "fd_man.h"
> +#include "vhost-net-user.h"
> +#include "vhost-net.h"
> +#include "virtio-net-user.h"
> +
> +static void vserver_new_vq_conn(int fd, void *data);
> +static void vserver_message_handler(int fd, void *dat);
> +struct vhost_net_device_ops const *ops;
> +
> +struct connfd_ctx {
> + struct vhost_server *vserver;
> + uint32_t fh;
> +};
> +
> +#define MAX_VHOST_SERVER 1024
> +static struct {
> + struct vhost_server *server[MAX_VHOST_SERVER];
> + struct fdset fdset; /**< The fd list this vhost server manages. */
> +} g_vhost_server;
> +
> +static int vserver_idx;
> +
> +static const char *vhost_message_str[VHOST_USER_MAX] = {
> + [VHOST_USER_NONE] = "VHOST_USER_NONE",
> + [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES",
> + [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES",
> + [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER",
> + [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER",
> + [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE",
> + [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE",
> + [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD",
> + [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM",
> + [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR",
> + [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE",
> + [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE",
> + [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK",
> + [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL",
> + [VHOST_USER_SET_VRING_ERR] = "VHOST_USER_SET_VRING_ERR"
> +};
> +
> +/**
> + * Create a unix domain socket, bind to path and listen for connection.
> + * @return
> + * socket fd or -1 on failure
> + */
> +static int
> +uds_socket(const char *path)
> +{
> + struct sockaddr_un un;
> + int sockfd;
> + int ret;
> +
> + if (path == NULL)
> + return -1;
> +
> + sockfd = socket(AF_UNIX, SOCK_STREAM, 0);
> + if (sockfd < 0)
> + return -1;
> + RTE_LOG(INFO, VHOST_CONFIG, "socket created, fd:%d\n", sockfd);
> +
> + memset(&un, 0, sizeof(un));
> + un.sun_family = AF_UNIX;
> + snprintf(un.sun_path, sizeof(un.sun_path), "%s", path);
> + ret = bind(sockfd, (struct sockaddr *)&un, sizeof(un));
> + if (ret == -1)
> + goto err;
> + RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path);
> +
> + ret = listen(sockfd, 10);
> + if (ret == -1)
> + goto err;
> +
> + return sockfd;
> +
> +err:
> + close(sockfd);
> + return -1;
> +}
> +
> +/* return bytes# of read on success or negative val on failure. */
> +static int
> +read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
> +{
> + struct iovec iov;
> + struct msghdr msgh = { 0 };
> + size_t fdsize = fd_num * sizeof(int);
> + char control[CMSG_SPACE(fdsize)];
> + struct cmsghdr *cmsg;
> + int ret;
> +
> + iov.iov_base = buf;
> + iov.iov_len = buflen;
> +
> + msgh.msg_iov = &iov;
> + msgh.msg_iovlen = 1;
> + msgh.msg_control = control;
> + msgh.msg_controllen = sizeof(control);
> +
> + ret = recvmsg(sockfd, &msgh, 0);
> + if (ret <= 0) {
> + RTE_LOG(ERR, VHOST_CONFIG, "recvmsg failed\n");
> + return ret;
> + }
> +
> + if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
> + RTE_LOG(ERR, VHOST_CONFIG, "truncted msg\n");
> + return -1;
> + }
> +
> + for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
> + cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
> + if ((cmsg->cmsg_level == SOL_SOCKET) &&
> + (cmsg->cmsg_type == SCM_RIGHTS)) {
> + memcpy(fds, CMSG_DATA(cmsg), fdsize);
> + break;
> + }
> + }
> +
> + return ret;
> +}
> +
> +/* return bytes# of read on success or negative val on failure. */
> +static int
> +read_vhost_message(int sockfd, struct VhostUserMsg *msg)
> +{
> + int ret;
> +
> + ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE,
> + msg->fds, VHOST_MEMORY_MAX_NREGIONS);
> + if (ret <= 0)
> + return ret;
> +
> + if (msg && msg->size) {
> + if (msg->size > sizeof(msg->payload)) {
> + RTE_LOG(ERR, VHOST_CONFIG,
> + "invalid msg size: %d\n", msg->size);
> + return -1;
> + }
> + ret = read(sockfd, &msg->payload, msg->size);
> + if (ret <= 0)
> + return ret;
> + if (ret != (int)msg->size) {
> + RTE_LOG(ERR, VHOST_CONFIG,
> + "read control message failed\n");
> + return -1;
> + }
> + }
> +
> + return ret;
> +}
> +
> +static int
> +send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
> +{
> +
> + struct iovec iov;
> + struct msghdr msgh = { 0 };
> + size_t fdsize = fd_num * sizeof(int);
> + char control[CMSG_SPACE(fdsize)];
> + struct cmsghdr *cmsg;
> + int ret;
> +
> + iov.iov_base = buf;
> + iov.iov_len = buflen;
> +
> + msgh.msg_iov = &iov;
> + msgh.msg_iovlen = 1;
> +
> + if (fds && fd_num > 0) {
> + msgh.msg_control = control;
> + msgh.msg_controllen = sizeof(control);
> + cmsg = CMSG_FIRSTHDR(&msgh);
> + cmsg->cmsg_len = CMSG_LEN(fdsize);
> + cmsg->cmsg_level = SOL_SOCKET;
> + cmsg->cmsg_type = SCM_RIGHTS;
> + memcpy(CMSG_DATA(cmsg), fds, fdsize);
> + } else {
> + msgh.msg_control = NULL;
> + msgh.msg_controllen = 0;
> + }
> +
> + do {
> + ret = sendmsg(sockfd, &msgh, 0);
> + } while (ret < 0 && errno == EINTR);
> +
> + if (ret < 0) {
> + RTE_LOG(ERR, VHOST_CONFIG, "sendmsg error\n");
> + return ret;
> + }
> +
> + return ret;
> +}
> +
> +static int
> +send_vhost_message(int sockfd, struct VhostUserMsg *msg)
> +{
> + int ret;
> +
> + if (!msg)
> + return 0;
> +
> + msg->flags &= ~VHOST_USER_VERSION_MASK;
> + msg->flags |= VHOST_USER_VERSION;
> + msg->flags |= VHOST_USER_REPLY_MASK;
> +
> + ret = send_fd_message(sockfd, (char *)msg,
> + VHOST_USER_HDR_SIZE + msg->size, NULL, 0);
> +
> + return ret;
> +}
> +
> +/* call back when there is new virtio connection. */
> +static void
> +vserver_new_vq_conn(int fd, void *dat)
> +{
> + struct vhost_server *vserver = (struct vhost_server *)dat;
> + int conn_fd;
> + struct connfd_ctx *ctx;
> + int fh;
> + struct vhost_device_ctx vdev_ctx = { 0 };
> +
> + conn_fd = accept(fd, NULL, NULL);
> + RTE_LOG(INFO, VHOST_CONFIG,
> + "new virtio connection is %d\n", conn_fd);
> + if (conn_fd < 0)
> + return;
> +
> + ctx = calloc(1, sizeof(*ctx));
> + if (ctx == NULL) {
> + close(conn_fd);
> + return;
> + }
> +
> + fh = ops->new_device(vdev_ctx);
> + if (fh == -1) {
> + free(ctx);
> + close(conn_fd);
> + return;
> + }
> + RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", fh);
> +
> + ctx->vserver = vserver;
> + ctx->fh = fh;
> + fdset_add(&g_vhost_server.fdset,
> + conn_fd, vserver_message_handler, NULL, ctx);
> +}
> +
> +/* callback when there is message on the connfd */
> +static void
> +vserver_message_handler(int connfd, void *dat)
> +{
> + struct vhost_device_ctx ctx;
> + struct connfd_ctx *cfd_ctx = (struct connfd_ctx *)dat;
> + struct VhostUserMsg msg;
> + uint64_t features;
> + int ret;
> +
> + ctx.fh = cfd_ctx->fh;
> + ret = read_vhost_message(connfd, &msg);
> + if (ret < 0) {
> + RTE_LOG(ERR, VHOST_CONFIG,
> + "vhost read message failed\n");
> +
> + close(connfd);
> + fdset_del(&g_vhost_server.fdset, connfd);
> + free(cfd_ctx);
> + user_destroy_device(ctx);
> + ops->destroy_device(ctx);
> +
> + return;
> + } else if (ret == 0) {
> + RTE_LOG(INFO, VHOST_CONFIG,
> + "vhost peer closed\n");
> +
> + close(connfd);
> + fdset_del(&g_vhost_server.fdset, connfd);
> + free(cfd_ctx);
> + user_destroy_device(ctx);
> + ops->destroy_device(ctx);
> +
> + return;
> + }
> + if (msg.request > VHOST_USER_MAX) {
> + RTE_LOG(ERR, VHOST_CONFIG,
> + "vhost read incorrect message\n");
> +
> + close(connfd);
> + fdset_del(&g_vhost_server.fdset, connfd);
> + free(cfd_ctx);
> + user_destroy_device(ctx);
> + ops->destroy_device(ctx);
> +
> + return;
> + }
> +
> + RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n",
> + vhost_message_str[msg.request]);
> + switch (msg.request) {
> + case VHOST_USER_GET_FEATURES:
> + ret = ops->get_features(ctx, &features);
> + msg.payload.u64 = features;
> + msg.size = sizeof(msg.payload.u64);
> + send_vhost_message(connfd, &msg);
> + break;
> + case VHOST_USER_SET_FEATURES:
> + features = msg.payload.u64;
> + ops->set_features(ctx, &features);
> + break;
> +
> + case VHOST_USER_SET_OWNER:
> + ops->set_owner(ctx);
> + break;
> + case VHOST_USER_RESET_OWNER:
> + ops->reset_owner(ctx);
> + break;
> +
> + case VHOST_USER_SET_MEM_TABLE:
> + user_set_mem_table(ctx, &msg);
> + break;
> +
> + case VHOST_USER_SET_LOG_BASE:
> + RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n");
> + case VHOST_USER_SET_LOG_FD:
> + close(msg.fds[0]);
> + RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n");
> + break;
> +
> + case VHOST_USER_SET_VRING_NUM:
> + ops->set_vring_num(ctx, &msg.payload.state);
> + break;
> + case VHOST_USER_SET_VRING_ADDR:
> + ops->set_vring_addr(ctx, &msg.payload.addr);
> + break;
> + case VHOST_USER_SET_VRING_BASE:
> + ops->set_vring_base(ctx, &msg.payload.state);
> + break;
> +
> + case VHOST_USER_GET_VRING_BASE:
> + ret = user_get_vring_base(ctx, &msg.payload.state);
> + msg.size = sizeof(msg.payload.state);
> + send_vhost_message(connfd, &msg);
> + break;
> +
> + case VHOST_USER_SET_VRING_KICK:
> + user_set_vring_kick(ctx, &msg);
> + break;
> + case VHOST_USER_SET_VRING_CALL:
> + user_set_vring_call(ctx, &msg);
> + break;
> +
> + case VHOST_USER_SET_VRING_ERR:
> + if (!(msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK))
> + close(msg.fds[0]);
> + RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n");
> + break;
> +
> + default:
> + break;
> +
> + }
> +}
> +
> +
> +/**
> + * Creates and initialise the vhost server.
> + */
> +int
> +rte_vhost_driver_register(const char *path)
> +{
> + struct vhost_server *vserver;
> +
> + if (vserver_idx == 0) {
> + fdset_init(&g_vhost_server.fdset);
> + ops = get_virtio_net_callbacks();
> + }
> + if (vserver_idx == MAX_VHOST_SERVER)
> + return -1;
> +
> + vserver = calloc(sizeof(struct vhost_server), 1);
> + if (vserver == NULL)
> + return -1;
> +
> + unlink(path);
> +
> + vserver->listenfd = uds_socket(path);
> + if (vserver->listenfd < 0) {
> + free(vserver);
> + return -1;
> + }
> + vserver->path = path;
> +
> + fdset_add(&g_vhost_server.fdset, vserver->listenfd,
> + vserver_new_vq_conn, NULL,
> + vserver);
> +
> + g_vhost_server.server[vserver_idx++] = vserver;
> +
> + return 0;
> +}
> +
> +
> +int
> +rte_vhost_driver_session_start(void)
> +{
> + fdset_event_dispatch(&g_vhost_server.fdset);
> + return 0;
> +}
> +
> diff --git a/lib/librte_vhost/vhost_user/vhost-net-user.h b/lib/librte_vhost/vhost_user/vhost-net-user.h
> new file mode 100644
> index 0000000..e2a91a9
> --- /dev/null
> +++ b/lib/librte_vhost/vhost_user/vhost-net-user.h
> @@ -0,0 +1,106 @@
> +/*-
> + * BSD LICENSE
> + *
> + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> + * All rights reserved.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + *
> + * * Redistributions of source code must retain the above copyright
> + * notice, this list of conditions and the following disclaimer.
> + * * Redistributions in binary form must reproduce the above copyright
> + * notice, this list of conditions and the following disclaimer in
> + * the documentation and/or other materials provided with the
> + * distribution.
> + * * Neither the name of Intel Corporation nor the names of its
> + * contributors may be used to endorse or promote products derived
> + * from this software without specific prior written permission.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#ifndef _VHOST_NET_USER_H
> +#define _VHOST_NET_USER_H
> +
> +#include <stdint.h>
> +#include <linux/vhost.h>
> +
> +#include "rte_virtio_net.h"
> +#include "fd_man.h"
> +
> +struct vhost_server {
> + const char *path; /**< The path the uds is bind to. */
> + int listenfd; /**< The listener sockfd. */
> +};
> +
> +/* refer to hw/virtio/vhost-user.c */
> +
> +typedef enum VhostUserRequest {
> + VHOST_USER_NONE = 0,
> + VHOST_USER_GET_FEATURES = 1,
> + VHOST_USER_SET_FEATURES = 2,
> + VHOST_USER_SET_OWNER = 3,
> + VHOST_USER_RESET_OWNER = 4,
> + VHOST_USER_SET_MEM_TABLE = 5,
> + VHOST_USER_SET_LOG_BASE = 6,
> + VHOST_USER_SET_LOG_FD = 7,
> + VHOST_USER_SET_VRING_NUM = 8,
> + VHOST_USER_SET_VRING_ADDR = 9,
> + VHOST_USER_SET_VRING_BASE = 10,
> + VHOST_USER_GET_VRING_BASE = 11,
> + VHOST_USER_SET_VRING_KICK = 12,
> + VHOST_USER_SET_VRING_CALL = 13,
> + VHOST_USER_SET_VRING_ERR = 14,
> + VHOST_USER_MAX
> +} VhostUserRequest;
> +
> +typedef struct VhostUserMemoryRegion {
> + uint64_t guest_phys_addr;
> + uint64_t memory_size;
> + uint64_t userspace_addr;
> + uint64_t mmap_offset;
> +} VhostUserMemoryRegion;
> +
> +typedef struct VhostUserMemory {
> + uint32_t nregions;
> + uint32_t padding;
> + VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
> +} VhostUserMemory;
> +
> +typedef struct VhostUserMsg {
> + VhostUserRequest request;
> +
> +#define VHOST_USER_VERSION_MASK (0x3)
> +#define VHOST_USER_REPLY_MASK (0x1 << 2)
> + uint32_t flags;
> + uint32_t size; /* the following payload size */
> + union {
> +#define VHOST_USER_VRING_IDX_MASK (0xff)
> +#define VHOST_USER_VRING_NOFD_MASK (0x1<<8)
> + uint64_t u64;
> + struct vhost_vring_state state;
> + struct vhost_vring_addr addr;
> + VhostUserMemory memory;
> + } payload;
> + int fds[VHOST_MEMORY_MAX_NREGIONS];
> +} __attribute((packed)) VhostUserMsg;
> +
> +#define VHOST_USER_HDR_SIZE (intptr_t)(&((VhostUserMsg *)0)->payload.u64)
> +
> +/* The version of the protocol we support */
> +#define VHOST_USER_VERSION (0x1)
> +
> +/*****************************************************************************/
> +#endif
> diff --git a/lib/librte_vhost/vhost_user/virtio-net-user.c b/lib/librte_vhost/vhost_user/virtio-net-user.c
> new file mode 100644
> index 0000000..8e6d580
> --- /dev/null
> +++ b/lib/librte_vhost/vhost_user/virtio-net-user.c
> @@ -0,0 +1,322 @@
> +/*-
> + * BSD LICENSE
> + *
> + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> + * All rights reserved.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + *
> + * * Redistributions of source code must retain the above copyright
> + * notice, this list of conditions and the following disclaimer.
> + * * Redistributions in binary form must reproduce the above copyright
> + * notice, this list of conditions and the following disclaimer in
> + * the documentation and/or other materials provided with the
> + * distribution.
> + * * Neither the name of Intel Corporation nor the names of its
> + * contributors may be used to endorse or promote products derived
> + * from this software without specific prior written permission.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#include <stdint.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <sys/mman.h>
> +#include <sys/types.h>
> +#include <sys/stat.h>
> +#include <unistd.h>
> +
> +#include <rte_common.h>
> +#include <rte_log.h>
> +
> +#include "virtio-net.h"
> +#include "virtio-net-user.h"
> +#include "vhost-net-user.h"
> +#include "vhost-net.h"
> +
> +struct orig_region_map {
> + int fd;
> + uint64_t mapped_address;
> + uint64_t mapped_size;
> + uint64_t blksz;
> +};
> +
> +#define orig_region(ptr, nregions) ((struct orig_region_map *)RTE_PTR_ADD(ptr, \
> + sizeof(struct virtio_memory) + \
> + sizeof(struct virtio_memory_regions) * (nregions)))
> +
> +static uint64_t
> +get_blk_size(int fd)
> +{
> + struct stat stat;
> +
> + fstat(fd, &stat);
> + return (uint64_t)stat.st_blksize;
> +}
> +
> +static void
> +free_mem_region(struct virtio_net *dev)
> +{
> + struct orig_region_map *region;
> + unsigned int idx;
> + uint64_t alignment;
> +
> + if (!dev || !dev->mem)
> + return;
> +
> + region = orig_region(dev->mem, dev->mem->nregions);
> + for (idx = 0; idx < dev->mem->nregions; idx++) {
> + if (region[idx].mapped_address) {
> + alignment = region[idx].blksz;
> + munmap((void *)
> + RTE_ALIGN_FLOOR(
> + region[idx].mapped_address, alignment),
> + RTE_ALIGN_CEIL(
> + region[idx].mapped_size, alignment));
> + close(region[idx].fd);
> + }
> + }
> +}
> +
> +int
> +user_set_mem_table(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg)
> +{
> + struct VhostUserMemory memory = pmsg->payload.memory;
> + struct virtio_memory_regions regions[VHOST_MEMORY_MAX_NREGIONS];
> + uint64_t mapped_address, mapped_size, base_address = 0;
> + struct virtio_net *dev;
> + unsigned int idx = 0;
> + struct orig_region_map tmp[VHOST_MEMORY_MAX_NREGIONS] = {
> + [0 ... VHOST_MEMORY_MAX_NREGIONS - 1] = { 0 } };
> + struct orig_region_map *region;
> + uint64_t alignment;
> +
> + /* unmap old memory regions one by one*/
> + dev = get_device(ctx);
> + if (dev && dev->mem) {
> + free_mem_region(dev);
> + free(dev->mem);
> + dev->mem = NULL;
> + }
> +
> + for (idx = 0; idx < memory.nregions; idx++) {
> + if (memory.regions[idx].guest_phys_addr == 0)
> + base_address = memory.regions[idx].userspace_addr;
> + }
> + if (base_address == 0) {
> + RTE_LOG(ERR, VHOST_CONFIG,
> + "couldn't find the mem region whose GPA is 0.\n");
> + return -1;
> + }
> +
> + for (idx = 0; idx < memory.nregions; idx++) {
> + regions[idx].guest_phys_address =
> + memory.regions[idx].guest_phys_addr;
> + regions[idx].guest_phys_address_end =
> + memory.regions[idx].guest_phys_addr +
> + memory.regions[idx].memory_size;
> + regions[idx].memory_size = memory.regions[idx].memory_size;
> + regions[idx].userspace_address =
> + memory.regions[idx].userspace_addr;
> +
> + /* This is ugly */
> + mapped_size = regions[idx].memory_size +
> + memory.regions[idx].mmap_offset;
> + mapped_address = (uint64_t)(uintptr_t)mmap(NULL,
> + mapped_size,
> + PROT_READ | PROT_WRITE, MAP_SHARED,
> + pmsg->fds[idx],
> + 0);
> +
> + RTE_LOG(INFO, VHOST_CONFIG,
> + "mapped region %d fd:%d to %p sz:0x%"PRIx64" off:0x%"PRIx64"\n",
> + idx, pmsg->fds[idx], (void *)mapped_address,
> + mapped_size, memory.regions[idx].mmap_offset);
> +
> + if (mapped_address == (uint64_t)(uintptr_t)MAP_FAILED) {
> + RTE_LOG(ERR, VHOST_CONFIG,
> + "mmap qemu guest failed.\n");
> + goto err;
> + }
> +
> + tmp[idx].mapped_address = mapped_address;
> + tmp[idx].mapped_size = mapped_size;
> + tmp[idx].blksz = get_blk_size(pmsg->fds[idx]);
> + tmp[idx].fd = pmsg->fds[idx];
> +
> + mapped_address += memory.regions[idx].mmap_offset;
> +
> + regions[idx].address_offset = mapped_address -
> + regions[idx].guest_phys_address;
> + LOG_DEBUG(VHOST_CONFIG,
> + "REGION: %u GPA: %p QEMU VA: %p SIZE (%"PRIu64")\n",
> + idx,
> + (void *)(uintptr_t)regions[idx].guest_phys_address,
> + (void *)(uintptr_t)regions[idx].userspace_address,
> + regions[idx].memory_size);
> + }
> +
> + ops->set_mem_table(ctx, regions, memory.nregions);
> +
> + if (dev->mem) {
> + void *tmp_mem;
> + tmp_mem = realloc(dev->mem,
> + sizeof(struct virtio_memory) +
> + sizeof(struct virtio_memory_regions) * memory.nregions +
> + sizeof(struct orig_region_map) * memory.nregions);
> + if (tmp_mem == NULL)
> + goto err_realloc;
> +
> + dev->mem = tmp_mem;
> + region = orig_region(dev->mem, memory.nregions);
> + for (idx = 0; idx < memory.nregions; idx++) {
> + region[idx].mapped_address = tmp[idx].mapped_address;
> + region[idx].mapped_size = tmp[idx].mapped_size;
> + region[idx].blksz = tmp[idx].blksz;
> + region[idx].fd = tmp[idx].fd;
> + }
> + } else
> + goto err_set_mem_table;
> +
> + return 0;
> +
> +err_realloc:
> + free(dev->mem);
> +err_set_mem_table:
> +err:
> + while (idx--) {
> + alignment = tmp[idx].blksz;
> + munmap((void *)RTE_ALIGN_FLOOR(
> + tmp[idx].mapped_address, alignment),
> + RTE_ALIGN_CEIL(tmp[idx].mapped_size, alignment));
> + close(tmp[idx].fd);
> + }
> + dev->mem = NULL;
> + return -1;
> +}
> +
> +static int
> +virtio_is_ready(struct virtio_net *dev)
> +{
> + struct vhost_virtqueue *rvq, *tvq;
> +
> + /* mq support in future.*/
> + rvq = dev->virtqueue[VIRTIO_RXQ];
> + tvq = dev->virtqueue[VIRTIO_TXQ];
> + if (rvq && tvq && rvq->desc && tvq->desc &&
> + (rvq->kickfd != (eventfd_t)-1) &&
> + (rvq->callfd != (eventfd_t)-1) &&
> + (tvq->kickfd != (eventfd_t)-1) &&
> + (tvq->callfd != (eventfd_t)-1)) {
> + RTE_LOG(INFO, VHOST_CONFIG,
> + "virtio is now ready for processing.\n");
> + return 1;
> + }
> + RTE_LOG(INFO, VHOST_CONFIG,
> + "virtio isn't ready for processing.\n");
> + return 0;
> +}
> +
> +void
> +user_set_vring_call(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg)
> +{
> + struct vhost_vring_file file;
> +
> + file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
> + if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
> + file.fd = -1;
> + else
> + file.fd = pmsg->fds[0];
> + RTE_LOG(INFO, VHOST_CONFIG,
> + "vring call idx:%d file:%d\n", file.index, file.fd);
> + ops->set_vring_call(ctx, &file);
> +}
> +
> +
> +/*
> + * In vhost-user, when we receive kick message, will test whether virtio
> + * device is ready for packet processing.
> + */
> +void
> +user_set_vring_kick(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg)
> +{
> + struct vhost_vring_file file;
> + struct virtio_net *dev = get_device(ctx);
> +
> + file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
> + if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
> + file.fd = -1;
> + else
> + file.fd = pmsg->fds[0];
> + RTE_LOG(INFO, VHOST_CONFIG,
> + "vring kick idx:%d file:%d\n", file.index, file.fd);
> + ops->set_vring_kick(ctx, &file);
> +
> + if (virtio_is_ready(dev) &&
> + !(dev->flags & VIRTIO_DEV_RUNNING))
> + notify_ops->new_device(dev);
> +}
> +
> +/*
> + * when virtio is stopped, qemu will send us the GET_VRING_BASE message.
> + */
> +int
> +user_get_vring_base(struct vhost_device_ctx ctx,
> + struct vhost_vring_state *state)
> +{
> + struct virtio_net *dev = get_device(ctx);
> +
> + /* We have to stop the queue (virtio) if it is running. */
> + if (dev->flags & VIRTIO_DEV_RUNNING)
> + notify_ops->destroy_device(dev);
> +
> + /* Here we are safe to get the last used index */
> + ops->get_vring_base(ctx, state->index, state);
> +
> + RTE_LOG(INFO, VHOST_CONFIG,
> + "vring base idx:%d file:%d\n", state->index, state->num);
> + /*
> + * Based on current qemu vhost-user implementation, this message is
> + * sent and only sent in vhost_vring_stop.
> + * TODO: cleanup the vring, it isn't usable since here.
> + */
> + if (((int)dev->virtqueue[VIRTIO_RXQ]->callfd) >= 0) {
> + close(dev->virtqueue[VIRTIO_RXQ]->callfd);
> + dev->virtqueue[VIRTIO_RXQ]->callfd = (eventfd_t)-1;
> + }
> + if (((int)dev->virtqueue[VIRTIO_TXQ]->callfd) >= 0) {
> + close(dev->virtqueue[VIRTIO_TXQ]->callfd);
> + dev->virtqueue[VIRTIO_TXQ]->callfd = (eventfd_t)-1;
> + }
> +
> + return 0;
> +}
> +
> +void
> +user_destroy_device(struct vhost_device_ctx ctx)
> +{
> + struct virtio_net *dev = get_device(ctx);
> +
> + if (dev && (dev->flags & VIRTIO_DEV_RUNNING))
> + notify_ops->destroy_device(dev);
> +
> + if (dev && dev->mem) {
> + free_mem_region(dev);
> + free(dev->mem);
> + dev->mem = NULL;
> + }
> +}
> diff --git a/lib/librte_vhost/vhost_user/virtio-net-user.h b/lib/librte_vhost/vhost_user/virtio-net-user.h
> new file mode 100644
> index 0000000..df24860
> --- /dev/null
> +++ b/lib/librte_vhost/vhost_user/virtio-net-user.h
> @@ -0,0 +1,49 @@
> +/*-
> + * BSD LICENSE
> + *
> + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> + * All rights reserved.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + *
> + * * Redistributions of source code must retain the above copyright
> + * notice, this list of conditions and the following disclaimer.
> + * * Redistributions in binary form must reproduce the above copyright
> + * notice, this list of conditions and the following disclaimer in
> + * the documentation and/or other materials provided with the
> + * distribution.
> + * * Neither the name of Intel Corporation nor the names of its
> + * contributors may be used to endorse or promote products derived
> + * from this software without specific prior written permission.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#ifndef _VIRTIO_NET_USER_H
> +#define _VIRTIO_NET_USER_H
> +
> +#include "vhost-net.h"
> +#include "vhost-net-user.h"
> +
> +int user_set_mem_table(struct vhost_device_ctx, struct VhostUserMsg *);
> +
> +void user_set_vring_call(struct vhost_device_ctx, struct VhostUserMsg *);
> +
> +void user_set_vring_kick(struct vhost_device_ctx, struct VhostUserMsg *);
> +
> +int user_get_vring_base(struct vhost_device_ctx, struct vhost_vring_state *);
> +
> +void user_destroy_device(struct vhost_device_ctx);
> +#endif
> diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c
> index 57a5801..c458ed9 100644
> --- a/lib/librte_vhost/virtio-net.c
> +++ b/lib/librte_vhost/virtio-net.c
> @@ -50,6 +50,7 @@
> #include <rte_virtio_net.h>
>
> #include "vhost-net.h"
> +#include "virtio-net.h"
>
> /*
> * Device linked list structure for configuration.
> @@ -60,7 +61,7 @@ struct virtio_net_config_ll {
> };
>
> /* device ops to add/remove device to/from data core. */
> -static struct virtio_net_device_ops const *notify_ops;
> +struct virtio_net_device_ops const *notify_ops;
> /* root address of the linked list of managed virtio devices */
> static struct virtio_net_config_ll *ll_root;
>
> @@ -88,8 +89,9 @@ qva_to_vva(struct virtio_net *dev, uint64_t qemu_va)
> if ((qemu_va >= region->userspace_address) &&
> (qemu_va <= region->userspace_address +
> region->memory_size)) {
> - vhost_va = dev->mem->mapped_address + qemu_va -
> - dev->mem->base_address;
> + vhost_va = qemu_va + region->guest_phys_address +
> + region->address_offset -
> + region->userspace_address;
> break;
> }
> }
> @@ -119,7 +121,7 @@ get_config_ll_entry(struct vhost_device_ctx ctx)
> * Searches the configuration core linked list and
> * retrieves the device if it exists.
> */
> -static struct virtio_net *
> +struct virtio_net *
> get_device(struct vhost_device_ctx ctx)
> {
> struct virtio_net_config_ll *ll_dev;
> @@ -256,6 +258,11 @@ init_device(struct virtio_net *dev)
> memset(dev->virtqueue[VIRTIO_RXQ], 0, sizeof(struct vhost_virtqueue));
> memset(dev->virtqueue[VIRTIO_TXQ], 0, sizeof(struct vhost_virtqueue));
>
> + dev->virtqueue[VIRTIO_RXQ]->kickfd = (eventfd_t)-1;
> + dev->virtqueue[VIRTIO_RXQ]->callfd = (eventfd_t)-1;
> + dev->virtqueue[VIRTIO_TXQ]->kickfd = (eventfd_t)-1;
> + dev->virtqueue[VIRTIO_TXQ]->callfd = (eventfd_t)-1;
> +
> /* Backends are set to -1 indicating an inactive device. */
> dev->virtqueue[VIRTIO_RXQ]->backend = VIRTIO_DEV_STOPPED;
> dev->virtqueue[VIRTIO_TXQ]->backend = VIRTIO_DEV_STOPPED;
> @@ -455,12 +462,6 @@ set_mem_table(struct vhost_device_ctx ctx,
> if (dev == NULL)
> return -1;
>
> - if (dev->mem) {
> - munmap((void *)(uintptr_t)dev->mem->mapped_address,
> - (size_t)dev->mem->mapped_size);
> - free(dev->mem);
> - }
> -
> /* Malloc the memory structure depending on the number of regions. */
> mem = calloc(1, sizeof(struct virtio_memory) +
> (sizeof(struct virtio_memory_regions) * nregions));
> @@ -624,7 +625,7 @@ set_vring_call(struct vhost_device_ctx ctx, struct vhost_vring_file *file)
> /* file->index refers to the queue index. The txq is 1, rxq is 0. */
> vq = dev->virtqueue[file->index];
>
> - if (vq->kickfd)
> + if ((int)vq->kickfd >= 0)
> close((int)vq->kickfd);
>
> vq->kickfd = file->fd;
> @@ -650,8 +651,9 @@ set_vring_kick(struct vhost_device_ctx ctx, struct vhost_vring_file *file)
> /* file->index refers to the queue index. The txq is 1, rxq is 0. */
> vq = dev->virtqueue[file->index];
>
> - if (vq->callfd)
> + if ((int)vq->callfd >= 0)
> close((int)vq->callfd);
> +
> vq->callfd = file->fd;
>
> return 0;
@@ -34,10 +34,14 @@ include $(RTE_SDK)/mk/rte.vars.mk
# library name
LIB = librte_vhost.a
-CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -I vhost_cuse -O3 -D_FILE_OFFSET_BITS=64 -lfuse
+CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -D_FILE_OFFSET_BITS=64
+CFLAGS += -I vhost_cuse -lfuse
+CFLAGS += -I vhost_user
LDFLAGS += -lfuse
# all source are stored in SRCS-y
-SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := vhost_cuse/vhost-net-cdev.c vhost_cuse/virtio-net-cdev.c vhost_cuse/eventfd_copy.c virtio-net.c vhost_rxtx.c
+SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := virtio-net.c vhost_rxtx.c
+#SRCS-$(CONFIG_RTE_LIBRTE_VHOST) += vhost_cuse/vhost-net-cdev.c vhost_cuse/virtio-net-cdev.c vhost_cuse/eventfd_copy.c
+SRCS-$(CONFIG_RTE_LIBRTE_VHOST) += vhost_user/vhost-net-user.c vhost_user/virtio-net-user.c vhost_user/fd_man.c
# install includes
SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_virtio_net.h
@@ -50,6 +50,8 @@
#include <rte_mempool.h>
#include <rte_mbuf.h>
+#define VHOST_MEMORY_MAX_NREGIONS 8
+
/* Used to indicate that the device is running on a data core */
#define VIRTIO_DEV_RUNNING 1
@@ -41,7 +41,9 @@
#include <rte_log.h>
-#define VHOST_MEMORY_MAX_NREGIONS 8
+#include "rte_virtio_net.h"
+
+extern struct vhost_net_device_ops const *ops;
/* Macros for printing using RTE_LOG */
#define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1
new file mode 100644
@@ -0,0 +1,455 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <errno.h>
+
+#include <rte_log.h>
+#include <rte_virtio_net.h>
+
+#include "fd_man.h"
+#include "vhost-net-user.h"
+#include "vhost-net.h"
+#include "virtio-net-user.h"
+
+static void vserver_new_vq_conn(int fd, void *data);
+static void vserver_message_handler(int fd, void *dat);
+struct vhost_net_device_ops const *ops;
+
+struct connfd_ctx {
+ struct vhost_server *vserver;
+ uint32_t fh;
+};
+
+#define MAX_VHOST_SERVER 1024
+static struct {
+ struct vhost_server *server[MAX_VHOST_SERVER];
+ struct fdset fdset; /**< The fd list this vhost server manages. */
+} g_vhost_server;
+
+static int vserver_idx;
+
+static const char *vhost_message_str[VHOST_USER_MAX] = {
+ [VHOST_USER_NONE] = "VHOST_USER_NONE",
+ [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES",
+ [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES",
+ [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER",
+ [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER",
+ [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE",
+ [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE",
+ [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD",
+ [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM",
+ [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR",
+ [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE",
+ [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE",
+ [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK",
+ [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL",
+ [VHOST_USER_SET_VRING_ERR] = "VHOST_USER_SET_VRING_ERR"
+};
+
+/**
+ * Create a unix domain socket, bind to path and listen for connection.
+ * @return
+ * socket fd or -1 on failure
+ */
+static int
+uds_socket(const char *path)
+{
+ struct sockaddr_un un;
+ int sockfd;
+ int ret;
+
+ if (path == NULL)
+ return -1;
+
+ sockfd = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (sockfd < 0)
+ return -1;
+ RTE_LOG(INFO, VHOST_CONFIG, "socket created, fd:%d\n", sockfd);
+
+ memset(&un, 0, sizeof(un));
+ un.sun_family = AF_UNIX;
+ snprintf(un.sun_path, sizeof(un.sun_path), "%s", path);
+ ret = bind(sockfd, (struct sockaddr *)&un, sizeof(un));
+ if (ret == -1)
+ goto err;
+ RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path);
+
+ ret = listen(sockfd, 10);
+ if (ret == -1)
+ goto err;
+
+ return sockfd;
+
+err:
+ close(sockfd);
+ return -1;
+}
+
+/* return bytes# of read on success or negative val on failure. */
+static int
+read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
+{
+ struct iovec iov;
+ struct msghdr msgh = { 0 };
+ size_t fdsize = fd_num * sizeof(int);
+ char control[CMSG_SPACE(fdsize)];
+ struct cmsghdr *cmsg;
+ int ret;
+
+ iov.iov_base = buf;
+ iov.iov_len = buflen;
+
+ msgh.msg_iov = &iov;
+ msgh.msg_iovlen = 1;
+ msgh.msg_control = control;
+ msgh.msg_controllen = sizeof(control);
+
+ ret = recvmsg(sockfd, &msgh, 0);
+ if (ret <= 0) {
+ RTE_LOG(ERR, VHOST_CONFIG, "recvmsg failed\n");
+ return ret;
+ }
+
+ if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
+ RTE_LOG(ERR, VHOST_CONFIG, "truncted msg\n");
+ return -1;
+ }
+
+ for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
+ cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
+ if ((cmsg->cmsg_level == SOL_SOCKET) &&
+ (cmsg->cmsg_type == SCM_RIGHTS)) {
+ memcpy(fds, CMSG_DATA(cmsg), fdsize);
+ break;
+ }
+ }
+
+ return ret;
+}
+
+/* return bytes# of read on success or negative val on failure. */
+static int
+read_vhost_message(int sockfd, struct VhostUserMsg *msg)
+{
+ int ret;
+
+ ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE,
+ msg->fds, VHOST_MEMORY_MAX_NREGIONS);
+ if (ret <= 0)
+ return ret;
+
+ if (msg && msg->size) {
+ if (msg->size > sizeof(msg->payload)) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "invalid msg size: %d\n", msg->size);
+ return -1;
+ }
+ ret = read(sockfd, &msg->payload, msg->size);
+ if (ret <= 0)
+ return ret;
+ if (ret != (int)msg->size) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "read control message failed\n");
+ return -1;
+ }
+ }
+
+ return ret;
+}
+
+static int
+send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
+{
+
+ struct iovec iov;
+ struct msghdr msgh = { 0 };
+ size_t fdsize = fd_num * sizeof(int);
+ char control[CMSG_SPACE(fdsize)];
+ struct cmsghdr *cmsg;
+ int ret;
+
+ iov.iov_base = buf;
+ iov.iov_len = buflen;
+
+ msgh.msg_iov = &iov;
+ msgh.msg_iovlen = 1;
+
+ if (fds && fd_num > 0) {
+ msgh.msg_control = control;
+ msgh.msg_controllen = sizeof(control);
+ cmsg = CMSG_FIRSTHDR(&msgh);
+ cmsg->cmsg_len = CMSG_LEN(fdsize);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ memcpy(CMSG_DATA(cmsg), fds, fdsize);
+ } else {
+ msgh.msg_control = NULL;
+ msgh.msg_controllen = 0;
+ }
+
+ do {
+ ret = sendmsg(sockfd, &msgh, 0);
+ } while (ret < 0 && errno == EINTR);
+
+ if (ret < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG, "sendmsg error\n");
+ return ret;
+ }
+
+ return ret;
+}
+
+static int
+send_vhost_message(int sockfd, struct VhostUserMsg *msg)
+{
+ int ret;
+
+ if (!msg)
+ return 0;
+
+ msg->flags &= ~VHOST_USER_VERSION_MASK;
+ msg->flags |= VHOST_USER_VERSION;
+ msg->flags |= VHOST_USER_REPLY_MASK;
+
+ ret = send_fd_message(sockfd, (char *)msg,
+ VHOST_USER_HDR_SIZE + msg->size, NULL, 0);
+
+ return ret;
+}
+
+/* call back when there is new virtio connection. */
+static void
+vserver_new_vq_conn(int fd, void *dat)
+{
+ struct vhost_server *vserver = (struct vhost_server *)dat;
+ int conn_fd;
+ struct connfd_ctx *ctx;
+ int fh;
+ struct vhost_device_ctx vdev_ctx = { 0 };
+
+ conn_fd = accept(fd, NULL, NULL);
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "new virtio connection is %d\n", conn_fd);
+ if (conn_fd < 0)
+ return;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (ctx == NULL) {
+ close(conn_fd);
+ return;
+ }
+
+ fh = ops->new_device(vdev_ctx);
+ if (fh == -1) {
+ free(ctx);
+ close(conn_fd);
+ return;
+ }
+ RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", fh);
+
+ ctx->vserver = vserver;
+ ctx->fh = fh;
+ fdset_add(&g_vhost_server.fdset,
+ conn_fd, vserver_message_handler, NULL, ctx);
+}
+
+/* callback when there is message on the connfd */
+static void
+vserver_message_handler(int connfd, void *dat)
+{
+ struct vhost_device_ctx ctx;
+ struct connfd_ctx *cfd_ctx = (struct connfd_ctx *)dat;
+ struct VhostUserMsg msg;
+ uint64_t features;
+ int ret;
+
+ ctx.fh = cfd_ctx->fh;
+ ret = read_vhost_message(connfd, &msg);
+ if (ret < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "vhost read message failed\n");
+
+ close(connfd);
+ fdset_del(&g_vhost_server.fdset, connfd);
+ free(cfd_ctx);
+ user_destroy_device(ctx);
+ ops->destroy_device(ctx);
+
+ return;
+ } else if (ret == 0) {
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "vhost peer closed\n");
+
+ close(connfd);
+ fdset_del(&g_vhost_server.fdset, connfd);
+ free(cfd_ctx);
+ user_destroy_device(ctx);
+ ops->destroy_device(ctx);
+
+ return;
+ }
+ if (msg.request > VHOST_USER_MAX) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "vhost read incorrect message\n");
+
+ close(connfd);
+ fdset_del(&g_vhost_server.fdset, connfd);
+ free(cfd_ctx);
+ user_destroy_device(ctx);
+ ops->destroy_device(ctx);
+
+ return;
+ }
+
+ RTE_LOG(INFO, VHOST_CONFIG, "read message %s\n",
+ vhost_message_str[msg.request]);
+ switch (msg.request) {
+ case VHOST_USER_GET_FEATURES:
+ ret = ops->get_features(ctx, &features);
+ msg.payload.u64 = features;
+ msg.size = sizeof(msg.payload.u64);
+ send_vhost_message(connfd, &msg);
+ break;
+ case VHOST_USER_SET_FEATURES:
+ features = msg.payload.u64;
+ ops->set_features(ctx, &features);
+ break;
+
+ case VHOST_USER_SET_OWNER:
+ ops->set_owner(ctx);
+ break;
+ case VHOST_USER_RESET_OWNER:
+ ops->reset_owner(ctx);
+ break;
+
+ case VHOST_USER_SET_MEM_TABLE:
+ user_set_mem_table(ctx, &msg);
+ break;
+
+ case VHOST_USER_SET_LOG_BASE:
+ RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n");
+ case VHOST_USER_SET_LOG_FD:
+ close(msg.fds[0]);
+ RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n");
+ break;
+
+ case VHOST_USER_SET_VRING_NUM:
+ ops->set_vring_num(ctx, &msg.payload.state);
+ break;
+ case VHOST_USER_SET_VRING_ADDR:
+ ops->set_vring_addr(ctx, &msg.payload.addr);
+ break;
+ case VHOST_USER_SET_VRING_BASE:
+ ops->set_vring_base(ctx, &msg.payload.state);
+ break;
+
+ case VHOST_USER_GET_VRING_BASE:
+ ret = user_get_vring_base(ctx, &msg.payload.state);
+ msg.size = sizeof(msg.payload.state);
+ send_vhost_message(connfd, &msg);
+ break;
+
+ case VHOST_USER_SET_VRING_KICK:
+ user_set_vring_kick(ctx, &msg);
+ break;
+ case VHOST_USER_SET_VRING_CALL:
+ user_set_vring_call(ctx, &msg);
+ break;
+
+ case VHOST_USER_SET_VRING_ERR:
+ if (!(msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK))
+ close(msg.fds[0]);
+ RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n");
+ break;
+
+ default:
+ break;
+
+ }
+}
+
+
+/**
+ * Creates and initialise the vhost server.
+ */
+int
+rte_vhost_driver_register(const char *path)
+{
+ struct vhost_server *vserver;
+
+ if (vserver_idx == 0) {
+ fdset_init(&g_vhost_server.fdset);
+ ops = get_virtio_net_callbacks();
+ }
+ if (vserver_idx == MAX_VHOST_SERVER)
+ return -1;
+
+ vserver = calloc(sizeof(struct vhost_server), 1);
+ if (vserver == NULL)
+ return -1;
+
+ unlink(path);
+
+ vserver->listenfd = uds_socket(path);
+ if (vserver->listenfd < 0) {
+ free(vserver);
+ return -1;
+ }
+ vserver->path = path;
+
+ fdset_add(&g_vhost_server.fdset, vserver->listenfd,
+ vserver_new_vq_conn, NULL,
+ vserver);
+
+ g_vhost_server.server[vserver_idx++] = vserver;
+
+ return 0;
+}
+
+
+int
+rte_vhost_driver_session_start(void)
+{
+ fdset_event_dispatch(&g_vhost_server.fdset);
+ return 0;
+}
+
new file mode 100644
@@ -0,0 +1,106 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VHOST_NET_USER_H
+#define _VHOST_NET_USER_H
+
+#include <stdint.h>
+#include <linux/vhost.h>
+
+#include "rte_virtio_net.h"
+#include "fd_man.h"
+
+struct vhost_server {
+ const char *path; /**< The path the uds is bind to. */
+ int listenfd; /**< The listener sockfd. */
+};
+
+/* refer to hw/virtio/vhost-user.c */
+
+typedef enum VhostUserRequest {
+ VHOST_USER_NONE = 0,
+ VHOST_USER_GET_FEATURES = 1,
+ VHOST_USER_SET_FEATURES = 2,
+ VHOST_USER_SET_OWNER = 3,
+ VHOST_USER_RESET_OWNER = 4,
+ VHOST_USER_SET_MEM_TABLE = 5,
+ VHOST_USER_SET_LOG_BASE = 6,
+ VHOST_USER_SET_LOG_FD = 7,
+ VHOST_USER_SET_VRING_NUM = 8,
+ VHOST_USER_SET_VRING_ADDR = 9,
+ VHOST_USER_SET_VRING_BASE = 10,
+ VHOST_USER_GET_VRING_BASE = 11,
+ VHOST_USER_SET_VRING_KICK = 12,
+ VHOST_USER_SET_VRING_CALL = 13,
+ VHOST_USER_SET_VRING_ERR = 14,
+ VHOST_USER_MAX
+} VhostUserRequest;
+
+typedef struct VhostUserMemoryRegion {
+ uint64_t guest_phys_addr;
+ uint64_t memory_size;
+ uint64_t userspace_addr;
+ uint64_t mmap_offset;
+} VhostUserMemoryRegion;
+
+typedef struct VhostUserMemory {
+ uint32_t nregions;
+ uint32_t padding;
+ VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
+} VhostUserMemory;
+
+typedef struct VhostUserMsg {
+ VhostUserRequest request;
+
+#define VHOST_USER_VERSION_MASK (0x3)
+#define VHOST_USER_REPLY_MASK (0x1 << 2)
+ uint32_t flags;
+ uint32_t size; /* the following payload size */
+ union {
+#define VHOST_USER_VRING_IDX_MASK (0xff)
+#define VHOST_USER_VRING_NOFD_MASK (0x1<<8)
+ uint64_t u64;
+ struct vhost_vring_state state;
+ struct vhost_vring_addr addr;
+ VhostUserMemory memory;
+ } payload;
+ int fds[VHOST_MEMORY_MAX_NREGIONS];
+} __attribute((packed)) VhostUserMsg;
+
+#define VHOST_USER_HDR_SIZE (intptr_t)(&((VhostUserMsg *)0)->payload.u64)
+
+/* The version of the protocol we support */
+#define VHOST_USER_VERSION (0x1)
+
+/*****************************************************************************/
+#endif
new file mode 100644
@@ -0,0 +1,322 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <rte_common.h>
+#include <rte_log.h>
+
+#include "virtio-net.h"
+#include "virtio-net-user.h"
+#include "vhost-net-user.h"
+#include "vhost-net.h"
+
+struct orig_region_map {
+ int fd;
+ uint64_t mapped_address;
+ uint64_t mapped_size;
+ uint64_t blksz;
+};
+
+#define orig_region(ptr, nregions) ((struct orig_region_map *)RTE_PTR_ADD(ptr, \
+ sizeof(struct virtio_memory) + \
+ sizeof(struct virtio_memory_regions) * (nregions)))
+
+static uint64_t
+get_blk_size(int fd)
+{
+ struct stat stat;
+
+ fstat(fd, &stat);
+ return (uint64_t)stat.st_blksize;
+}
+
+static void
+free_mem_region(struct virtio_net *dev)
+{
+ struct orig_region_map *region;
+ unsigned int idx;
+ uint64_t alignment;
+
+ if (!dev || !dev->mem)
+ return;
+
+ region = orig_region(dev->mem, dev->mem->nregions);
+ for (idx = 0; idx < dev->mem->nregions; idx++) {
+ if (region[idx].mapped_address) {
+ alignment = region[idx].blksz;
+ munmap((void *)
+ RTE_ALIGN_FLOOR(
+ region[idx].mapped_address, alignment),
+ RTE_ALIGN_CEIL(
+ region[idx].mapped_size, alignment));
+ close(region[idx].fd);
+ }
+ }
+}
+
+int
+user_set_mem_table(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg)
+{
+ struct VhostUserMemory memory = pmsg->payload.memory;
+ struct virtio_memory_regions regions[VHOST_MEMORY_MAX_NREGIONS];
+ uint64_t mapped_address, mapped_size, base_address = 0;
+ struct virtio_net *dev;
+ unsigned int idx = 0;
+ struct orig_region_map tmp[VHOST_MEMORY_MAX_NREGIONS] = {
+ [0 ... VHOST_MEMORY_MAX_NREGIONS - 1] = { 0 } };
+ struct orig_region_map *region;
+ uint64_t alignment;
+
+ /* unmap old memory regions one by one*/
+ dev = get_device(ctx);
+ if (dev && dev->mem) {
+ free_mem_region(dev);
+ free(dev->mem);
+ dev->mem = NULL;
+ }
+
+ for (idx = 0; idx < memory.nregions; idx++) {
+ if (memory.regions[idx].guest_phys_addr == 0)
+ base_address = memory.regions[idx].userspace_addr;
+ }
+ if (base_address == 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "couldn't find the mem region whose GPA is 0.\n");
+ return -1;
+ }
+
+ for (idx = 0; idx < memory.nregions; idx++) {
+ regions[idx].guest_phys_address =
+ memory.regions[idx].guest_phys_addr;
+ regions[idx].guest_phys_address_end =
+ memory.regions[idx].guest_phys_addr +
+ memory.regions[idx].memory_size;
+ regions[idx].memory_size = memory.regions[idx].memory_size;
+ regions[idx].userspace_address =
+ memory.regions[idx].userspace_addr;
+
+ /* This is ugly */
+ mapped_size = regions[idx].memory_size +
+ memory.regions[idx].mmap_offset;
+ mapped_address = (uint64_t)(uintptr_t)mmap(NULL,
+ mapped_size,
+ PROT_READ | PROT_WRITE, MAP_SHARED,
+ pmsg->fds[idx],
+ 0);
+
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "mapped region %d fd:%d to %p sz:0x%"PRIx64" off:0x%"PRIx64"\n",
+ idx, pmsg->fds[idx], (void *)mapped_address,
+ mapped_size, memory.regions[idx].mmap_offset);
+
+ if (mapped_address == (uint64_t)(uintptr_t)MAP_FAILED) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "mmap qemu guest failed.\n");
+ goto err;
+ }
+
+ tmp[idx].mapped_address = mapped_address;
+ tmp[idx].mapped_size = mapped_size;
+ tmp[idx].blksz = get_blk_size(pmsg->fds[idx]);
+ tmp[idx].fd = pmsg->fds[idx];
+
+ mapped_address += memory.regions[idx].mmap_offset;
+
+ regions[idx].address_offset = mapped_address -
+ regions[idx].guest_phys_address;
+ LOG_DEBUG(VHOST_CONFIG,
+ "REGION: %u GPA: %p QEMU VA: %p SIZE (%"PRIu64")\n",
+ idx,
+ (void *)(uintptr_t)regions[idx].guest_phys_address,
+ (void *)(uintptr_t)regions[idx].userspace_address,
+ regions[idx].memory_size);
+ }
+
+ ops->set_mem_table(ctx, regions, memory.nregions);
+
+ if (dev->mem) {
+ void *tmp_mem;
+ tmp_mem = realloc(dev->mem,
+ sizeof(struct virtio_memory) +
+ sizeof(struct virtio_memory_regions) * memory.nregions +
+ sizeof(struct orig_region_map) * memory.nregions);
+ if (tmp_mem == NULL)
+ goto err_realloc;
+
+ dev->mem = tmp_mem;
+ region = orig_region(dev->mem, memory.nregions);
+ for (idx = 0; idx < memory.nregions; idx++) {
+ region[idx].mapped_address = tmp[idx].mapped_address;
+ region[idx].mapped_size = tmp[idx].mapped_size;
+ region[idx].blksz = tmp[idx].blksz;
+ region[idx].fd = tmp[idx].fd;
+ }
+ } else
+ goto err_set_mem_table;
+
+ return 0;
+
+err_realloc:
+ free(dev->mem);
+err_set_mem_table:
+err:
+ while (idx--) {
+ alignment = tmp[idx].blksz;
+ munmap((void *)RTE_ALIGN_FLOOR(
+ tmp[idx].mapped_address, alignment),
+ RTE_ALIGN_CEIL(tmp[idx].mapped_size, alignment));
+ close(tmp[idx].fd);
+ }
+ dev->mem = NULL;
+ return -1;
+}
+
+static int
+virtio_is_ready(struct virtio_net *dev)
+{
+ struct vhost_virtqueue *rvq, *tvq;
+
+ /* mq support in future.*/
+ rvq = dev->virtqueue[VIRTIO_RXQ];
+ tvq = dev->virtqueue[VIRTIO_TXQ];
+ if (rvq && tvq && rvq->desc && tvq->desc &&
+ (rvq->kickfd != (eventfd_t)-1) &&
+ (rvq->callfd != (eventfd_t)-1) &&
+ (tvq->kickfd != (eventfd_t)-1) &&
+ (tvq->callfd != (eventfd_t)-1)) {
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "virtio is now ready for processing.\n");
+ return 1;
+ }
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "virtio isn't ready for processing.\n");
+ return 0;
+}
+
+void
+user_set_vring_call(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg)
+{
+ struct vhost_vring_file file;
+
+ file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+ if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
+ file.fd = -1;
+ else
+ file.fd = pmsg->fds[0];
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "vring call idx:%d file:%d\n", file.index, file.fd);
+ ops->set_vring_call(ctx, &file);
+}
+
+
+/*
+ * In vhost-user, when we receive kick message, will test whether virtio
+ * device is ready for packet processing.
+ */
+void
+user_set_vring_kick(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg)
+{
+ struct vhost_vring_file file;
+ struct virtio_net *dev = get_device(ctx);
+
+ file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+ if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
+ file.fd = -1;
+ else
+ file.fd = pmsg->fds[0];
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "vring kick idx:%d file:%d\n", file.index, file.fd);
+ ops->set_vring_kick(ctx, &file);
+
+ if (virtio_is_ready(dev) &&
+ !(dev->flags & VIRTIO_DEV_RUNNING))
+ notify_ops->new_device(dev);
+}
+
+/*
+ * when virtio is stopped, qemu will send us the GET_VRING_BASE message.
+ */
+int
+user_get_vring_base(struct vhost_device_ctx ctx,
+ struct vhost_vring_state *state)
+{
+ struct virtio_net *dev = get_device(ctx);
+
+ /* We have to stop the queue (virtio) if it is running. */
+ if (dev->flags & VIRTIO_DEV_RUNNING)
+ notify_ops->destroy_device(dev);
+
+ /* Here we are safe to get the last used index */
+ ops->get_vring_base(ctx, state->index, state);
+
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "vring base idx:%d file:%d\n", state->index, state->num);
+ /*
+ * Based on current qemu vhost-user implementation, this message is
+ * sent and only sent in vhost_vring_stop.
+ * TODO: cleanup the vring, it isn't usable since here.
+ */
+ if (((int)dev->virtqueue[VIRTIO_RXQ]->callfd) >= 0) {
+ close(dev->virtqueue[VIRTIO_RXQ]->callfd);
+ dev->virtqueue[VIRTIO_RXQ]->callfd = (eventfd_t)-1;
+ }
+ if (((int)dev->virtqueue[VIRTIO_TXQ]->callfd) >= 0) {
+ close(dev->virtqueue[VIRTIO_TXQ]->callfd);
+ dev->virtqueue[VIRTIO_TXQ]->callfd = (eventfd_t)-1;
+ }
+
+ return 0;
+}
+
+void
+user_destroy_device(struct vhost_device_ctx ctx)
+{
+ struct virtio_net *dev = get_device(ctx);
+
+ if (dev && (dev->flags & VIRTIO_DEV_RUNNING))
+ notify_ops->destroy_device(dev);
+
+ if (dev && dev->mem) {
+ free_mem_region(dev);
+ free(dev->mem);
+ dev->mem = NULL;
+ }
+}
new file mode 100644
@@ -0,0 +1,49 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VIRTIO_NET_USER_H
+#define _VIRTIO_NET_USER_H
+
+#include "vhost-net.h"
+#include "vhost-net-user.h"
+
+int user_set_mem_table(struct vhost_device_ctx, struct VhostUserMsg *);
+
+void user_set_vring_call(struct vhost_device_ctx, struct VhostUserMsg *);
+
+void user_set_vring_kick(struct vhost_device_ctx, struct VhostUserMsg *);
+
+int user_get_vring_base(struct vhost_device_ctx, struct vhost_vring_state *);
+
+void user_destroy_device(struct vhost_device_ctx);
+#endif
@@ -50,6 +50,7 @@
#include <rte_virtio_net.h>
#include "vhost-net.h"
+#include "virtio-net.h"
/*
* Device linked list structure for configuration.
@@ -60,7 +61,7 @@ struct virtio_net_config_ll {
};
/* device ops to add/remove device to/from data core. */
-static struct virtio_net_device_ops const *notify_ops;
+struct virtio_net_device_ops const *notify_ops;
/* root address of the linked list of managed virtio devices */
static struct virtio_net_config_ll *ll_root;
@@ -88,8 +89,9 @@ qva_to_vva(struct virtio_net *dev, uint64_t qemu_va)
if ((qemu_va >= region->userspace_address) &&
(qemu_va <= region->userspace_address +
region->memory_size)) {
- vhost_va = dev->mem->mapped_address + qemu_va -
- dev->mem->base_address;
+ vhost_va = qemu_va + region->guest_phys_address +
+ region->address_offset -
+ region->userspace_address;
break;
}
}
@@ -119,7 +121,7 @@ get_config_ll_entry(struct vhost_device_ctx ctx)
* Searches the configuration core linked list and
* retrieves the device if it exists.
*/
-static struct virtio_net *
+struct virtio_net *
get_device(struct vhost_device_ctx ctx)
{
struct virtio_net_config_ll *ll_dev;
@@ -256,6 +258,11 @@ init_device(struct virtio_net *dev)
memset(dev->virtqueue[VIRTIO_RXQ], 0, sizeof(struct vhost_virtqueue));
memset(dev->virtqueue[VIRTIO_TXQ], 0, sizeof(struct vhost_virtqueue));
+ dev->virtqueue[VIRTIO_RXQ]->kickfd = (eventfd_t)-1;
+ dev->virtqueue[VIRTIO_RXQ]->callfd = (eventfd_t)-1;
+ dev->virtqueue[VIRTIO_TXQ]->kickfd = (eventfd_t)-1;
+ dev->virtqueue[VIRTIO_TXQ]->callfd = (eventfd_t)-1;
+
/* Backends are set to -1 indicating an inactive device. */
dev->virtqueue[VIRTIO_RXQ]->backend = VIRTIO_DEV_STOPPED;
dev->virtqueue[VIRTIO_TXQ]->backend = VIRTIO_DEV_STOPPED;
@@ -455,12 +462,6 @@ set_mem_table(struct vhost_device_ctx ctx,
if (dev == NULL)
return -1;
- if (dev->mem) {
- munmap((void *)(uintptr_t)dev->mem->mapped_address,
- (size_t)dev->mem->mapped_size);
- free(dev->mem);
- }
-
/* Malloc the memory structure depending on the number of regions. */
mem = calloc(1, sizeof(struct virtio_memory) +
(sizeof(struct virtio_memory_regions) * nregions));
@@ -624,7 +625,7 @@ set_vring_call(struct vhost_device_ctx ctx, struct vhost_vring_file *file)
/* file->index refers to the queue index. The txq is 1, rxq is 0. */
vq = dev->virtqueue[file->index];
- if (vq->kickfd)
+ if ((int)vq->kickfd >= 0)
close((int)vq->kickfd);
vq->kickfd = file->fd;
@@ -650,8 +651,9 @@ set_vring_kick(struct vhost_device_ctx ctx, struct vhost_vring_file *file)
/* file->index refers to the queue index. The txq is 1, rxq is 0. */
vq = dev->virtqueue[file->index];
- if (vq->callfd)
+ if ((int)vq->callfd >= 0)
close((int)vq->callfd);
+
vq->callfd = file->fd;
return 0;