[dpdk-dev,RFC,v2,10/12] lib/librte_vhost: vhost-user memory region map
Commit Message
deals with vhost user memory map/unmap alignment
Signed-off-by: Huawei Xie <huawei.xie@intel.com>
---
lib/librte_vhost/rte_virtio_net.h | 2 +
lib/librte_vhost/vhost-net.h | 2 -
lib/librte_vhost/vhost_user/vhost-net-user.h | 3 +-
lib/librte_vhost/vhost_user/virtio-net-user.c | 105 ++++++++++++++++++++++++--
4 files changed, 100 insertions(+), 12 deletions(-)
Comments
(2014/12/11 6:37), Huawei Xie wrote:
> deals with vhost user memory map/unmap alignment
>
> Signed-off-by: Huawei Xie <huawei.xie@intel.com>
> ---
> lib/librte_vhost/rte_virtio_net.h | 2 +
> lib/librte_vhost/vhost-net.h | 2 -
> lib/librte_vhost/vhost_user/vhost-net-user.h | 3 +-
> lib/librte_vhost/vhost_user/virtio-net-user.c | 105 ++++++++++++++++++++++++--
> 4 files changed, 100 insertions(+), 12 deletions(-)
>
> diff --git a/lib/librte_vhost/rte_virtio_net.h b/lib/librte_vhost/rte_virtio_net.h
> index 00b1328..77db80b 100644
> --- a/lib/librte_vhost/rte_virtio_net.h
> +++ b/lib/librte_vhost/rte_virtio_net.h
> @@ -48,6 +48,8 @@
> #include <rte_mempool.h>
> #include <rte_mbuf.h>
>
> +#define VHOST_MEMORY_MAX_NREGIONS 8
> +
> /* Used to indicate that the device is running on a data core */
> #define VIRTIO_DEV_RUNNING 1
>
> diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
> index f9ec40b..ec2584f 100644
> --- a/lib/librte_vhost/vhost-net.h
> +++ b/lib/librte_vhost/vhost-net.h
> @@ -43,8 +43,6 @@
>
> #include "rte_virtio_net.h"
>
> -#define VHOST_MEMORY_MAX_NREGIONS 8
> -
> extern struct vhost_net_device_ops const *ops;
>
> /* Macros for printing using RTE_LOG */
> diff --git a/lib/librte_vhost/vhost_user/vhost-net-user.h b/lib/librte_vhost/vhost_user/vhost-net-user.h
> index c138844..f4c9d01 100644
> --- a/lib/librte_vhost/vhost_user/vhost-net-user.h
> +++ b/lib/librte_vhost/vhost_user/vhost-net-user.h
> @@ -37,6 +37,7 @@
> #include <stdint.h>
> #include <linux/vhost.h>
>
> +#include "rte_virtio_net.h"
> #include "fd_man.h"
>
> struct vhost_server {
> @@ -47,8 +48,6 @@ struct vhost_server {
>
> /* refer to hw/virtio/vhost-user.c */
>
> -#define VHOST_MEMORY_MAX_NREGIONS 8
> -
> typedef enum VhostUserRequest {
> VHOST_USER_NONE = 0,
> VHOST_USER_GET_FEATURES = 1,
> diff --git a/lib/librte_vhost/vhost_user/virtio-net-user.c b/lib/librte_vhost/vhost_user/virtio-net-user.c
> index ad59fcc..3aecb17 100644
> --- a/lib/librte_vhost/vhost_user/virtio-net-user.c
> +++ b/lib/librte_vhost/vhost_user/virtio-net-user.c
> @@ -36,7 +36,11 @@
> #include <stdlib.h>
> #include <unistd.h>
> #include <sys/mman.h>
> +#include <sys/types.h>
> +#include <sys/stat.h>
> +#include <unistd.h>
>
> +#include <rte_common.h>
> #include <rte_log.h>
>
> #include "virtio-net.h"
> @@ -44,13 +48,56 @@
> #include "vhost-net-user.h"
> #include "vhost-net.h"
>
> +struct orig_region_map {
> + int fd;
> + uint64_t mapped_address;
> + uint64_t mapped_size;
> + uint64_t blksz;
> +};
> +
> +#define orig_region(ptr, nregions) (struct orig_region_map *)RTE_PTR_ADD(ptr, sizeof(struct virtio_memory) + sizeof(struct virtio_memory_regions) * (nregions))
> +
> +static uint64_t
> +get_blk_size(int fd)
> +{
> + struct stat stat;
> + fstat(fd, &stat);
> + return (uint64_t)stat.st_blksize;
> +}
I've also confirmed we can get hugepage size of the fd using st_blksize.
If someone wants to run QEMU on 2MB hugepage, but DPDK backend is on
1GB, even in such a case, we will also be able to mmap and munmap QEMU
backend memory correctly.
So I guess using st_blksize is smart workaround not to hit munmap issue.
> +
> int
> user_set_mem_table(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg)
> {
> - unsigned int idx;
> struct VhostUserMemory memory = pmsg->payload.memory;
> struct virtio_memory_regions regions[VHOST_MEMORY_MAX_NREGIONS];
> - uint64_t mapped_address, base_address = 0;
> + uint64_t mapped_address, mapped_size, base_address = 0;
> + struct virtio_net *dev;
> + unsigned int idx = 0;
> + struct orig_region_map tmp[VHOST_MEMORY_MAX_NREGIONS] =
> + { [0 ... VHOST_MEMORY_MAX_NREGIONS - 1] = { 0 } };
> + struct orig_region_map *region;
> + uint64_t alignment;
> + int ret;
> +
> + /* unmap old memory regions one by one*/
> + dev = get_device(ctx);
> + if (dev->mem) {
> + region = orig_region(dev->mem, dev->mem->nregions);
> + for (idx = 0; idx < dev->mem->nregions; idx++) {
> + if (region[idx].mapped_address) {
> + alignment = region[idx].blksz;
> + printf("Freeing %p\n",
> + (void *)(uintptr_t)region[idx].mapped_address);
> + ret = munmap((void *)RTE_ALIGN_FLOOR(region[idx].mapped_address, alignment),
> + RTE_ALIGN_CEIL(region[idx].mapped_size, alignment));
> + printf("munmap ret= %d\n", ret);
> + printf("close file %d\n", region[idx].fd);
> + close(region[idx].fd);
> + }
> + }
> + free(dev->mem);
> + dev->mem = NULL;
> + }
>
> for (idx = 0; idx < memory.nregions; idx++) {
> if (memory.regions[idx].guest_phys_addr == 0)
> @@ -73,22 +120,30 @@ user_set_mem_table(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg)
> memory.regions[idx].userspace_addr;
>
> /* This is ugly */
> + mapped_size = regions[idx].memory_size +
> + memory.regions[idx].mmap_offset;
> mapped_address = (uint64_t)(uintptr_t)mmap(NULL,
> - regions[idx].memory_size +
> - memory.regions[idx].mmap_offset,
> + mapped_size,
> PROT_READ | PROT_WRITE, MAP_SHARED,
> pmsg->fds[idx],
> 0);
> +
> RTE_LOG(INFO, VHOST_CONFIG,
> - "mapped region %d to %p\n",
> - idx, (void *)mapped_address);
> + "mapped region %d fd:%d to %p sz:0x%"PRIx64" off:0x%"PRIx64"\n",
> + idx, pmsg->fds[idx], (void *)mapped_address,
> + mapped_size, memory.regions[idx].mmap_offset);
>
> if (mapped_address == (uint64_t)(uintptr_t)MAP_FAILED) {
> RTE_LOG(ERR, VHOST_CONFIG,
> "mmap qemu guest failed.\n");
> - return -1;
> + goto err;
> }
>
> + tmp[idx].mapped_address = mapped_address;
> + tmp[idx].mapped_size = mapped_size;
> + tmp[idx].blksz = get_blk_size(pmsg->fds[idx]);
> + tmp[idx].fd = pmsg->fds[idx];
> +
> mapped_address += memory.regions[idx].mmap_offset;
>
> regions[idx].address_offset = mapped_address -
> @@ -100,10 +155,44 @@ user_set_mem_table(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg)
> (void *)(uintptr_t)regions[idx].userspace_address,
> regions[idx].memory_size);
> }
> +
> ops->set_mem_table(ctx, regions, memory.nregions);
> +
> + if (dev->mem) {
> + void *tmp_mem;
> + tmp_mem = realloc(dev->mem,
> + sizeof(struct virtio_memory) +
> + sizeof(struct virtio_memory_regions) * memory.nregions +
> + sizeof(struct orig_region_map) * memory.nregions);
> + if (tmp_mem == NULL) {
> + goto err_realloc;
> + }
> + dev->mem = tmp_mem;
> + region = orig_region(dev->mem, memory.nregions);
> + for (idx = 0; idx < memory.nregions; idx++) {
> + region[idx].mapped_address = tmp[idx].mapped_address;
> + region[idx].mapped_size = tmp[idx].mapped_size;
> + region[idx].blksz = tmp[idx].blksz;
> + region[idx].fd = tmp[idx].fd;
> + }
> + } else
> + goto err_set_mem_table;
> +
> return 0;
> -}
>
> +err_realloc:
> + free(dev->mem);
> +err_set_mem_table:
> +err:
> + while (idx--) {
> + alignment = tmp[idx].blksz;
> + munmap((void *)RTE_ALIGN_FLOOR(tmp[idx].mapped_address, alignment),
> + RTE_ALIGN_CEIL(tmp[idx].mapped_size, alignment));
> + close(tmp[idx].fd);
> + }
> + dev->mem = NULL;
> + return -1;
> +}
>
> static int
> virtio_is_ready(struct virtio_net *dev)
@@ -48,6 +48,8 @@
#include <rte_mempool.h>
#include <rte_mbuf.h>
+#define VHOST_MEMORY_MAX_NREGIONS 8
+
/* Used to indicate that the device is running on a data core */
#define VIRTIO_DEV_RUNNING 1
@@ -43,8 +43,6 @@
#include "rte_virtio_net.h"
-#define VHOST_MEMORY_MAX_NREGIONS 8
-
extern struct vhost_net_device_ops const *ops;
/* Macros for printing using RTE_LOG */
@@ -37,6 +37,7 @@
#include <stdint.h>
#include <linux/vhost.h>
+#include "rte_virtio_net.h"
#include "fd_man.h"
struct vhost_server {
@@ -47,8 +48,6 @@ struct vhost_server {
/* refer to hw/virtio/vhost-user.c */
-#define VHOST_MEMORY_MAX_NREGIONS 8
-
typedef enum VhostUserRequest {
VHOST_USER_NONE = 0,
VHOST_USER_GET_FEATURES = 1,
@@ -36,7 +36,11 @@
#include <stdlib.h>
#include <unistd.h>
#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <rte_common.h>
#include <rte_log.h>
#include "virtio-net.h"
@@ -44,13 +48,56 @@
#include "vhost-net-user.h"
#include "vhost-net.h"
+struct orig_region_map {
+ int fd;
+ uint64_t mapped_address;
+ uint64_t mapped_size;
+ uint64_t blksz;
+};
+
+#define orig_region(ptr, nregions) (struct orig_region_map *)RTE_PTR_ADD(ptr, sizeof(struct virtio_memory) + sizeof(struct virtio_memory_regions) * (nregions))
+
+static uint64_t
+get_blk_size(int fd)
+{
+ struct stat stat;
+ fstat(fd, &stat);
+ return (uint64_t)stat.st_blksize;
+}
+
int
user_set_mem_table(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg)
{
- unsigned int idx;
struct VhostUserMemory memory = pmsg->payload.memory;
struct virtio_memory_regions regions[VHOST_MEMORY_MAX_NREGIONS];
- uint64_t mapped_address, base_address = 0;
+ uint64_t mapped_address, mapped_size, base_address = 0;
+ struct virtio_net *dev;
+ unsigned int idx = 0;
+ struct orig_region_map tmp[VHOST_MEMORY_MAX_NREGIONS] =
+ { [0 ... VHOST_MEMORY_MAX_NREGIONS - 1] = { 0 } };
+ struct orig_region_map *region;
+ uint64_t alignment;
+ int ret;
+
+ /* unmap old memory regions one by one*/
+ dev = get_device(ctx);
+ if (dev->mem) {
+ region = orig_region(dev->mem, dev->mem->nregions);
+ for (idx = 0; idx < dev->mem->nregions; idx++) {
+ if (region[idx].mapped_address) {
+ alignment = region[idx].blksz;
+ printf("Freeing %p\n",
+ (void *)(uintptr_t)region[idx].mapped_address);
+ ret = munmap((void *)RTE_ALIGN_FLOOR(region[idx].mapped_address, alignment),
+ RTE_ALIGN_CEIL(region[idx].mapped_size, alignment));
+ printf("munmap ret= %d\n", ret);
+ printf("close file %d\n", region[idx].fd);
+ close(region[idx].fd);
+ }
+ }
+ free(dev->mem);
+ dev->mem = NULL;
+ }
for (idx = 0; idx < memory.nregions; idx++) {
if (memory.regions[idx].guest_phys_addr == 0)
@@ -73,22 +120,30 @@ user_set_mem_table(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg)
memory.regions[idx].userspace_addr;
/* This is ugly */
+ mapped_size = regions[idx].memory_size +
+ memory.regions[idx].mmap_offset;
mapped_address = (uint64_t)(uintptr_t)mmap(NULL,
- regions[idx].memory_size +
- memory.regions[idx].mmap_offset,
+ mapped_size,
PROT_READ | PROT_WRITE, MAP_SHARED,
pmsg->fds[idx],
0);
+
RTE_LOG(INFO, VHOST_CONFIG,
- "mapped region %d to %p\n",
- idx, (void *)mapped_address);
+ "mapped region %d fd:%d to %p sz:0x%"PRIx64" off:0x%"PRIx64"\n",
+ idx, pmsg->fds[idx], (void *)mapped_address,
+ mapped_size, memory.regions[idx].mmap_offset);
if (mapped_address == (uint64_t)(uintptr_t)MAP_FAILED) {
RTE_LOG(ERR, VHOST_CONFIG,
"mmap qemu guest failed.\n");
- return -1;
+ goto err;
}
+ tmp[idx].mapped_address = mapped_address;
+ tmp[idx].mapped_size = mapped_size;
+ tmp[idx].blksz = get_blk_size(pmsg->fds[idx]);
+ tmp[idx].fd = pmsg->fds[idx];
+
mapped_address += memory.regions[idx].mmap_offset;
regions[idx].address_offset = mapped_address -
@@ -100,10 +155,44 @@ user_set_mem_table(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg)
(void *)(uintptr_t)regions[idx].userspace_address,
regions[idx].memory_size);
}
+
ops->set_mem_table(ctx, regions, memory.nregions);
+
+ if (dev->mem) {
+ void *tmp_mem;
+ tmp_mem = realloc(dev->mem,
+ sizeof(struct virtio_memory) +
+ sizeof(struct virtio_memory_regions) * memory.nregions +
+ sizeof(struct orig_region_map) * memory.nregions);
+ if (tmp_mem == NULL) {
+ goto err_realloc;
+ }
+ dev->mem = tmp_mem;
+ region = orig_region(dev->mem, memory.nregions);
+ for (idx = 0; idx < memory.nregions; idx++) {
+ region[idx].mapped_address = tmp[idx].mapped_address;
+ region[idx].mapped_size = tmp[idx].mapped_size;
+ region[idx].blksz = tmp[idx].blksz;
+ region[idx].fd = tmp[idx].fd;
+ }
+ } else
+ goto err_set_mem_table;
+
return 0;
-}
+err_realloc:
+ free(dev->mem);
+err_set_mem_table:
+err:
+ while (idx--) {
+ alignment = tmp[idx].blksz;
+ munmap((void *)RTE_ALIGN_FLOOR(tmp[idx].mapped_address, alignment),
+ RTE_ALIGN_CEIL(tmp[idx].mapped_size, alignment));
+ close(tmp[idx].fd);
+ }
+ dev->mem = NULL;
+ return -1;
+}
static int
virtio_is_ready(struct virtio_net *dev)