[dpdk-dev,RFC,v2,10/12] lib/librte_vhost: vhost-user memory region map

Message ID 1418247477-13920-11-git-send-email-huawei.xie@intel.com (mailing list archive)
State RFC, archived
Headers

Commit Message

Huawei Xie Dec. 10, 2014, 9:37 p.m. UTC
deals with vhost user memory map/unmap alignment

Signed-off-by: Huawei Xie <huawei.xie@intel.com>
---
 lib/librte_vhost/rte_virtio_net.h             |   2 +
 lib/librte_vhost/vhost-net.h                  |   2 -
 lib/librte_vhost/vhost_user/vhost-net-user.h  |   3 +-
 lib/librte_vhost/vhost_user/virtio-net-user.c | 105 ++++++++++++++++++++++++--
 4 files changed, 100 insertions(+), 12 deletions(-)
  

Comments

Tetsuya Mukawa Dec. 16, 2014, 2:38 a.m. UTC | #1
(2014/12/11 6:37), Huawei Xie wrote:
> deals with vhost user memory map/unmap alignment
>
> Signed-off-by: Huawei Xie <huawei.xie@intel.com>
> ---
>  lib/librte_vhost/rte_virtio_net.h             |   2 +
>  lib/librte_vhost/vhost-net.h                  |   2 -
>  lib/librte_vhost/vhost_user/vhost-net-user.h  |   3 +-
>  lib/librte_vhost/vhost_user/virtio-net-user.c | 105 ++++++++++++++++++++++++--
>  4 files changed, 100 insertions(+), 12 deletions(-)
>
> diff --git a/lib/librte_vhost/rte_virtio_net.h b/lib/librte_vhost/rte_virtio_net.h
> index 00b1328..77db80b 100644
> --- a/lib/librte_vhost/rte_virtio_net.h
> +++ b/lib/librte_vhost/rte_virtio_net.h
> @@ -48,6 +48,8 @@
>  #include <rte_mempool.h>
>  #include <rte_mbuf.h>
>  
> +#define VHOST_MEMORY_MAX_NREGIONS 8
> +
>  /* Used to indicate that the device is running on a data core */
>  #define VIRTIO_DEV_RUNNING 1
>  
> diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
> index f9ec40b..ec2584f 100644
> --- a/lib/librte_vhost/vhost-net.h
> +++ b/lib/librte_vhost/vhost-net.h
> @@ -43,8 +43,6 @@
>  
>  #include "rte_virtio_net.h"
>  
> -#define VHOST_MEMORY_MAX_NREGIONS 8
> -
>  extern struct vhost_net_device_ops const *ops;
>  
>  /* Macros for printing using RTE_LOG */
> diff --git a/lib/librte_vhost/vhost_user/vhost-net-user.h b/lib/librte_vhost/vhost_user/vhost-net-user.h
> index c138844..f4c9d01 100644
> --- a/lib/librte_vhost/vhost_user/vhost-net-user.h
> +++ b/lib/librte_vhost/vhost_user/vhost-net-user.h
> @@ -37,6 +37,7 @@
>  #include <stdint.h>
>  #include <linux/vhost.h>
>  
> +#include "rte_virtio_net.h"
>  #include "fd_man.h"
>  
>  struct vhost_server {
> @@ -47,8 +48,6 @@ struct vhost_server {
>  
>  /* refer to hw/virtio/vhost-user.c */
>  
> -#define VHOST_MEMORY_MAX_NREGIONS    8
> -
>  typedef enum VhostUserRequest {
>  	VHOST_USER_NONE = 0,
>  	VHOST_USER_GET_FEATURES = 1,
> diff --git a/lib/librte_vhost/vhost_user/virtio-net-user.c b/lib/librte_vhost/vhost_user/virtio-net-user.c
> index ad59fcc..3aecb17 100644
> --- a/lib/librte_vhost/vhost_user/virtio-net-user.c
> +++ b/lib/librte_vhost/vhost_user/virtio-net-user.c
> @@ -36,7 +36,11 @@
>  #include <stdlib.h>
>  #include <unistd.h>
>  #include <sys/mman.h>
> +#include <sys/types.h>
> +#include <sys/stat.h>
> +#include <unistd.h>
>  
> +#include <rte_common.h>
>  #include <rte_log.h>
>  
>  #include "virtio-net.h"
> @@ -44,13 +48,56 @@
>  #include "vhost-net-user.h"
>  #include "vhost-net.h"
>  
> +struct orig_region_map {
> +	int fd;
> +	uint64_t mapped_address;
> +	uint64_t mapped_size;
> +	uint64_t blksz;
> +};
> +
> +#define orig_region(ptr, nregions) (struct orig_region_map *)RTE_PTR_ADD(ptr, sizeof(struct virtio_memory) + sizeof(struct virtio_memory_regions) * (nregions))
> +
> +static uint64_t
> +get_blk_size(int fd)
> +{
> +	struct stat stat;
> +	fstat(fd, &stat);
> +	return (uint64_t)stat.st_blksize;
> +}

I've also confirmed we can get hugepage size of the fd using st_blksize.
If someone wants to run QEMU on 2MB hugepage, but DPDK backend is on
1GB, even in such a case, we will also be able to mmap and munmap QEMU
backend memory correctly.
So I guess using st_blksize is smart workaround not to hit munmap issue.

> +
>  int
>  user_set_mem_table(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg)
>  {
> -	unsigned int idx;
>  	struct VhostUserMemory memory = pmsg->payload.memory;
>  	struct virtio_memory_regions regions[VHOST_MEMORY_MAX_NREGIONS];
> -	uint64_t mapped_address, base_address = 0;
> +	uint64_t mapped_address, mapped_size, base_address = 0;
> +	struct virtio_net *dev;
> +	unsigned int idx = 0;
> +	struct orig_region_map tmp[VHOST_MEMORY_MAX_NREGIONS] =
> +		{ [0 ... VHOST_MEMORY_MAX_NREGIONS - 1] = { 0 } };
> +	struct orig_region_map *region;
> +	uint64_t alignment;
> +	int ret;
> +
> +	/* unmap old memory regions one by one*/
> +	dev = get_device(ctx);
> +	if (dev->mem) {
> +		region = orig_region(dev->mem, dev->mem->nregions);
> +		for (idx = 0; idx < dev->mem->nregions; idx++) {
> +			if (region[idx].mapped_address) {
> +				alignment = region[idx].blksz;
> +				printf("Freeing %p\n",
> +					(void *)(uintptr_t)region[idx].mapped_address);
> +				ret = munmap((void *)RTE_ALIGN_FLOOR(region[idx].mapped_address, alignment),
> +					RTE_ALIGN_CEIL(region[idx].mapped_size, alignment));
> +				printf("munmap ret= %d\n", ret);
> +				printf("close file %d\n", region[idx].fd);
> +				close(region[idx].fd);
> +			}
> +		}
> +		free(dev->mem);
> +		dev->mem = NULL;
> +	}
>  
>  	for (idx = 0; idx < memory.nregions; idx++) {
>  		if (memory.regions[idx].guest_phys_addr == 0)
> @@ -73,22 +120,30 @@ user_set_mem_table(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg)
>  			memory.regions[idx].userspace_addr;
>  
>  		/* This is ugly */
> +		mapped_size = regions[idx].memory_size +
> +			memory.regions[idx].mmap_offset;
>  		mapped_address = (uint64_t)(uintptr_t)mmap(NULL,
> -			regions[idx].memory_size +
> -				memory.regions[idx].mmap_offset,
> +			mapped_size,
>  			PROT_READ | PROT_WRITE, MAP_SHARED,
>  			pmsg->fds[idx],
>  			0);
> +
>  		RTE_LOG(INFO, VHOST_CONFIG,
> -			"mapped region %d to %p\n",
> -			idx, (void *)mapped_address);
> +			"mapped region %d fd:%d to %p sz:0x%"PRIx64" off:0x%"PRIx64"\n",
> +			idx, pmsg->fds[idx], (void *)mapped_address,
> +			mapped_size, memory.regions[idx].mmap_offset);
>  
>  		if (mapped_address == (uint64_t)(uintptr_t)MAP_FAILED) {
>  			RTE_LOG(ERR, VHOST_CONFIG,
>  				"mmap qemu guest failed.\n");
> -			return -1;
> +			goto err;
>  		}
>  
> +		tmp[idx].mapped_address = mapped_address;
> +		tmp[idx].mapped_size = mapped_size;
> +		tmp[idx].blksz = get_blk_size(pmsg->fds[idx]);
> +		tmp[idx].fd = pmsg->fds[idx];
> +
>  		mapped_address +=  memory.regions[idx].mmap_offset;
>  
>  		regions[idx].address_offset = mapped_address -
> @@ -100,10 +155,44 @@ user_set_mem_table(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg)
>  			(void *)(uintptr_t)regions[idx].userspace_address,
>  			 regions[idx].memory_size);
>  	}
> +
>  	ops->set_mem_table(ctx, regions, memory.nregions);
> +
> +	if (dev->mem) {
> +		void *tmp_mem;
> +		tmp_mem = realloc(dev->mem,
> +			sizeof(struct virtio_memory) +
> +			sizeof(struct virtio_memory_regions) * memory.nregions +
> +			sizeof(struct orig_region_map) * memory.nregions);
> +		if (tmp_mem == NULL) {
> +			goto err_realloc;
> +		}
> +		dev->mem = tmp_mem;
> +		region = orig_region(dev->mem, memory.nregions);
> +		for (idx = 0; idx < memory.nregions; idx++) {
> +			region[idx].mapped_address = tmp[idx].mapped_address;
> +			region[idx].mapped_size = tmp[idx].mapped_size;
> +			region[idx].blksz = tmp[idx].blksz;
> +			region[idx].fd = tmp[idx].fd;
> +		}
> +	} else
> +		goto err_set_mem_table;
> +
>  	return 0;
> -}
>  
> +err_realloc:
> +	free(dev->mem);
> +err_set_mem_table:
> +err:
> +	while (idx--) {
> +		alignment = tmp[idx].blksz;
> +		munmap((void *)RTE_ALIGN_FLOOR(tmp[idx].mapped_address, alignment),
> +			RTE_ALIGN_CEIL(tmp[idx].mapped_size, alignment));
> +		close(tmp[idx].fd);
> +	}
> +	dev->mem = NULL;
> +	return -1;
> +}
>  
>  static int
>  virtio_is_ready(struct virtio_net *dev)
  

Patch

diff --git a/lib/librte_vhost/rte_virtio_net.h b/lib/librte_vhost/rte_virtio_net.h
index 00b1328..77db80b 100644
--- a/lib/librte_vhost/rte_virtio_net.h
+++ b/lib/librte_vhost/rte_virtio_net.h
@@ -48,6 +48,8 @@ 
 #include <rte_mempool.h>
 #include <rte_mbuf.h>
 
+#define VHOST_MEMORY_MAX_NREGIONS 8
+
 /* Used to indicate that the device is running on a data core */
 #define VIRTIO_DEV_RUNNING 1
 
diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
index f9ec40b..ec2584f 100644
--- a/lib/librte_vhost/vhost-net.h
+++ b/lib/librte_vhost/vhost-net.h
@@ -43,8 +43,6 @@ 
 
 #include "rte_virtio_net.h"
 
-#define VHOST_MEMORY_MAX_NREGIONS 8
-
 extern struct vhost_net_device_ops const *ops;
 
 /* Macros for printing using RTE_LOG */
diff --git a/lib/librte_vhost/vhost_user/vhost-net-user.h b/lib/librte_vhost/vhost_user/vhost-net-user.h
index c138844..f4c9d01 100644
--- a/lib/librte_vhost/vhost_user/vhost-net-user.h
+++ b/lib/librte_vhost/vhost_user/vhost-net-user.h
@@ -37,6 +37,7 @@ 
 #include <stdint.h>
 #include <linux/vhost.h>
 
+#include "rte_virtio_net.h"
 #include "fd_man.h"
 
 struct vhost_server {
@@ -47,8 +48,6 @@  struct vhost_server {
 
 /* refer to hw/virtio/vhost-user.c */
 
-#define VHOST_MEMORY_MAX_NREGIONS    8
-
 typedef enum VhostUserRequest {
 	VHOST_USER_NONE = 0,
 	VHOST_USER_GET_FEATURES = 1,
diff --git a/lib/librte_vhost/vhost_user/virtio-net-user.c b/lib/librte_vhost/vhost_user/virtio-net-user.c
index ad59fcc..3aecb17 100644
--- a/lib/librte_vhost/vhost_user/virtio-net-user.c
+++ b/lib/librte_vhost/vhost_user/virtio-net-user.c
@@ -36,7 +36,11 @@ 
 #include <stdlib.h>
 #include <unistd.h>
 #include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
 
+#include <rte_common.h>
 #include <rte_log.h>
 
 #include "virtio-net.h"
@@ -44,13 +48,56 @@ 
 #include "vhost-net-user.h"
 #include "vhost-net.h"
 
+struct orig_region_map {
+	int fd;
+	uint64_t mapped_address;
+	uint64_t mapped_size;
+	uint64_t blksz;
+};
+
+#define orig_region(ptr, nregions) (struct orig_region_map *)RTE_PTR_ADD(ptr, sizeof(struct virtio_memory) + sizeof(struct virtio_memory_regions) * (nregions))
+
+static uint64_t
+get_blk_size(int fd)
+{
+	struct stat stat;
+	fstat(fd, &stat);
+	return (uint64_t)stat.st_blksize;
+}
+
 int
 user_set_mem_table(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg)
 {
-	unsigned int idx;
 	struct VhostUserMemory memory = pmsg->payload.memory;
 	struct virtio_memory_regions regions[VHOST_MEMORY_MAX_NREGIONS];
-	uint64_t mapped_address, base_address = 0;
+	uint64_t mapped_address, mapped_size, base_address = 0;
+	struct virtio_net *dev;
+	unsigned int idx = 0;
+	struct orig_region_map tmp[VHOST_MEMORY_MAX_NREGIONS] =
+		{ [0 ... VHOST_MEMORY_MAX_NREGIONS - 1] = { 0 } };
+	struct orig_region_map *region;
+	uint64_t alignment;
+	int ret;
+
+	/* unmap old memory regions one by one*/
+	dev = get_device(ctx);
+	if (dev->mem) {
+		region = orig_region(dev->mem, dev->mem->nregions);
+		for (idx = 0; idx < dev->mem->nregions; idx++) {
+			if (region[idx].mapped_address) {
+				alignment = region[idx].blksz;
+				printf("Freeing %p\n",
+					(void *)(uintptr_t)region[idx].mapped_address);
+				ret = munmap((void *)RTE_ALIGN_FLOOR(region[idx].mapped_address, alignment),
+					RTE_ALIGN_CEIL(region[idx].mapped_size, alignment));
+				printf("munmap ret= %d\n", ret);
+				printf("close file %d\n", region[idx].fd);
+				close(region[idx].fd);
+			}
+		}
+		free(dev->mem);
+		dev->mem = NULL;
+	}
 
 	for (idx = 0; idx < memory.nregions; idx++) {
 		if (memory.regions[idx].guest_phys_addr == 0)
@@ -73,22 +120,30 @@  user_set_mem_table(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg)
 			memory.regions[idx].userspace_addr;
 
 		/* This is ugly */
+		mapped_size = regions[idx].memory_size +
+			memory.regions[idx].mmap_offset;
 		mapped_address = (uint64_t)(uintptr_t)mmap(NULL,
-			regions[idx].memory_size +
-				memory.regions[idx].mmap_offset,
+			mapped_size,
 			PROT_READ | PROT_WRITE, MAP_SHARED,
 			pmsg->fds[idx],
 			0);
+
 		RTE_LOG(INFO, VHOST_CONFIG,
-			"mapped region %d to %p\n",
-			idx, (void *)mapped_address);
+			"mapped region %d fd:%d to %p sz:0x%"PRIx64" off:0x%"PRIx64"\n",
+			idx, pmsg->fds[idx], (void *)mapped_address,
+			mapped_size, memory.regions[idx].mmap_offset);
 
 		if (mapped_address == (uint64_t)(uintptr_t)MAP_FAILED) {
 			RTE_LOG(ERR, VHOST_CONFIG,
 				"mmap qemu guest failed.\n");
-			return -1;
+			goto err;
 		}
 
+		tmp[idx].mapped_address = mapped_address;
+		tmp[idx].mapped_size = mapped_size;
+		tmp[idx].blksz = get_blk_size(pmsg->fds[idx]);
+		tmp[idx].fd = pmsg->fds[idx];
+
 		mapped_address +=  memory.regions[idx].mmap_offset;
 
 		regions[idx].address_offset = mapped_address -
@@ -100,10 +155,44 @@  user_set_mem_table(struct vhost_device_ctx ctx, struct VhostUserMsg *pmsg)
 			(void *)(uintptr_t)regions[idx].userspace_address,
 			 regions[idx].memory_size);
 	}
+
 	ops->set_mem_table(ctx, regions, memory.nregions);
+
+	if (dev->mem) {
+		void *tmp_mem;
+		tmp_mem = realloc(dev->mem,
+			sizeof(struct virtio_memory) +
+			sizeof(struct virtio_memory_regions) * memory.nregions +
+			sizeof(struct orig_region_map) * memory.nregions);
+		if (tmp_mem == NULL) {
+			goto err_realloc;
+		}
+		dev->mem = tmp_mem;
+		region = orig_region(dev->mem, memory.nregions);
+		for (idx = 0; idx < memory.nregions; idx++) {
+			region[idx].mapped_address = tmp[idx].mapped_address;
+			region[idx].mapped_size = tmp[idx].mapped_size;
+			region[idx].blksz = tmp[idx].blksz;
+			region[idx].fd = tmp[idx].fd;
+		}
+	} else
+		goto err_set_mem_table;
+
 	return 0;
-}
 
+err_realloc:
+	free(dev->mem);
+err_set_mem_table:
+err:
+	while (idx--) {
+		alignment = tmp[idx].blksz;
+		munmap((void *)RTE_ALIGN_FLOOR(tmp[idx].mapped_address, alignment),
+			RTE_ALIGN_CEIL(tmp[idx].mapped_size, alignment));
+		close(tmp[idx].fd);
+	}
+	dev->mem = NULL;
+	return -1;
+}
 
 static int
 virtio_is_ready(struct virtio_net *dev)