[dpdk-dev,RFC,4/5] virtio/container: adjust memory initialization process

Message ID 1446748276-132087-5-git-send-email-jianfeng.tan@intel.com (mailing list archive)
State RFC, archived
Headers

Commit Message

Jianfeng Tan Nov. 5, 2015, 6:31 p.m. UTC
  When using virtio for container, we should specify --no-huge so
that in memory initialization, shm_open() is used to alloc memory
from tmpfs filesystem /dev/shm/.

Signed-off-by: Huawei Xie <huawei.xie@intel.com>
Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
---
 lib/librte_eal/common/include/rte_memory.h |  5 +++
 lib/librte_eal/linuxapp/eal/eal_memory.c   | 58 ++++++++++++++++++++++++++++--
 lib/librte_mempool/rte_mempool.c           | 16 ++++-----
 3 files changed, 69 insertions(+), 10 deletions(-)
  

Comments

Ananyev, Konstantin Nov. 6, 2015, 4:21 p.m. UTC | #1
Hi,

> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Jianfeng Tan
> Sent: Thursday, November 05, 2015 6:31 PM
> To: dev@dpdk.org
> Cc: nakajima.yoshihiro@lab.ntt.co.jp; zhbzg@huawei.com; mst@redhat.com; gaoxiaoqiu@huawei.com; oscar.zhangbo@huawei.com;
> ann.zhuangyanying@huawei.com; zhoujingbin@huawei.com; guohongzhen@huawei.com
> Subject: [dpdk-dev] [RFC 4/5] virtio/container: adjust memory initialization process
> 
> When using virtio for container, we should specify --no-huge so
> that in memory initialization, shm_open() is used to alloc memory
> from tmpfs filesystem /dev/shm/.
> 
> Signed-off-by: Huawei Xie <huawei.xie@intel.com>
> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> ---
>  lib/librte_eal/common/include/rte_memory.h |  5 +++
>  lib/librte_eal/linuxapp/eal/eal_memory.c   | 58 ++++++++++++++++++++++++++++--
>  lib/librte_mempool/rte_mempool.c           | 16 ++++-----
>  3 files changed, 69 insertions(+), 10 deletions(-)
> 
> diff --git a/lib/librte_eal/common/include/rte_memory.h b/lib/librte_eal/common/include/rte_memory.h
> index 1bed415..9c1effc 100644
> --- a/lib/librte_eal/common/include/rte_memory.h
> +++ b/lib/librte_eal/common/include/rte_memory.h
> @@ -100,6 +100,7 @@ struct rte_memseg {
>  	int32_t socket_id;          /**< NUMA socket ID. */
>  	uint32_t nchannel;          /**< Number of channels. */
>  	uint32_t nrank;             /**< Number of ranks. */
> +	int fd;                     /**< fd used for share this memory */
>  #ifdef RTE_LIBRTE_XEN_DOM0
>  	 /**< store segment MFNs */
>  	uint64_t mfn[DOM0_NUM_MEMBLOCK];
> @@ -128,6 +129,10 @@ int rte_mem_lock_page(const void *virt);
>   */
>  phys_addr_t rte_mem_virt2phy(const void *virt);
> 
> +
> +int
> +rte_memseg_info_get(int index, int *pfd, uint64_t *psize, void **paddr);
> +
>  /**
>   * Get the layout of the available physical memory.
>   *
> diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
> index ac2745e..9abbfc6 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_memory.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
> @@ -80,6 +80,9 @@
>  #include <errno.h>
>  #include <sys/ioctl.h>
>  #include <sys/time.h>
> +#include <mntent.h>
> +#include <sys/mman.h>
> +#include <sys/file.h>
> 
>  #include <rte_log.h>
>  #include <rte_memory.h>
> @@ -143,6 +146,18 @@ rte_mem_lock_page(const void *virt)
>  	return mlock((void*)aligned, page_size);
>  }
> 
> +int
> +rte_memseg_info_get(int index, int *pfd, uint64_t *psize, void **paddr)
> +{
> +	struct rte_mem_config *mcfg;
> +	mcfg = rte_eal_get_configuration()->mem_config;
> +
> +	*pfd = mcfg->memseg[index].fd;
> +	*psize = (uint64_t)mcfg->memseg[index].len;
> +	*paddr = (void *)(uint64_t)mcfg->memseg[index].addr;
> +	return 0;
> +}

Wonder who will use that function?
Can't see any references to that function in that patch or next. 

> +
>  /*
>   * Get physical address of any mapped virtual address in the current process.
>   */
> @@ -1044,6 +1059,42 @@ calc_num_pages_per_socket(uint64_t * memory,
>  	return total_num_pages;
>  }
> 
> +static void *
> +rte_eal_shm_create(int *pfd)
> +{
> +	int ret, fd;
> +	char filepath[256];
> +	void *vaddr;
> +	uint64_t size = internal_config.memory;
> +
> +	sprintf(filepath, "/%s_cvio", internal_config.hugefile_prefix);
> +
> +	fd = shm_open(filepath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
> +	if (fd < 0) {
> +		rte_panic("shm_open %s failed: %s\n", filepath, strerror(errno));
> +	}
> +	ret = flock(fd, LOCK_EX);
> +	if (ret < 0) {
> +		close(fd);
> +		rte_panic("flock %s failed: %s\n", filepath, strerror(errno));
> +	}
> +
> +	ret = ftruncate(fd, size);
> +	if (ret < 0) {
> +		rte_panic("ftruncate failed: %s\n", strerror(errno));
> +	}
> +	/* flag: MAP_HUGETLB */

Any explanation what that comment means here?
Do you plan to use MAP_HUGETLb in the call below or ...?

> +	vaddr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
> +	if (vaddr == MAP_FAILED) {
> +		rte_panic("mmap failed: %s\n", strerror(errno));
> +	}
> +	memset(vaddr, 0, size);
> +	*pfd = fd;
> +
> +	return vaddr;
> +}
> +
> +
>  /*
>   * Prepare physical memory mapping: fill configuration structure with
>   * these infos, return 0 on success.
> @@ -1072,7 +1123,9 @@ rte_eal_hugepage_init(void)
>  	int new_pages_count[MAX_HUGEPAGE_SIZES];
>  #endif
> 
> +#ifndef RTE_VIRTIO_VDEV
>  	test_proc_pagemap_readable();
> +#endif
> 
>  	memset(used_hp, 0, sizeof(used_hp));
> 
> @@ -1081,8 +1134,8 @@ rte_eal_hugepage_init(void)
> 
>  	/* hugetlbfs can be disabled */
>  	if (internal_config.no_hugetlbfs) {
> -		addr = mmap(NULL, internal_config.memory, PROT_READ | PROT_WRITE,
> -				MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
> +		int fd;
> +		addr = rte_eal_shm_create(&fd);

Why do you remove ability to map(dev/zero) here?
Probably not everyone plan to use --no-hugepages only inside containers. 


>  		if (addr == MAP_FAILED) {
>  			RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__,
>  					strerror(errno));
> @@ -1093,6 +1146,7 @@ rte_eal_hugepage_init(void)
>  		mcfg->memseg[0].hugepage_sz = RTE_PGSIZE_4K;
>  		mcfg->memseg[0].len = internal_config.memory;
>  		mcfg->memseg[0].socket_id = 0;
> +		mcfg->memseg[0].fd = fd;
>  		return 0;
>  	}
> 
> diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
> index e57cbbd..8f8852b 100644
> --- a/lib/librte_mempool/rte_mempool.c
> +++ b/lib/librte_mempool/rte_mempool.c
> @@ -453,13 +453,6 @@ rte_mempool_xmem_create(const char *name, unsigned n, unsigned elt_size,
>  		rte_errno = EINVAL;
>  		return NULL;
>  	}
> -
> -	/* check that we have both VA and PA */
> -	if (vaddr != NULL && paddr == NULL) {
> -		rte_errno = EINVAL;
> -		return NULL;
> -	}
> -
>  	/* Check that pg_num and pg_shift parameters are valid. */
>  	if (pg_num < RTE_DIM(mp->elt_pa) || pg_shift > MEMPOOL_PG_SHIFT_MAX) {
>  		rte_errno = EINVAL;
> @@ -596,8 +589,15 @@ rte_mempool_xmem_create(const char *name, unsigned n, unsigned elt_size,
> 
>  	/* mempool elements in a separate chunk of memory. */
>  	} else {
> +		/* when VA is specified, PA should be specified? */
> +		if (rte_eal_has_hugepages()) {
> +			if (paddr == NULL) {
> +				rte_errno = EINVAL;
> +				return NULL;
> +			}
> +			memcpy(mp->elt_pa, paddr, sizeof (mp->elt_pa[0]) * pg_num);
> +		}
>  		mp->elt_va_start = (uintptr_t)vaddr;
> -		memcpy(mp->elt_pa, paddr, sizeof (mp->elt_pa[0]) * pg_num);

Could you explain the reason for that change?
Specially why mempool over external memory now only allowed for hugepages config?
Konstantin

>  	}
> 
>  	mp->elt_va_end = mp->elt_va_start;
> --
> 2.1.4
  
Jianfeng Tan Nov. 8, 2015, 11:18 a.m. UTC | #2
> -----Original Message-----
> From: Ananyev, Konstantin
> Sent: Saturday, November 7, 2015 12:22 AM
> To: Tan, Jianfeng; dev@dpdk.org
> Cc: nakajima.yoshihiro@lab.ntt.co.jp; zhbzg@huawei.com; mst@redhat.com;
> gaoxiaoqiu@huawei.com; oscar.zhangbo@huawei.com;
> ann.zhuangyanying@huawei.com; zhoujingbin@huawei.com;
> guohongzhen@huawei.com
> Subject: RE: [dpdk-dev] [RFC 4/5] virtio/container: adjust memory
> initialization process
> 
> Hi,
> 
> > -----Original Message-----
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Jianfeng Tan
> > Sent: Thursday, November 05, 2015 6:31 PM
> > To: dev@dpdk.org
> > Cc: nakajima.yoshihiro@lab.ntt.co.jp; zhbzg@huawei.com;
> > mst@redhat.com; gaoxiaoqiu@huawei.com; oscar.zhangbo@huawei.com;
> > ann.zhuangyanying@huawei.com; zhoujingbin@huawei.com;
> > guohongzhen@huawei.com
> > Subject: [dpdk-dev] [RFC 4/5] virtio/container: adjust memory
> > initialization process
> >
> > When using virtio for container, we should specify --no-huge so that
> > in memory initialization, shm_open() is used to alloc memory from
> > tmpfs filesystem /dev/shm/.
> >
> > Signed-off-by: Huawei Xie <huawei.xie@intel.com>
> > Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> > ---
......
> > +int
> > +rte_memseg_info_get(int index, int *pfd, uint64_t *psize, void
> > +**paddr) {
> > +	struct rte_mem_config *mcfg;
> > +	mcfg = rte_eal_get_configuration()->mem_config;
> > +
> > +	*pfd = mcfg->memseg[index].fd;
> > +	*psize = (uint64_t)mcfg->memseg[index].len;
> > +	*paddr = (void *)(uint64_t)mcfg->memseg[index].addr;
> > +	return 0;
> > +}
> 
> Wonder who will use that function?
> Can't see any references to that function in that patch or next.

This function is used in 1/5, when virtio front end needs to send VHOST_USER_SET_MEM_TABLE to back end.

> > +
> >  /*
> >   * Get physical address of any mapped virtual address in the current
> process.
> >   */
> > @@ -1044,6 +1059,42 @@ calc_num_pages_per_socket(uint64_t *
> memory,
> >  	return total_num_pages;
> >  }
> >
> > +static void *
> > +rte_eal_shm_create(int *pfd)
> > +{
> > +	int ret, fd;
> > +	char filepath[256];
> > +	void *vaddr;
> > +	uint64_t size = internal_config.memory;
> > +
> > +	sprintf(filepath, "/%s_cvio", internal_config.hugefile_prefix);
> > +
> > +	fd = shm_open(filepath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
> > +	if (fd < 0) {
> > +		rte_panic("shm_open %s failed: %s\n", filepath,
> strerror(errno));
> > +	}
> > +	ret = flock(fd, LOCK_EX);
> > +	if (ret < 0) {
> > +		close(fd);
> > +		rte_panic("flock %s failed: %s\n", filepath, strerror(errno));
> > +	}
> > +
> > +	ret = ftruncate(fd, size);
> > +	if (ret < 0) {
> > +		rte_panic("ftruncate failed: %s\n", strerror(errno));
> > +	}
> > +	/* flag: MAP_HUGETLB */
> 
> Any explanation what that comment means here?
> Do you plan to use MAP_HUGETLb in the call below or ...?

Yes, it's a todo item. Shm_open() just uses a tmpfs mounted at /dev/shm. So I wonder maybe we can use this flag to make
sure  os allocates hugepages here if user would like to use hugepages.

>>
......
> > @@ -1081,8 +1134,8 @@ rte_eal_hugepage_init(void)
> >
> >  	/* hugetlbfs can be disabled */
> >  	if (internal_config.no_hugetlbfs) {
> > -		addr = mmap(NULL, internal_config.memory, PROT_READ |
> PROT_WRITE,
> > -				MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
> > +		int fd;
> > +		addr = rte_eal_shm_create(&fd);
> 
> Why do you remove ability to map(dev/zero) here?
> Probably not everyone plan to use --no-hugepages only inside containers.

From my understanding, mmap here is just to allocate some memory, which is initialized to be all zero. I cannot understand what's
the relationship with /dev/zero. rte_eal_shm_create() can do the original function, plus it generates a fd to point to this chunk of
memory. This fd is indispensable in vhost protocol when VHOST_USER_SET_MEM_TABLE using sendmsg().

> 
> 
> >  		if (addr == MAP_FAILED) {
> >  			RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n",
> __func__,
> >  					strerror(errno));
> > @@ -1093,6 +1146,7 @@ rte_eal_hugepage_init(void)
> >  		mcfg->memseg[0].hugepage_sz = RTE_PGSIZE_4K;
> >  		mcfg->memseg[0].len = internal_config.memory;
> >  		mcfg->memseg[0].socket_id = 0;
> > +		mcfg->memseg[0].fd = fd;
> >  		return 0;
> >  	}
> >
> > diff --git a/lib/librte_mempool/rte_mempool.c
> > b/lib/librte_mempool/rte_mempool.c
> > index e57cbbd..8f8852b 100644
> > --- a/lib/librte_mempool/rte_mempool.c
> > +++ b/lib/librte_mempool/rte_mempool.c
> > @@ -453,13 +453,6 @@ rte_mempool_xmem_create(const char *name,
> unsigned n, unsigned elt_size,
> >  		rte_errno = EINVAL;
> >  		return NULL;
> >  	}
> > -
> > -	/* check that we have both VA and PA */
> > -	if (vaddr != NULL && paddr == NULL) {
> > -		rte_errno = EINVAL;
> > -		return NULL;
> > -	}
> > -
> >  	/* Check that pg_num and pg_shift parameters are valid. */
> >  	if (pg_num < RTE_DIM(mp->elt_pa) || pg_shift >
> MEMPOOL_PG_SHIFT_MAX) {
> >  		rte_errno = EINVAL;
> > @@ -596,8 +589,15 @@ rte_mempool_xmem_create(const char *name,
> > unsigned n, unsigned elt_size,
> >
> >  	/* mempool elements in a separate chunk of memory. */
> >  	} else {
> > +		/* when VA is specified, PA should be specified? */
> > +		if (rte_eal_has_hugepages()) {
> > +			if (paddr == NULL) {
> > +				rte_errno = EINVAL;
> > +				return NULL;
> > +			}
> > +			memcpy(mp->elt_pa, paddr, sizeof (mp->elt_pa[0])
> * pg_num);
> > +		}
> >  		mp->elt_va_start = (uintptr_t)vaddr;
> > -		memcpy(mp->elt_pa, paddr, sizeof (mp->elt_pa[0]) *
> pg_num);
> 
> Could you explain the reason for that change?
> Specially why mempool over external memory now only allowed for
> hugepages config?
> Konstantin

Oops, you're right! This change was previously for creating a mbuf mempool at a given vaddr and without
giving any paddr[].  And now we don't need to care about neither vaddr nor paddr[] so I should have reverted
change in this file.

> 
> >  	}
> >
> >  	mp->elt_va_end = mp->elt_va_start;
> > --
> > 2.1.4
  
Ananyev, Konstantin Nov. 9, 2015, 1:32 p.m. UTC | #3
> 
> > -----Original Message-----
> > From: Ananyev, Konstantin
> > Sent: Saturday, November 7, 2015 12:22 AM
> > To: Tan, Jianfeng; dev@dpdk.org
> > Cc: nakajima.yoshihiro@lab.ntt.co.jp; zhbzg@huawei.com; mst@redhat.com;
> > gaoxiaoqiu@huawei.com; oscar.zhangbo@huawei.com;
> > ann.zhuangyanying@huawei.com; zhoujingbin@huawei.com;
> > guohongzhen@huawei.com
> > Subject: RE: [dpdk-dev] [RFC 4/5] virtio/container: adjust memory
> > initialization process
> >
> > Hi,
> >
> > > -----Original Message-----
> > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Jianfeng Tan
> > > Sent: Thursday, November 05, 2015 6:31 PM
> > > To: dev@dpdk.org
> > > Cc: nakajima.yoshihiro@lab.ntt.co.jp; zhbzg@huawei.com;
> > > mst@redhat.com; gaoxiaoqiu@huawei.com; oscar.zhangbo@huawei.com;
> > > ann.zhuangyanying@huawei.com; zhoujingbin@huawei.com;
> > > guohongzhen@huawei.com
> > > Subject: [dpdk-dev] [RFC 4/5] virtio/container: adjust memory
> > > initialization process
> > >
> > > When using virtio for container, we should specify --no-huge so that
> > > in memory initialization, shm_open() is used to alloc memory from
> > > tmpfs filesystem /dev/shm/.
> > >
> > > Signed-off-by: Huawei Xie <huawei.xie@intel.com>
> > > Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> > > ---
> ......
> > > +int
> > > +rte_memseg_info_get(int index, int *pfd, uint64_t *psize, void
> > > +**paddr) {
> > > +	struct rte_mem_config *mcfg;
> > > +	mcfg = rte_eal_get_configuration()->mem_config;
> > > +
> > > +	*pfd = mcfg->memseg[index].fd;
> > > +	*psize = (uint64_t)mcfg->memseg[index].len;
> > > +	*paddr = (void *)(uint64_t)mcfg->memseg[index].addr;
> > > +	return 0;
> > > +}
> >
> > Wonder who will use that function?
> > Can't see any references to that function in that patch or next.
> 
> This function is used in 1/5, when virtio front end needs to send VHOST_USER_SET_MEM_TABLE to back end.

Ok, but hen this function should be defined in the patch *before* it is used, not after.
Another thing: probably better to create a struct for all memseg parameters you want to retrieve,
and pass it to the function, instead of several pointers. 

> 
> > > +
> > >  /*
> > >   * Get physical address of any mapped virtual address in the current
> > process.
> > >   */
> > > @@ -1044,6 +1059,42 @@ calc_num_pages_per_socket(uint64_t *
> > memory,
> > >  	return total_num_pages;
> > >  }
> > >
> > > +static void *
> > > +rte_eal_shm_create(int *pfd)
> > > +{
> > > +	int ret, fd;
> > > +	char filepath[256];
> > > +	void *vaddr;
> > > +	uint64_t size = internal_config.memory;
> > > +
> > > +	sprintf(filepath, "/%s_cvio", internal_config.hugefile_prefix);
> > > +
> > > +	fd = shm_open(filepath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
> > > +	if (fd < 0) {
> > > +		rte_panic("shm_open %s failed: %s\n", filepath,
> > strerror(errno));
> > > +	}
> > > +	ret = flock(fd, LOCK_EX);
> > > +	if (ret < 0) {
> > > +		close(fd);
> > > +		rte_panic("flock %s failed: %s\n", filepath, strerror(errno));
> > > +	}
> > > +
> > > +	ret = ftruncate(fd, size);
> > > +	if (ret < 0) {
> > > +		rte_panic("ftruncate failed: %s\n", strerror(errno));
> > > +	}
> > > +	/* flag: MAP_HUGETLB */
> >
> > Any explanation what that comment means here?
> > Do you plan to use MAP_HUGETLb in the call below or ...?
> 
> Yes, it's a todo item. Shm_open() just uses a tmpfs mounted at /dev/shm. So I wonder maybe we can use this flag to make
> sure  os allocates hugepages here if user would like to use hugepages.
> 
> >>
> ......
> > > @@ -1081,8 +1134,8 @@ rte_eal_hugepage_init(void)
> > >
> > >  	/* hugetlbfs can be disabled */
> > >  	if (internal_config.no_hugetlbfs) {
> > > -		addr = mmap(NULL, internal_config.memory, PROT_READ |
> > PROT_WRITE,
> > > -				MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
> > > +		int fd;
> > > +		addr = rte_eal_shm_create(&fd);
> >
> > Why do you remove ability to map(dev/zero) here?
> > Probably not everyone plan to use --no-hugepages only inside containers.
> 
> From my understanding, mmap here is just to allocate some memory, which is initialized to be all zero. I cannot understand what's
> the relationship with /dev/zero.

I used it here as a synonym for mmap(, ..., MAP_ANONYMOUS,...).

 rte_eal_shm_create() can do the original function, plus it generates a fd to point to this chunk of
> memory. This fd is indispensable in vhost protocol when VHOST_USER_SET_MEM_TABLE using sendmsg().


My question was:
Right now for --no-hugepages it allocates a chunk of memory that is not backed-up by any file and is private to the process:

addr = mmap(NULL, internal_config.memory, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); 

You changed it to shared memory region allocation:

fd = shm_open(filepath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);

I understand that you need it for your containers stuff - but I suppose you have to add
new functionality without breaking existing one>
There could be other users of --no-hugepages and they probably want existing behaviour.
Konstantin

> 
> >
> >
> > >  		if (addr == MAP_FAILED) {
> > >  			RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n",
> > __func__,
> > >  					strerror(errno));
> > > @@ -1093,6 +1146,7 @@ rte_eal_hugepage_init(void)
> > >  		mcfg->memseg[0].hugepage_sz = RTE_PGSIZE_4K;
> > >  		mcfg->memseg[0].len = internal_config.memory;
> > >  		mcfg->memseg[0].socket_id = 0;
> > > +		mcfg->memseg[0].fd = fd;
> > >  		return 0;
> > >  	}
> > >
> > > diff --git a/lib/librte_mempool/rte_mempool.c
> > > b/lib/librte_mempool/rte_mempool.c
> > > index e57cbbd..8f8852b 100644
> > > --- a/lib/librte_mempool/rte_mempool.c
> > > +++ b/lib/librte_mempool/rte_mempool.c
> > > @@ -453,13 +453,6 @@ rte_mempool_xmem_create(const char *name,
> > unsigned n, unsigned elt_size,
> > >  		rte_errno = EINVAL;
> > >  		return NULL;
> > >  	}
> > > -
> > > -	/* check that we have both VA and PA */
> > > -	if (vaddr != NULL && paddr == NULL) {
> > > -		rte_errno = EINVAL;
> > > -		return NULL;
> > > -	}
> > > -
> > >  	/* Check that pg_num and pg_shift parameters are valid. */
> > >  	if (pg_num < RTE_DIM(mp->elt_pa) || pg_shift >
> > MEMPOOL_PG_SHIFT_MAX) {
> > >  		rte_errno = EINVAL;
> > > @@ -596,8 +589,15 @@ rte_mempool_xmem_create(const char *name,
> > > unsigned n, unsigned elt_size,
> > >
> > >  	/* mempool elements in a separate chunk of memory. */
> > >  	} else {
> > > +		/* when VA is specified, PA should be specified? */
> > > +		if (rte_eal_has_hugepages()) {
> > > +			if (paddr == NULL) {
> > > +				rte_errno = EINVAL;
> > > +				return NULL;
> > > +			}
> > > +			memcpy(mp->elt_pa, paddr, sizeof (mp->elt_pa[0])
> > * pg_num);
> > > +		}
> > >  		mp->elt_va_start = (uintptr_t)vaddr;
> > > -		memcpy(mp->elt_pa, paddr, sizeof (mp->elt_pa[0]) *
> > pg_num);
> >
> > Could you explain the reason for that change?
> > Specially why mempool over external memory now only allowed for
> > hugepages config?
> > Konstantin
> 
> Oops, you're right! This change was previously for creating a mbuf mempool at a given vaddr and without
> giving any paddr[].  And now we don't need to care about neither vaddr nor paddr[] so I should have reverted
> change in this file.
> 
> >
> > >  	}
> > >
> > >  	mp->elt_va_end = mp->elt_va_start;
> > > --
> > > 2.1.4
  
Jianfeng Tan Nov. 9, 2015, 2:13 p.m. UTC | #4
> ......
> > > > +int
> > > > +rte_memseg_info_get(int index, int *pfd, uint64_t *psize, void
> > > > +**paddr) {
> > > > +	struct rte_mem_config *mcfg;
> > > > +	mcfg = rte_eal_get_configuration()->mem_config;
> > > > +
> > > > +	*pfd = mcfg->memseg[index].fd;
> > > > +	*psize = (uint64_t)mcfg->memseg[index].len;
> > > > +	*paddr = (void *)(uint64_t)mcfg->memseg[index].addr;
> > > > +	return 0;
> > > > +}
> > >
> > > Wonder who will use that function?
> > > Can't see any references to that function in that patch or next.
> >
> > This function is used in 1/5, when virtio front end needs to send
> VHOST_USER_SET_MEM_TABLE to back end.
> 
> Ok, but hen this function should be defined in the patch *before* it is used,
> not after.
> Another thing: probably better to create a struct for all memseg parameters
> you want to retrieve, and pass it to the function, instead of several pointers.

Very good suggestion! I'll fix it in next version.

> > > > +		addr = rte_eal_shm_create(&fd);
> > >
> > > Why do you remove ability to map(dev/zero) here?
> > > Probably not everyone plan to use --no-hugepages only inside containers.
> >
> > From my understanding, mmap here is just to allocate some memory,
> > which is initialized to be all zero. I cannot understand what's the
> relationship with /dev/zero.
> 
> I used it here as a synonym for mmap(, ..., MAP_ANONYMOUS,...).
> 
>  rte_eal_shm_create() can do the original function, plus it generates a fd to
> point to this chunk of
> > memory. This fd is indispensable in vhost protocol when
> VHOST_USER_SET_MEM_TABLE using sendmsg().
> 
> 
> My question was:
> Right now for --no-hugepages it allocates a chunk of memory that is not
> backed-up by any file and is private to the process:
> 
> addr = mmap(NULL, internal_config.memory, PROT_READ | PROT_WRITE,
> MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
> 
> You changed it to shared memory region allocation:
> 
> fd = shm_open(filepath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR); addr =
> mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
> 
> I understand that you need it for your containers stuff - but I suppose you
> have to add new functionality without breaking existing one> There could be
> other users of --no-hugepages and they probably want existing behaviour.
> Konstantin

Thank you for patient analysis and I agree with you. I should have not break
compatibility with existing applications. I'd like to redesign this in next version.
Maybe a new cmd option is necessary here.

Jianfeng

.....
> > > > --
> > > > 2.1.4
  

Patch

diff --git a/lib/librte_eal/common/include/rte_memory.h b/lib/librte_eal/common/include/rte_memory.h
index 1bed415..9c1effc 100644
--- a/lib/librte_eal/common/include/rte_memory.h
+++ b/lib/librte_eal/common/include/rte_memory.h
@@ -100,6 +100,7 @@  struct rte_memseg {
 	int32_t socket_id;          /**< NUMA socket ID. */
 	uint32_t nchannel;          /**< Number of channels. */
 	uint32_t nrank;             /**< Number of ranks. */
+	int fd;                     /**< fd used for share this memory */
 #ifdef RTE_LIBRTE_XEN_DOM0
 	 /**< store segment MFNs */
 	uint64_t mfn[DOM0_NUM_MEMBLOCK];
@@ -128,6 +129,10 @@  int rte_mem_lock_page(const void *virt);
  */
 phys_addr_t rte_mem_virt2phy(const void *virt);
 
+
+int
+rte_memseg_info_get(int index, int *pfd, uint64_t *psize, void **paddr);
+
 /**
  * Get the layout of the available physical memory.
  *
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index ac2745e..9abbfc6 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -80,6 +80,9 @@ 
 #include <errno.h>
 #include <sys/ioctl.h>
 #include <sys/time.h>
+#include <mntent.h>
+#include <sys/mman.h>
+#include <sys/file.h>
 
 #include <rte_log.h>
 #include <rte_memory.h>
@@ -143,6 +146,18 @@  rte_mem_lock_page(const void *virt)
 	return mlock((void*)aligned, page_size);
 }
 
+int
+rte_memseg_info_get(int index, int *pfd, uint64_t *psize, void **paddr)
+{
+	struct rte_mem_config *mcfg;
+	mcfg = rte_eal_get_configuration()->mem_config;
+
+	*pfd = mcfg->memseg[index].fd;
+	*psize = (uint64_t)mcfg->memseg[index].len;
+	*paddr = (void *)(uint64_t)mcfg->memseg[index].addr;
+	return 0;
+}
+
 /*
  * Get physical address of any mapped virtual address in the current process.
  */
@@ -1044,6 +1059,42 @@  calc_num_pages_per_socket(uint64_t * memory,
 	return total_num_pages;
 }
 
+static void *
+rte_eal_shm_create(int *pfd)
+{
+	int ret, fd;
+	char filepath[256];
+	void *vaddr;
+	uint64_t size = internal_config.memory;
+
+	sprintf(filepath, "/%s_cvio", internal_config.hugefile_prefix);
+
+	fd = shm_open(filepath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
+	if (fd < 0) {
+		rte_panic("shm_open %s failed: %s\n", filepath, strerror(errno));
+	}
+	ret = flock(fd, LOCK_EX);
+	if (ret < 0) {
+		close(fd);
+		rte_panic("flock %s failed: %s\n", filepath, strerror(errno));
+	}
+
+	ret = ftruncate(fd, size);
+	if (ret < 0) {
+		rte_panic("ftruncate failed: %s\n", strerror(errno));
+	}
+	/* flag: MAP_HUGETLB */
+	vaddr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+	if (vaddr == MAP_FAILED) {
+		rte_panic("mmap failed: %s\n", strerror(errno));
+	}
+	memset(vaddr, 0, size);
+	*pfd = fd;
+
+	return vaddr;
+}
+
+
 /*
  * Prepare physical memory mapping: fill configuration structure with
  * these infos, return 0 on success.
@@ -1072,7 +1123,9 @@  rte_eal_hugepage_init(void)
 	int new_pages_count[MAX_HUGEPAGE_SIZES];
 #endif
 
+#ifndef RTE_VIRTIO_VDEV
 	test_proc_pagemap_readable();
+#endif
 
 	memset(used_hp, 0, sizeof(used_hp));
 
@@ -1081,8 +1134,8 @@  rte_eal_hugepage_init(void)
 
 	/* hugetlbfs can be disabled */
 	if (internal_config.no_hugetlbfs) {
-		addr = mmap(NULL, internal_config.memory, PROT_READ | PROT_WRITE,
-				MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+		int fd;
+		addr = rte_eal_shm_create(&fd);
 		if (addr == MAP_FAILED) {
 			RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__,
 					strerror(errno));
@@ -1093,6 +1146,7 @@  rte_eal_hugepage_init(void)
 		mcfg->memseg[0].hugepage_sz = RTE_PGSIZE_4K;
 		mcfg->memseg[0].len = internal_config.memory;
 		mcfg->memseg[0].socket_id = 0;
+		mcfg->memseg[0].fd = fd;
 		return 0;
 	}
 
diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index e57cbbd..8f8852b 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -453,13 +453,6 @@  rte_mempool_xmem_create(const char *name, unsigned n, unsigned elt_size,
 		rte_errno = EINVAL;
 		return NULL;
 	}
-
-	/* check that we have both VA and PA */
-	if (vaddr != NULL && paddr == NULL) {
-		rte_errno = EINVAL;
-		return NULL;
-	}
-
 	/* Check that pg_num and pg_shift parameters are valid. */
 	if (pg_num < RTE_DIM(mp->elt_pa) || pg_shift > MEMPOOL_PG_SHIFT_MAX) {
 		rte_errno = EINVAL;
@@ -596,8 +589,15 @@  rte_mempool_xmem_create(const char *name, unsigned n, unsigned elt_size,
 
 	/* mempool elements in a separate chunk of memory. */
 	} else {
+		/* when VA is specified, PA should be specified? */
+		if (rte_eal_has_hugepages()) {
+			if (paddr == NULL) {
+				rte_errno = EINVAL;
+				return NULL;
+			}
+			memcpy(mp->elt_pa, paddr, sizeof (mp->elt_pa[0]) * pg_num);
+		}
 		mp->elt_va_start = (uintptr_t)vaddr;
-		memcpy(mp->elt_pa, paddr, sizeof (mp->elt_pa[0]) * pg_num);
 	}
 
 	mp->elt_va_end = mp->elt_va_start;