[dpdk-dev,RFC,4/5] virtio/container: adjust memory initialization process
Commit Message
When using virtio for container, we should specify --no-huge so
that in memory initialization, shm_open() is used to alloc memory
from tmpfs filesystem /dev/shm/.
Signed-off-by: Huawei Xie <huawei.xie@intel.com>
Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
---
lib/librte_eal/common/include/rte_memory.h | 5 +++
lib/librte_eal/linuxapp/eal/eal_memory.c | 58 ++++++++++++++++++++++++++++--
lib/librte_mempool/rte_mempool.c | 16 ++++-----
3 files changed, 69 insertions(+), 10 deletions(-)
Comments
Hi,
> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Jianfeng Tan
> Sent: Thursday, November 05, 2015 6:31 PM
> To: dev@dpdk.org
> Cc: nakajima.yoshihiro@lab.ntt.co.jp; zhbzg@huawei.com; mst@redhat.com; gaoxiaoqiu@huawei.com; oscar.zhangbo@huawei.com;
> ann.zhuangyanying@huawei.com; zhoujingbin@huawei.com; guohongzhen@huawei.com
> Subject: [dpdk-dev] [RFC 4/5] virtio/container: adjust memory initialization process
>
> When using virtio for container, we should specify --no-huge so
> that in memory initialization, shm_open() is used to alloc memory
> from tmpfs filesystem /dev/shm/.
>
> Signed-off-by: Huawei Xie <huawei.xie@intel.com>
> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> ---
> lib/librte_eal/common/include/rte_memory.h | 5 +++
> lib/librte_eal/linuxapp/eal/eal_memory.c | 58 ++++++++++++++++++++++++++++--
> lib/librte_mempool/rte_mempool.c | 16 ++++-----
> 3 files changed, 69 insertions(+), 10 deletions(-)
>
> diff --git a/lib/librte_eal/common/include/rte_memory.h b/lib/librte_eal/common/include/rte_memory.h
> index 1bed415..9c1effc 100644
> --- a/lib/librte_eal/common/include/rte_memory.h
> +++ b/lib/librte_eal/common/include/rte_memory.h
> @@ -100,6 +100,7 @@ struct rte_memseg {
> int32_t socket_id; /**< NUMA socket ID. */
> uint32_t nchannel; /**< Number of channels. */
> uint32_t nrank; /**< Number of ranks. */
> + int fd; /**< fd used for share this memory */
> #ifdef RTE_LIBRTE_XEN_DOM0
> /**< store segment MFNs */
> uint64_t mfn[DOM0_NUM_MEMBLOCK];
> @@ -128,6 +129,10 @@ int rte_mem_lock_page(const void *virt);
> */
> phys_addr_t rte_mem_virt2phy(const void *virt);
>
> +
> +int
> +rte_memseg_info_get(int index, int *pfd, uint64_t *psize, void **paddr);
> +
> /**
> * Get the layout of the available physical memory.
> *
> diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
> index ac2745e..9abbfc6 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_memory.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
> @@ -80,6 +80,9 @@
> #include <errno.h>
> #include <sys/ioctl.h>
> #include <sys/time.h>
> +#include <mntent.h>
> +#include <sys/mman.h>
> +#include <sys/file.h>
>
> #include <rte_log.h>
> #include <rte_memory.h>
> @@ -143,6 +146,18 @@ rte_mem_lock_page(const void *virt)
> return mlock((void*)aligned, page_size);
> }
>
> +int
> +rte_memseg_info_get(int index, int *pfd, uint64_t *psize, void **paddr)
> +{
> + struct rte_mem_config *mcfg;
> + mcfg = rte_eal_get_configuration()->mem_config;
> +
> + *pfd = mcfg->memseg[index].fd;
> + *psize = (uint64_t)mcfg->memseg[index].len;
> + *paddr = (void *)(uint64_t)mcfg->memseg[index].addr;
> + return 0;
> +}
Wonder who will use that function?
Can't see any references to that function in that patch or next.
> +
> /*
> * Get physical address of any mapped virtual address in the current process.
> */
> @@ -1044,6 +1059,42 @@ calc_num_pages_per_socket(uint64_t * memory,
> return total_num_pages;
> }
>
> +static void *
> +rte_eal_shm_create(int *pfd)
> +{
> + int ret, fd;
> + char filepath[256];
> + void *vaddr;
> + uint64_t size = internal_config.memory;
> +
> + sprintf(filepath, "/%s_cvio", internal_config.hugefile_prefix);
> +
> + fd = shm_open(filepath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
> + if (fd < 0) {
> + rte_panic("shm_open %s failed: %s\n", filepath, strerror(errno));
> + }
> + ret = flock(fd, LOCK_EX);
> + if (ret < 0) {
> + close(fd);
> + rte_panic("flock %s failed: %s\n", filepath, strerror(errno));
> + }
> +
> + ret = ftruncate(fd, size);
> + if (ret < 0) {
> + rte_panic("ftruncate failed: %s\n", strerror(errno));
> + }
> + /* flag: MAP_HUGETLB */
Any explanation what that comment means here?
Do you plan to use MAP_HUGETLb in the call below or ...?
> + vaddr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
> + if (vaddr == MAP_FAILED) {
> + rte_panic("mmap failed: %s\n", strerror(errno));
> + }
> + memset(vaddr, 0, size);
> + *pfd = fd;
> +
> + return vaddr;
> +}
> +
> +
> /*
> * Prepare physical memory mapping: fill configuration structure with
> * these infos, return 0 on success.
> @@ -1072,7 +1123,9 @@ rte_eal_hugepage_init(void)
> int new_pages_count[MAX_HUGEPAGE_SIZES];
> #endif
>
> +#ifndef RTE_VIRTIO_VDEV
> test_proc_pagemap_readable();
> +#endif
>
> memset(used_hp, 0, sizeof(used_hp));
>
> @@ -1081,8 +1134,8 @@ rte_eal_hugepage_init(void)
>
> /* hugetlbfs can be disabled */
> if (internal_config.no_hugetlbfs) {
> - addr = mmap(NULL, internal_config.memory, PROT_READ | PROT_WRITE,
> - MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
> + int fd;
> + addr = rte_eal_shm_create(&fd);
Why do you remove ability to map(dev/zero) here?
Probably not everyone plan to use --no-hugepages only inside containers.
> if (addr == MAP_FAILED) {
> RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__,
> strerror(errno));
> @@ -1093,6 +1146,7 @@ rte_eal_hugepage_init(void)
> mcfg->memseg[0].hugepage_sz = RTE_PGSIZE_4K;
> mcfg->memseg[0].len = internal_config.memory;
> mcfg->memseg[0].socket_id = 0;
> + mcfg->memseg[0].fd = fd;
> return 0;
> }
>
> diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
> index e57cbbd..8f8852b 100644
> --- a/lib/librte_mempool/rte_mempool.c
> +++ b/lib/librte_mempool/rte_mempool.c
> @@ -453,13 +453,6 @@ rte_mempool_xmem_create(const char *name, unsigned n, unsigned elt_size,
> rte_errno = EINVAL;
> return NULL;
> }
> -
> - /* check that we have both VA and PA */
> - if (vaddr != NULL && paddr == NULL) {
> - rte_errno = EINVAL;
> - return NULL;
> - }
> -
> /* Check that pg_num and pg_shift parameters are valid. */
> if (pg_num < RTE_DIM(mp->elt_pa) || pg_shift > MEMPOOL_PG_SHIFT_MAX) {
> rte_errno = EINVAL;
> @@ -596,8 +589,15 @@ rte_mempool_xmem_create(const char *name, unsigned n, unsigned elt_size,
>
> /* mempool elements in a separate chunk of memory. */
> } else {
> + /* when VA is specified, PA should be specified? */
> + if (rte_eal_has_hugepages()) {
> + if (paddr == NULL) {
> + rte_errno = EINVAL;
> + return NULL;
> + }
> + memcpy(mp->elt_pa, paddr, sizeof (mp->elt_pa[0]) * pg_num);
> + }
> mp->elt_va_start = (uintptr_t)vaddr;
> - memcpy(mp->elt_pa, paddr, sizeof (mp->elt_pa[0]) * pg_num);
Could you explain the reason for that change?
Specially why mempool over external memory now only allowed for hugepages config?
Konstantin
> }
>
> mp->elt_va_end = mp->elt_va_start;
> --
> 2.1.4
> -----Original Message-----
> From: Ananyev, Konstantin
> Sent: Saturday, November 7, 2015 12:22 AM
> To: Tan, Jianfeng; dev@dpdk.org
> Cc: nakajima.yoshihiro@lab.ntt.co.jp; zhbzg@huawei.com; mst@redhat.com;
> gaoxiaoqiu@huawei.com; oscar.zhangbo@huawei.com;
> ann.zhuangyanying@huawei.com; zhoujingbin@huawei.com;
> guohongzhen@huawei.com
> Subject: RE: [dpdk-dev] [RFC 4/5] virtio/container: adjust memory
> initialization process
>
> Hi,
>
> > -----Original Message-----
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Jianfeng Tan
> > Sent: Thursday, November 05, 2015 6:31 PM
> > To: dev@dpdk.org
> > Cc: nakajima.yoshihiro@lab.ntt.co.jp; zhbzg@huawei.com;
> > mst@redhat.com; gaoxiaoqiu@huawei.com; oscar.zhangbo@huawei.com;
> > ann.zhuangyanying@huawei.com; zhoujingbin@huawei.com;
> > guohongzhen@huawei.com
> > Subject: [dpdk-dev] [RFC 4/5] virtio/container: adjust memory
> > initialization process
> >
> > When using virtio for container, we should specify --no-huge so that
> > in memory initialization, shm_open() is used to alloc memory from
> > tmpfs filesystem /dev/shm/.
> >
> > Signed-off-by: Huawei Xie <huawei.xie@intel.com>
> > Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> > ---
......
> > +int
> > +rte_memseg_info_get(int index, int *pfd, uint64_t *psize, void
> > +**paddr) {
> > + struct rte_mem_config *mcfg;
> > + mcfg = rte_eal_get_configuration()->mem_config;
> > +
> > + *pfd = mcfg->memseg[index].fd;
> > + *psize = (uint64_t)mcfg->memseg[index].len;
> > + *paddr = (void *)(uint64_t)mcfg->memseg[index].addr;
> > + return 0;
> > +}
>
> Wonder who will use that function?
> Can't see any references to that function in that patch or next.
This function is used in 1/5, when virtio front end needs to send VHOST_USER_SET_MEM_TABLE to back end.
> > +
> > /*
> > * Get physical address of any mapped virtual address in the current
> process.
> > */
> > @@ -1044,6 +1059,42 @@ calc_num_pages_per_socket(uint64_t *
> memory,
> > return total_num_pages;
> > }
> >
> > +static void *
> > +rte_eal_shm_create(int *pfd)
> > +{
> > + int ret, fd;
> > + char filepath[256];
> > + void *vaddr;
> > + uint64_t size = internal_config.memory;
> > +
> > + sprintf(filepath, "/%s_cvio", internal_config.hugefile_prefix);
> > +
> > + fd = shm_open(filepath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
> > + if (fd < 0) {
> > + rte_panic("shm_open %s failed: %s\n", filepath,
> strerror(errno));
> > + }
> > + ret = flock(fd, LOCK_EX);
> > + if (ret < 0) {
> > + close(fd);
> > + rte_panic("flock %s failed: %s\n", filepath, strerror(errno));
> > + }
> > +
> > + ret = ftruncate(fd, size);
> > + if (ret < 0) {
> > + rte_panic("ftruncate failed: %s\n", strerror(errno));
> > + }
> > + /* flag: MAP_HUGETLB */
>
> Any explanation what that comment means here?
> Do you plan to use MAP_HUGETLb in the call below or ...?
Yes, it's a todo item. Shm_open() just uses a tmpfs mounted at /dev/shm. So I wonder maybe we can use this flag to make
sure os allocates hugepages here if user would like to use hugepages.
>>
......
> > @@ -1081,8 +1134,8 @@ rte_eal_hugepage_init(void)
> >
> > /* hugetlbfs can be disabled */
> > if (internal_config.no_hugetlbfs) {
> > - addr = mmap(NULL, internal_config.memory, PROT_READ |
> PROT_WRITE,
> > - MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
> > + int fd;
> > + addr = rte_eal_shm_create(&fd);
>
> Why do you remove ability to map(dev/zero) here?
> Probably not everyone plan to use --no-hugepages only inside containers.
From my understanding, mmap here is just to allocate some memory, which is initialized to be all zero. I cannot understand what's
the relationship with /dev/zero. rte_eal_shm_create() can do the original function, plus it generates a fd to point to this chunk of
memory. This fd is indispensable in vhost protocol when VHOST_USER_SET_MEM_TABLE using sendmsg().
>
>
> > if (addr == MAP_FAILED) {
> > RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n",
> __func__,
> > strerror(errno));
> > @@ -1093,6 +1146,7 @@ rte_eal_hugepage_init(void)
> > mcfg->memseg[0].hugepage_sz = RTE_PGSIZE_4K;
> > mcfg->memseg[0].len = internal_config.memory;
> > mcfg->memseg[0].socket_id = 0;
> > + mcfg->memseg[0].fd = fd;
> > return 0;
> > }
> >
> > diff --git a/lib/librte_mempool/rte_mempool.c
> > b/lib/librte_mempool/rte_mempool.c
> > index e57cbbd..8f8852b 100644
> > --- a/lib/librte_mempool/rte_mempool.c
> > +++ b/lib/librte_mempool/rte_mempool.c
> > @@ -453,13 +453,6 @@ rte_mempool_xmem_create(const char *name,
> unsigned n, unsigned elt_size,
> > rte_errno = EINVAL;
> > return NULL;
> > }
> > -
> > - /* check that we have both VA and PA */
> > - if (vaddr != NULL && paddr == NULL) {
> > - rte_errno = EINVAL;
> > - return NULL;
> > - }
> > -
> > /* Check that pg_num and pg_shift parameters are valid. */
> > if (pg_num < RTE_DIM(mp->elt_pa) || pg_shift >
> MEMPOOL_PG_SHIFT_MAX) {
> > rte_errno = EINVAL;
> > @@ -596,8 +589,15 @@ rte_mempool_xmem_create(const char *name,
> > unsigned n, unsigned elt_size,
> >
> > /* mempool elements in a separate chunk of memory. */
> > } else {
> > + /* when VA is specified, PA should be specified? */
> > + if (rte_eal_has_hugepages()) {
> > + if (paddr == NULL) {
> > + rte_errno = EINVAL;
> > + return NULL;
> > + }
> > + memcpy(mp->elt_pa, paddr, sizeof (mp->elt_pa[0])
> * pg_num);
> > + }
> > mp->elt_va_start = (uintptr_t)vaddr;
> > - memcpy(mp->elt_pa, paddr, sizeof (mp->elt_pa[0]) *
> pg_num);
>
> Could you explain the reason for that change?
> Specially why mempool over external memory now only allowed for
> hugepages config?
> Konstantin
Oops, you're right! This change was previously for creating a mbuf mempool at a given vaddr and without
giving any paddr[]. And now we don't need to care about neither vaddr nor paddr[] so I should have reverted
change in this file.
>
> > }
> >
> > mp->elt_va_end = mp->elt_va_start;
> > --
> > 2.1.4
>
> > -----Original Message-----
> > From: Ananyev, Konstantin
> > Sent: Saturday, November 7, 2015 12:22 AM
> > To: Tan, Jianfeng; dev@dpdk.org
> > Cc: nakajima.yoshihiro@lab.ntt.co.jp; zhbzg@huawei.com; mst@redhat.com;
> > gaoxiaoqiu@huawei.com; oscar.zhangbo@huawei.com;
> > ann.zhuangyanying@huawei.com; zhoujingbin@huawei.com;
> > guohongzhen@huawei.com
> > Subject: RE: [dpdk-dev] [RFC 4/5] virtio/container: adjust memory
> > initialization process
> >
> > Hi,
> >
> > > -----Original Message-----
> > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Jianfeng Tan
> > > Sent: Thursday, November 05, 2015 6:31 PM
> > > To: dev@dpdk.org
> > > Cc: nakajima.yoshihiro@lab.ntt.co.jp; zhbzg@huawei.com;
> > > mst@redhat.com; gaoxiaoqiu@huawei.com; oscar.zhangbo@huawei.com;
> > > ann.zhuangyanying@huawei.com; zhoujingbin@huawei.com;
> > > guohongzhen@huawei.com
> > > Subject: [dpdk-dev] [RFC 4/5] virtio/container: adjust memory
> > > initialization process
> > >
> > > When using virtio for container, we should specify --no-huge so that
> > > in memory initialization, shm_open() is used to alloc memory from
> > > tmpfs filesystem /dev/shm/.
> > >
> > > Signed-off-by: Huawei Xie <huawei.xie@intel.com>
> > > Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> > > ---
> ......
> > > +int
> > > +rte_memseg_info_get(int index, int *pfd, uint64_t *psize, void
> > > +**paddr) {
> > > + struct rte_mem_config *mcfg;
> > > + mcfg = rte_eal_get_configuration()->mem_config;
> > > +
> > > + *pfd = mcfg->memseg[index].fd;
> > > + *psize = (uint64_t)mcfg->memseg[index].len;
> > > + *paddr = (void *)(uint64_t)mcfg->memseg[index].addr;
> > > + return 0;
> > > +}
> >
> > Wonder who will use that function?
> > Can't see any references to that function in that patch or next.
>
> This function is used in 1/5, when virtio front end needs to send VHOST_USER_SET_MEM_TABLE to back end.
Ok, but hen this function should be defined in the patch *before* it is used, not after.
Another thing: probably better to create a struct for all memseg parameters you want to retrieve,
and pass it to the function, instead of several pointers.
>
> > > +
> > > /*
> > > * Get physical address of any mapped virtual address in the current
> > process.
> > > */
> > > @@ -1044,6 +1059,42 @@ calc_num_pages_per_socket(uint64_t *
> > memory,
> > > return total_num_pages;
> > > }
> > >
> > > +static void *
> > > +rte_eal_shm_create(int *pfd)
> > > +{
> > > + int ret, fd;
> > > + char filepath[256];
> > > + void *vaddr;
> > > + uint64_t size = internal_config.memory;
> > > +
> > > + sprintf(filepath, "/%s_cvio", internal_config.hugefile_prefix);
> > > +
> > > + fd = shm_open(filepath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
> > > + if (fd < 0) {
> > > + rte_panic("shm_open %s failed: %s\n", filepath,
> > strerror(errno));
> > > + }
> > > + ret = flock(fd, LOCK_EX);
> > > + if (ret < 0) {
> > > + close(fd);
> > > + rte_panic("flock %s failed: %s\n", filepath, strerror(errno));
> > > + }
> > > +
> > > + ret = ftruncate(fd, size);
> > > + if (ret < 0) {
> > > + rte_panic("ftruncate failed: %s\n", strerror(errno));
> > > + }
> > > + /* flag: MAP_HUGETLB */
> >
> > Any explanation what that comment means here?
> > Do you plan to use MAP_HUGETLb in the call below or ...?
>
> Yes, it's a todo item. Shm_open() just uses a tmpfs mounted at /dev/shm. So I wonder maybe we can use this flag to make
> sure os allocates hugepages here if user would like to use hugepages.
>
> >>
> ......
> > > @@ -1081,8 +1134,8 @@ rte_eal_hugepage_init(void)
> > >
> > > /* hugetlbfs can be disabled */
> > > if (internal_config.no_hugetlbfs) {
> > > - addr = mmap(NULL, internal_config.memory, PROT_READ |
> > PROT_WRITE,
> > > - MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
> > > + int fd;
> > > + addr = rte_eal_shm_create(&fd);
> >
> > Why do you remove ability to map(dev/zero) here?
> > Probably not everyone plan to use --no-hugepages only inside containers.
>
> From my understanding, mmap here is just to allocate some memory, which is initialized to be all zero. I cannot understand what's
> the relationship with /dev/zero.
I used it here as a synonym for mmap(, ..., MAP_ANONYMOUS,...).
rte_eal_shm_create() can do the original function, plus it generates a fd to point to this chunk of
> memory. This fd is indispensable in vhost protocol when VHOST_USER_SET_MEM_TABLE using sendmsg().
My question was:
Right now for --no-hugepages it allocates a chunk of memory that is not backed-up by any file and is private to the process:
addr = mmap(NULL, internal_config.memory, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
You changed it to shared memory region allocation:
fd = shm_open(filepath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
I understand that you need it for your containers stuff - but I suppose you have to add
new functionality without breaking existing one>
There could be other users of --no-hugepages and they probably want existing behaviour.
Konstantin
>
> >
> >
> > > if (addr == MAP_FAILED) {
> > > RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n",
> > __func__,
> > > strerror(errno));
> > > @@ -1093,6 +1146,7 @@ rte_eal_hugepage_init(void)
> > > mcfg->memseg[0].hugepage_sz = RTE_PGSIZE_4K;
> > > mcfg->memseg[0].len = internal_config.memory;
> > > mcfg->memseg[0].socket_id = 0;
> > > + mcfg->memseg[0].fd = fd;
> > > return 0;
> > > }
> > >
> > > diff --git a/lib/librte_mempool/rte_mempool.c
> > > b/lib/librte_mempool/rte_mempool.c
> > > index e57cbbd..8f8852b 100644
> > > --- a/lib/librte_mempool/rte_mempool.c
> > > +++ b/lib/librte_mempool/rte_mempool.c
> > > @@ -453,13 +453,6 @@ rte_mempool_xmem_create(const char *name,
> > unsigned n, unsigned elt_size,
> > > rte_errno = EINVAL;
> > > return NULL;
> > > }
> > > -
> > > - /* check that we have both VA and PA */
> > > - if (vaddr != NULL && paddr == NULL) {
> > > - rte_errno = EINVAL;
> > > - return NULL;
> > > - }
> > > -
> > > /* Check that pg_num and pg_shift parameters are valid. */
> > > if (pg_num < RTE_DIM(mp->elt_pa) || pg_shift >
> > MEMPOOL_PG_SHIFT_MAX) {
> > > rte_errno = EINVAL;
> > > @@ -596,8 +589,15 @@ rte_mempool_xmem_create(const char *name,
> > > unsigned n, unsigned elt_size,
> > >
> > > /* mempool elements in a separate chunk of memory. */
> > > } else {
> > > + /* when VA is specified, PA should be specified? */
> > > + if (rte_eal_has_hugepages()) {
> > > + if (paddr == NULL) {
> > > + rte_errno = EINVAL;
> > > + return NULL;
> > > + }
> > > + memcpy(mp->elt_pa, paddr, sizeof (mp->elt_pa[0])
> > * pg_num);
> > > + }
> > > mp->elt_va_start = (uintptr_t)vaddr;
> > > - memcpy(mp->elt_pa, paddr, sizeof (mp->elt_pa[0]) *
> > pg_num);
> >
> > Could you explain the reason for that change?
> > Specially why mempool over external memory now only allowed for
> > hugepages config?
> > Konstantin
>
> Oops, you're right! This change was previously for creating a mbuf mempool at a given vaddr and without
> giving any paddr[]. And now we don't need to care about neither vaddr nor paddr[] so I should have reverted
> change in this file.
>
> >
> > > }
> > >
> > > mp->elt_va_end = mp->elt_va_start;
> > > --
> > > 2.1.4
> ......
> > > > +int
> > > > +rte_memseg_info_get(int index, int *pfd, uint64_t *psize, void
> > > > +**paddr) {
> > > > + struct rte_mem_config *mcfg;
> > > > + mcfg = rte_eal_get_configuration()->mem_config;
> > > > +
> > > > + *pfd = mcfg->memseg[index].fd;
> > > > + *psize = (uint64_t)mcfg->memseg[index].len;
> > > > + *paddr = (void *)(uint64_t)mcfg->memseg[index].addr;
> > > > + return 0;
> > > > +}
> > >
> > > Wonder who will use that function?
> > > Can't see any references to that function in that patch or next.
> >
> > This function is used in 1/5, when virtio front end needs to send
> VHOST_USER_SET_MEM_TABLE to back end.
>
> Ok, but hen this function should be defined in the patch *before* it is used,
> not after.
> Another thing: probably better to create a struct for all memseg parameters
> you want to retrieve, and pass it to the function, instead of several pointers.
Very good suggestion! I'll fix it in next version.
> > > > + addr = rte_eal_shm_create(&fd);
> > >
> > > Why do you remove ability to map(dev/zero) here?
> > > Probably not everyone plan to use --no-hugepages only inside containers.
> >
> > From my understanding, mmap here is just to allocate some memory,
> > which is initialized to be all zero. I cannot understand what's the
> relationship with /dev/zero.
>
> I used it here as a synonym for mmap(, ..., MAP_ANONYMOUS,...).
>
> rte_eal_shm_create() can do the original function, plus it generates a fd to
> point to this chunk of
> > memory. This fd is indispensable in vhost protocol when
> VHOST_USER_SET_MEM_TABLE using sendmsg().
>
>
> My question was:
> Right now for --no-hugepages it allocates a chunk of memory that is not
> backed-up by any file and is private to the process:
>
> addr = mmap(NULL, internal_config.memory, PROT_READ | PROT_WRITE,
> MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
>
> You changed it to shared memory region allocation:
>
> fd = shm_open(filepath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR); addr =
> mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
>
> I understand that you need it for your containers stuff - but I suppose you
> have to add new functionality without breaking existing one> There could be
> other users of --no-hugepages and they probably want existing behaviour.
> Konstantin
Thank you for patient analysis and I agree with you. I should have not break
compatibility with existing applications. I'd like to redesign this in next version.
Maybe a new cmd option is necessary here.
Jianfeng
.....
> > > > --
> > > > 2.1.4
@@ -100,6 +100,7 @@ struct rte_memseg {
int32_t socket_id; /**< NUMA socket ID. */
uint32_t nchannel; /**< Number of channels. */
uint32_t nrank; /**< Number of ranks. */
+ int fd; /**< fd used for share this memory */
#ifdef RTE_LIBRTE_XEN_DOM0
/**< store segment MFNs */
uint64_t mfn[DOM0_NUM_MEMBLOCK];
@@ -128,6 +129,10 @@ int rte_mem_lock_page(const void *virt);
*/
phys_addr_t rte_mem_virt2phy(const void *virt);
+
+int
+rte_memseg_info_get(int index, int *pfd, uint64_t *psize, void **paddr);
+
/**
* Get the layout of the available physical memory.
*
@@ -80,6 +80,9 @@
#include <errno.h>
#include <sys/ioctl.h>
#include <sys/time.h>
+#include <mntent.h>
+#include <sys/mman.h>
+#include <sys/file.h>
#include <rte_log.h>
#include <rte_memory.h>
@@ -143,6 +146,18 @@ rte_mem_lock_page(const void *virt)
return mlock((void*)aligned, page_size);
}
+int
+rte_memseg_info_get(int index, int *pfd, uint64_t *psize, void **paddr)
+{
+ struct rte_mem_config *mcfg;
+ mcfg = rte_eal_get_configuration()->mem_config;
+
+ *pfd = mcfg->memseg[index].fd;
+ *psize = (uint64_t)mcfg->memseg[index].len;
+ *paddr = (void *)(uint64_t)mcfg->memseg[index].addr;
+ return 0;
+}
+
/*
* Get physical address of any mapped virtual address in the current process.
*/
@@ -1044,6 +1059,42 @@ calc_num_pages_per_socket(uint64_t * memory,
return total_num_pages;
}
+static void *
+rte_eal_shm_create(int *pfd)
+{
+ int ret, fd;
+ char filepath[256];
+ void *vaddr;
+ uint64_t size = internal_config.memory;
+
+ sprintf(filepath, "/%s_cvio", internal_config.hugefile_prefix);
+
+ fd = shm_open(filepath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
+ if (fd < 0) {
+ rte_panic("shm_open %s failed: %s\n", filepath, strerror(errno));
+ }
+ ret = flock(fd, LOCK_EX);
+ if (ret < 0) {
+ close(fd);
+ rte_panic("flock %s failed: %s\n", filepath, strerror(errno));
+ }
+
+ ret = ftruncate(fd, size);
+ if (ret < 0) {
+ rte_panic("ftruncate failed: %s\n", strerror(errno));
+ }
+ /* flag: MAP_HUGETLB */
+ vaddr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ if (vaddr == MAP_FAILED) {
+ rte_panic("mmap failed: %s\n", strerror(errno));
+ }
+ memset(vaddr, 0, size);
+ *pfd = fd;
+
+ return vaddr;
+}
+
+
/*
* Prepare physical memory mapping: fill configuration structure with
* these infos, return 0 on success.
@@ -1072,7 +1123,9 @@ rte_eal_hugepage_init(void)
int new_pages_count[MAX_HUGEPAGE_SIZES];
#endif
+#ifndef RTE_VIRTIO_VDEV
test_proc_pagemap_readable();
+#endif
memset(used_hp, 0, sizeof(used_hp));
@@ -1081,8 +1134,8 @@ rte_eal_hugepage_init(void)
/* hugetlbfs can be disabled */
if (internal_config.no_hugetlbfs) {
- addr = mmap(NULL, internal_config.memory, PROT_READ | PROT_WRITE,
- MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+ int fd;
+ addr = rte_eal_shm_create(&fd);
if (addr == MAP_FAILED) {
RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__,
strerror(errno));
@@ -1093,6 +1146,7 @@ rte_eal_hugepage_init(void)
mcfg->memseg[0].hugepage_sz = RTE_PGSIZE_4K;
mcfg->memseg[0].len = internal_config.memory;
mcfg->memseg[0].socket_id = 0;
+ mcfg->memseg[0].fd = fd;
return 0;
}
@@ -453,13 +453,6 @@ rte_mempool_xmem_create(const char *name, unsigned n, unsigned elt_size,
rte_errno = EINVAL;
return NULL;
}
-
- /* check that we have both VA and PA */
- if (vaddr != NULL && paddr == NULL) {
- rte_errno = EINVAL;
- return NULL;
- }
-
/* Check that pg_num and pg_shift parameters are valid. */
if (pg_num < RTE_DIM(mp->elt_pa) || pg_shift > MEMPOOL_PG_SHIFT_MAX) {
rte_errno = EINVAL;
@@ -596,8 +589,15 @@ rte_mempool_xmem_create(const char *name, unsigned n, unsigned elt_size,
/* mempool elements in a separate chunk of memory. */
} else {
+ /* when VA is specified, PA should be specified? */
+ if (rte_eal_has_hugepages()) {
+ if (paddr == NULL) {
+ rte_errno = EINVAL;
+ return NULL;
+ }
+ memcpy(mp->elt_pa, paddr, sizeof (mp->elt_pa[0]) * pg_num);
+ }
mp->elt_va_start = (uintptr_t)vaddr;
- memcpy(mp->elt_pa, paddr, sizeof (mp->elt_pa[0]) * pg_num);
}
mp->elt_va_end = mp->elt_va_start;