[v5,03/11] eal: introduce memory management wrappers
Checks
Commit Message
Introduce OS-independent wrappers for memory management operations used
across DPDK and specifically in common code of EAL:
* rte_mem_map()
* rte_mem_unmap()
* rte_get_page_size()
* rte_mem_lock()
Windows uses different APIs for memory mapping and reservation, while
Unices reserve memory by mapping it. Introduce EAL private functions to
support memory reservation in common code:
* eal_mem_reserve()
* eal_mem_free()
* eal_mem_set_dump()
Wrappers follow POSIX semantics limited to DPDK tasks, but their
signatures deliberately differ from POSIX ones to be more safe and
expressive.
Signed-off-by: Dmitry Kozlyuk <dmitry.kozliuk@gmail.com>
---
lib/librte_eal/common/eal_common_fbarray.c | 37 +++--
lib/librte_eal/common/eal_common_memory.c | 60 +++-----
lib/librte_eal/common/eal_private.h | 78 ++++++++++-
lib/librte_eal/freebsd/Makefile | 1 +
lib/librte_eal/include/rte_memory.h | 88 ++++++++++++
lib/librte_eal/linux/Makefile | 1 +
lib/librte_eal/linux/eal_memalloc.c | 5 +-
lib/librte_eal/rte_eal_version.map | 6 +
lib/librte_eal/unix/eal_unix_memory.c | 152 +++++++++++++++++++++
lib/librte_eal/unix/meson.build | 1 +
10 files changed, 365 insertions(+), 64 deletions(-)
create mode 100644 lib/librte_eal/unix/eal_unix_memory.c
Comments
Are wrappers 100% are required.
Would it be simpler (and less invasive) to have a windows_compat.h that plugged this holes?
I am not sure on the standard approach here - so I will leave this to others.
Outside of that - do these symbols really require experimental status.
Are they really likely to change?
Ray K
On 25/05/2020 01:37, Dmitry Kozlyuk wrote:
> Introduce OS-independent wrappers for memory management operations used
> across DPDK and specifically in common code of EAL:
>
> * rte_mem_map()
> * rte_mem_unmap()
> * rte_get_page_size()
> * rte_mem_lock()
>
> Windows uses different APIs for memory mapping and reservation, while
> Unices reserve memory by mapping it. Introduce EAL private functions to
> support memory reservation in common code:
>
> * eal_mem_reserve()
> * eal_mem_free()
> * eal_mem_set_dump()
>
> Wrappers follow POSIX semantics limited to DPDK tasks, but their
> signatures deliberately differ from POSIX ones to be more safe and
> expressive.
>
> Signed-off-by: Dmitry Kozlyuk <dmitry.kozliuk@gmail.com>
> ---
> lib/librte_eal/common/eal_common_fbarray.c | 37 +++--
> lib/librte_eal/common/eal_common_memory.c | 60 +++-----
> lib/librte_eal/common/eal_private.h | 78 ++++++++++-
> lib/librte_eal/freebsd/Makefile | 1 +
> lib/librte_eal/include/rte_memory.h | 88 ++++++++++++
> lib/librte_eal/linux/Makefile | 1 +
> lib/librte_eal/linux/eal_memalloc.c | 5 +-
> lib/librte_eal/rte_eal_version.map | 6 +
> lib/librte_eal/unix/eal_unix_memory.c | 152 +++++++++++++++++++++
> lib/librte_eal/unix/meson.build | 1 +
> 10 files changed, 365 insertions(+), 64 deletions(-)
> create mode 100644 lib/librte_eal/unix/eal_unix_memory.c
>
> diff --git a/lib/librte_eal/common/eal_common_fbarray.c b/lib/librte_eal/common/eal_common_fbarray.c
> index cfcab63e9..a41e8ce5f 100644
> --- a/lib/librte_eal/common/eal_common_fbarray.c
> +++ b/lib/librte_eal/common/eal_common_fbarray.c
> @@ -5,15 +5,15 @@
> #include <fcntl.h>
> #include <inttypes.h>
> #include <limits.h>
> -#include <sys/mman.h>
> #include <stdint.h>
> #include <errno.h>
> #include <string.h>
> #include <unistd.h>
>
> #include <rte_common.h>
> -#include <rte_log.h>
> #include <rte_errno.h>
> +#include <rte_log.h>
> +#include <rte_memory.h>
> #include <rte_spinlock.h>
> #include <rte_tailq.h>
>
> @@ -90,12 +90,9 @@ resize_and_map(int fd, void *addr, size_t len)
> return -1;
> }
>
> - map_addr = mmap(addr, len, PROT_READ | PROT_WRITE,
> - MAP_SHARED | MAP_FIXED, fd, 0);
> + map_addr = rte_mem_map(addr, len, RTE_PROT_READ | RTE_PROT_WRITE,
> + RTE_MAP_SHARED | RTE_MAP_FORCE_ADDRESS, fd, 0);
> if (map_addr != addr) {
> - RTE_LOG(ERR, EAL, "mmap() failed: %s\n", strerror(errno));
> - /* pass errno up the chain */
> - rte_errno = errno;
> return -1;
> }
> return 0;
> @@ -733,7 +730,7 @@ rte_fbarray_init(struct rte_fbarray *arr, const char *name, unsigned int len,
> return -1;
> }
>
> - page_sz = sysconf(_SC_PAGESIZE);
> + page_sz = rte_get_page_size();
> if (page_sz == (size_t)-1) {
> free(ma);
> return -1;
> @@ -754,9 +751,11 @@ rte_fbarray_init(struct rte_fbarray *arr, const char *name, unsigned int len,
>
> if (internal_config.no_shconf) {
> /* remap virtual area as writable */
> - void *new_data = mmap(data, mmap_len, PROT_READ | PROT_WRITE,
> - MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, fd, 0);
> - if (new_data == MAP_FAILED) {
> + static const int flags = RTE_MAP_FORCE_ADDRESS |
> + RTE_MAP_PRIVATE | RTE_MAP_ANONYMOUS;
> + void *new_data = rte_mem_map(data, mmap_len,
> + RTE_PROT_READ | RTE_PROT_WRITE, flags, fd, 0);
> + if (new_data == NULL) {
> RTE_LOG(DEBUG, EAL, "%s(): couldn't remap anonymous memory: %s\n",
> __func__, strerror(errno));
> goto fail;
> @@ -821,7 +820,7 @@ rte_fbarray_init(struct rte_fbarray *arr, const char *name, unsigned int len,
> return 0;
> fail:
> if (data)
> - munmap(data, mmap_len);
> + rte_mem_unmap(data, mmap_len);
> if (fd >= 0)
> close(fd);
> free(ma);
> @@ -859,7 +858,7 @@ rte_fbarray_attach(struct rte_fbarray *arr)
> return -1;
> }
>
> - page_sz = sysconf(_SC_PAGESIZE);
> + page_sz = rte_get_page_size();
> if (page_sz == (size_t)-1) {
> free(ma);
> return -1;
> @@ -911,7 +910,7 @@ rte_fbarray_attach(struct rte_fbarray *arr)
> return 0;
> fail:
> if (data)
> - munmap(data, mmap_len);
> + rte_mem_unmap(data, mmap_len);
> if (fd >= 0)
> close(fd);
> free(ma);
> @@ -939,8 +938,7 @@ rte_fbarray_detach(struct rte_fbarray *arr)
> * really do anything about it, things will blow up either way.
> */
>
> - size_t page_sz = sysconf(_SC_PAGESIZE);
> -
> + size_t page_sz = rte_get_page_size();
> if (page_sz == (size_t)-1)
> return -1;
>
> @@ -959,7 +957,7 @@ rte_fbarray_detach(struct rte_fbarray *arr)
> goto out;
> }
>
> - munmap(arr->data, mmap_len);
> + rte_mem_unmap(arr->data, mmap_len);
>
> /* area is unmapped, close fd and remove the tailq entry */
> if (tmp->fd >= 0)
> @@ -994,8 +992,7 @@ rte_fbarray_destroy(struct rte_fbarray *arr)
> * really do anything about it, things will blow up either way.
> */
>
> - size_t page_sz = sysconf(_SC_PAGESIZE);
> -
> + size_t page_sz = rte_get_page_size();
> if (page_sz == (size_t)-1)
> return -1;
>
> @@ -1044,7 +1041,7 @@ rte_fbarray_destroy(struct rte_fbarray *arr)
> }
> close(fd);
> }
> - munmap(arr->data, mmap_len);
> + rte_mem_unmap(arr->data, mmap_len);
>
> /* area is unmapped, remove the tailq entry */
> TAILQ_REMOVE(&mem_area_tailq, tmp, next);
> diff --git a/lib/librte_eal/common/eal_common_memory.c b/lib/librte_eal/common/eal_common_memory.c
> index 4c897a13f..c6243aca1 100644
> --- a/lib/librte_eal/common/eal_common_memory.c
> +++ b/lib/librte_eal/common/eal_common_memory.c
> @@ -11,7 +11,6 @@
> #include <string.h>
> #include <unistd.h>
> #include <inttypes.h>
> -#include <sys/mman.h>
> #include <sys/queue.h>
>
> #include <rte_fbarray.h>
> @@ -40,18 +39,10 @@
> static void *next_baseaddr;
> static uint64_t system_page_sz;
>
> -#ifdef RTE_EXEC_ENV_LINUX
> -#define RTE_DONTDUMP MADV_DONTDUMP
> -#elif defined RTE_EXEC_ENV_FREEBSD
> -#define RTE_DONTDUMP MADV_NOCORE
> -#else
> -#error "madvise doesn't support this OS"
> -#endif
> -
> #define MAX_MMAP_WITH_DEFINED_ADDR_TRIES 5
> void *
> eal_get_virtual_area(void *requested_addr, size_t *size,
> - size_t page_sz, int flags, int mmap_flags)
> + size_t page_sz, int flags, int reserve_flags)
> {
> bool addr_is_hint, allow_shrink, unmap, no_align;
> uint64_t map_sz;
> @@ -59,9 +50,7 @@ eal_get_virtual_area(void *requested_addr, size_t *size,
> uint8_t try = 0;
>
> if (system_page_sz == 0)
> - system_page_sz = sysconf(_SC_PAGESIZE);
> -
> - mmap_flags |= MAP_PRIVATE | MAP_ANONYMOUS;
> + system_page_sz = rte_get_page_size();
>
> RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size);
>
> @@ -105,24 +94,24 @@ eal_get_virtual_area(void *requested_addr, size_t *size,
> return NULL;
> }
>
> - mapped_addr = mmap(requested_addr, (size_t)map_sz, PROT_NONE,
> - mmap_flags, -1, 0);
> - if (mapped_addr == MAP_FAILED && allow_shrink)
> + mapped_addr = eal_mem_reserve(
> + requested_addr, (size_t)map_sz, reserve_flags);
> + if ((mapped_addr == NULL) && allow_shrink)
> *size -= page_sz;
>
> - if (mapped_addr != MAP_FAILED && addr_is_hint &&
> - mapped_addr != requested_addr) {
> + if ((mapped_addr != NULL) && addr_is_hint &&
> + (mapped_addr != requested_addr)) {
> try++;
> next_baseaddr = RTE_PTR_ADD(next_baseaddr, page_sz);
> if (try <= MAX_MMAP_WITH_DEFINED_ADDR_TRIES) {
> /* hint was not used. Try with another offset */
> - munmap(mapped_addr, map_sz);
> - mapped_addr = MAP_FAILED;
> + eal_mem_free(mapped_addr, map_sz);
> + mapped_addr = NULL;
> requested_addr = next_baseaddr;
> }
> }
> } while ((allow_shrink || addr_is_hint) &&
> - mapped_addr == MAP_FAILED && *size > 0);
> + (mapped_addr == NULL) && (*size > 0));
>
> /* align resulting address - if map failed, we will ignore the value
> * anyway, so no need to add additional checks.
> @@ -132,20 +121,17 @@ eal_get_virtual_area(void *requested_addr, size_t *size,
>
> if (*size == 0) {
> RTE_LOG(ERR, EAL, "Cannot get a virtual area of any size: %s\n",
> - strerror(errno));
> - rte_errno = errno;
> + strerror(rte_errno));
> return NULL;
> - } else if (mapped_addr == MAP_FAILED) {
> + } else if (mapped_addr == NULL) {
> RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n",
> - strerror(errno));
> - /* pass errno up the call chain */
> - rte_errno = errno;
> + strerror(rte_errno));
> return NULL;
> } else if (requested_addr != NULL && !addr_is_hint &&
> aligned_addr != requested_addr) {
> RTE_LOG(ERR, EAL, "Cannot get a virtual area at requested address: %p (got %p)\n",
> requested_addr, aligned_addr);
> - munmap(mapped_addr, map_sz);
> + eal_mem_free(mapped_addr, map_sz);
> rte_errno = EADDRNOTAVAIL;
> return NULL;
> } else if (requested_addr != NULL && addr_is_hint &&
> @@ -161,7 +147,7 @@ eal_get_virtual_area(void *requested_addr, size_t *size,
> aligned_addr, *size);
>
> if (unmap) {
> - munmap(mapped_addr, map_sz);
> + eal_mem_free(mapped_addr, map_sz);
> } else if (!no_align) {
> void *map_end, *aligned_end;
> size_t before_len, after_len;
> @@ -179,19 +165,17 @@ eal_get_virtual_area(void *requested_addr, size_t *size,
> /* unmap space before aligned mmap address */
> before_len = RTE_PTR_DIFF(aligned_addr, mapped_addr);
> if (before_len > 0)
> - munmap(mapped_addr, before_len);
> + eal_mem_free(mapped_addr, before_len);
>
> /* unmap space after aligned end mmap address */
> after_len = RTE_PTR_DIFF(map_end, aligned_end);
> if (after_len > 0)
> - munmap(aligned_end, after_len);
> + eal_mem_free(aligned_end, after_len);
> }
>
> if (!unmap) {
> /* Exclude these pages from a core dump. */
> - if (madvise(aligned_addr, *size, RTE_DONTDUMP) != 0)
> - RTE_LOG(DEBUG, EAL, "madvise failed: %s\n",
> - strerror(errno));
> + eal_mem_set_dump(aligned_addr, *size, false);
> }
>
> return aligned_addr;
> @@ -547,10 +531,10 @@ rte_eal_memdevice_init(void)
> int
> rte_mem_lock_page(const void *virt)
> {
> - unsigned long virtual = (unsigned long)virt;
> - int page_size = getpagesize();
> - unsigned long aligned = (virtual & ~(page_size - 1));
> - return mlock((void *)aligned, page_size);
> + uintptr_t virtual = (uintptr_t)virt;
> + size_t page_size = rte_get_page_size();
> + uintptr_t aligned = RTE_PTR_ALIGN_FLOOR(virtual, page_size);
> + return rte_mem_lock((void *)aligned, page_size);
> }
>
> int
> diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
> index cef73d6fe..a93850c09 100644
> --- a/lib/librte_eal/common/eal_private.h
> +++ b/lib/librte_eal/common/eal_private.h
> @@ -11,6 +11,7 @@
>
> #include <rte_dev.h>
> #include <rte_lcore.h>
> +#include <rte_memory.h>
>
> /**
> * Structure storing internal configuration (per-lcore)
> @@ -202,6 +203,24 @@ int rte_eal_alarm_init(void);
> */
> int rte_eal_check_module(const char *module_name);
>
> +/**
> + * Memory reservation flags.
> + */
> +enum eal_mem_reserve_flags {
> + /**
> + * Reserve hugepages. May be unsupported by some platforms.
> + */
> + EAL_RESERVE_HUGEPAGES = 1 << 0,
> + /**
> + * Force reserving memory at the requested address.
> + * This can be a destructive action depending on the implementation.
> + *
> + * @see RTE_MAP_FORCE_ADDRESS for description of possible consequences
> + * (although implementations are not required to use it).
> + */
> + EAL_RESERVE_FORCE_ADDRESS = 1 << 1
> +};
> +
> /**
> * Get virtual area of specified size from the OS.
> *
> @@ -215,8 +234,8 @@ int rte_eal_check_module(const char *module_name);
> * Page size on which to align requested virtual area.
> * @param flags
> * EAL_VIRTUAL_AREA_* flags.
> - * @param mmap_flags
> - * Extra flags passed directly to mmap().
> + * @param reserve_flags
> + * Extra flags passed directly to rte_mem_reserve().
> *
> * @return
> * Virtual area address if successful.
> @@ -233,7 +252,7 @@ int rte_eal_check_module(const char *module_name);
> /**< immediately unmap reserved virtual area. */
> void *
> eal_get_virtual_area(void *requested_addr, size_t *size,
> - size_t page_sz, int flags, int mmap_flags);
> + size_t page_sz, int flags, int reserve_flags);
>
> /**
> * Get cpu core_id.
> @@ -467,4 +486,57 @@ eal_file_lock(int fd, enum eal_flock_op op, enum eal_flock_mode mode);
> int
> eal_file_truncate(int fd, ssize_t size);
>
> +/**
> + * Reserve a region of virtual memory.
> + *
> + * Use eal_mem_free() to free reserved memory.
> + *
> + * @param requested_addr
> + * A desired reservation addressm which must be page-aligned.
> + * The system might not respect it.
> + * NULL means the address will be chosen by the system.
> + * @param size
> + * Reservation size. Must be a multiple of system page size.
> + * @param flags
> + * Reservation options, a combination of eal_mem_reserve_flags.
> + * @returns
> + * Starting address of the reserved area on success, NULL on failure.
> + * Callers must not access this memory until remapping it.
> + */
> +void *
> +eal_mem_reserve(void *requested_addr, size_t size, int flags);
> +
> +/**
> + * Free memory obtained by eal_mem_reserve() or eal_mem_alloc().
> + *
> + * If *virt* and *size* describe a part of the reserved region,
> + * only this part of the region is freed (accurately up to the system
> + * page size). If *virt* points to allocated memory, *size* must match
> + * the one specified on allocation. The behavior is undefined
> + * if the memory pointed by *virt* is obtained from another source
> + * than listed above.
> + *
> + * @param virt
> + * A virtual address in a region previously reserved.
> + * @param size
> + * Number of bytes to unreserve.
> + */
> +void
> +eal_mem_free(void *virt, size_t size);
> +
> +/**
> + * Configure memory region inclusion into core dumps.
> + *
> + * @param virt
> + * Starting address of the region.
> + * @param size
> + * Size of the region.
> + * @param dump
> + * True to include memory into core dumps, false to exclude.
> + * @return
> + * 0 on success, (-1) on failure and rte_errno is set.
> + */
> +int
> +eal_mem_set_dump(void *virt, size_t size, bool dump);
> +
> #endif /* _EAL_PRIVATE_H_ */
> diff --git a/lib/librte_eal/freebsd/Makefile b/lib/librte_eal/freebsd/Makefile
> index 4654ca2b3..f64a3994c 100644
> --- a/lib/librte_eal/freebsd/Makefile
> +++ b/lib/librte_eal/freebsd/Makefile
> @@ -77,6 +77,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_reciprocal.c
>
> # from unix dir
> SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_unix.c
> +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_unix_memory.c
>
> # from arch dir
> SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_cpuflags.c
> diff --git a/lib/librte_eal/include/rte_memory.h b/lib/librte_eal/include/rte_memory.h
> index 65374d53a..63ff0773d 100644
> --- a/lib/librte_eal/include/rte_memory.h
> +++ b/lib/librte_eal/include/rte_memory.h
> @@ -82,6 +82,94 @@ struct rte_memseg_list {
> struct rte_fbarray memseg_arr;
> };
>
> +/**
> + * Memory protection flags.
> + */
> +enum rte_mem_prot {
> + RTE_PROT_READ = 1 << 0, /**< Read access. */
> + RTE_PROT_WRITE = 1 << 1, /**< Write access. */
> + RTE_PROT_EXECUTE = 1 << 2 /**< Code execution. */
> +};
> +
> +/**
> + * Additional flags for memory mapping.
> + */
> +enum rte_map_flags {
> + /** Changes to the mapped memory are visible to other processes. */
> + RTE_MAP_SHARED = 1 << 0,
> + /** Mapping is not backed by a regular file. */
> + RTE_MAP_ANONYMOUS = 1 << 1,
> + /** Copy-on-write mapping, changes are invisible to other processes. */
> + RTE_MAP_PRIVATE = 1 << 2,
> + /**
> + * Force mapping to the requested address. This flag should be used
> + * with caution, because to fulfill the request implementation
> + * may remove all other mappings in the requested region. However,
> + * it is not required to do so, thus mapping with this flag may fail.
> + */
> + RTE_MAP_FORCE_ADDRESS = 1 << 3
> +};
> +
> +/**
> + * Map a portion of an opened file or the page file into memory.
> + *
> + * This function is similar to POSIX mmap(3) with common MAP_ANONYMOUS
> + * extension, except for the return value.
> + *
> + * @param requested_addr
> + * Desired virtual address for mapping. Can be NULL to let OS choose.
> + * @param size
> + * Size of the mapping in bytes.
> + * @param prot
> + * Protection flags, a combination of rte_mem_prot values.
> + * @param flags
> + * Addtional mapping flags, a combination of rte_map_flags.
> + * @param fd
> + * Mapped file descriptor. Can be negative for anonymous mapping.
> + * @param offset
> + * Offset of the mapped region in fd. Must be 0 for anonymous mappings.
> + * @return
> + * Mapped address or NULL on failure and rte_errno is set to OS error.
> + */
> +__rte_experimental
> +void *
> +rte_mem_map(void *requested_addr, size_t size, int prot, int flags,
> + int fd, size_t offset);
> +
> +/**
> + * OS-independent implementation of POSIX munmap(3).
> + */
> +__rte_experimental
> +int
> +rte_mem_unmap(void *virt, size_t size);
> +
> +/**
> + * Get system page size. This function never fails.
> + *
> + * @return
> + * Page size in bytes.
> + */
> +__rte_experimental
> +size_t
> +rte_get_page_size(void);
> +
> +/**
> + * Lock in physical memory all pages crossed by the address region.
> + *
> + * @param virt
> + * Base virtual address of the region.
> + * @param size
> + * Size of the region.
> + * @return
> + * 0 on success, negative on error.
> + *
> + * @see rte_get_page_size() to retrieve the page size.
> + * @see rte_mem_lock_page() to lock an entire single page.
> + */
> +__rte_experimental
> +int
> +rte_mem_lock(const void *virt, size_t size);
> +
> /**
> * Lock page in physical memory and prevent from swapping.
> *
> diff --git a/lib/librte_eal/linux/Makefile b/lib/librte_eal/linux/Makefile
> index 4f39d462c..d314648cb 100644
> --- a/lib/librte_eal/linux/Makefile
> +++ b/lib/librte_eal/linux/Makefile
> @@ -84,6 +84,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_reciprocal.c
>
> # from unix dir
> SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_unix.c
> +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_unix_memory.c
>
> # from arch dir
> SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_cpuflags.c
> diff --git a/lib/librte_eal/linux/eal_memalloc.c b/lib/librte_eal/linux/eal_memalloc.c
> index 2c717f8bd..bf29b83c6 100644
> --- a/lib/librte_eal/linux/eal_memalloc.c
> +++ b/lib/librte_eal/linux/eal_memalloc.c
> @@ -630,7 +630,7 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
> mapped:
> munmap(addr, alloc_sz);
> unmapped:
> - flags = MAP_FIXED;
> + flags = EAL_RESERVE_FORCE_ADDRESS;
> new_addr = eal_get_virtual_area(addr, &alloc_sz, alloc_sz, 0, flags);
> if (new_addr != addr) {
> if (new_addr != NULL)
> @@ -687,8 +687,7 @@ free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
> return -1;
> }
>
> - if (madvise(ms->addr, ms->len, MADV_DONTDUMP) != 0)
> - RTE_LOG(DEBUG, EAL, "madvise failed: %s\n", strerror(errno));
> + eal_mem_set_dump(ms->addr, ms->len, false);
>
> exit_early = false;
>
> diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
> index d8038749a..dff51b13d 100644
> --- a/lib/librte_eal/rte_eal_version.map
> +++ b/lib/librte_eal/rte_eal_version.map
> @@ -386,4 +386,10 @@ EXPERIMENTAL {
> rte_trace_point_lookup;
> rte_trace_regexp;
> rte_trace_save;
> +
> + # added in 20.08
> + rte_get_page_size;
> + rte_mem_lock;
> + rte_mem_map;
> + rte_mem_unmap;
> };
> diff --git a/lib/librte_eal/unix/eal_unix_memory.c b/lib/librte_eal/unix/eal_unix_memory.c
> new file mode 100644
> index 000000000..658595b6e
> --- /dev/null
> +++ b/lib/librte_eal/unix/eal_unix_memory.c
> @@ -0,0 +1,152 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2020 Dmitry Kozlyuk
> + */
> +
> +#include <string.h>
> +#include <sys/mman.h>
> +#include <unistd.h>
> +
> +#include <rte_errno.h>
> +#include <rte_log.h>
> +#include <rte_memory.h>
> +
> +#include "eal_private.h"
> +
> +#ifdef RTE_EXEC_ENV_LINUX
> +#define EAL_DONTDUMP MADV_DONTDUMP
> +#define EAL_DODUMP MADV_DODUMP
> +#elif defined RTE_EXEC_ENV_FREEBSD
> +#define EAL_DONTDUMP MADV_NOCORE
> +#define EAL_DODUMP MADV_CORE
> +#else
> +#error "madvise doesn't support this OS"
> +#endif
> +
> +static void *
> +mem_map(void *requested_addr, size_t size, int prot, int flags,
> + int fd, size_t offset)
> +{
> + void *virt = mmap(requested_addr, size, prot, flags, fd, offset);
> + if (virt == MAP_FAILED) {
> + RTE_LOG(DEBUG, EAL,
> + "Cannot mmap(%p, 0x%zx, 0x%x, 0x%x, %d, 0x%zx): %s\n",
> + requested_addr, size, prot, flags, fd, offset,
> + strerror(errno));
> + rte_errno = errno;
> + return NULL;
> + }
> + return virt;
> +}
> +
> +static int
> +mem_unmap(void *virt, size_t size)
> +{
> + int ret = munmap(virt, size);
> + if (ret < 0) {
> + RTE_LOG(DEBUG, EAL, "Cannot munmap(%p, 0x%zx): %s\n",
> + virt, size, strerror(errno));
> + rte_errno = errno;
> + }
> + return ret;
> +}
> +
> +void *
> +eal_mem_reserve(void *requested_addr, size_t size, int flags)
> +{
> + int sys_flags = MAP_PRIVATE | MAP_ANONYMOUS;
> +
> + if (flags & EAL_RESERVE_HUGEPAGES) {
> +#ifdef MAP_HUGETLB
> + sys_flags |= MAP_HUGETLB;
> +#else
> + rte_errno = ENOTSUP;
> + return NULL;
> +#endif
> + }
> +
> + if (flags & EAL_RESERVE_FORCE_ADDRESS)
> + sys_flags |= MAP_FIXED;
> +
> + return mem_map(requested_addr, size, PROT_NONE, sys_flags, -1, 0);
> +}
> +
> +void
> +eal_mem_free(void *virt, size_t size)
> +{
> + mem_unmap(virt, size);
> +}
> +
> +int
> +eal_mem_set_dump(void *virt, size_t size, bool dump)
> +{
> + int flags = dump ? EAL_DODUMP : EAL_DONTDUMP;
> + int ret = madvise(virt, size, flags);
> + if (ret) {
> + RTE_LOG(DEBUG, EAL, "madvise(%p, %#zx, %d) failed: %s\n",
> + virt, size, flags, strerror(rte_errno));
> + rte_errno = errno;
> + }
> + return ret;
> +}
> +
> +static int
> +mem_rte_to_sys_prot(int prot)
> +{
> + int sys_prot = PROT_NONE;
> +
> + if (prot & RTE_PROT_READ)
> + sys_prot |= PROT_READ;
> + if (prot & RTE_PROT_WRITE)
> + sys_prot |= PROT_WRITE;
> + if (prot & RTE_PROT_EXECUTE)
> + sys_prot |= PROT_EXEC;
> +
> + return sys_prot;
> +}
> +
> +void *
> +rte_mem_map(void *requested_addr, size_t size, int prot, int flags,
> + int fd, size_t offset)
> +{
> + int sys_flags = 0;
> + int sys_prot;
> +
> + sys_prot = mem_rte_to_sys_prot(prot);
> +
> + if (flags & RTE_MAP_SHARED)
> + sys_flags |= MAP_SHARED;
> + if (flags & RTE_MAP_ANONYMOUS)
> + sys_flags |= MAP_ANONYMOUS;
> + if (flags & RTE_MAP_PRIVATE)
> + sys_flags |= MAP_PRIVATE;
> + if (flags & RTE_MAP_FORCE_ADDRESS)
> + sys_flags |= MAP_FIXED;
> +
> + return mem_map(requested_addr, size, sys_prot, sys_flags, fd, offset);
> +}
> +
> +int
> +rte_mem_unmap(void *virt, size_t size)
> +{
> + return mem_unmap(virt, size);
> +}
> +
> +size_t
> +rte_get_page_size(void)
> +{
> + static size_t page_size;
> +
> + if (!page_size)
> + page_size = sysconf(_SC_PAGESIZE);
> +
> + return page_size;
> +}
> +
> +int
> +rte_mem_lock(const void *virt, size_t size)
> +{
> + int ret = mlock(virt, size);
> + if (ret)
> + rte_errno = errno;
> + return ret;
> +}
> diff --git a/lib/librte_eal/unix/meson.build b/lib/librte_eal/unix/meson.build
> index cfa1b4ef9..5734f26ad 100644
> --- a/lib/librte_eal/unix/meson.build
> +++ b/lib/librte_eal/unix/meson.build
> @@ -3,4 +3,5 @@
>
> sources += files(
> 'eal_unix.c',
> + 'eal_unix_memory.c',
> )
>
Answers below is the summary of discussion with Thomas, Ranjit, Tal, et al.
On Wed, 27 May 2020 07:33:32 +0100 Ray Kinsella <mdr@ashroe.eu> wrote:
> Are wrappers 100% are required.
> Would it be simpler (and less invasive) to have a windows_compat.h that plugged this holes?
> I am not sure on the standard approach here - so I will leave this to others.
With wrappers, we control API and semantics, which is limited compared to the
underlying syscalls. It is also cleaner not to export non-RTE symbols from
DPDK libraries. Regarding invasion: it requires little change, factoring
out some common error logging in the process.
> Outside of that - do these symbols really require experimental status.
> Are they really likely to change?
Indeed, the wrappers should be internal, not experimental. Will fix in v6.
On 25-May-20 1:37 AM, Dmitry Kozlyuk wrote:
> Introduce OS-independent wrappers for memory management operations used
> across DPDK and specifically in common code of EAL:
>
> * rte_mem_map()
> * rte_mem_unmap()
> * rte_get_page_size()
> * rte_mem_lock()
>
> Windows uses different APIs for memory mapping and reservation, while
> Unices reserve memory by mapping it. Introduce EAL private functions to
> support memory reservation in common code:
>
> * eal_mem_reserve()
> * eal_mem_free()
> * eal_mem_set_dump()
>
> Wrappers follow POSIX semantics limited to DPDK tasks, but their
> signatures deliberately differ from POSIX ones to be more safe and
> expressive.
>
> Signed-off-by: Dmitry Kozlyuk <dmitry.kozliuk@gmail.com>
> ---
<snip>
> - page_sz = sysconf(_SC_PAGESIZE);
> + page_sz = rte_get_page_size();
> if (page_sz == (size_t)-1) {
> free(ma);
> return -1;
> @@ -754,9 +751,11 @@ rte_fbarray_init(struct rte_fbarray *arr, const char *name, unsigned int len,
>
> if (internal_config.no_shconf) {
> /* remap virtual area as writable */
> - void *new_data = mmap(data, mmap_len, PROT_READ | PROT_WRITE,
> - MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, fd, 0);
> - if (new_data == MAP_FAILED) {
> + static const int flags = RTE_MAP_FORCE_ADDRESS |
> + RTE_MAP_PRIVATE | RTE_MAP_ANONYMOUS;
> + void *new_data = rte_mem_map(data, mmap_len,
> + RTE_PROT_READ | RTE_PROT_WRITE, flags, fd, 0);
> + if (new_data == NULL) {
> RTE_LOG(DEBUG, EAL, "%s(): couldn't remap anonymous memory: %s\n",
> __func__, strerror(errno));
I believe this should be rte_strerror(rte_errno) instead of strerror(errno).
> goto fail;
> @@ -821,7 +820,7 @@ rte_fbarray_init(struct rte_fbarray *arr, const char *name, unsigned int len,
> return 0;
> fail:
> if (data)
> - munmap(data, mmap_len);
> + rte_mem_unmap(data, mmap_len);
> if (fd >= 0)
> close(fd);
> free(ma);
> @@ -859,7 +858,7 @@ rte_fbarray_attach(struct rte_fbarray *arr)
> return -1;
<snip>
>
> +/**
> + * Memory protection flags.
> + */
> +enum rte_mem_prot {
> + RTE_PROT_READ = 1 << 0, /**< Read access. */
> + RTE_PROT_WRITE = 1 << 1, /**< Write access. */
> + RTE_PROT_EXECUTE = 1 << 2 /**< Code execution. */
> +};
> +
> +/**
> + * Additional flags for memory mapping.
> + */
> +enum rte_map_flags {
> + /** Changes to the mapped memory are visible to other processes. */
> + RTE_MAP_SHARED = 1 << 0,
> + /** Mapping is not backed by a regular file. */
> + RTE_MAP_ANONYMOUS = 1 << 1,
> + /** Copy-on-write mapping, changes are invisible to other processes. */
> + RTE_MAP_PRIVATE = 1 << 2,
> + /**
> + * Force mapping to the requested address. This flag should be used
> + * with caution, because to fulfill the request implementation
> + * may remove all other mappings in the requested region. However,
> + * it is not required to do so, thus mapping with this flag may fail.
> + */
> + RTE_MAP_FORCE_ADDRESS = 1 << 3
> +};
I have no strong opinion on this, but it feels like the fact that these
are enums is a relic from the times where you used enum everywhere :) i
have a feeling that DPDK codebase prefers #define's for this usage,
while what you have here is more of a C++ thing.
On 25-May-20 1:37 AM, Dmitry Kozlyuk wrote:
> Introduce OS-independent wrappers for memory management operations used
> across DPDK and specifically in common code of EAL:
>
> * rte_mem_map()
> * rte_mem_unmap()
> * rte_get_page_size()
> * rte_mem_lock()
>
> Windows uses different APIs for memory mapping and reservation, while
> Unices reserve memory by mapping it. Introduce EAL private functions to
> support memory reservation in common code:
>
> * eal_mem_reserve()
> * eal_mem_free()
> * eal_mem_set_dump()
>
> Wrappers follow POSIX semantics limited to DPDK tasks, but their
> signatures deliberately differ from POSIX ones to be more safe and
> expressive.
>
> Signed-off-by: Dmitry Kozlyuk <dmitry.kozliuk@gmail.com>
> ---
<snip>
> + } else if (mapped_addr == NULL) {
> RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n",
> - strerror(errno));
> - /* pass errno up the call chain */
> - rte_errno = errno;
> + strerror(rte_errno));
Also, please check that you're using rte_strerror with rte_errno :)
28/05/2020 13:26, Burakov, Anatoly:
> On 25-May-20 1:37 AM, Dmitry Kozlyuk wrote:
> > +/**
> > + * Memory protection flags.
> > + */
> > +enum rte_mem_prot {
> > + RTE_PROT_READ = 1 << 0, /**< Read access. */
> > + RTE_PROT_WRITE = 1 << 1, /**< Write access. */
> > + RTE_PROT_EXECUTE = 1 << 2 /**< Code execution. */
> > +};
> > +
> > +/**
> > + * Additional flags for memory mapping.
> > + */
> > +enum rte_map_flags {
> > + /** Changes to the mapped memory are visible to other processes. */
> > + RTE_MAP_SHARED = 1 << 0,
> > + /** Mapping is not backed by a regular file. */
> > + RTE_MAP_ANONYMOUS = 1 << 1,
> > + /** Copy-on-write mapping, changes are invisible to other processes. */
> > + RTE_MAP_PRIVATE = 1 << 2,
> > + /**
> > + * Force mapping to the requested address. This flag should be used
> > + * with caution, because to fulfill the request implementation
> > + * may remove all other mappings in the requested region. However,
> > + * it is not required to do so, thus mapping with this flag may fail.
> > + */
> > + RTE_MAP_FORCE_ADDRESS = 1 << 3
> > +};
>
> I have no strong opinion on this, but it feels like the fact that these
> are enums is a relic from the times where you used enum everywhere :) i
> have a feeling that DPDK codebase prefers #define's for this usage,
> while what you have here is more of a C++ thing.
The benefit of using an enum is to explicitly name the type
of the variables, serving documentation purpose.
+1 for the enums
@@ -5,15 +5,15 @@
#include <fcntl.h>
#include <inttypes.h>
#include <limits.h>
-#include <sys/mman.h>
#include <stdint.h>
#include <errno.h>
#include <string.h>
#include <unistd.h>
#include <rte_common.h>
-#include <rte_log.h>
#include <rte_errno.h>
+#include <rte_log.h>
+#include <rte_memory.h>
#include <rte_spinlock.h>
#include <rte_tailq.h>
@@ -90,12 +90,9 @@ resize_and_map(int fd, void *addr, size_t len)
return -1;
}
- map_addr = mmap(addr, len, PROT_READ | PROT_WRITE,
- MAP_SHARED | MAP_FIXED, fd, 0);
+ map_addr = rte_mem_map(addr, len, RTE_PROT_READ | RTE_PROT_WRITE,
+ RTE_MAP_SHARED | RTE_MAP_FORCE_ADDRESS, fd, 0);
if (map_addr != addr) {
- RTE_LOG(ERR, EAL, "mmap() failed: %s\n", strerror(errno));
- /* pass errno up the chain */
- rte_errno = errno;
return -1;
}
return 0;
@@ -733,7 +730,7 @@ rte_fbarray_init(struct rte_fbarray *arr, const char *name, unsigned int len,
return -1;
}
- page_sz = sysconf(_SC_PAGESIZE);
+ page_sz = rte_get_page_size();
if (page_sz == (size_t)-1) {
free(ma);
return -1;
@@ -754,9 +751,11 @@ rte_fbarray_init(struct rte_fbarray *arr, const char *name, unsigned int len,
if (internal_config.no_shconf) {
/* remap virtual area as writable */
- void *new_data = mmap(data, mmap_len, PROT_READ | PROT_WRITE,
- MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, fd, 0);
- if (new_data == MAP_FAILED) {
+ static const int flags = RTE_MAP_FORCE_ADDRESS |
+ RTE_MAP_PRIVATE | RTE_MAP_ANONYMOUS;
+ void *new_data = rte_mem_map(data, mmap_len,
+ RTE_PROT_READ | RTE_PROT_WRITE, flags, fd, 0);
+ if (new_data == NULL) {
RTE_LOG(DEBUG, EAL, "%s(): couldn't remap anonymous memory: %s\n",
__func__, strerror(errno));
goto fail;
@@ -821,7 +820,7 @@ rte_fbarray_init(struct rte_fbarray *arr, const char *name, unsigned int len,
return 0;
fail:
if (data)
- munmap(data, mmap_len);
+ rte_mem_unmap(data, mmap_len);
if (fd >= 0)
close(fd);
free(ma);
@@ -859,7 +858,7 @@ rte_fbarray_attach(struct rte_fbarray *arr)
return -1;
}
- page_sz = sysconf(_SC_PAGESIZE);
+ page_sz = rte_get_page_size();
if (page_sz == (size_t)-1) {
free(ma);
return -1;
@@ -911,7 +910,7 @@ rte_fbarray_attach(struct rte_fbarray *arr)
return 0;
fail:
if (data)
- munmap(data, mmap_len);
+ rte_mem_unmap(data, mmap_len);
if (fd >= 0)
close(fd);
free(ma);
@@ -939,8 +938,7 @@ rte_fbarray_detach(struct rte_fbarray *arr)
* really do anything about it, things will blow up either way.
*/
- size_t page_sz = sysconf(_SC_PAGESIZE);
-
+ size_t page_sz = rte_get_page_size();
if (page_sz == (size_t)-1)
return -1;
@@ -959,7 +957,7 @@ rte_fbarray_detach(struct rte_fbarray *arr)
goto out;
}
- munmap(arr->data, mmap_len);
+ rte_mem_unmap(arr->data, mmap_len);
/* area is unmapped, close fd and remove the tailq entry */
if (tmp->fd >= 0)
@@ -994,8 +992,7 @@ rte_fbarray_destroy(struct rte_fbarray *arr)
* really do anything about it, things will blow up either way.
*/
- size_t page_sz = sysconf(_SC_PAGESIZE);
-
+ size_t page_sz = rte_get_page_size();
if (page_sz == (size_t)-1)
return -1;
@@ -1044,7 +1041,7 @@ rte_fbarray_destroy(struct rte_fbarray *arr)
}
close(fd);
}
- munmap(arr->data, mmap_len);
+ rte_mem_unmap(arr->data, mmap_len);
/* area is unmapped, remove the tailq entry */
TAILQ_REMOVE(&mem_area_tailq, tmp, next);
@@ -11,7 +11,6 @@
#include <string.h>
#include <unistd.h>
#include <inttypes.h>
-#include <sys/mman.h>
#include <sys/queue.h>
#include <rte_fbarray.h>
@@ -40,18 +39,10 @@
static void *next_baseaddr;
static uint64_t system_page_sz;
-#ifdef RTE_EXEC_ENV_LINUX
-#define RTE_DONTDUMP MADV_DONTDUMP
-#elif defined RTE_EXEC_ENV_FREEBSD
-#define RTE_DONTDUMP MADV_NOCORE
-#else
-#error "madvise doesn't support this OS"
-#endif
-
#define MAX_MMAP_WITH_DEFINED_ADDR_TRIES 5
void *
eal_get_virtual_area(void *requested_addr, size_t *size,
- size_t page_sz, int flags, int mmap_flags)
+ size_t page_sz, int flags, int reserve_flags)
{
bool addr_is_hint, allow_shrink, unmap, no_align;
uint64_t map_sz;
@@ -59,9 +50,7 @@ eal_get_virtual_area(void *requested_addr, size_t *size,
uint8_t try = 0;
if (system_page_sz == 0)
- system_page_sz = sysconf(_SC_PAGESIZE);
-
- mmap_flags |= MAP_PRIVATE | MAP_ANONYMOUS;
+ system_page_sz = rte_get_page_size();
RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size);
@@ -105,24 +94,24 @@ eal_get_virtual_area(void *requested_addr, size_t *size,
return NULL;
}
- mapped_addr = mmap(requested_addr, (size_t)map_sz, PROT_NONE,
- mmap_flags, -1, 0);
- if (mapped_addr == MAP_FAILED && allow_shrink)
+ mapped_addr = eal_mem_reserve(
+ requested_addr, (size_t)map_sz, reserve_flags);
+ if ((mapped_addr == NULL) && allow_shrink)
*size -= page_sz;
- if (mapped_addr != MAP_FAILED && addr_is_hint &&
- mapped_addr != requested_addr) {
+ if ((mapped_addr != NULL) && addr_is_hint &&
+ (mapped_addr != requested_addr)) {
try++;
next_baseaddr = RTE_PTR_ADD(next_baseaddr, page_sz);
if (try <= MAX_MMAP_WITH_DEFINED_ADDR_TRIES) {
/* hint was not used. Try with another offset */
- munmap(mapped_addr, map_sz);
- mapped_addr = MAP_FAILED;
+ eal_mem_free(mapped_addr, map_sz);
+ mapped_addr = NULL;
requested_addr = next_baseaddr;
}
}
} while ((allow_shrink || addr_is_hint) &&
- mapped_addr == MAP_FAILED && *size > 0);
+ (mapped_addr == NULL) && (*size > 0));
/* align resulting address - if map failed, we will ignore the value
* anyway, so no need to add additional checks.
@@ -132,20 +121,17 @@ eal_get_virtual_area(void *requested_addr, size_t *size,
if (*size == 0) {
RTE_LOG(ERR, EAL, "Cannot get a virtual area of any size: %s\n",
- strerror(errno));
- rte_errno = errno;
+ strerror(rte_errno));
return NULL;
- } else if (mapped_addr == MAP_FAILED) {
+ } else if (mapped_addr == NULL) {
RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n",
- strerror(errno));
- /* pass errno up the call chain */
- rte_errno = errno;
+ strerror(rte_errno));
return NULL;
} else if (requested_addr != NULL && !addr_is_hint &&
aligned_addr != requested_addr) {
RTE_LOG(ERR, EAL, "Cannot get a virtual area at requested address: %p (got %p)\n",
requested_addr, aligned_addr);
- munmap(mapped_addr, map_sz);
+ eal_mem_free(mapped_addr, map_sz);
rte_errno = EADDRNOTAVAIL;
return NULL;
} else if (requested_addr != NULL && addr_is_hint &&
@@ -161,7 +147,7 @@ eal_get_virtual_area(void *requested_addr, size_t *size,
aligned_addr, *size);
if (unmap) {
- munmap(mapped_addr, map_sz);
+ eal_mem_free(mapped_addr, map_sz);
} else if (!no_align) {
void *map_end, *aligned_end;
size_t before_len, after_len;
@@ -179,19 +165,17 @@ eal_get_virtual_area(void *requested_addr, size_t *size,
/* unmap space before aligned mmap address */
before_len = RTE_PTR_DIFF(aligned_addr, mapped_addr);
if (before_len > 0)
- munmap(mapped_addr, before_len);
+ eal_mem_free(mapped_addr, before_len);
/* unmap space after aligned end mmap address */
after_len = RTE_PTR_DIFF(map_end, aligned_end);
if (after_len > 0)
- munmap(aligned_end, after_len);
+ eal_mem_free(aligned_end, after_len);
}
if (!unmap) {
/* Exclude these pages from a core dump. */
- if (madvise(aligned_addr, *size, RTE_DONTDUMP) != 0)
- RTE_LOG(DEBUG, EAL, "madvise failed: %s\n",
- strerror(errno));
+ eal_mem_set_dump(aligned_addr, *size, false);
}
return aligned_addr;
@@ -547,10 +531,10 @@ rte_eal_memdevice_init(void)
int
rte_mem_lock_page(const void *virt)
{
- unsigned long virtual = (unsigned long)virt;
- int page_size = getpagesize();
- unsigned long aligned = (virtual & ~(page_size - 1));
- return mlock((void *)aligned, page_size);
+ uintptr_t virtual = (uintptr_t)virt;
+ size_t page_size = rte_get_page_size();
+ uintptr_t aligned = RTE_PTR_ALIGN_FLOOR(virtual, page_size);
+ return rte_mem_lock((void *)aligned, page_size);
}
int
@@ -11,6 +11,7 @@
#include <rte_dev.h>
#include <rte_lcore.h>
+#include <rte_memory.h>
/**
* Structure storing internal configuration (per-lcore)
@@ -202,6 +203,24 @@ int rte_eal_alarm_init(void);
*/
int rte_eal_check_module(const char *module_name);
+/**
+ * Memory reservation flags.
+ */
+enum eal_mem_reserve_flags {
+ /**
+ * Reserve hugepages. May be unsupported by some platforms.
+ */
+ EAL_RESERVE_HUGEPAGES = 1 << 0,
+ /**
+ * Force reserving memory at the requested address.
+ * This can be a destructive action depending on the implementation.
+ *
+ * @see RTE_MAP_FORCE_ADDRESS for description of possible consequences
+ * (although implementations are not required to use it).
+ */
+ EAL_RESERVE_FORCE_ADDRESS = 1 << 1
+};
+
/**
* Get virtual area of specified size from the OS.
*
@@ -215,8 +234,8 @@ int rte_eal_check_module(const char *module_name);
* Page size on which to align requested virtual area.
* @param flags
* EAL_VIRTUAL_AREA_* flags.
- * @param mmap_flags
- * Extra flags passed directly to mmap().
+ * @param reserve_flags
+ * Extra flags passed directly to rte_mem_reserve().
*
* @return
* Virtual area address if successful.
@@ -233,7 +252,7 @@ int rte_eal_check_module(const char *module_name);
/**< immediately unmap reserved virtual area. */
void *
eal_get_virtual_area(void *requested_addr, size_t *size,
- size_t page_sz, int flags, int mmap_flags);
+ size_t page_sz, int flags, int reserve_flags);
/**
* Get cpu core_id.
@@ -467,4 +486,57 @@ eal_file_lock(int fd, enum eal_flock_op op, enum eal_flock_mode mode);
int
eal_file_truncate(int fd, ssize_t size);
+/**
+ * Reserve a region of virtual memory.
+ *
+ * Use eal_mem_free() to free reserved memory.
+ *
+ * @param requested_addr
+ * A desired reservation addressm which must be page-aligned.
+ * The system might not respect it.
+ * NULL means the address will be chosen by the system.
+ * @param size
+ * Reservation size. Must be a multiple of system page size.
+ * @param flags
+ * Reservation options, a combination of eal_mem_reserve_flags.
+ * @returns
+ * Starting address of the reserved area on success, NULL on failure.
+ * Callers must not access this memory until remapping it.
+ */
+void *
+eal_mem_reserve(void *requested_addr, size_t size, int flags);
+
+/**
+ * Free memory obtained by eal_mem_reserve() or eal_mem_alloc().
+ *
+ * If *virt* and *size* describe a part of the reserved region,
+ * only this part of the region is freed (accurately up to the system
+ * page size). If *virt* points to allocated memory, *size* must match
+ * the one specified on allocation. The behavior is undefined
+ * if the memory pointed by *virt* is obtained from another source
+ * than listed above.
+ *
+ * @param virt
+ * A virtual address in a region previously reserved.
+ * @param size
+ * Number of bytes to unreserve.
+ */
+void
+eal_mem_free(void *virt, size_t size);
+
+/**
+ * Configure memory region inclusion into core dumps.
+ *
+ * @param virt
+ * Starting address of the region.
+ * @param size
+ * Size of the region.
+ * @param dump
+ * True to include memory into core dumps, false to exclude.
+ * @return
+ * 0 on success, (-1) on failure and rte_errno is set.
+ */
+int
+eal_mem_set_dump(void *virt, size_t size, bool dump);
+
#endif /* _EAL_PRIVATE_H_ */
@@ -77,6 +77,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_reciprocal.c
# from unix dir
SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_unix.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_unix_memory.c
# from arch dir
SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_cpuflags.c
@@ -82,6 +82,94 @@ struct rte_memseg_list {
struct rte_fbarray memseg_arr;
};
+/**
+ * Memory protection flags.
+ */
+enum rte_mem_prot {
+ RTE_PROT_READ = 1 << 0, /**< Read access. */
+ RTE_PROT_WRITE = 1 << 1, /**< Write access. */
+ RTE_PROT_EXECUTE = 1 << 2 /**< Code execution. */
+};
+
+/**
+ * Additional flags for memory mapping.
+ */
+enum rte_map_flags {
+ /** Changes to the mapped memory are visible to other processes. */
+ RTE_MAP_SHARED = 1 << 0,
+ /** Mapping is not backed by a regular file. */
+ RTE_MAP_ANONYMOUS = 1 << 1,
+ /** Copy-on-write mapping, changes are invisible to other processes. */
+ RTE_MAP_PRIVATE = 1 << 2,
+ /**
+ * Force mapping to the requested address. This flag should be used
+ * with caution, because to fulfill the request implementation
+ * may remove all other mappings in the requested region. However,
+ * it is not required to do so, thus mapping with this flag may fail.
+ */
+ RTE_MAP_FORCE_ADDRESS = 1 << 3
+};
+
+/**
+ * Map a portion of an opened file or the page file into memory.
+ *
+ * This function is similar to POSIX mmap(3) with common MAP_ANONYMOUS
+ * extension, except for the return value.
+ *
+ * @param requested_addr
+ * Desired virtual address for mapping. Can be NULL to let OS choose.
+ * @param size
+ * Size of the mapping in bytes.
+ * @param prot
+ * Protection flags, a combination of rte_mem_prot values.
+ * @param flags
+ * Addtional mapping flags, a combination of rte_map_flags.
+ * @param fd
+ * Mapped file descriptor. Can be negative for anonymous mapping.
+ * @param offset
+ * Offset of the mapped region in fd. Must be 0 for anonymous mappings.
+ * @return
+ * Mapped address or NULL on failure and rte_errno is set to OS error.
+ */
+__rte_experimental
+void *
+rte_mem_map(void *requested_addr, size_t size, int prot, int flags,
+ int fd, size_t offset);
+
+/**
+ * OS-independent implementation of POSIX munmap(3).
+ */
+__rte_experimental
+int
+rte_mem_unmap(void *virt, size_t size);
+
+/**
+ * Get system page size. This function never fails.
+ *
+ * @return
+ * Page size in bytes.
+ */
+__rte_experimental
+size_t
+rte_get_page_size(void);
+
+/**
+ * Lock in physical memory all pages crossed by the address region.
+ *
+ * @param virt
+ * Base virtual address of the region.
+ * @param size
+ * Size of the region.
+ * @return
+ * 0 on success, negative on error.
+ *
+ * @see rte_get_page_size() to retrieve the page size.
+ * @see rte_mem_lock_page() to lock an entire single page.
+ */
+__rte_experimental
+int
+rte_mem_lock(const void *virt, size_t size);
+
/**
* Lock page in physical memory and prevent from swapping.
*
@@ -84,6 +84,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_reciprocal.c
# from unix dir
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_unix.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_unix_memory.c
# from arch dir
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_cpuflags.c
@@ -630,7 +630,7 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
mapped:
munmap(addr, alloc_sz);
unmapped:
- flags = MAP_FIXED;
+ flags = EAL_RESERVE_FORCE_ADDRESS;
new_addr = eal_get_virtual_area(addr, &alloc_sz, alloc_sz, 0, flags);
if (new_addr != addr) {
if (new_addr != NULL)
@@ -687,8 +687,7 @@ free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
return -1;
}
- if (madvise(ms->addr, ms->len, MADV_DONTDUMP) != 0)
- RTE_LOG(DEBUG, EAL, "madvise failed: %s\n", strerror(errno));
+ eal_mem_set_dump(ms->addr, ms->len, false);
exit_early = false;
@@ -386,4 +386,10 @@ EXPERIMENTAL {
rte_trace_point_lookup;
rte_trace_regexp;
rte_trace_save;
+
+ # added in 20.08
+ rte_get_page_size;
+ rte_mem_lock;
+ rte_mem_map;
+ rte_mem_unmap;
};
new file mode 100644
@@ -0,0 +1,152 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Dmitry Kozlyuk
+ */
+
+#include <string.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include <rte_errno.h>
+#include <rte_log.h>
+#include <rte_memory.h>
+
+#include "eal_private.h"
+
+#ifdef RTE_EXEC_ENV_LINUX
+#define EAL_DONTDUMP MADV_DONTDUMP
+#define EAL_DODUMP MADV_DODUMP
+#elif defined RTE_EXEC_ENV_FREEBSD
+#define EAL_DONTDUMP MADV_NOCORE
+#define EAL_DODUMP MADV_CORE
+#else
+#error "madvise doesn't support this OS"
+#endif
+
+static void *
+mem_map(void *requested_addr, size_t size, int prot, int flags,
+ int fd, size_t offset)
+{
+ void *virt = mmap(requested_addr, size, prot, flags, fd, offset);
+ if (virt == MAP_FAILED) {
+ RTE_LOG(DEBUG, EAL,
+ "Cannot mmap(%p, 0x%zx, 0x%x, 0x%x, %d, 0x%zx): %s\n",
+ requested_addr, size, prot, flags, fd, offset,
+ strerror(errno));
+ rte_errno = errno;
+ return NULL;
+ }
+ return virt;
+}
+
+static int
+mem_unmap(void *virt, size_t size)
+{
+ int ret = munmap(virt, size);
+ if (ret < 0) {
+ RTE_LOG(DEBUG, EAL, "Cannot munmap(%p, 0x%zx): %s\n",
+ virt, size, strerror(errno));
+ rte_errno = errno;
+ }
+ return ret;
+}
+
+void *
+eal_mem_reserve(void *requested_addr, size_t size, int flags)
+{
+ int sys_flags = MAP_PRIVATE | MAP_ANONYMOUS;
+
+ if (flags & EAL_RESERVE_HUGEPAGES) {
+#ifdef MAP_HUGETLB
+ sys_flags |= MAP_HUGETLB;
+#else
+ rte_errno = ENOTSUP;
+ return NULL;
+#endif
+ }
+
+ if (flags & EAL_RESERVE_FORCE_ADDRESS)
+ sys_flags |= MAP_FIXED;
+
+ return mem_map(requested_addr, size, PROT_NONE, sys_flags, -1, 0);
+}
+
+void
+eal_mem_free(void *virt, size_t size)
+{
+ mem_unmap(virt, size);
+}
+
+int
+eal_mem_set_dump(void *virt, size_t size, bool dump)
+{
+ int flags = dump ? EAL_DODUMP : EAL_DONTDUMP;
+ int ret = madvise(virt, size, flags);
+ if (ret) {
+ RTE_LOG(DEBUG, EAL, "madvise(%p, %#zx, %d) failed: %s\n",
+ virt, size, flags, strerror(rte_errno));
+ rte_errno = errno;
+ }
+ return ret;
+}
+
+static int
+mem_rte_to_sys_prot(int prot)
+{
+ int sys_prot = PROT_NONE;
+
+ if (prot & RTE_PROT_READ)
+ sys_prot |= PROT_READ;
+ if (prot & RTE_PROT_WRITE)
+ sys_prot |= PROT_WRITE;
+ if (prot & RTE_PROT_EXECUTE)
+ sys_prot |= PROT_EXEC;
+
+ return sys_prot;
+}
+
+void *
+rte_mem_map(void *requested_addr, size_t size, int prot, int flags,
+ int fd, size_t offset)
+{
+ int sys_flags = 0;
+ int sys_prot;
+
+ sys_prot = mem_rte_to_sys_prot(prot);
+
+ if (flags & RTE_MAP_SHARED)
+ sys_flags |= MAP_SHARED;
+ if (flags & RTE_MAP_ANONYMOUS)
+ sys_flags |= MAP_ANONYMOUS;
+ if (flags & RTE_MAP_PRIVATE)
+ sys_flags |= MAP_PRIVATE;
+ if (flags & RTE_MAP_FORCE_ADDRESS)
+ sys_flags |= MAP_FIXED;
+
+ return mem_map(requested_addr, size, sys_prot, sys_flags, fd, offset);
+}
+
+int
+rte_mem_unmap(void *virt, size_t size)
+{
+ return mem_unmap(virt, size);
+}
+
+size_t
+rte_get_page_size(void)
+{
+ static size_t page_size;
+
+ if (!page_size)
+ page_size = sysconf(_SC_PAGESIZE);
+
+ return page_size;
+}
+
+int
+rte_mem_lock(const void *virt, size_t size)
+{
+ int ret = mlock(virt, size);
+ if (ret)
+ rte_errno = errno;
+ return ret;
+}
@@ -3,4 +3,5 @@
sources += files(
'eal_unix.c',
+ 'eal_unix_memory.c',
)