[RFC,v2] eal/linux: add support for fast virt/iova translation

Message ID 20220914211201.32940-1-donw@xsightlabs.com (mailing list archive)
State Changes Requested
Delegated to: David Marchand
Headers
Series [RFC,v2] eal/linux: add support for fast virt/iova translation |

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/Intel-compilation fail Compilation issues
ci/intel-Testing success Testing PASS

Commit Message

Don Wallwork Sept. 14, 2022, 9:12 p.m. UTC
This patch maps hugepage memory such that address translation from
virtual to iova or vice versa can be done by simple addition/
subtraction of a constant value without any page table walks.

A new '--const-translate' EAL option is added to enable this mode.

The following example describes how this works:

Say you have a system with 4 huge pages that are 1G each and the
physical addresses are 10, 11, 17 and 22G. If we map 13G of virtual
address space, that will be enough to cover all of the huge page
physical addresses.

If the VA region starts at 1G, all of the hugepage PAs can
be mapped into that region as shown below under Proposed
heading.  For comparison, existing mapping that would be
done in legacy mode is shown under the Current heading.

Proposed      Current (Legacy)

 VA | PA         VA | PA
----+----       ----+----
 1G | 10G        1G | 10G
 2G | 11G        2G | 11G
 3G |  -         3G |  -
 4G |  -         4G | 17G
 5G |  -         5G |  -
 6G |  -         6G | 22G
 7G |  -
 8G | 17G
 9G |  -
10G |  -
11G |  -
12G |  -
13G | 22G

So in this example, we have a fixed offset of 9G to translate
between VA to PA or vice versa.This works whether the huge
pages happen to be allocated statically (legacy mode) or
dynamically.

The unused VA address space from 3G-7G and 9G-12G can be
unmapped in just two unmap calls.

This patch applies to legacy-mem mode only.

Signed-off-by: Don Wallwork <donw@xsightlabs.com>
---
 doc/guides/linux_gsg/eal_args.include.rst     |   5 +
 .../prog_guide/env_abstraction_layer.rst      |  10 ++
 lib/eal/common/eal_common_options.c           |   6 +
 lib/eal/common/eal_internal_cfg.h             |   2 +
 lib/eal/common/eal_options.h                  |   2 +
 lib/eal/include/rte_memory.h                  |  28 +++
 lib/eal/linux/eal.c                           |   6 +
 lib/eal/linux/eal_memory.c                    | 165 +++++++++++++++++-
 8 files changed, 218 insertions(+), 6 deletions(-)
  

Patch

diff --git a/doc/guides/linux_gsg/eal_args.include.rst b/doc/guides/linux_gsg/eal_args.include.rst
index 9cfbf7de84..cb3d554f5a 100644
--- a/doc/guides/linux_gsg/eal_args.include.rst
+++ b/doc/guides/linux_gsg/eal_args.include.rst
@@ -122,6 +122,11 @@  Memory-related options
     to system pthread stack size unless the optional size (in kbytes) is
     specified.
 
+*   ``--const-translate``
+
+    Prepare hugepage memory such that the offset between any hugepage virtual
+    address is a constant offset from physical address and vice versa.
+
 Debugging options
 ~~~~~~~~~~~~~~~~~
 
diff --git a/doc/guides/prog_guide/env_abstraction_layer.rst b/doc/guides/prog_guide/env_abstraction_layer.rst
index 67842ae272..c7ffada7ef 100644
--- a/doc/guides/prog_guide/env_abstraction_layer.rst
+++ b/doc/guides/prog_guide/env_abstraction_layer.rst
@@ -350,6 +350,16 @@  if the optional size parameter is not specified.
     hugepage worker thread stacks given the same thread stack size and
     loading conditions.
 
+Constant Address Translation
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When the ``--const-translate`` EAL option is specified, hugepage
+memory is initialized to provide a constant offset between hugepage
+virtual and physical addresses.
+
+This allows device drivers to quickly translate from both virtual to
+physical and physical to virtual addresses for any hugepage address.
+
 Support for Externally Allocated Memory
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/lib/eal/common/eal_common_options.c b/lib/eal/common/eal_common_options.c
index 4c2def0155..ba4151a65c 100644
--- a/lib/eal/common/eal_common_options.c
+++ b/lib/eal/common/eal_common_options.c
@@ -104,6 +104,7 @@  eal_long_options[] = {
 	{OPT_NO_TELEMETRY,      0, NULL, OPT_NO_TELEMETRY_NUM     },
 	{OPT_FORCE_MAX_SIMD_BITWIDTH, 1, NULL, OPT_FORCE_MAX_SIMD_BITWIDTH_NUM},
 	{OPT_HUGE_WORKER_STACK, 2, NULL, OPT_HUGE_WORKER_STACK_NUM     },
+	{OPT_CONST_TRANSLATE,   0, NULL, OPT_CONST_TRANSLATE_NUM },
 
 	{0,                     0, NULL, 0                        }
 };
@@ -2086,6 +2087,11 @@  eal_check_common_options(struct internal_config *internal_cfg)
 			"be specified together with --"OPT_NO_HUGE"\n");
 		return -1;
 	}
+	if (internal_cfg->no_hugetlbfs && internal_cfg->const_translate) {
+		RTE_LOG(ERR, EAL, "Option --"OPT_CONST_TRANSLATE" cannot "
+			"be specified together with --"OPT_NO_HUGE"\n");
+		return -1;
+	}
 	if (internal_conf->force_socket_limits && internal_conf->legacy_mem) {
 		RTE_LOG(ERR, EAL, "Option --"OPT_SOCKET_LIMIT
 			" is only supported in non-legacy memory mode\n");
diff --git a/lib/eal/common/eal_internal_cfg.h b/lib/eal/common/eal_internal_cfg.h
index 167ec501fa..d4a034d823 100644
--- a/lib/eal/common/eal_internal_cfg.h
+++ b/lib/eal/common/eal_internal_cfg.h
@@ -103,6 +103,8 @@  struct internal_config {
 	struct simd_bitwidth max_simd_bitwidth;
 	/**< max simd bitwidth path to use */
 	size_t huge_worker_stack_size; /**< worker thread stack size */
+	volatile unsigned const_translate;
+	/**< true to enable constant VA->PA, PA->VA address translation */
 };
 
 void eal_reset_internal_config(struct internal_config *internal_cfg);
diff --git a/lib/eal/common/eal_options.h b/lib/eal/common/eal_options.h
index 3cc9cb6412..98cd22fd32 100644
--- a/lib/eal/common/eal_options.h
+++ b/lib/eal/common/eal_options.h
@@ -89,6 +89,8 @@  enum {
 	OPT_FORCE_MAX_SIMD_BITWIDTH_NUM,
 #define OPT_HUGE_WORKER_STACK  "huge-worker-stack"
 	OPT_HUGE_WORKER_STACK_NUM,
+#define OPT_CONST_TRANSLATE    "const-translate"
+	OPT_CONST_TRANSLATE_NUM,
 
 	OPT_LONG_MAX_NUM
 };
diff --git a/lib/eal/include/rte_memory.h b/lib/eal/include/rte_memory.h
index 68b069fd04..c87777ca01 100644
--- a/lib/eal/include/rte_memory.h
+++ b/lib/eal/include/rte_memory.h
@@ -134,6 +134,34 @@  rte_iova_t rte_mem_virt2iova(const void *virt);
 void *
 rte_mem_iova2virt(rte_iova_t iova);
 
+/**
+ * Get IO virtual address of any mapped virtual address in the current process.
+ *
+ * @note This function provides a fast implementation of virtual to physical
+ *       addresses that does not walk any page tables.  Suitable for use in
+ *       data plane threads.
+ *
+ * @param virt
+ *   The virtual address.
+ * @return
+ *   The IO address or RTE_BAD_IOVA on error.
+ */
+rte_iova_t rte_mem_fast_virt2iova(const void *virt);
+
+/**
+ * Get virtual memory address corresponding to iova address.
+ *
+ * @note This function provides a fast implementation of physical to virtual to
+ *       addresses.  Suitable for use in data plane threads.
+ *
+ * @param iova
+ *   The iova address.
+ * @return
+ *   Virtual address corresponding to iova address (or NULL if address does not
+ *   exist within DPDK memory map).
+ */
+void *rte_mem_fast_iova2virt(rte_iova_t iova);
+
 /**
  * Get memseg to which a particular virtual address belongs.
  *
diff --git a/lib/eal/linux/eal.c b/lib/eal/linux/eal.c
index 37d29643a5..6e4e60d8c1 100644
--- a/lib/eal/linux/eal.c
+++ b/lib/eal/linux/eal.c
@@ -455,6 +455,8 @@  eal_usage(const char *prgname)
 	       "                      Allocate worker thread stacks from hugepage memory.\n"
 	       "                      Size is in units of kbytes and defaults to system\n"
 	       "                      thread stack size if not specified.\n"
+	       "  --"OPT_CONST_TRANSLATE"  Constant delta between hugepage "
+	       "physical and virtual addresses\n"
 	       "\n");
 	/* Allow the application to print its usage message too if hook is set */
 	if (hook) {
@@ -767,6 +769,10 @@  eal_parse_args(int argc, char **argv)
 			}
 			break;
 
+		case OPT_CONST_TRANSLATE_NUM:
+			internal_conf->const_translate = 1;
+			break;
+
 		default:
 			if (opt < OPT_LONG_MIN_NUM && isprint(opt)) {
 				RTE_LOG(ERR, EAL, "Option %c is not supported "
diff --git a/lib/eal/linux/eal_memory.c b/lib/eal/linux/eal_memory.c
index c890c42106..3fefb3dc9d 100644
--- a/lib/eal/linux/eal_memory.c
+++ b/lib/eal/linux/eal_memory.c
@@ -148,6 +148,47 @@  rte_mem_virt2iova(const void *virtaddr)
 	return rte_mem_virt2phy(virtaddr);
 }
 
+static void *const_va_pa_delta;
+
+#ifdef RTE_MEM_SANITY_CHECK
+#define __rte_mem_validate(v) rte_mem_validate(v)
+
+static int rte_mem_validate(const void *virtaddr)
+{
+	if (!rte_mem_virt2memseg(virt, NULL)) {
+		RTE_LOG(ERR, EAL, "Invalid virtual address %p\n", virtaddr);
+		return -1;
+	}
+	return 0;
+}
+#else
+#define __rte_mem_validate(v) 0
+#endif
+
+rte_iova_t rte_mem_fast_virt2iova(const void *virtaddr)
+{
+	if (rte_eal_iova_mode() == RTE_IOVA_VA)
+		return (uintptr_t)virtaddr;
+
+	if (__rte_mem_validate(virtaddr) != 0)
+		return RTE_BAD_IOVA;
+
+	return (rte_iova_t)((uintptr_t)virtaddr - (uintptr_t)const_va_pa_delta);
+}
+
+void *rte_mem_fast_iova2virt(rte_iova_t iova)
+{
+	if (rte_eal_iova_mode() == RTE_IOVA_VA)
+		return (void *)(uintptr_t)iova;
+
+	void *virtaddr = (void *)((uintptr_t)const_va_pa_delta + iova);
+
+	if (__rte_mem_validate(virtaddr) != 0)
+		return NULL;
+
+	return virtaddr;
+}
+
 /*
  * For each hugepage in hugepg_tbl, fill the physaddr value. We find
  * it by browsing the /proc/self/pagemap special file.
@@ -664,10 +705,9 @@  remap_segment(struct hugepage_file *hugepages, int seg_start, int seg_end)
 	uint64_t page_sz;
 	size_t memseg_len;
 	int socket_id;
-#ifndef RTE_ARCH_64
 	const struct internal_config *internal_conf =
 		eal_get_internal_configuration();
-#endif
+
 	page_sz = hugepages[seg_start].size;
 	socket_id = hugepages[seg_start].socket_id;
 	seg_len = seg_end - seg_start;
@@ -691,6 +731,12 @@  remap_segment(struct hugepage_file *hugepages, int seg_start, int seg_end)
 		ms_idx = rte_fbarray_find_next_n_free(arr, 0,
 				seg_len + (empty ? 0 : 1));
 
+		if (internal_conf->const_translate &&
+		    internal_conf->legacy_mem &&
+		    rte_eal_iova_mode() == RTE_IOVA_PA &&
+		    ms_idx != 0)
+			continue;
+
 		/* memseg list is full? */
 		if (ms_idx < 0)
 			continue;
@@ -735,7 +781,12 @@  remap_segment(struct hugepage_file *hugepages, int seg_start, int seg_end)
 			return -1;
 		}
 		memseg_len = (size_t)page_sz;
-		addr = RTE_PTR_ADD(msl->base_va, ms_idx * memseg_len);
+		if (internal_conf->const_translate &&
+		    internal_conf->legacy_mem &&
+		    rte_eal_iova_mode() == RTE_IOVA_PA)
+			addr = RTE_PTR_ADD(const_va_pa_delta, hfile->physaddr);
+		else
+			addr = RTE_PTR_ADD(msl->base_va, ms_idx * memseg_len);
 
 		/* we know this address is already mmapped by memseg list, so
 		 * using MAP_FIXED here is safe
@@ -1085,6 +1136,98 @@  huge_recover_sigbus(void)
 	}
 }
 
+static int
+remap_hugepages_const_xlate(struct hugepage_file *hugepage, int n_pages,
+			    int nr_hugepages)
+{
+	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+	int i, remap_failed = 0;
+	void *addr;
+
+	/* Adjust VA bases in memory segment lists to enable constant
+	 * va->pa and pa->va address translation
+	 */
+	if (rte_eal_iova_mode() == RTE_IOVA_PA) {
+		RTE_LOG(INFO, EAL,
+			"Enabling constant address translation support...\n");
+
+		/* Allocate virtual address space to cover the full
+		 * range of huge page physical addresses
+		 */
+		size_t va_mem_sz =
+			hugepage[nr_hugepages - 1].physaddr +
+			hugepage[nr_hugepages - 1].size -
+			hugepage[0].physaddr;
+		size_t page_sz = 0;
+
+		for (i = 0; i < nr_hugepages; i++)
+			if (hugepage[i].size > page_sz)
+				page_sz = hugepage[i].size;
+
+		void *va_base =
+			eal_get_virtual_area(NULL, &va_mem_sz, page_sz, 0, 0);
+
+		if (va_base == NULL) {
+			RTE_LOG(ERR, EAL, "Cannot reserve memory\n");
+			return -ENOMEM;
+		}
+		const_va_pa_delta = RTE_PTR_ADD(va_base, -hugepage[0].physaddr);
+
+		/* Unmap gaps in virtual address space when there are gaps
+		 * between huge page physical addresses
+		 */
+		for (i = 1; i < nr_hugepages; i++) {
+			size_t gap_sz = hugepage[i].physaddr -
+				(hugepage[i-1].physaddr + hugepage[i-1].size);
+
+			if (gap_sz) {
+				addr = RTE_PTR_ADD(const_va_pa_delta,
+						   hugepage[i-1].physaddr +
+						   hugepage[i-1].size);
+
+				if (munmap(addr, gap_sz) != 0)
+					RTE_LOG(ERR, EAL, "Gap unmap failed\n");
+			}
+		}
+	}
+
+	/* remap all pages we do need into memseg list VA space, so that those
+	 * pages become first-class citizens in DPDK memory subsystem
+	 */
+	if (remap_needed_hugepages(hugepage, n_pages)) {
+		RTE_LOG(ERR, EAL,
+			"Couldn't remap hugepage files into memseg lists\n");
+		remap_failed = 1;
+	}
+
+	/* Unmap the existing virtual address space in each MSL with
+	 * allocated pages. Modify MSL base_va to be the VA of the
+	 * first page of the segment list. Adjust the msl->len to the
+	 * length of the address space consumed by the msl.
+	 */
+	for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+		struct rte_memseg_list *msl = &mcfg->memsegs[i];
+		struct rte_memseg *ms;
+
+		/* skip inactive lists */
+		if (msl->base_va == NULL)
+			continue;
+
+		/* skip lists where there are no pages allocated */
+		if (!msl->memseg_arr.count)
+			continue;
+
+		/* release current VA space */
+		munmap(msl->base_va, msl->len);
+
+		/* assign new VA base and len */
+		ms = rte_fbarray_get(&msl->memseg_arr, 0);
+		msl->base_va = ms->addr;
+		msl->len = (msl->page_sz * msl->memseg_arr.count);
+	}
+	return remap_failed;
+}
+
 /*
  * Prepare physical memory mapping: fill configuration structure with
  * these infos, return 0 on success.
@@ -1413,9 +1556,19 @@  eal_legacy_hugepage_init(void)
 	/* remap all pages we do need into memseg list VA space, so that those
 	 * pages become first-class citizens in DPDK memory subsystem
 	 */
-	if (remap_needed_hugepages(hugepage, nr_hugefiles)) {
-		RTE_LOG(ERR, EAL, "Couldn't remap hugepage files into memseg lists\n");
-		goto fail;
+	if (internal_conf->const_translate) {
+		if (remap_hugepages_const_xlate(hugepage, nr_hugefiles,
+						nr_hugepages)) {
+			RTE_LOG(ERR, EAL,
+				"Couldn't remap hugepage files into memseg lists\n");
+			goto fail;
+		}
+	} else {
+		if (remap_needed_hugepages(hugepage, nr_hugefiles)) {
+			RTE_LOG(ERR, EAL,
+				"Couldn't remap hugepage files into memseg lists\n");
+			goto fail;
+		}
 	}
 
 	/* free the hugepage backing files */