diff mbox series

[v2,3/6] mem: add dirty malloc element support

Message ID 20220119210917.765505-4-dkozlyuk@nvidia.com (mailing list archive)
State Superseded, archived
Delegated to: David Marchand
Headers show
Series Fast restart with many hugepages | expand

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Dmitry Kozlyuk Jan. 19, 2022, 9:09 p.m. UTC
EAL malloc layer assumed all free elements content
is filled with zeros ("clean"), as opposed to uninitialized ("dirty").
This assumption was ensured in two ways:
1. EAL memalloc layer always returned clean memory.
2. Freed memory was cleared before returning into the heap.

Clearing the memory can be as slow as around 14 GiB/s.
To save doing so, memalloc layer is allowed to return dirty memory.
Such segments being marked with RTE_MEMSEG_FLAG_DIRTY.
The allocator tracks elements that contain dirty memory
using the new flag in the element header.
When clean memory is requested via rte_zmalloc*()
and the suitable element is dirty, it is cleared on allocation.
When memory is deallocated, the freed element is joined
with adjacent free elements, and the dirty flag is updated:

a) If the joint element contains dirty parts, it is dirty:

    dirty + freed + dirty = dirty  =>  no need to clean
            freed + dirty = dirty      the freed memory

   Dirty parts may be large (e.g. initial allocation),
   so clearing them could create unpredictable slowdown.

b) If the only dirty part of the joint element
   is the freed memory, the joint element can be made clean:

    clean + freed + clean = clean  =>  freed memory
    clean + freed         = clean      must be cleared
            freed + clean = clean
            freed         = clean

   This logic naturally reproduces the old behavior
   and always applies in modes when EAL memalloc layer
   returns only clean segments.

As a result, memory is either cleared on free, as before,
or it will be cleared on allocation if need be, but never twice.

Signed-off-by: Dmitry Kozlyuk <dkozlyuk@nvidia.com>
---
 lib/eal/common/malloc_elem.c | 22 +++++++++++++++++++---
 lib/eal/common/malloc_elem.h | 11 +++++++++--
 lib/eal/common/malloc_heap.c | 18 ++++++++++++------
 lib/eal/common/rte_malloc.c  | 21 ++++++++++++++-------
 lib/eal/include/rte_memory.h |  8 ++++++--
 5 files changed, 60 insertions(+), 20 deletions(-)
diff mbox series

Patch

diff --git a/lib/eal/common/malloc_elem.c b/lib/eal/common/malloc_elem.c
index bdd20a162e..e04e0890fb 100644
--- a/lib/eal/common/malloc_elem.c
+++ b/lib/eal/common/malloc_elem.c
@@ -129,7 +129,7 @@  malloc_elem_find_max_iova_contig(struct malloc_elem *elem, size_t align)
 void
 malloc_elem_init(struct malloc_elem *elem, struct malloc_heap *heap,
 		struct rte_memseg_list *msl, size_t size,
-		struct malloc_elem *orig_elem, size_t orig_size)
+		struct malloc_elem *orig_elem, size_t orig_size, bool dirty)
 {
 	elem->heap = heap;
 	elem->msl = msl;
@@ -137,6 +137,7 @@  malloc_elem_init(struct malloc_elem *elem, struct malloc_heap *heap,
 	elem->next = NULL;
 	memset(&elem->free_list, 0, sizeof(elem->free_list));
 	elem->state = ELEM_FREE;
+	elem->dirty = dirty;
 	elem->size = size;
 	elem->pad = 0;
 	elem->orig_elem = orig_elem;
@@ -300,7 +301,7 @@  split_elem(struct malloc_elem *elem, struct malloc_elem *split_pt)
 	const size_t new_elem_size = elem->size - old_elem_size;
 
 	malloc_elem_init(split_pt, elem->heap, elem->msl, new_elem_size,
-			 elem->orig_elem, elem->orig_size);
+			elem->orig_elem, elem->orig_size, elem->dirty);
 	split_pt->prev = elem;
 	split_pt->next = next_elem;
 	if (next_elem)
@@ -506,6 +507,7 @@  join_elem(struct malloc_elem *elem1, struct malloc_elem *elem2)
 	else
 		elem1->heap->last = elem1;
 	elem1->next = next;
+	elem1->dirty |= elem2->dirty;
 	if (elem1->pad) {
 		struct malloc_elem *inner = RTE_PTR_ADD(elem1, elem1->pad);
 		inner->size = elem1->size - elem1->pad;
@@ -579,6 +581,14 @@  malloc_elem_free(struct malloc_elem *elem)
 	ptr = RTE_PTR_ADD(elem, MALLOC_ELEM_HEADER_LEN);
 	data_len = elem->size - MALLOC_ELEM_OVERHEAD;
 
+	/*
+	 * Consider the element clean for the purposes of joining.
+	 * If both neighbors are clean or non-existent,
+	 * the joint element will be clean,
+	 * which means the memory should be cleared.
+	 * There is no need to clear the memory if the joint element is dirty.
+	 */
+	elem->dirty = false;
 	elem = malloc_elem_join_adjacent_free(elem);
 
 	malloc_elem_free_list_insert(elem);
@@ -588,8 +598,14 @@  malloc_elem_free(struct malloc_elem *elem)
 	/* decrease heap's count of allocated elements */
 	elem->heap->alloc_count--;
 
-	/* poison memory */
+#ifndef RTE_MALLOC_DEBUG
+	/* Normally clear the memory when needed. */
+	if (!elem->dirty)
+		memset(ptr, 0, data_len);
+#else
+	/* Always poison the memory in debug mode. */
 	memset(ptr, MALLOC_POISON, data_len);
+#endif
 
 	return elem;
 }
diff --git a/lib/eal/common/malloc_elem.h b/lib/eal/common/malloc_elem.h
index 15d8ba7af2..f2aa98821b 100644
--- a/lib/eal/common/malloc_elem.h
+++ b/lib/eal/common/malloc_elem.h
@@ -27,7 +27,13 @@  struct malloc_elem {
 	LIST_ENTRY(malloc_elem) free_list;
 	/**< list of free elements in heap */
 	struct rte_memseg_list *msl;
-	volatile enum elem_state state;
+	/** Element state, @c dirty and @c pad validity depends on it. */
+	/* An extra bit is needed to represent enum elem_state as signed int. */
+	enum elem_state state : 3;
+	/** If state == ELEM_FREE: the memory is not filled with zeroes. */
+	uint32_t dirty : 1;
+	/** Reserved for future use. */
+	uint32_t reserved : 28;
 	uint32_t pad;
 	size_t size;
 	struct malloc_elem *orig_elem;
@@ -320,7 +326,8 @@  malloc_elem_init(struct malloc_elem *elem,
 		struct rte_memseg_list *msl,
 		size_t size,
 		struct malloc_elem *orig_elem,
-		size_t orig_size);
+		size_t orig_size,
+		bool dirty);
 
 void
 malloc_elem_insert(struct malloc_elem *elem);
diff --git a/lib/eal/common/malloc_heap.c b/lib/eal/common/malloc_heap.c
index 55aad2711b..24080fc473 100644
--- a/lib/eal/common/malloc_heap.c
+++ b/lib/eal/common/malloc_heap.c
@@ -93,11 +93,11 @@  malloc_socket_to_heap_id(unsigned int socket_id)
  */
 static struct malloc_elem *
 malloc_heap_add_memory(struct malloc_heap *heap, struct rte_memseg_list *msl,
-		void *start, size_t len)
+		void *start, size_t len, bool dirty)
 {
 	struct malloc_elem *elem = start;
 
-	malloc_elem_init(elem, heap, msl, len, elem, len);
+	malloc_elem_init(elem, heap, msl, len, elem, len, dirty);
 
 	malloc_elem_insert(elem);
 
@@ -135,7 +135,8 @@  malloc_add_seg(const struct rte_memseg_list *msl,
 
 	found_msl = &mcfg->memsegs[msl_idx];
 
-	malloc_heap_add_memory(heap, found_msl, ms->addr, len);
+	malloc_heap_add_memory(heap, found_msl, ms->addr, len,
+			ms->flags & RTE_MEMSEG_FLAG_DIRTY);
 
 	heap->total_size += len;
 
@@ -303,7 +304,8 @@  alloc_pages_on_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
 	struct rte_memseg_list *msl;
 	struct malloc_elem *elem = NULL;
 	size_t alloc_sz;
-	int allocd_pages;
+	int allocd_pages, i;
+	bool dirty = false;
 	void *ret, *map_addr;
 
 	alloc_sz = (size_t)pg_sz * n_segs;
@@ -372,8 +374,12 @@  alloc_pages_on_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
 		goto fail;
 	}
 
+	/* Element is dirty if it contains at least one dirty page. */
+	for (i = 0; i < allocd_pages; i++)
+		dirty |= ms[i]->flags & RTE_MEMSEG_FLAG_DIRTY;
+
 	/* add newly minted memsegs to malloc heap */
-	elem = malloc_heap_add_memory(heap, msl, map_addr, alloc_sz);
+	elem = malloc_heap_add_memory(heap, msl, map_addr, alloc_sz, dirty);
 
 	/* try once more, as now we have allocated new memory */
 	ret = find_suitable_element(heap, elt_size, flags, align, bound,
@@ -1260,7 +1266,7 @@  malloc_heap_add_external_memory(struct malloc_heap *heap,
 	memset(msl->base_va, 0, msl->len);
 
 	/* now, add newly minted memory to the malloc heap */
-	malloc_heap_add_memory(heap, msl, msl->base_va, msl->len);
+	malloc_heap_add_memory(heap, msl, msl->base_va, msl->len, false);
 
 	heap->total_size += msl->len;
 
diff --git a/lib/eal/common/rte_malloc.c b/lib/eal/common/rte_malloc.c
index d0bec26920..71a3f7ecb4 100644
--- a/lib/eal/common/rte_malloc.c
+++ b/lib/eal/common/rte_malloc.c
@@ -115,15 +115,22 @@  rte_zmalloc_socket(const char *type, size_t size, unsigned align, int socket)
 {
 	void *ptr = rte_malloc_socket(type, size, align, socket);
 
+	if (ptr != NULL) {
+		struct malloc_elem *elem = malloc_elem_from_data(ptr);
+
+		if (elem->dirty) {
+			memset(ptr, 0, size);
+		} else {
 #ifdef RTE_MALLOC_DEBUG
-	/*
-	 * If DEBUG is enabled, then freed memory is marked with poison
-	 * value and set to zero on allocation.
-	 * If DEBUG is not enabled then  memory is already zeroed.
-	 */
-	if (ptr != NULL)
-		memset(ptr, 0, size);
+			/*
+			 * If DEBUG is enabled, then freed memory is marked
+			 * with a poison value and set to zero on allocation.
+			 * If DEBUG is disabled then memory is already zeroed.
+			 */
+			memset(ptr, 0, size);
 #endif
+		}
+	}
 
 	rte_eal_trace_mem_zmalloc(type, size, align, socket, ptr);
 	return ptr;
diff --git a/lib/eal/include/rte_memory.h b/lib/eal/include/rte_memory.h
index 6d018629ae..68b069fd04 100644
--- a/lib/eal/include/rte_memory.h
+++ b/lib/eal/include/rte_memory.h
@@ -19,6 +19,7 @@ 
 extern "C" {
 #endif
 
+#include <rte_bitops.h>
 #include <rte_common.h>
 #include <rte_compat.h>
 #include <rte_config.h>
@@ -37,11 +38,14 @@  extern "C" {
 
 #define SOCKET_ID_ANY -1                    /**< Any NUMA socket. */
 
+/** Prevent this segment from being freed back to the OS. */
+#define RTE_MEMSEG_FLAG_DO_NOT_FREE RTE_BIT32(0)
+/** This segment is not filled with zeros. */
+#define RTE_MEMSEG_FLAG_DIRTY RTE_BIT32(1)
+
 /**
  * Physical memory segment descriptor.
  */
-#define RTE_MEMSEG_FLAG_DO_NOT_FREE (1 << 0)
-/**< Prevent this segment from being freed back to the OS. */
 struct rte_memseg {
 	rte_iova_t iova;            /**< Start IO address. */
 	RTE_STD_C11