Patch Detail
get:
Show a patch.
patch:
Update a patch.
put:
Update a patch.
GET /api/patches/133872/?format=api
http://patchwork.dpdk.org/api/patches/133872/?format=api", "web_url": "http://patchwork.dpdk.org/project/dpdk/patch/98CBD80474FA8B44BF855DF32C47DC35E9EFD4@smartserver.smartshare.dk/", "project": { "id": 1, "url": "http://patchwork.dpdk.org/api/projects/1/?format=api", "name": "DPDK", "link_name": "dpdk", "list_id": "dev.dpdk.org", "list_email": "dev@dpdk.org", "web_url": "http://core.dpdk.org", "scm_url": "git://dpdk.org/dpdk", "webscm_url": "http://git.dpdk.org/dpdk", "list_archive_url": "https://inbox.dpdk.org/dev", "list_archive_url_format": "https://inbox.dpdk.org/dev/{}", "commit_url_format": "" }, "msgid": "<98CBD80474FA8B44BF855DF32C47DC35E9EFD4@smartserver.smartshare.dk>", "list_archive_url": "https://inbox.dpdk.org/dev/98CBD80474FA8B44BF855DF32C47DC35E9EFD4@smartserver.smartshare.dk", "date": "2023-11-04T17:29:40", "name": "[RFC] mempool: CPU cache aligning mempool driver accesses", "commit_ref": null, "pull_url": null, "state": "new", "archived": false, "hash": "0e79f41a913f94418ba8335498f4a87ff494cfd9", "submitter": { "id": 591, "url": "http://patchwork.dpdk.org/api/people/591/?format=api", "name": "Morten Brørup", "email": "mb@smartsharesystems.com" }, "delegate": { "id": 1, "url": "http://patchwork.dpdk.org/api/users/1/?format=api", "username": "tmonjalo", "first_name": "Thomas", "last_name": "Monjalon", "email": "thomas@monjalon.net" }, "mbox": "http://patchwork.dpdk.org/project/dpdk/patch/98CBD80474FA8B44BF855DF32C47DC35E9EFD4@smartserver.smartshare.dk/mbox/", "series": [ { "id": 30159, "url": "http://patchwork.dpdk.org/api/series/30159/?format=api", "web_url": "http://patchwork.dpdk.org/project/dpdk/list/?series=30159", "date": "2023-11-04T17:29:40", "name": "[RFC] mempool: CPU cache aligning mempool driver accesses", "version": 1, "mbox": "http://patchwork.dpdk.org/series/30159/mbox/" } ], "comments": "http://patchwork.dpdk.org/api/patches/133872/comments/", "check": "warning", "checks": "http://patchwork.dpdk.org/api/patches/133872/checks/", "tags": {}, "related": [], "headers": { "Return-Path": "<dev-bounces@dpdk.org>", "X-Original-To": "patchwork@inbox.dpdk.org", "Delivered-To": "patchwork@inbox.dpdk.org", "Received": [ "from mails.dpdk.org (mails.dpdk.org [217.70.189.124])\n\tby inbox.dpdk.org (Postfix) with ESMTP id 3345143287;\n\tSat, 4 Nov 2023 18:29:44 +0100 (CET)", "from mails.dpdk.org (localhost [127.0.0.1])\n\tby mails.dpdk.org (Postfix) with ESMTP id B494F4029B;\n\tSat, 4 Nov 2023 18:29:43 +0100 (CET)", "from dkmailrelay1.smartsharesystems.com\n (smartserver.smartsharesystems.com [77.243.40.215])\n by mails.dpdk.org (Postfix) with ESMTP id D84C640282\n for <dev@dpdk.org>; Sat, 4 Nov 2023 18:29:42 +0100 (CET)", "from smartserver.smartsharesystems.com\n (smartserver.smartsharesys.local [192.168.4.10])\n by dkmailrelay1.smartsharesystems.com (Postfix) with ESMTP id AB539206C3\n for <dev@dpdk.org>; Sat, 4 Nov 2023 18:29:42 +0100 (CET)" ], "Content-class": "urn:content-classes:message", "MIME-Version": "1.0", "Content-Type": "text/plain;\n\tcharset=\"iso-8859-1\"", "Content-Transfer-Encoding": "quoted-printable", "Subject": "[RFC] mempool: CPU cache aligning mempool driver accesses", "X-MimeOLE": "Produced By Microsoft Exchange V6.5", "Date": "Sat, 4 Nov 2023 18:29:40 +0100", "Message-ID": "<98CBD80474FA8B44BF855DF32C47DC35E9EFD4@smartserver.smartshare.dk>", "X-MS-Has-Attach": "", "X-MS-TNEF-Correlator": "", "Thread-Topic": "[RFC] mempool: CPU cache aligning mempool driver accesses", "Thread-Index": "AdoPRH2dqaLCk8BNQ/aikLEUxLBH7w==", "From": "=?iso-8859-1?q?Morten_Br=F8rup?= <mb@smartsharesystems.com>", "To": "<dev@dpdk.org>", "X-BeenThere": "dev@dpdk.org", "X-Mailman-Version": "2.1.29", "Precedence": "list", "List-Id": "DPDK patches and discussions <dev.dpdk.org>", "List-Unsubscribe": "<https://mails.dpdk.org/options/dev>,\n <mailto:dev-request@dpdk.org?subject=unsubscribe>", "List-Archive": "<http://mails.dpdk.org/archives/dev/>", "List-Post": "<mailto:dev@dpdk.org>", "List-Help": "<mailto:dev-request@dpdk.org?subject=help>", "List-Subscribe": "<https://mails.dpdk.org/listinfo/dev>,\n <mailto:dev-request@dpdk.org?subject=subscribe>", "Errors-To": "dev-bounces@dpdk.org" }, "content": "I tried a little experiment, which gave a 25 % improvement in mempool\nperf tests for long bursts (n_get_bulk=32 n_put_bulk=32 n_keep=512\nconstant_n=0) on a Xeon E5-2620 v4 based system.\n\nThis is the concept:\n\nIf all accesses to the mempool driver goes through the mempool cache,\nwe can ensure that these bulk load/stores are always CPU cache aligned,\nby using cache->size when loading/storing to the mempool driver.\n\nFurthermore, it is rumored that most applications use the default\nmempool cache size, so if the driver tests for that specific value,\nit can use rte_memcpy(src,dst,N) with N known at build time, allowing\noptimal performance for copying the array of objects.\n\nUnfortunately, I need to change the flush threshold from 1.5 to 2 to\nbe able to always use cache->size when loading/storing to the mempool\ndriver.\n\nWhat do you think?\n\nPS: If we can't get rid of the mempool cache size threshold factor,\nwe really need to expose it through public APIs. A job for another day.\n\nSigned-off-by: Morten Brørup <mb@smartsharesystems.com>\n---", "diff": "diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c\nindex 7a7a9bf6db..b21033209b 100644\n--- a/lib/mempool/rte_mempool.c\n+++ b/lib/mempool/rte_mempool.c\n@@ -48,7 +48,7 @@ static void\n mempool_event_callback_invoke(enum rte_mempool_event event,\n struct rte_mempool *mp);\n\n-#define CACHE_FLUSHTHRESH_MULTIPLIER 1.5\n+#define CACHE_FLUSHTHRESH_MULTIPLIER 2\n #define CALC_CACHE_FLUSHTHRESH(c) \\\n ((typeof(c))((c) * CACHE_FLUSHTHRESH_MULTIPLIER))\n\ndiff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h\nindex df87cd231e..76efeff59e 100644\n--- a/lib/mempool/rte_mempool.h\n+++ b/lib/mempool/rte_mempool.h\n@@ -1014,7 +1014,7 @@ typedef void (rte_mempool_ctor_t)(struct rte_mempool *, void *);\n * If cache_size is non-zero, the rte_mempool library will try to\n * limit the accesses to the common lockless pool, by maintaining a\n * per-lcore object cache. This argument must be lower or equal to\n- * RTE_MEMPOOL_CACHE_MAX_SIZE and n / 1.5. It is advised to choose\n+ * RTE_MEMPOOL_CACHE_MAX_SIZE and n / 2. It is advised to choose\n * cache_size to have \"n modulo cache_size == 0\": if this is\n * not the case, some elements will always stay in the pool and will\n * never be used. The access to the per-lcore table is of course\n@@ -1373,24 +1373,24 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,\n RTE_MEMPOOL_CACHE_STAT_ADD(cache, put_objs, n);\n\n /* The request itself is too big for the cache */\n- if (unlikely(n > cache->flushthresh))\n+ if (unlikely(n > cache->size))\n goto driver_enqueue_stats_incremented;\n\n /*\n * The cache follows the following algorithm:\n * 1. If the objects cannot be added to the cache without crossing\n- * the flush threshold, flush the cache to the backend.\n+ * the flush threshold, flush a fixed amount of the cache to the backend.\n * 2. Add the objects to the cache.\n */\n\n if (cache->len + n <= cache->flushthresh) {\n cache_objs = &cache->objs[cache->len];\n- cache->len += n;\n } else {\n- cache_objs = &cache->objs[0];\n- rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache->len);\n- cache->len = n;\n+ cache->len -= cache->size;\n+ cache_objs = &cache->objs[cache->len];\n+ rte_mempool_ops_enqueue_bulk(mp, cache_objs, cache->size);\n }\n+ cache->len += n;\n\n /* Add the objects to the cache. */\n rte_memcpy(cache_objs, obj_table, sizeof(void *) * n);\n@@ -1547,13 +1547,13 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,\n return 0;\n }\n\n- /* if dequeue below would overflow mem allocated for cache */\n- if (unlikely(remaining > RTE_MEMPOOL_CACHE_MAX_SIZE))\n+ /* More remaining than the cache size */\n+ if (unlikely(remaining > cache->size))\n goto driver_dequeue;\n\n- /* Fill the cache from the backend; fetch size + remaining objects. */\n+ /* Fill the cache from the backend; fetch size objects. */\n ret = rte_mempool_ops_dequeue_bulk(mp, cache->objs,\n- cache->size + remaining);\n+ cache->size);\n if (unlikely(ret < 0)) {\n /*\n * We are buffer constrained, and not able to allocate\n@@ -1565,11 +1565,11 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,\n }\n\n /* Satisfy the remaining part of the request from the filled cache. */\n- cache_objs = &cache->objs[cache->size + remaining];\n+ cache_objs = &cache->objs[cache->size];\n for (index = 0; index < remaining; index++)\n *obj_table++ = *--cache_objs;\n\n- cache->len = cache->size;\n+ cache->len = cache->size - remaining;\n\n RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_bulk, 1);\n RTE_MEMPOOL_CACHE_STAT_ADD(cache, get_success_objs, n);\ndiff --git a/lib/ring/rte_ring_elem_pvt.h b/lib/ring/rte_ring_elem_pvt.h\nindex 4b80f58980..2b10b76fc1 100644\n--- a/lib/ring/rte_ring_elem_pvt.h\n+++ b/lib/ring/rte_ring_elem_pvt.h\n@@ -10,6 +10,9 @@\n #ifndef _RTE_RING_ELEM_PVT_H_\n #define _RTE_RING_ELEM_PVT_H_\n\n+#include <rte_config.h>\n+#include <rte_memcpy.h>\n+\n #if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 120000)\n #pragma GCC diagnostic push\n #pragma GCC diagnostic ignored \"-Wstringop-overflow\"\n@@ -24,6 +27,12 @@ __rte_ring_enqueue_elems_32(struct rte_ring *r, const uint32_t size,\n uint32_t *ring = (uint32_t *)&r[1];\n const uint32_t *obj = (const uint32_t *)obj_table;\n if (likely(idx + n <= size)) {\n+#ifdef RTE_ARCH_32\n+ if (n == RTE_MEMPOOL_CACHE_MAX_SIZE) {\n+ rte_memcpy(&ring[idx], obj_table, RTE_MEMPOOL_CACHE_MAX_SIZE * sizeof(uint32_t));\n+ return;\n+ }\n+#endif\n for (i = 0; i < (n & ~0x7); i += 8, idx += 8) {\n ring[idx] = obj[i];\n ring[idx + 1] = obj[i + 1];\n@@ -69,6 +78,12 @@ __rte_ring_enqueue_elems_64(struct rte_ring *r, uint32_t prod_head,\n uint64_t *ring = (uint64_t *)&r[1];\n const unaligned_uint64_t *obj = (const unaligned_uint64_t *)obj_table;\n if (likely(idx + n <= size)) {\n+#ifdef RTE_ARCH_64\n+ if (n == RTE_MEMPOOL_CACHE_MAX_SIZE) {\n+ rte_memcpy(&ring[idx], obj_table, RTE_MEMPOOL_CACHE_MAX_SIZE * sizeof(uint64_t));\n+ return;\n+ }\n+#endif\n for (i = 0; i < (n & ~0x3); i += 4, idx += 4) {\n ring[idx] = obj[i];\n ring[idx + 1] = obj[i + 1];\n@@ -158,6 +173,12 @@ __rte_ring_dequeue_elems_32(struct rte_ring *r, const uint32_t size,\n uint32_t *ring = (uint32_t *)&r[1];\n uint32_t *obj = (uint32_t *)obj_table;\n if (likely(idx + n <= size)) {\n+#ifdef RTE_ARCH_32\n+ if (n == RTE_MEMPOOL_CACHE_MAX_SIZE) {\n+ rte_memcpy(obj_table, &ring[idx], RTE_MEMPOOL_CACHE_MAX_SIZE * sizeof(uint32_t));\n+ return;\n+ }\n+#endif\n for (i = 0; i < (n & ~0x7); i += 8, idx += 8) {\n obj[i] = ring[idx];\n obj[i + 1] = ring[idx + 1];\n@@ -203,6 +224,12 @@ __rte_ring_dequeue_elems_64(struct rte_ring *r, uint32_t cons_head,\n uint64_t *ring = (uint64_t *)&r[1];\n unaligned_uint64_t *obj = (unaligned_uint64_t *)obj_table;\n if (likely(idx + n <= size)) {\n+#ifdef RTE_ARCH_64\n+ if (n == RTE_MEMPOOL_CACHE_MAX_SIZE) {\n+ rte_memcpy(obj_table, &ring[idx], RTE_MEMPOOL_CACHE_MAX_SIZE * sizeof(uint64_t));\n+ return;\n+ }\n+#endif\n for (i = 0; i < (n & ~0x3); i += 4, idx += 4) {\n obj[i] = ring[idx];\n obj[i + 1] = ring[idx + 1];\n", "prefixes": [ "RFC" ] }{ "id": 133872, "url": "