From patchwork Fri Dec 24 22:59:23 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Dharmik Thakkar X-Patchwork-Id: 105406 X-Patchwork-Delegate: thomas@monjalon.net Return-Path: X-Original-To: patchwork@inbox.dpdk.org Delivered-To: patchwork@inbox.dpdk.org Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124]) by inbox.dpdk.org (Postfix) with ESMTP id A151AA0352; Sat, 25 Dec 2021 00:00:13 +0100 (CET) Received: from [217.70.189.124] (localhost [127.0.0.1]) by mails.dpdk.org (Postfix) with ESMTP id 3B0DB4067B; Sat, 25 Dec 2021 00:00:13 +0100 (CET) Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by mails.dpdk.org (Postfix) with ESMTP id 131884067B for ; Sat, 25 Dec 2021 00:00:11 +0100 (CET) Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 74499ED1; Fri, 24 Dec 2021 15:00:10 -0800 (PST) Received: from 2p2660v4-1.austin.arm.com (2p2660v4-1.austin.arm.com [10.118.13.211]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id 628D93F718; Fri, 24 Dec 2021 15:00:10 -0800 (PST) From: Dharmik Thakkar To: Olivier Matz , Andrew Rybchenko Cc: dev@dpdk.org, nd@arm.com, honnappa.nagarahalli@arm.com, ruifeng.wang@arm.com, Dharmik Thakkar Subject: [PATCH 1/1] mempool: implement index-based per core cache Date: Fri, 24 Dec 2021 16:59:23 -0600 Message-Id: <20211224225923.806498-2-dharmik.thakkar@arm.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20211224225923.806498-1-dharmik.thakkar@arm.com> References: <20210930172735.2675627-1-dharmik.thakkar@arm.com> <20211224225923.806498-1-dharmik.thakkar@arm.com> MIME-Version: 1.0 X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Current mempool per core cache implementation stores pointers to mbufs On 64b architectures, each pointer consumes 8B This patch replaces it with index-based implementation, where in each buffer is addressed by (pool base address + index) It reduces the amount of memory/cache required for per core cache L3Fwd performance testing reveals minor improvements in the cache performance (L1 and L2 misses reduced by 0.60%) with no change in throughput Suggested-by: Honnappa Nagarahalli Signed-off-by: Dharmik Thakkar Reviewed-by: Ruifeng Wang --- lib/mempool/rte_mempool.h | 114 +++++++++++++++++++++++++- lib/mempool/rte_mempool_ops_default.c | 7 ++ 2 files changed, 119 insertions(+), 2 deletions(-) diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h index 1e7a3c15273c..4fabd3b1920b 100644 --- a/lib/mempool/rte_mempool.h +++ b/lib/mempool/rte_mempool.h @@ -50,6 +50,10 @@ #include #include +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE +#include +#endif + #include "rte_mempool_trace_fp.h" #ifdef __cplusplus @@ -239,6 +243,9 @@ struct rte_mempool { int32_t ops_index; struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */ +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE + void *pool_base_value; /**< Base value to calculate indices */ +#endif uint32_t populated_size; /**< Number of populated objects. */ struct rte_mempool_objhdr_list elt_list; /**< List of objects in pool */ @@ -1314,7 +1321,19 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache, if (cache == NULL || cache->len == 0) return; rte_mempool_trace_cache_flush(cache, mp); + +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE + unsigned int i; + unsigned int cache_len = cache->len; + void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; + void *base_value = mp->pool_base_value; + uint32_t *cache_objs = (uint32_t *) cache->objs; + for (i = 0; i < cache_len; i++) + obj_table[i] = (void *) RTE_PTR_ADD(base_value, cache_objs[i]); + rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len); +#else rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len); +#endif cache->len = 0; } @@ -1334,8 +1353,13 @@ static __rte_always_inline void rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table, unsigned int n, struct rte_mempool_cache *cache) { +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE + uint32_t *cache_objs; + void *base_value; + uint32_t i; +#else void **cache_objs; - +#endif /* increment stat now, adding in mempool always success */ RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1); RTE_MEMPOOL_STAT_ADD(mp, put_objs, n); @@ -1344,7 +1368,13 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table, if (unlikely(cache == NULL || n > RTE_MEMPOOL_CACHE_MAX_SIZE)) goto ring_enqueue; +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE + cache_objs = (uint32_t *) cache->objs; + cache_objs = &cache_objs[cache->len]; + base_value = mp->pool_base_value; +#else cache_objs = &cache->objs[cache->len]; +#endif /* * The cache follows the following algorithm @@ -1354,13 +1384,40 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table, */ /* Add elements back into the cache */ + +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE +#if defined __ARM_NEON + uint64x2_t v_obj_table; + uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value); + uint32x2_t v_cache_objs; + + for (i = 0; i < (n & ~0x1); i += 2) { + v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]); + v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table, v_base_value)); + vst1_u32(cache_objs + i, v_cache_objs); + } + if (n & 0x1) { + cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value); + } +#else + for (i = 0; i < n; i++) { + cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value); + } +#endif +#else rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n); +#endif cache->len += n; if (cache->len >= cache->flushthresh) { +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE + rte_mempool_ops_enqueue_bulk(mp, obj_table + cache->len - cache->size, + cache->len - cache->size); +#else rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size], cache->len - cache->size); +#endif cache->len = cache->size; } @@ -1461,13 +1518,22 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table, { int ret; uint32_t index, len; +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE + uint32_t i; + uint32_t *cache_objs; +#else void **cache_objs; - +#endif /* No cache provided or cannot be satisfied from cache */ if (unlikely(cache == NULL || n >= cache->size)) goto ring_dequeue; +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE + void *base_value = mp->pool_base_value; + cache_objs = (uint32_t *) cache->objs; +#else cache_objs = cache->objs; +#endif /* Can this be satisfied from the cache? */ if (cache->len < n) { @@ -1475,8 +1541,14 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table, uint32_t req = n + (cache->size - cache->len); /* How many do we require i.e. number to fill the cache + the request */ +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE + void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */ + ret = rte_mempool_ops_dequeue_bulk(mp, + temp_objs, req); +#else ret = rte_mempool_ops_dequeue_bulk(mp, &cache->objs[cache->len], req); +#endif if (unlikely(ret < 0)) { /* * In the off chance that we are buffer constrained, @@ -1487,12 +1559,50 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table, goto ring_dequeue; } +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE + len = cache->len; + for (i = 0; i < req; ++i, ++len) { + cache_objs[len] = (uint32_t) RTE_PTR_DIFF(temp_objs[i], + base_value); + } +#endif cache->len += req; } /* Now fill in the response ... */ +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE +#if defined __ARM_NEON + uint64x2_t v_obj_table; + uint64x2_t v_cache_objs; + uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value); + + for (index = 0, len = cache->len - 1; index < (n & ~0x3); index += 4, + len -= 4, obj_table += 4) { + v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1)); + v_obj_table = vaddq_u64(v_cache_objs, v_base_value); + vst1q_u64((uint64_t *)obj_table, v_obj_table); + v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 3)); + v_obj_table = vaddq_u64(v_cache_objs, v_base_value); + vst1q_u64((uint64_t *)(obj_table + 2), v_obj_table); + } + switch (n & 0x3) { + case 3: + *(obj_table++) = (void *) RTE_PTR_ADD(base_value, cache_objs[len--]); + /* fallthrough */ + case 2: + *(obj_table++) = (void *) RTE_PTR_ADD(base_value, cache_objs[len--]); + /* fallthrough */ + case 1: + *(obj_table++) = (void *) RTE_PTR_ADD(base_value, cache_objs[len--]); + } +#else + for (index = 0, len = cache->len - 1; index < n; ++index, len--, obj_table++) + *obj_table = (void *) RTE_PTR_ADD(base_value, cache_objs[len]); +#endif +#else for (index = 0, len = cache->len - 1; index < n; ++index, len--, obj_table++) *obj_table = cache_objs[len]; +#endif cache->len -= n; diff --git a/lib/mempool/rte_mempool_ops_default.c b/lib/mempool/rte_mempool_ops_default.c index 22fccf9d7619..3543cad9d4ce 100644 --- a/lib/mempool/rte_mempool_ops_default.c +++ b/lib/mempool/rte_mempool_ops_default.c @@ -127,6 +127,13 @@ rte_mempool_op_populate_helper(struct rte_mempool *mp, unsigned int flags, obj = va + off; obj_cb(mp, obj_cb_arg, obj, (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off)); +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE + /* Store pool base value to calculate indices for index-based + * lcore cache implementation + */ + if (i == 0) + mp->pool_base_value = obj; +#endif rte_mempool_ops_enqueue_bulk(mp, &obj, 1); off += mp->elt_size + mp->trailer_size; }