[1/1] mempool: implement index-based per core cache
Checks
Commit Message
Current mempool per core cache implementation stores pointers to mbufs
On 64b architectures, each pointer consumes 8B
This patch replaces it with index-based implementation,
where in each buffer is addressed by (pool base address + index)
It reduces the amount of memory/cache required for per core cache
L3Fwd performance testing reveals minor improvements in the cache
performance (L1 and L2 misses reduced by 0.60%)
with no change in throughput
Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
lib/mempool/rte_mempool.h | 114 +++++++++++++++++++++++++-
lib/mempool/rte_mempool_ops_default.c | 7 ++
2 files changed, 119 insertions(+), 2 deletions(-)
Comments
> Current mempool per core cache implementation stores pointers to mbufs
> On 64b architectures, each pointer consumes 8B
> This patch replaces it with index-based implementation,
> where in each buffer is addressed by (pool base address + index)
> It reduces the amount of memory/cache required for per core cache
>
> L3Fwd performance testing reveals minor improvements in the cache
> performance (L1 and L2 misses reduced by 0.60%)
> with no change in throughput
I feel really sceptical about that patch and the whole idea in general:
- From what I read above there is no real performance improvement observed.
(In fact on my IA boxes mempool_perf_autotest reports ~20% slowdown,
see below for more details).
- Space utilization difference looks neglectable too.
- The change introduces a new build time config option with a major limitation:
All memzones in a pool have to be within the same 4GB boundary.
To address it properly, extra changes will be required in init(/populate) part of the code.
All that will complicate mempool code, will make it more error prone
and harder to maintain.
But, as there is no real gain in return - no point to add such extra complexity at all.
Konstantin
CSX 2.1 GHz
==========
echo 'mempool_perf_autotest' | ./dpdk-test -n 4 --lcores='6-13' --no-pci
params : rate_persec
(normal/index-based/diff %)
(with cache)
cache=512 cores=1 n_get_bulk=32 n_put_bulk=32 n_keep=32 : 740989337.00/504116019.00/-31.97
cache=512 cores=1 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 756495155.00/615002931.00/-18.70
cache=512 cores=2 n_get_bulk=32 n_put_bulk=32 n_keep=32 : 1483499110.00/1007248997.00/-32.10
cache=512 cores=2 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 1512439807.00/1229927218.00/-18.68
cache=512 cores=8 n_get_bulk=32 n_put_bulk=32 n_keep=32 : 5933668757.00/4029048421.00/-32.10
cache=512 cores=8 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 6049234942.00/4921111344.00/-18.65
(with user-owned cache)
cache=512 cores=1 n_get_bulk=32 n_put_bulk=32 n_keep=32 : 630600499.00/504312627.00/-20.03
cache=512 cores=1 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 756259225.00/615042252.00/-18.67
cache=512 cores=2 n_get_bulk=32 n_put_bulk=32 n_keep=32 : 1262052966.00/1007039283.00/-20.21
cache=512 cores=2 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 1517853081.00/1230818508.00/-18.91
cache=512 cores=8 n_get_bulk=32 n_put_bulk=32 n_keep=32 :5054529533.00/4028052273.00/-20.31
cache=512 cores=8 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 6059340592.00/4912893129.00/-18.92
>
> Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> ---
> lib/mempool/rte_mempool.h | 114 +++++++++++++++++++++++++-
> lib/mempool/rte_mempool_ops_default.c | 7 ++
> 2 files changed, 119 insertions(+), 2 deletions(-)
>
> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> index 1e7a3c15273c..4fabd3b1920b 100644
> --- a/lib/mempool/rte_mempool.h
> +++ b/lib/mempool/rte_mempool.h
> @@ -50,6 +50,10 @@
> #include <rte_memcpy.h>
> #include <rte_common.h>
>
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> +#include <rte_vect.h>
> +#endif
> +
> #include "rte_mempool_trace_fp.h"
>
> #ifdef __cplusplus
> @@ -239,6 +243,9 @@ struct rte_mempool {
> int32_t ops_index;
>
> struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> + void *pool_base_value; /**< Base value to calculate indices */
> +#endif
>
> uint32_t populated_size; /**< Number of populated objects. */
> struct rte_mempool_objhdr_list elt_list; /**< List of objects in pool */
> @@ -1314,7 +1321,19 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
> if (cache == NULL || cache->len == 0)
> return;
> rte_mempool_trace_cache_flush(cache, mp);
> +
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> + unsigned int i;
> + unsigned int cache_len = cache->len;
> + void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> + void *base_value = mp->pool_base_value;
> + uint32_t *cache_objs = (uint32_t *) cache->objs;
> + for (i = 0; i < cache_len; i++)
> + obj_table[i] = (void *) RTE_PTR_ADD(base_value, cache_objs[i]);
> + rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
> +#else
> rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
> +#endif
> cache->len = 0;
> }
>
> @@ -1334,8 +1353,13 @@ static __rte_always_inline void
> rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
> unsigned int n, struct rte_mempool_cache *cache)
> {
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> + uint32_t *cache_objs;
> + void *base_value;
> + uint32_t i;
> +#else
> void **cache_objs;
> -
> +#endif
> /* increment stat now, adding in mempool always success */
> RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
> RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
> @@ -1344,7 +1368,13 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
> if (unlikely(cache == NULL || n > RTE_MEMPOOL_CACHE_MAX_SIZE))
> goto ring_enqueue;
>
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> + cache_objs = (uint32_t *) cache->objs;
> + cache_objs = &cache_objs[cache->len];
> + base_value = mp->pool_base_value;
> +#else
> cache_objs = &cache->objs[cache->len];
> +#endif
>
> /*
> * The cache follows the following algorithm
> @@ -1354,13 +1384,40 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
> */
>
> /* Add elements back into the cache */
> +
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> +#if defined __ARM_NEON
> + uint64x2_t v_obj_table;
> + uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> + uint32x2_t v_cache_objs;
> +
> + for (i = 0; i < (n & ~0x1); i += 2) {
> + v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
> + v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table, v_base_value));
> + vst1_u32(cache_objs + i, v_cache_objs);
> + }
> + if (n & 0x1) {
> + cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
> + }
> +#else
> + for (i = 0; i < n; i++) {
> + cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
> + }
> +#endif
> +#else
> rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
> +#endif
>
> cache->len += n;
>
> if (cache->len >= cache->flushthresh) {
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> + rte_mempool_ops_enqueue_bulk(mp, obj_table + cache->len - cache->size,
> + cache->len - cache->size);
> +#else
> rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size],
> cache->len - cache->size);
> +#endif
> cache->len = cache->size;
> }
>
> @@ -1461,13 +1518,22 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
> {
> int ret;
> uint32_t index, len;
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> + uint32_t i;
> + uint32_t *cache_objs;
> +#else
> void **cache_objs;
> -
> +#endif
> /* No cache provided or cannot be satisfied from cache */
> if (unlikely(cache == NULL || n >= cache->size))
> goto ring_dequeue;
>
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> + void *base_value = mp->pool_base_value;
> + cache_objs = (uint32_t *) cache->objs;
> +#else
> cache_objs = cache->objs;
> +#endif
>
> /* Can this be satisfied from the cache? */
> if (cache->len < n) {
> @@ -1475,8 +1541,14 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
> uint32_t req = n + (cache->size - cache->len);
>
> /* How many do we require i.e. number to fill the cache + the request */
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> + void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
> + ret = rte_mempool_ops_dequeue_bulk(mp,
> + temp_objs, req);
> +#else
> ret = rte_mempool_ops_dequeue_bulk(mp,
> &cache->objs[cache->len], req);
> +#endif
> if (unlikely(ret < 0)) {
> /*
> * In the off chance that we are buffer constrained,
> @@ -1487,12 +1559,50 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
> goto ring_dequeue;
> }
>
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> + len = cache->len;
> + for (i = 0; i < req; ++i, ++len) {
> + cache_objs[len] = (uint32_t) RTE_PTR_DIFF(temp_objs[i],
> + base_value);
> + }
> +#endif
> cache->len += req;
> }
>
> /* Now fill in the response ... */
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> +#if defined __ARM_NEON
> + uint64x2_t v_obj_table;
> + uint64x2_t v_cache_objs;
> + uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> +
> + for (index = 0, len = cache->len - 1; index < (n & ~0x3); index += 4,
> + len -= 4, obj_table += 4) {
> + v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
> + v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
> + vst1q_u64((uint64_t *)obj_table, v_obj_table);
> + v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 3));
> + v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
> + vst1q_u64((uint64_t *)(obj_table + 2), v_obj_table);
> + }
> + switch (n & 0x3) {
> + case 3:
> + *(obj_table++) = (void *) RTE_PTR_ADD(base_value, cache_objs[len--]);
> + /* fallthrough */
> + case 2:
> + *(obj_table++) = (void *) RTE_PTR_ADD(base_value, cache_objs[len--]);
> + /* fallthrough */
> + case 1:
> + *(obj_table++) = (void *) RTE_PTR_ADD(base_value, cache_objs[len--]);
> + }
> +#else
> + for (index = 0, len = cache->len - 1; index < n; ++index, len--, obj_table++)
> + *obj_table = (void *) RTE_PTR_ADD(base_value, cache_objs[len]);
> +#endif
> +#else
> for (index = 0, len = cache->len - 1; index < n; ++index, len--, obj_table++)
> *obj_table = cache_objs[len];
> +#endif
>
> cache->len -= n;
>
> diff --git a/lib/mempool/rte_mempool_ops_default.c b/lib/mempool/rte_mempool_ops_default.c
> index 22fccf9d7619..3543cad9d4ce 100644
> --- a/lib/mempool/rte_mempool_ops_default.c
> +++ b/lib/mempool/rte_mempool_ops_default.c
> @@ -127,6 +127,13 @@ rte_mempool_op_populate_helper(struct rte_mempool *mp, unsigned int flags,
> obj = va + off;
> obj_cb(mp, obj_cb_arg, obj,
> (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
> + /* Store pool base value to calculate indices for index-based
> + * lcore cache implementation
> + */
> + if (i == 0)
> + mp->pool_base_value = obj;
> +#endif
> rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
> off += mp->elt_size + mp->trailer_size;
> }
> --
> 2.25.1
Hi Konstatin,
Thank you for your comments and the test report!
> On Jan 10, 2022, at 8:26 PM, Ananyev, Konstantin <konstantin.ananyev@intel.com> wrote:
>
>
>
>
>> Current mempool per core cache implementation stores pointers to mbufs
>> On 64b architectures, each pointer consumes 8B
>> This patch replaces it with index-based implementation,
>> where in each buffer is addressed by (pool base address + index)
>> It reduces the amount of memory/cache required for per core cache
>>
>> L3Fwd performance testing reveals minor improvements in the cache
>> performance (L1 and L2 misses reduced by 0.60%)
>> with no change in throughput
>
> I feel really sceptical about that patch and the whole idea in general:
> - From what I read above there is no real performance improvement observed.
> (In fact on my IA boxes mempool_perf_autotest reports ~20% slowdown,
> see below for more details).
Currently, the optimizations (loop unroll and vectorization) are only implemented for ARM64.
Similar optimizations can be implemented for x86 platforms which should close the performance gap
and in my understanding should give better performance for a bulk size of 32.
> - Space utilization difference looks neglectable too.
Sorry, I did not understand this point.
> - The change introduces a new build time config option with a major limitation:
> All memzones in a pool have to be within the same 4GB boundary.
> To address it properly, extra changes will be required in init(/populate) part of the code.
I agree to the above mentioned challenges and I am currently working on resolving these issues.
> All that will complicate mempool code, will make it more error prone
> and harder to maintain.
> But, as there is no real gain in return - no point to add such extra complexity at all.
>
> Konstantin
>
> CSX 2.1 GHz
> ==========
>
> echo 'mempool_perf_autotest' | ./dpdk-test -n 4 --lcores='6-13' --no-pci
>
> params : rate_persec
> (normal/index-based/diff %)
> (with cache)
> cache=512 cores=1 n_get_bulk=32 n_put_bulk=32 n_keep=32 : 740989337.00/504116019.00/-31.97
> cache=512 cores=1 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 756495155.00/615002931.00/-18.70
> cache=512 cores=2 n_get_bulk=32 n_put_bulk=32 n_keep=32 : 1483499110.00/1007248997.00/-32.10
> cache=512 cores=2 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 1512439807.00/1229927218.00/-18.68
> cache=512 cores=8 n_get_bulk=32 n_put_bulk=32 n_keep=32 : 5933668757.00/4029048421.00/-32.10
> cache=512 cores=8 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 6049234942.00/4921111344.00/-18.65
>
> (with user-owned cache)
> cache=512 cores=1 n_get_bulk=32 n_put_bulk=32 n_keep=32 : 630600499.00/504312627.00/-20.03
> cache=512 cores=1 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 756259225.00/615042252.00/-18.67
> cache=512 cores=2 n_get_bulk=32 n_put_bulk=32 n_keep=32 : 1262052966.00/1007039283.00/-20.21
> cache=512 cores=2 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 1517853081.00/1230818508.00/-18.91
> cache=512 cores=8 n_get_bulk=32 n_put_bulk=32 n_keep=32 :5054529533.00/4028052273.00/-20.31
> cache=512 cores=8 n_get_bulk=32 n_put_bulk=32 n_keep=128 : 6059340592.00/4912893129.00/-18.92
>
>>
>> Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
>> Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
>> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
>> ---
>> lib/mempool/rte_mempool.h | 114 +++++++++++++++++++++++++-
>> lib/mempool/rte_mempool_ops_default.c | 7 ++
>> 2 files changed, 119 insertions(+), 2 deletions(-)
>>
>> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
>> index 1e7a3c15273c..4fabd3b1920b 100644
>> --- a/lib/mempool/rte_mempool.h
>> +++ b/lib/mempool/rte_mempool.h
>> @@ -50,6 +50,10 @@
>> #include <rte_memcpy.h>
>> #include <rte_common.h>
>>
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +#include <rte_vect.h>
>> +#endif
>> +
>> #include "rte_mempool_trace_fp.h"
>>
>> #ifdef __cplusplus
>> @@ -239,6 +243,9 @@ struct rte_mempool {
>> int32_t ops_index;
>>
>> struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> + void *pool_base_value; /**< Base value to calculate indices */
>> +#endif
>>
>> uint32_t populated_size; /**< Number of populated objects. */
>> struct rte_mempool_objhdr_list elt_list; /**< List of objects in pool */
>> @@ -1314,7 +1321,19 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
>> if (cache == NULL || cache->len == 0)
>> return;
>> rte_mempool_trace_cache_flush(cache, mp);
>> +
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> + unsigned int i;
>> + unsigned int cache_len = cache->len;
>> + void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
>> + void *base_value = mp->pool_base_value;
>> + uint32_t *cache_objs = (uint32_t *) cache->objs;
>> + for (i = 0; i < cache_len; i++)
>> + obj_table[i] = (void *) RTE_PTR_ADD(base_value, cache_objs[i]);
>> + rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
>> +#else
>> rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
>> +#endif
>> cache->len = 0;
>> }
>>
>> @@ -1334,8 +1353,13 @@ static __rte_always_inline void
>> rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
>> unsigned int n, struct rte_mempool_cache *cache)
>> {
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> + uint32_t *cache_objs;
>> + void *base_value;
>> + uint32_t i;
>> +#else
>> void **cache_objs;
>> -
>> +#endif
>> /* increment stat now, adding in mempool always success */
>> RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
>> RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
>> @@ -1344,7 +1368,13 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
>> if (unlikely(cache == NULL || n > RTE_MEMPOOL_CACHE_MAX_SIZE))
>> goto ring_enqueue;
>>
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> + cache_objs = (uint32_t *) cache->objs;
>> + cache_objs = &cache_objs[cache->len];
>> + base_value = mp->pool_base_value;
>> +#else
>> cache_objs = &cache->objs[cache->len];
>> +#endif
>>
>> /*
>> * The cache follows the following algorithm
>> @@ -1354,13 +1384,40 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
>> */
>>
>> /* Add elements back into the cache */
>> +
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +#if defined __ARM_NEON
>> + uint64x2_t v_obj_table;
>> + uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
>> + uint32x2_t v_cache_objs;
>> +
>> + for (i = 0; i < (n & ~0x1); i += 2) {
>> + v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
>> + v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table, v_base_value));
>> + vst1_u32(cache_objs + i, v_cache_objs);
>> + }
>> + if (n & 0x1) {
>> + cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
>> + }
>> +#else
>> + for (i = 0; i < n; i++) {
>> + cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
>> + }
>> +#endif
>> +#else
>> rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
>> +#endif
>>
>> cache->len += n;
>>
>> if (cache->len >= cache->flushthresh) {
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> + rte_mempool_ops_enqueue_bulk(mp, obj_table + cache->len - cache->size,
>> + cache->len - cache->size);
>> +#else
>> rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size],
>> cache->len - cache->size);
>> +#endif
>> cache->len = cache->size;
>> }
>>
>> @@ -1461,13 +1518,22 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
>> {
>> int ret;
>> uint32_t index, len;
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> + uint32_t i;
>> + uint32_t *cache_objs;
>> +#else
>> void **cache_objs;
>> -
>> +#endif
>> /* No cache provided or cannot be satisfied from cache */
>> if (unlikely(cache == NULL || n >= cache->size))
>> goto ring_dequeue;
>>
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> + void *base_value = mp->pool_base_value;
>> + cache_objs = (uint32_t *) cache->objs;
>> +#else
>> cache_objs = cache->objs;
>> +#endif
>>
>> /* Can this be satisfied from the cache? */
>> if (cache->len < n) {
>> @@ -1475,8 +1541,14 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
>> uint32_t req = n + (cache->size - cache->len);
>>
>> /* How many do we require i.e. number to fill the cache + the request */
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> + void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
>> + ret = rte_mempool_ops_dequeue_bulk(mp,
>> + temp_objs, req);
>> +#else
>> ret = rte_mempool_ops_dequeue_bulk(mp,
>> &cache->objs[cache->len], req);
>> +#endif
>> if (unlikely(ret < 0)) {
>> /*
>> * In the off chance that we are buffer constrained,
>> @@ -1487,12 +1559,50 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
>> goto ring_dequeue;
>> }
>>
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> + len = cache->len;
>> + for (i = 0; i < req; ++i, ++len) {
>> + cache_objs[len] = (uint32_t) RTE_PTR_DIFF(temp_objs[i],
>> + base_value);
>> + }
>> +#endif
>> cache->len += req;
>> }
>>
>> /* Now fill in the response ... */
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> +#if defined __ARM_NEON
>> + uint64x2_t v_obj_table;
>> + uint64x2_t v_cache_objs;
>> + uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
>> +
>> + for (index = 0, len = cache->len - 1; index < (n & ~0x3); index += 4,
>> + len -= 4, obj_table += 4) {
>> + v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
>> + v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
>> + vst1q_u64((uint64_t *)obj_table, v_obj_table);
>> + v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 3));
>> + v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
>> + vst1q_u64((uint64_t *)(obj_table + 2), v_obj_table);
>> + }
>> + switch (n & 0x3) {
>> + case 3:
>> + *(obj_table++) = (void *) RTE_PTR_ADD(base_value, cache_objs[len--]);
>> + /* fallthrough */
>> + case 2:
>> + *(obj_table++) = (void *) RTE_PTR_ADD(base_value, cache_objs[len--]);
>> + /* fallthrough */
>> + case 1:
>> + *(obj_table++) = (void *) RTE_PTR_ADD(base_value, cache_objs[len--]);
>> + }
>> +#else
>> + for (index = 0, len = cache->len - 1; index < n; ++index, len--, obj_table++)
>> + *obj_table = (void *) RTE_PTR_ADD(base_value, cache_objs[len]);
>> +#endif
>> +#else
>> for (index = 0, len = cache->len - 1; index < n; ++index, len--, obj_table++)
>> *obj_table = cache_objs[len];
>> +#endif
>>
>> cache->len -= n;
>>
>> diff --git a/lib/mempool/rte_mempool_ops_default.c b/lib/mempool/rte_mempool_ops_default.c
>> index 22fccf9d7619..3543cad9d4ce 100644
>> --- a/lib/mempool/rte_mempool_ops_default.c
>> +++ b/lib/mempool/rte_mempool_ops_default.c
>> @@ -127,6 +127,13 @@ rte_mempool_op_populate_helper(struct rte_mempool *mp, unsigned int flags,
>> obj = va + off;
>> obj_cb(mp, obj_cb_arg, obj,
>> (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
>> +#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
>> + /* Store pool base value to calculate indices for index-based
>> + * lcore cache implementation
>> + */
>> + if (i == 0)
>> + mp->pool_base_value = obj;
>> +#endif
>> rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
>> off += mp->elt_size + mp->trailer_size;
>> }
>> --
>> 2.25.1
>
Hi Dharmik,
> >
> >> Current mempool per core cache implementation stores pointers to mbufs
> >> On 64b architectures, each pointer consumes 8B
> >> This patch replaces it with index-based implementation,
> >> where in each buffer is addressed by (pool base address + index)
> >> It reduces the amount of memory/cache required for per core cache
> >>
> >> L3Fwd performance testing reveals minor improvements in the cache
> >> performance (L1 and L2 misses reduced by 0.60%)
> >> with no change in throughput
> >
> > I feel really sceptical about that patch and the whole idea in general:
> > - From what I read above there is no real performance improvement observed.
> > (In fact on my IA boxes mempool_perf_autotest reports ~20% slowdown,
> > see below for more details).
>
> Currently, the optimizations (loop unroll and vectorization) are only implemented for ARM64.
> Similar optimizations can be implemented for x86 platforms which should close the performance gap
> and in my understanding should give better performance for a bulk size of 32.
Might be, but I still don't see the reason for such effort.
As you mentioned there is no performance improvement in 'real' apps: l3fwd, etc.
on ARM64 even with vectorized version of the code.
> > - Space utilization difference looks neglectable too.
>
> Sorry, I did not understand this point.
As I understand one of the expectations from that patch was:
reduce memory/cache required, which should improve cache utilization
(less misses, etc.).
Though I think such improvements would be neglectable and wouldn't
cause any real performance gain.
> > - The change introduces a new build time config option with a major limitation:
> > All memzones in a pool have to be within the same 4GB boundary.
> > To address it properly, extra changes will be required in init(/populate) part of the code.
>
> I agree to the above mentioned challenges and I am currently working on resolving these issues.
I still think that to justify such changes some really noticeable performance
improvement needs to be demonstrated: double-digit speedup for l3fwd/ipsec-secgw/...
Otherwise it just not worth the hassle.
> > All that will complicate mempool code, will make it more error prone
> > and harder to maintain.
> > But, as there is no real gain in return - no point to add such extra complexity at all.
> >
> > Konstantin
> >
Hi Konstatin,
> On Jan 13, 2022, at 4:37 AM, Ananyev, Konstantin <konstantin.ananyev@intel.com> wrote:
>
>
> Hi Dharmik,
>
>>>
>>>> Current mempool per core cache implementation stores pointers to mbufs
>>>> On 64b architectures, each pointer consumes 8B
>>>> This patch replaces it with index-based implementation,
>>>> where in each buffer is addressed by (pool base address + index)
>>>> It reduces the amount of memory/cache required for per core cache
>>>>
>>>> L3Fwd performance testing reveals minor improvements in the cache
>>>> performance (L1 and L2 misses reduced by 0.60%)
>>>> with no change in throughput
>>>
>>> I feel really sceptical about that patch and the whole idea in general:
>>> - From what I read above there is no real performance improvement observed.
>>> (In fact on my IA boxes mempool_perf_autotest reports ~20% slowdown,
>>> see below for more details).
>>
>> Currently, the optimizations (loop unroll and vectorization) are only implemented for ARM64.
>> Similar optimizations can be implemented for x86 platforms which should close the performance gap
>> and in my understanding should give better performance for a bulk size of 32.
>
> Might be, but I still don't see the reason for such effort.
> As you mentioned there is no performance improvement in 'real' apps: l3fwd, etc.
> on ARM64 even with vectorized version of the code.
>
IMO, even without performance improvement, it is advantageous because the same performance is being achieved
with less memory and cache utilization using the patch.
>>> - Space utilization difference looks neglectable too.
>>
>> Sorry, I did not understand this point.
>
> As I understand one of the expectations from that patch was:
> reduce memory/cache required, which should improve cache utilization
> (less misses, etc.).
> Though I think such improvements would be neglectable and wouldn't
> cause any real performance gain.
The cache utilization performance numbers are for the l3fwd app, which might not be bottlenecked at the mempool per core cache.
Theoretically, this patch enables storing twice the number of objects in the cache as compared to the original implementation.
>
>>> - The change introduces a new build time config option with a major limitation:
>>> All memzones in a pool have to be within the same 4GB boundary.
>>> To address it properly, extra changes will be required in init(/populate) part of the code.
>>
>> I agree to the above mentioned challenges and I am currently working on resolving these issues.
>
> I still think that to justify such changes some really noticeable performance
> improvement needs to be demonstrated: double-digit speedup for l3fwd/ipsec-secgw/...
> Otherwise it just not worth the hassle.
>
Like I mentioned earlier, the app might not be bottlenecked at the mempool per core cache.
That could be the reason the numbers with l3fwd don’t fully show the advantage of the patch.
I’m seeing double-digit improvement with mempool_perf_autotest which should not be ignored.
>>> All that will complicate mempool code, will make it more error prone
>>> and harder to maintain.
>>> But, as there is no real gain in return - no point to add such extra complexity at all.
>>>
>>> Konstantin
>>>
Hi Dharmik,
> >
> >>>
> >>>> Current mempool per core cache implementation stores pointers to mbufs
> >>>> On 64b architectures, each pointer consumes 8B
> >>>> This patch replaces it with index-based implementation,
> >>>> where in each buffer is addressed by (pool base address + index)
> >>>> It reduces the amount of memory/cache required for per core cache
> >>>>
> >>>> L3Fwd performance testing reveals minor improvements in the cache
> >>>> performance (L1 and L2 misses reduced by 0.60%)
> >>>> with no change in throughput
> >>>
> >>> I feel really sceptical about that patch and the whole idea in general:
> >>> - From what I read above there is no real performance improvement observed.
> >>> (In fact on my IA boxes mempool_perf_autotest reports ~20% slowdown,
> >>> see below for more details).
> >>
> >> Currently, the optimizations (loop unroll and vectorization) are only implemented for ARM64.
> >> Similar optimizations can be implemented for x86 platforms which should close the performance gap
> >> and in my understanding should give better performance for a bulk size of 32.
> >
> > Might be, but I still don't see the reason for such effort.
> > As you mentioned there is no performance improvement in 'real' apps: l3fwd, etc.
> > on ARM64 even with vectorized version of the code.
> >
>
> IMO, even without performance improvement, it is advantageous because the same performance is being achieved
> with less memory and cache utilization using the patch.
>
> >>> - Space utilization difference looks neglectable too.
> >>
> >> Sorry, I did not understand this point.
> >
> > As I understand one of the expectations from that patch was:
> > reduce memory/cache required, which should improve cache utilization
> > (less misses, etc.).
> > Though I think such improvements would be neglectable and wouldn't
> > cause any real performance gain.
>
> The cache utilization performance numbers are for the l3fwd app, which might not be bottlenecked at the mempool per core cache.
> Theoretically, this patch enables storing twice the number of objects in the cache as compared to the original implementation.
It saves you 4 just bytes per mbuf.
Even for simple l2fwd-like workload we access ~100 bytes per mbuf.
Let's do a simplistic estimation of number of affected cache-lines l for l2fwd.
For bulk of 32 packets, assuming 64B per cache-line and 16B per HW desc:
number of cache-lines accessed
cache with pointers / cache with indexes
mempool_get: (32*8)/64=4 / (32*4)/64=2
RX (read HW desc): (32*16)/64=8 / (32*16)/64=8
RX (write mbuf fields, 1st cache line): (32*64)/64=3 / (32*64)/64=32
update mac addrs: (32*64)/64=32 / (32*64)/64=32
TX (write HW desc): (32*16)/64=8 / (32*16)/64=8
free mbufs (read 2nd mbuf cache line): (32*64)/64=32 / (32*64)/64=32
mempool_put: (32*8)/64=4 / (32*4)/64=2
total: 120 116
So, if my calculations are correct, max estimated gain for cache utilization would be:
(120-116)*100/120=3.33%
Note that numbers are for over-simplistic usage scenario.
In more realistic ones, when we have to touch more cache-lines per packet,
that difference would be even less noticeable.
So I really doubt we will see some noticeable improvements in terms of cache utilization
with that patch.
> >
> >>> - The change introduces a new build time config option with a major limitation:
> >>> All memzones in a pool have to be within the same 4GB boundary.
> >>> To address it properly, extra changes will be required in init(/populate) part of the code.
> >>
> >> I agree to the above mentioned challenges and I am currently working on resolving these issues.
> >
> > I still think that to justify such changes some really noticeable performance
> > improvement needs to be demonstrated: double-digit speedup for l3fwd/ipsec-secgw/...
> > Otherwise it just not worth the hassle.
> >
>
> Like I mentioned earlier, the app might not be bottlenecked at the mempool per core cache.
> That could be the reason the numbers with l3fwd don’t fully show the advantage of the patch.
As I said above, I don’t think we'll see any real advantage here.
But feel free to pick-up different app and prove me wrong.
After all we have plenty of sample apps that do provide enough
pressure on the cache: l3fwd-acl, ipsec-secgw.
Or you can even apply these patches from Sean:
https://patches.dpdk.org/project/dpdk/list/?series=20999
to run l3fwd with configurable routes.
That should help you to make it cache-bound.
> I’m seeing double-digit improvement with mempool_perf_autotest which should not be ignored.
And for other we are seeing double digit degradation.
So far the whole idea doesn't look promising at all, at least to me.
Konstantin
>
>
> Hi Dharmik,
> > >
> > >>>
> > >>>> Current mempool per core cache implementation stores pointers to mbufs
> > >>>> On 64b architectures, each pointer consumes 8B
> > >>>> This patch replaces it with index-based implementation,
> > >>>> where in each buffer is addressed by (pool base address + index)
> > >>>> It reduces the amount of memory/cache required for per core cache
> > >>>>
> > >>>> L3Fwd performance testing reveals minor improvements in the cache
> > >>>> performance (L1 and L2 misses reduced by 0.60%)
> > >>>> with no change in throughput
> > >>>
> > >>> I feel really sceptical about that patch and the whole idea in general:
> > >>> - From what I read above there is no real performance improvement observed.
> > >>> (In fact on my IA boxes mempool_perf_autotest reports ~20% slowdown,
> > >>> see below for more details).
> > >>
> > >> Currently, the optimizations (loop unroll and vectorization) are only implemented for ARM64.
> > >> Similar optimizations can be implemented for x86 platforms which should close the performance gap
> > >> and in my understanding should give better performance for a bulk size of 32.
> > >
> > > Might be, but I still don't see the reason for such effort.
> > > As you mentioned there is no performance improvement in 'real' apps: l3fwd, etc.
> > > on ARM64 even with vectorized version of the code.
> > >
> >
> > IMO, even without performance improvement, it is advantageous because the same performance is being achieved
> > with less memory and cache utilization using the patch.
> >
> > >>> - Space utilization difference looks neglectable too.
> > >>
> > >> Sorry, I did not understand this point.
> > >
> > > As I understand one of the expectations from that patch was:
> > > reduce memory/cache required, which should improve cache utilization
> > > (less misses, etc.).
> > > Though I think such improvements would be neglectable and wouldn't
> > > cause any real performance gain.
> >
> > The cache utilization performance numbers are for the l3fwd app, which might not be bottlenecked at the mempool per core cache.
> > Theoretically, this patch enables storing twice the number of objects in the cache as compared to the original implementation.
>
> It saves you 4 just bytes per mbuf.
> Even for simple l2fwd-like workload we access ~100 bytes per mbuf.
> Let's do a simplistic estimation of number of affected cache-lines l for l2fwd.
> For bulk of 32 packets, assuming 64B per cache-line and 16B per HW desc:
>
> number of cache-lines accessed
> cache with pointers / cache with indexes
> mempool_get: (32*8)/64=4 / (32*4)/64=2
> RX (read HW desc): (32*16)/64=8 / (32*16)/64=8
> RX (write mbuf fields, 1st cache line): (32*64)/64=3 / (32*64)/64=32
Should be:
RX (write mbuf fields, 1st cache line): (32*64)/64=32 / (32*64)/64=32
off course
> update mac addrs: (32*64)/64=32 / (32*64)/64=32
> TX (write HW desc): (32*16)/64=8 / (32*16)/64=8
> free mbufs (read 2nd mbuf cache line): (32*64)/64=32 / (32*64)/64=32
> mempool_put: (32*8)/64=4 / (32*4)/64=2
> total: 120 116
>
> So, if my calculations are correct, max estimated gain for cache utilization would be:
> (120-116)*100/120=3.33%
> Note that numbers are for over-simplistic usage scenario.
> In more realistic ones, when we have to touch more cache-lines per packet,
> that difference would be even less noticeable.
> So I really doubt we will see some noticeable improvements in terms of cache utilization
> with that patch.
>
> > >
> > >>> - The change introduces a new build time config option with a major limitation:
> > >>> All memzones in a pool have to be within the same 4GB boundary.
> > >>> To address it properly, extra changes will be required in init(/populate) part of the code.
> > >>
> > >> I agree to the above mentioned challenges and I am currently working on resolving these issues.
> > >
> > > I still think that to justify such changes some really noticeable performance
> > > improvement needs to be demonstrated: double-digit speedup for l3fwd/ipsec-secgw/...
> > > Otherwise it just not worth the hassle.
> > >
> >
> > Like I mentioned earlier, the app might not be bottlenecked at the mempool per core cache.
> > That could be the reason the numbers with l3fwd don’t fully show the advantage of the patch.
>
> As I said above, I don’t think we'll see any real advantage here.
> But feel free to pick-up different app and prove me wrong.
> After all we have plenty of sample apps that do provide enough
> pressure on the cache: l3fwd-acl, ipsec-secgw.
> Or you can even apply these patches from Sean:
> https://patches.dpdk.org/project/dpdk/list/?series=20999
> to run l3fwd with configurable routes.
> That should help you to make it cache-bound.
>
> > I’m seeing double-digit improvement with mempool_perf_autotest which should not be ignored.
>
> And for other we are seeing double digit degradation.
> So far the whole idea doesn't look promising at all, at least to me.
> Konstantin
Hi,
Thank you for the comments!
Based on the suggestions, I tested the patch for single core L3Fwd performance with increased number of routes/flows (maximum 8K) to increase cache footprint.
However, I don’t see much improvement with the patch.
> On Jan 21, 2022, at 5:25 AM, Ananyev, Konstantin <konstantin.ananyev@intel.com> wrote:
>
>
>
> Hi Dharmik,
>>>
>>>>>
>>>>>> Current mempool per core cache implementation stores pointers to mbufs
>>>>>> On 64b architectures, each pointer consumes 8B
>>>>>> This patch replaces it with index-based implementation,
>>>>>> where in each buffer is addressed by (pool base address + index)
>>>>>> It reduces the amount of memory/cache required for per core cache
>>>>>>
>>>>>> L3Fwd performance testing reveals minor improvements in the cache
>>>>>> performance (L1 and L2 misses reduced by 0.60%)
>>>>>> with no change in throughput
>>>>>
>>>>> I feel really sceptical about that patch and the whole idea in general:
>>>>> - From what I read above there is no real performance improvement observed.
>>>>> (In fact on my IA boxes mempool_perf_autotest reports ~20% slowdown,
>>>>> see below for more details).
>>>>
>>>> Currently, the optimizations (loop unroll and vectorization) are only implemented for ARM64.
>>>> Similar optimizations can be implemented for x86 platforms which should close the performance gap
>>>> and in my understanding should give better performance for a bulk size of 32.
>>>
>>> Might be, but I still don't see the reason for such effort.
>>> As you mentioned there is no performance improvement in 'real' apps: l3fwd, etc.
>>> on ARM64 even with vectorized version of the code.
>>>
>>
>> IMO, even without performance improvement, it is advantageous because the same performance is being achieved
>> with less memory and cache utilization using the patch.
>>
>>>>> - Space utilization difference looks neglectable too.
>>>>
>>>> Sorry, I did not understand this point.
>>>
>>> As I understand one of the expectations from that patch was:
>>> reduce memory/cache required, which should improve cache utilization
>>> (less misses, etc.).
>>> Though I think such improvements would be neglectable and wouldn't
>>> cause any real performance gain.
>>
>> The cache utilization performance numbers are for the l3fwd app, which might not be bottlenecked at the mempool per core cache.
>> Theoretically, this patch enables storing twice the number of objects in the cache as compared to the original implementation.
>
> It saves you 4 just bytes per mbuf.
> Even for simple l2fwd-like workload we access ~100 bytes per mbuf.
> Let's do a simplistic estimation of number of affected cache-lines l for l2fwd.
> For bulk of 32 packets, assuming 64B per cache-line and 16B per HW desc:
>
> number of cache-lines accessed
> cache with pointers / cache with indexes
> mempool_get: (32*8)/64=4 / (32*4)/64=2
> RX (read HW desc): (32*16)/64=8 / (32*16)/64=8
> RX (write mbuf fields, 1st cache line): (32*64)/64=3 / (32*64)/64=32
> update mac addrs: (32*64)/64=32 / (32*64)/64=32
> TX (write HW desc): (32*16)/64=8 / (32*16)/64=8
> free mbufs (read 2nd mbuf cache line): (32*64)/64=32 / (32*64)/64=32
> mempool_put: (32*8)/64=4 / (32*4)/64=2
> total: 120 116
>
> So, if my calculations are correct, max estimated gain for cache utilization would be:
> (120-116)*100/120=3.33%
> Note that numbers are for over-simplistic usage scenario.
> In more realistic ones, when we have to touch more cache-lines per packet,
> that difference would be even less noticeable.
> So I really doubt we will see some noticeable improvements in terms of cache utilization
> with that patch.
>
>>>
>>>>> - The change introduces a new build time config option with a major limitation:
>>>>> All memzones in a pool have to be within the same 4GB boundary.
>>>>> To address it properly, extra changes will be required in init(/populate) part of the code.
>>>>
>>>> I agree to the above mentioned challenges and I am currently working on resolving these issues.
>>>
>>> I still think that to justify such changes some really noticeable performance
>>> improvement needs to be demonstrated: double-digit speedup for l3fwd/ipsec-secgw/...
>>> Otherwise it just not worth the hassle.
>>>
>>
>> Like I mentioned earlier, the app might not be bottlenecked at the mempool per core cache.
>> That could be the reason the numbers with l3fwd don’t fully show the advantage of the patch.
>
> As I said above, I don’t think we'll see any real advantage here.
> But feel free to pick-up different app and prove me wrong.
> After all we have plenty of sample apps that do provide enough
> pressure on the cache: l3fwd-acl, ipsec-secgw.
> Or you can even apply these patches from Sean:
> https://patches.dpdk.org/project/dpdk/list/?series=20999
> to run l3fwd with configurable routes.
> That should help you to make it cache-bound.
>
Thank you, Konstantin! This patch was helpful.
>> I’m seeing double-digit improvement with mempool_perf_autotest which should not be ignored.
>
> And for other we are seeing double digit degradation.
> So far the whole idea doesn't look promising at all, at least to me.
> Konstantin
>
@@ -50,6 +50,10 @@
#include <rte_memcpy.h>
#include <rte_common.h>
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+#include <rte_vect.h>
+#endif
+
#include "rte_mempool_trace_fp.h"
#ifdef __cplusplus
@@ -239,6 +243,9 @@ struct rte_mempool {
int32_t ops_index;
struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+ void *pool_base_value; /**< Base value to calculate indices */
+#endif
uint32_t populated_size; /**< Number of populated objects. */
struct rte_mempool_objhdr_list elt_list; /**< List of objects in pool */
@@ -1314,7 +1321,19 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
if (cache == NULL || cache->len == 0)
return;
rte_mempool_trace_cache_flush(cache, mp);
+
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+ unsigned int i;
+ unsigned int cache_len = cache->len;
+ void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
+ void *base_value = mp->pool_base_value;
+ uint32_t *cache_objs = (uint32_t *) cache->objs;
+ for (i = 0; i < cache_len; i++)
+ obj_table[i] = (void *) RTE_PTR_ADD(base_value, cache_objs[i]);
+ rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
+#else
rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
+#endif
cache->len = 0;
}
@@ -1334,8 +1353,13 @@ static __rte_always_inline void
rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
unsigned int n, struct rte_mempool_cache *cache)
{
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+ uint32_t *cache_objs;
+ void *base_value;
+ uint32_t i;
+#else
void **cache_objs;
-
+#endif
/* increment stat now, adding in mempool always success */
RTE_MEMPOOL_STAT_ADD(mp, put_bulk, 1);
RTE_MEMPOOL_STAT_ADD(mp, put_objs, n);
@@ -1344,7 +1368,13 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
if (unlikely(cache == NULL || n > RTE_MEMPOOL_CACHE_MAX_SIZE))
goto ring_enqueue;
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+ cache_objs = (uint32_t *) cache->objs;
+ cache_objs = &cache_objs[cache->len];
+ base_value = mp->pool_base_value;
+#else
cache_objs = &cache->objs[cache->len];
+#endif
/*
* The cache follows the following algorithm
@@ -1354,13 +1384,40 @@ rte_mempool_do_generic_put(struct rte_mempool *mp, void * const *obj_table,
*/
/* Add elements back into the cache */
+
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+#if defined __ARM_NEON
+ uint64x2_t v_obj_table;
+ uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
+ uint32x2_t v_cache_objs;
+
+ for (i = 0; i < (n & ~0x1); i += 2) {
+ v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
+ v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table, v_base_value));
+ vst1_u32(cache_objs + i, v_cache_objs);
+ }
+ if (n & 0x1) {
+ cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
+ }
+#else
+ for (i = 0; i < n; i++) {
+ cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
+ }
+#endif
+#else
rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
+#endif
cache->len += n;
if (cache->len >= cache->flushthresh) {
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+ rte_mempool_ops_enqueue_bulk(mp, obj_table + cache->len - cache->size,
+ cache->len - cache->size);
+#else
rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size],
cache->len - cache->size);
+#endif
cache->len = cache->size;
}
@@ -1461,13 +1518,22 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
{
int ret;
uint32_t index, len;
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+ uint32_t i;
+ uint32_t *cache_objs;
+#else
void **cache_objs;
-
+#endif
/* No cache provided or cannot be satisfied from cache */
if (unlikely(cache == NULL || n >= cache->size))
goto ring_dequeue;
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+ void *base_value = mp->pool_base_value;
+ cache_objs = (uint32_t *) cache->objs;
+#else
cache_objs = cache->objs;
+#endif
/* Can this be satisfied from the cache? */
if (cache->len < n) {
@@ -1475,8 +1541,14 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
uint32_t req = n + (cache->size - cache->len);
/* How many do we require i.e. number to fill the cache + the request */
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+ void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
+ ret = rte_mempool_ops_dequeue_bulk(mp,
+ temp_objs, req);
+#else
ret = rte_mempool_ops_dequeue_bulk(mp,
&cache->objs[cache->len], req);
+#endif
if (unlikely(ret < 0)) {
/*
* In the off chance that we are buffer constrained,
@@ -1487,12 +1559,50 @@ rte_mempool_do_generic_get(struct rte_mempool *mp, void **obj_table,
goto ring_dequeue;
}
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+ len = cache->len;
+ for (i = 0; i < req; ++i, ++len) {
+ cache_objs[len] = (uint32_t) RTE_PTR_DIFF(temp_objs[i],
+ base_value);
+ }
+#endif
cache->len += req;
}
/* Now fill in the response ... */
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+#if defined __ARM_NEON
+ uint64x2_t v_obj_table;
+ uint64x2_t v_cache_objs;
+ uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
+
+ for (index = 0, len = cache->len - 1; index < (n & ~0x3); index += 4,
+ len -= 4, obj_table += 4) {
+ v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
+ v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
+ vst1q_u64((uint64_t *)obj_table, v_obj_table);
+ v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 3));
+ v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
+ vst1q_u64((uint64_t *)(obj_table + 2), v_obj_table);
+ }
+ switch (n & 0x3) {
+ case 3:
+ *(obj_table++) = (void *) RTE_PTR_ADD(base_value, cache_objs[len--]);
+ /* fallthrough */
+ case 2:
+ *(obj_table++) = (void *) RTE_PTR_ADD(base_value, cache_objs[len--]);
+ /* fallthrough */
+ case 1:
+ *(obj_table++) = (void *) RTE_PTR_ADD(base_value, cache_objs[len--]);
+ }
+#else
+ for (index = 0, len = cache->len - 1; index < n; ++index, len--, obj_table++)
+ *obj_table = (void *) RTE_PTR_ADD(base_value, cache_objs[len]);
+#endif
+#else
for (index = 0, len = cache->len - 1; index < n; ++index, len--, obj_table++)
*obj_table = cache_objs[len];
+#endif
cache->len -= n;
@@ -127,6 +127,13 @@ rte_mempool_op_populate_helper(struct rte_mempool *mp, unsigned int flags,
obj = va + off;
obj_cb(mp, obj_cb_arg, obj,
(iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
+#ifdef RTE_MEMPOOL_INDEX_BASED_LCORE_CACHE
+ /* Store pool base value to calculate indices for index-based
+ * lcore cache implementation
+ */
+ if (i == 0)
+ mp->pool_base_value = obj;
+#endif
rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
off += mp->elt_size + mp->trailer_size;
}