diff mbox series

[RFC] mempool: implement index-based per core cache

Message ID 20210930172735.2675627-1-dharmik.thakkar@arm.com (mailing list archive)
State New
Delegated to: Thomas Monjalon
Headers show
Series [RFC] mempool: implement index-based per core cache | expand

Checks

Context Check Description
ci/Intel-compilation warning apply issues
ci/checkpatch warning coding style issues

Commit Message

Dharmik Thakkar Sept. 30, 2021, 5:27 p.m. UTC
Current mempool per core cache implementation is based on pointer
For most architectures, each pointer consumes 64b
Replace it with index-based implementation, where in each buffer
is addressed by (pool address + index)
It will reduce memory requirements

L3Fwd performance testing reveals minor improvements in the cache
performance and no change in throughput

Micro-benchmarking the patch using mempool_perf_test shows
significant improvement with majority of the test cases

Future plan involves replacing global pool's pointer-based implementation with index-based implementation

Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
---
 drivers/mempool/ring/rte_mempool_ring.c |  2 +-
 lib/mempool/rte_mempool.c               |  8 +++
 lib/mempool/rte_mempool.h               | 74 ++++++++++++++++++++++---
 3 files changed, 74 insertions(+), 10 deletions(-)

Comments

Jerin Jacob Oct. 1, 2021, 12:36 p.m. UTC | #1
On Thu, Sep 30, 2021 at 10:57 PM Dharmik Thakkar
<dharmik.thakkar@arm.com> wrote:
>
> Current mempool per core cache implementation is based on pointer
> For most architectures, each pointer consumes 64b
> Replace it with index-based implementation, where in each buffer
> is addressed by (pool address + index)
> It will reduce memory requirements
>
> L3Fwd performance testing reveals minor improvements in the cache
> performance and no change in throughput
>
> Micro-benchmarking the patch using mempool_perf_test shows
> significant improvement with majority of the test cases
>
> Future plan involves replacing global pool's pointer-based implementation with index-based implementation
>
> Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>


Sane idea. Like VPP, we tried to do this for rte_graph, but not
observed much gain.
Since lcore cache is typically 512, maybe there is a gain on the mempool path.
Also, Since you are enabling only for local cache, it is good as
mempool drivers can work as-is.(i.e HW drivers works with 64bit)
I think, getting more performance numbers for various cases may be the
next step.

> ---
>  drivers/mempool/ring/rte_mempool_ring.c |  2 +-
>  lib/mempool/rte_mempool.c               |  8 +++
>  lib/mempool/rte_mempool.h               | 74 ++++++++++++++++++++++---
>  3 files changed, 74 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/mempool/ring/rte_mempool_ring.c b/drivers/mempool/ring/rte_mempool_ring.c
> index b1f09ff28f4d..e55913e47f21 100644
> --- a/drivers/mempool/ring/rte_mempool_ring.c
> +++ b/drivers/mempool/ring/rte_mempool_ring.c
> @@ -101,7 +101,7 @@ ring_alloc(struct rte_mempool *mp, uint32_t rg_flags)
>                 return -rte_errno;
>
>         mp->pool_data = r;
> -
> +       mp->local_cache_base_addr = &r[1];
>         return 0;
>  }
>
> diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> index 59a588425bd6..424bdb19c323 100644
> --- a/lib/mempool/rte_mempool.c
> +++ b/lib/mempool/rte_mempool.c
> @@ -480,6 +480,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>         int ret;
>         bool need_iova_contig_obj;
>         size_t max_alloc_size = SIZE_MAX;
> +       unsigned lcore_id;
>
>         ret = mempool_ops_alloc_once(mp);
>         if (ret != 0)
> @@ -600,6 +601,13 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>                 }
>         }
>
> +       /* Init all default caches. */
> +       if (mp->cache_size != 0) {
> +               for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
> +                       mp->local_cache[lcore_id].local_cache_base_value =
> +                               *(void **)mp->local_cache_base_addr;
> +       }
> +
>         rte_mempool_trace_populate_default(mp);
>         return mp->size;
>
> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> index 4235d6f0bf2b..545405c0d3ce 100644
> --- a/lib/mempool/rte_mempool.h
> +++ b/lib/mempool/rte_mempool.h
> @@ -51,6 +51,8 @@
>  #include <rte_memcpy.h>
>  #include <rte_common.h>
>
> +#include <arm_neon.h>
> +
>  #include "rte_mempool_trace_fp.h"
>
>  #ifdef __cplusplus
> @@ -91,11 +93,12 @@ struct rte_mempool_cache {
>         uint32_t size;        /**< Size of the cache */
>         uint32_t flushthresh; /**< Threshold before we flush excess elements */
>         uint32_t len;         /**< Current cache count */
> +       void *local_cache_base_value; /**< Base value to calculate indices */
>         /*
>          * Cache is allocated to this size to allow it to overflow in certain
>          * cases to avoid needless emptying of cache.
>          */
> -       void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
> +       uint32_t objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
>  } __rte_cache_aligned;
>
>  /**
> @@ -172,7 +175,6 @@ struct rte_mempool_objtlr {
>   * A list of memory where objects are stored
>   */
>  STAILQ_HEAD(rte_mempool_memhdr_list, rte_mempool_memhdr);
> -
>  /**
>   * Callback used to free a memory chunk
>   */
> @@ -244,6 +246,7 @@ struct rte_mempool {
>         int32_t ops_index;
>
>         struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
> +       void *local_cache_base_addr; /**< Reference to the base value */
>
>         uint32_t populated_size;         /**< Number of populated objects. */
>         struct rte_mempool_objhdr_list elt_list; /**< List of objects in pool */
> @@ -1269,7 +1272,15 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
>         if (cache == NULL || cache->len == 0)
>                 return;
>         rte_mempool_trace_cache_flush(cache, mp);
> -       rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
> +
> +       unsigned int i;
> +       unsigned int cache_len = cache->len;
> +       void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> +       void *base_value = cache->local_cache_base_value;
> +       uint32_t *cache_objs = cache->objs;
> +       for (i = 0; i < cache_len; i++)
> +               obj_table[i] = (void *) RTE_PTR_ADD(base_value, cache_objs[i]);
> +       rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
>         cache->len = 0;
>  }
>
> @@ -1289,7 +1300,9 @@ static __rte_always_inline void
>  __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
>                       unsigned int n, struct rte_mempool_cache *cache)
>  {
> -       void **cache_objs;
> +       uint32_t *cache_objs;
> +       void *base_value;
> +       uint32_t i;
>
>         /* increment stat now, adding in mempool always success */
>         __MEMPOOL_STAT_ADD(mp, put_bulk, 1);
> @@ -1301,6 +1314,12 @@ __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
>
>         cache_objs = &cache->objs[cache->len];
>
> +       base_value = cache->local_cache_base_value;
> +
> +       uint64x2_t v_obj_table;
> +       uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> +       uint32x2_t v_cache_objs;
> +
>         /*
>          * The cache follows the following algorithm
>          *   1. Add the objects to the cache
> @@ -1309,12 +1328,26 @@ __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
>          */
>
>         /* Add elements back into the cache */
> -       rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
> +
> +#if defined __ARM_NEON
> +       for (i = 0; i < (n & ~0x1); i+=2) {
> +               v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
> +               v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table, v_base_value));
> +               vst1_u32(cache_objs + i, v_cache_objs);
> +       }
> +       if (n & 0x1) {
> +               cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
> +       }
> +#else
> +       for (i = 0; i < n; i++) {
> +               cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
> +       }
> +#endif
>
>         cache->len += n;
>
>         if (cache->len >= cache->flushthresh) {
> -               rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size],
> +               rte_mempool_ops_enqueue_bulk(mp, obj_table + cache->len - cache->size,
>                                 cache->len - cache->size);
>                 cache->len = cache->size;
>         }
> @@ -1415,23 +1448,26 @@ __mempool_generic_get(struct rte_mempool *mp, void **obj_table,
>                       unsigned int n, struct rte_mempool_cache *cache)
>  {
>         int ret;
> +       uint32_t i;
>         uint32_t index, len;
> -       void **cache_objs;
> +       uint32_t *cache_objs;
>
>         /* No cache provided or cannot be satisfied from cache */
>         if (unlikely(cache == NULL || n >= cache->size))
>                 goto ring_dequeue;
>
> +       void *base_value = cache->local_cache_base_value;
>         cache_objs = cache->objs;
>
>         /* Can this be satisfied from the cache? */
>         if (cache->len < n) {
>                 /* No. Backfill the cache first, and then fill from it */
>                 uint32_t req = n + (cache->size - cache->len);
> +               void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
>
>                 /* How many do we require i.e. number to fill the cache + the request */
>                 ret = rte_mempool_ops_dequeue_bulk(mp,
> -                       &cache->objs[cache->len], req);
> +                       temp_objs, req);
>                 if (unlikely(ret < 0)) {
>                         /*
>                          * In the off chance that we are buffer constrained,
> @@ -1442,12 +1478,32 @@ __mempool_generic_get(struct rte_mempool *mp, void **obj_table,
>                         goto ring_dequeue;
>                 }
>
> +               len = cache->len;
> +               for (i = 0; i < req; ++i, ++len) {
> +                       cache_objs[len] = (uint32_t) RTE_PTR_DIFF(temp_objs[i], base_value);
> +               }
> +
>                 cache->len += req;
>         }
>
> +       uint64x2_t v_obj_table;
> +       uint64x2_t v_cache_objs;
> +       uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> +
>         /* Now fill in the response ... */
> +#if defined __ARM_NEON
> +       for (index = 0, len = cache->len - 1; index < (n & ~0x1); index+=2,
> +                                               len-=2, obj_table+=2) {
> +               v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
> +               v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
> +               vst1q_u64((uint64_t *)obj_table, v_obj_table);
> +       }
> +       if (n & 0x1)
> +               *obj_table = (void *) RTE_PTR_ADD(base_value, cache_objs[len]);
> +#else
>         for (index = 0, len = cache->len - 1; index < n; ++index, len--, obj_table++)
> -               *obj_table = cache_objs[len];
> +               *obj_table = (void *) RTE_PTR_ADD(base_value, cache_objs[len]);
> +#endif
>
>         cache->len -= n;
>
> --
> 2.17.1
>
Honnappa Nagarahalli Oct. 1, 2021, 3:44 p.m. UTC | #2
<snip>

> 
> On Thu, Sep 30, 2021 at 10:57 PM Dharmik Thakkar
> <dharmik.thakkar@arm.com> wrote:
> >
> > Current mempool per core cache implementation is based on pointer For
> > most architectures, each pointer consumes 64b Replace it with
> > index-based implementation, where in each buffer is addressed by (pool
> > address + index) It will reduce memory requirements
> >
> > L3Fwd performance testing reveals minor improvements in the cache
> > performance and no change in throughput
> >
> > Micro-benchmarking the patch using mempool_perf_test shows significant
> > improvement with majority of the test cases
> >
> > Future plan involves replacing global pool's pointer-based
> > implementation with index-based implementation
> >
> > Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
> 
> 
> Sane idea. Like VPP, we tried to do this for rte_graph, but not observed much
> gain.
> Since lcore cache is typically 512, maybe there is a gain on the mempool path.
> Also, Since you are enabling only for local cache, it is good as mempool
> drivers can work as-is.(i.e HW drivers works with 64bit) I think, getting more
> performance numbers for various cases may be the next step.
The gain is not observed in terms of PPS improvement, but do see some improvements that PMUs indicate. This approach definitely results in savings in number of cache lines utilized.

> 
> > ---
> >  drivers/mempool/ring/rte_mempool_ring.c |  2 +-
> >  lib/mempool/rte_mempool.c               |  8 +++
> >  lib/mempool/rte_mempool.h               | 74 ++++++++++++++++++++++---
> >  3 files changed, 74 insertions(+), 10 deletions(-)
> >
> > diff --git a/drivers/mempool/ring/rte_mempool_ring.c
> > b/drivers/mempool/ring/rte_mempool_ring.c
> > index b1f09ff28f4d..e55913e47f21 100644
> > --- a/drivers/mempool/ring/rte_mempool_ring.c
> > +++ b/drivers/mempool/ring/rte_mempool_ring.c
> > @@ -101,7 +101,7 @@ ring_alloc(struct rte_mempool *mp, uint32_t
> rg_flags)
> >                 return -rte_errno;
> >
> >         mp->pool_data = r;
> > -
> > +       mp->local_cache_base_addr = &r[1];
> >         return 0;
> >  }
> >
> > diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> > index 59a588425bd6..424bdb19c323 100644
> > --- a/lib/mempool/rte_mempool.c
> > +++ b/lib/mempool/rte_mempool.c
> > @@ -480,6 +480,7 @@ rte_mempool_populate_default(struct
> rte_mempool *mp)
> >         int ret;
> >         bool need_iova_contig_obj;
> >         size_t max_alloc_size = SIZE_MAX;
> > +       unsigned lcore_id;
> >
> >         ret = mempool_ops_alloc_once(mp);
> >         if (ret != 0)
> > @@ -600,6 +601,13 @@ rte_mempool_populate_default(struct
> rte_mempool *mp)
> >                 }
> >         }
> >
> > +       /* Init all default caches. */
> > +       if (mp->cache_size != 0) {
> > +               for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
> > +                       mp->local_cache[lcore_id].local_cache_base_value =
> > +                               *(void **)mp->local_cache_base_addr;
> > +       }
> > +
> >         rte_mempool_trace_populate_default(mp);
> >         return mp->size;
> >
> > diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> > index 4235d6f0bf2b..545405c0d3ce 100644
> > --- a/lib/mempool/rte_mempool.h
> > +++ b/lib/mempool/rte_mempool.h
> > @@ -51,6 +51,8 @@
> >  #include <rte_memcpy.h>
> >  #include <rte_common.h>
> >
> > +#include <arm_neon.h>
> > +
> >  #include "rte_mempool_trace_fp.h"
> >
> >  #ifdef __cplusplus
> > @@ -91,11 +93,12 @@ struct rte_mempool_cache {
> >         uint32_t size;        /**< Size of the cache */
> >         uint32_t flushthresh; /**< Threshold before we flush excess elements
> */
> >         uint32_t len;         /**< Current cache count */
> > +       void *local_cache_base_value; /**< Base value to calculate
> > + indices */
> >         /*
> >          * Cache is allocated to this size to allow it to overflow in certain
> >          * cases to avoid needless emptying of cache.
> >          */
> > -       void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects
> */
> > +       uint32_t objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache
> > + objects */
> >  } __rte_cache_aligned;
> >
> >  /**
> > @@ -172,7 +175,6 @@ struct rte_mempool_objtlr {
> >   * A list of memory where objects are stored
> >   */
> >  STAILQ_HEAD(rte_mempool_memhdr_list, rte_mempool_memhdr);
> > -
> >  /**
> >   * Callback used to free a memory chunk
> >   */
> > @@ -244,6 +246,7 @@ struct rte_mempool {
> >         int32_t ops_index;
> >
> >         struct rte_mempool_cache *local_cache; /**< Per-lcore local
> > cache */
> > +       void *local_cache_base_addr; /**< Reference to the base value
> > + */
> >
> >         uint32_t populated_size;         /**< Number of populated objects. */
> >         struct rte_mempool_objhdr_list elt_list; /**< List of objects
> > in pool */ @@ -1269,7 +1272,15 @@ rte_mempool_cache_flush(struct
> rte_mempool_cache *cache,
> >         if (cache == NULL || cache->len == 0)
> >                 return;
> >         rte_mempool_trace_cache_flush(cache, mp);
> > -       rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
> > +
> > +       unsigned int i;
> > +       unsigned int cache_len = cache->len;
> > +       void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> > +       void *base_value = cache->local_cache_base_value;
> > +       uint32_t *cache_objs = cache->objs;
> > +       for (i = 0; i < cache_len; i++)
> > +               obj_table[i] = (void *) RTE_PTR_ADD(base_value, cache_objs[i]);
> > +       rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
> >         cache->len = 0;
> >  }
> >
> > @@ -1289,7 +1300,9 @@ static __rte_always_inline void
> > __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
> >                       unsigned int n, struct rte_mempool_cache *cache)
> > {
> > -       void **cache_objs;
> > +       uint32_t *cache_objs;
> > +       void *base_value;
> > +       uint32_t i;
> >
> >         /* increment stat now, adding in mempool always success */
> >         __MEMPOOL_STAT_ADD(mp, put_bulk, 1); @@ -1301,6 +1314,12 @@
> > __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
> >
> >         cache_objs = &cache->objs[cache->len];
> >
> > +       base_value = cache->local_cache_base_value;
> > +
> > +       uint64x2_t v_obj_table;
> > +       uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> > +       uint32x2_t v_cache_objs;
> > +
> >         /*
> >          * The cache follows the following algorithm
> >          *   1. Add the objects to the cache
> > @@ -1309,12 +1328,26 @@ __mempool_generic_put(struct rte_mempool
> *mp, void * const *obj_table,
> >          */
> >
> >         /* Add elements back into the cache */
> > -       rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
> > +
> > +#if defined __ARM_NEON
> > +       for (i = 0; i < (n & ~0x1); i+=2) {
> > +               v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
> > +               v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table,
> v_base_value));
> > +               vst1_u32(cache_objs + i, v_cache_objs);
> > +       }
> > +       if (n & 0x1) {
> > +               cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
> > +       }
> > +#else
> > +       for (i = 0; i < n; i++) {
> > +               cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
> > +       }
> > +#endif
> >
> >         cache->len += n;
> >
> >         if (cache->len >= cache->flushthresh) {
> > -               rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size],
> > +               rte_mempool_ops_enqueue_bulk(mp, obj_table +
> > + cache->len - cache->size,
> >                                 cache->len - cache->size);
> >                 cache->len = cache->size;
> >         }
> > @@ -1415,23 +1448,26 @@ __mempool_generic_get(struct rte_mempool
> *mp, void **obj_table,
> >                       unsigned int n, struct rte_mempool_cache *cache)
> > {
> >         int ret;
> > +       uint32_t i;
> >         uint32_t index, len;
> > -       void **cache_objs;
> > +       uint32_t *cache_objs;
> >
> >         /* No cache provided or cannot be satisfied from cache */
> >         if (unlikely(cache == NULL || n >= cache->size))
> >                 goto ring_dequeue;
> >
> > +       void *base_value = cache->local_cache_base_value;
> >         cache_objs = cache->objs;
> >
> >         /* Can this be satisfied from the cache? */
> >         if (cache->len < n) {
> >                 /* No. Backfill the cache first, and then fill from it */
> >                 uint32_t req = n + (cache->size - cache->len);
> > +               void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**<
> > + Cache objects */
> >
> >                 /* How many do we require i.e. number to fill the cache + the
> request */
> >                 ret = rte_mempool_ops_dequeue_bulk(mp,
> > -                       &cache->objs[cache->len], req);
> > +                       temp_objs, req);
> >                 if (unlikely(ret < 0)) {
> >                         /*
> >                          * In the off chance that we are buffer
> > constrained, @@ -1442,12 +1478,32 @@ __mempool_generic_get(struct
> rte_mempool *mp, void **obj_table,
> >                         goto ring_dequeue;
> >                 }
> >
> > +               len = cache->len;
> > +               for (i = 0; i < req; ++i, ++len) {
> > +                       cache_objs[len] = (uint32_t) RTE_PTR_DIFF(temp_objs[i],
> base_value);
> > +               }
> > +
> >                 cache->len += req;
> >         }
> >
> > +       uint64x2_t v_obj_table;
> > +       uint64x2_t v_cache_objs;
> > +       uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> > +
> >         /* Now fill in the response ... */
> > +#if defined __ARM_NEON
> > +       for (index = 0, len = cache->len - 1; index < (n & ~0x1); index+=2,
> > +                                               len-=2, obj_table+=2) {
> > +               v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
> > +               v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
> > +               vst1q_u64((uint64_t *)obj_table, v_obj_table);
> > +       }
> > +       if (n & 0x1)
> > +               *obj_table = (void *) RTE_PTR_ADD(base_value,
> > +cache_objs[len]); #else
> >         for (index = 0, len = cache->len - 1; index < n; ++index, len--,
> obj_table++)
> > -               *obj_table = cache_objs[len];
> > +               *obj_table = (void *) RTE_PTR_ADD(base_value,
> > +cache_objs[len]); #endif
> >
> >         cache->len -= n;
> >
> > --
> > 2.17.1
> >
Jerin Jacob Oct. 1, 2021, 5:32 p.m. UTC | #3
On Fri, Oct 1, 2021 at 9:14 PM Honnappa Nagarahalli
<Honnappa.Nagarahalli@arm.com> wrote:
>
> <snip>
>
> >
> > On Thu, Sep 30, 2021 at 10:57 PM Dharmik Thakkar
> > <dharmik.thakkar@arm.com> wrote:
> > >
> > > Current mempool per core cache implementation is based on pointer For
> > > most architectures, each pointer consumes 64b Replace it with
> > > index-based implementation, where in each buffer is addressed by (pool
> > > address + index) It will reduce memory requirements
> > >
> > > L3Fwd performance testing reveals minor improvements in the cache
> > > performance and no change in throughput
> > >
> > > Micro-benchmarking the patch using mempool_perf_test shows significant
> > > improvement with majority of the test cases
> > >
> > > Future plan involves replacing global pool's pointer-based
> > > implementation with index-based implementation
> > >
> > > Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
> >
> >
> > Sane idea. Like VPP, we tried to do this for rte_graph, but not observed much
> > gain.
> > Since lcore cache is typically 512, maybe there is a gain on the mempool path.
> > Also, Since you are enabling only for local cache, it is good as mempool
> > drivers can work as-is.(i.e HW drivers works with 64bit) I think, getting more
> > performance numbers for various cases may be the next step.
> The gain is not observed in terms of PPS improvement, but do see some improvements that PMUs indicate. This approach definitely results in savings in number of cache lines utilized.

OK. IMO, If PPS has regression then this path is not viable, else it may be OK.


>
> >
> > > ---
> > >  drivers/mempool/ring/rte_mempool_ring.c |  2 +-
> > >  lib/mempool/rte_mempool.c               |  8 +++
> > >  lib/mempool/rte_mempool.h               | 74 ++++++++++++++++++++++---
> > >  3 files changed, 74 insertions(+), 10 deletions(-)
> > >
> > > diff --git a/drivers/mempool/ring/rte_mempool_ring.c
> > > b/drivers/mempool/ring/rte_mempool_ring.c
> > > index b1f09ff28f4d..e55913e47f21 100644
> > > --- a/drivers/mempool/ring/rte_mempool_ring.c
> > > +++ b/drivers/mempool/ring/rte_mempool_ring.c
> > > @@ -101,7 +101,7 @@ ring_alloc(struct rte_mempool *mp, uint32_t
> > rg_flags)
> > >                 return -rte_errno;
> > >
> > >         mp->pool_data = r;
> > > -
> > > +       mp->local_cache_base_addr = &r[1];
> > >         return 0;
> > >  }
> > >
> > > diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> > > index 59a588425bd6..424bdb19c323 100644
> > > --- a/lib/mempool/rte_mempool.c
> > > +++ b/lib/mempool/rte_mempool.c
> > > @@ -480,6 +480,7 @@ rte_mempool_populate_default(struct
> > rte_mempool *mp)
> > >         int ret;
> > >         bool need_iova_contig_obj;
> > >         size_t max_alloc_size = SIZE_MAX;
> > > +       unsigned lcore_id;
> > >
> > >         ret = mempool_ops_alloc_once(mp);
> > >         if (ret != 0)
> > > @@ -600,6 +601,13 @@ rte_mempool_populate_default(struct
> > rte_mempool *mp)
> > >                 }
> > >         }
> > >
> > > +       /* Init all default caches. */
> > > +       if (mp->cache_size != 0) {
> > > +               for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
> > > +                       mp->local_cache[lcore_id].local_cache_base_value =
> > > +                               *(void **)mp->local_cache_base_addr;
> > > +       }
> > > +
> > >         rte_mempool_trace_populate_default(mp);
> > >         return mp->size;
> > >
> > > diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> > > index 4235d6f0bf2b..545405c0d3ce 100644
> > > --- a/lib/mempool/rte_mempool.h
> > > +++ b/lib/mempool/rte_mempool.h
> > > @@ -51,6 +51,8 @@
> > >  #include <rte_memcpy.h>
> > >  #include <rte_common.h>
> > >
> > > +#include <arm_neon.h>
> > > +
> > >  #include "rte_mempool_trace_fp.h"
> > >
> > >  #ifdef __cplusplus
> > > @@ -91,11 +93,12 @@ struct rte_mempool_cache {
> > >         uint32_t size;        /**< Size of the cache */
> > >         uint32_t flushthresh; /**< Threshold before we flush excess elements
> > */
> > >         uint32_t len;         /**< Current cache count */
> > > +       void *local_cache_base_value; /**< Base value to calculate
> > > + indices */
> > >         /*
> > >          * Cache is allocated to this size to allow it to overflow in certain
> > >          * cases to avoid needless emptying of cache.
> > >          */
> > > -       void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects
> > */
> > > +       uint32_t objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache
> > > + objects */
> > >  } __rte_cache_aligned;
> > >
> > >  /**
> > > @@ -172,7 +175,6 @@ struct rte_mempool_objtlr {
> > >   * A list of memory where objects are stored
> > >   */
> > >  STAILQ_HEAD(rte_mempool_memhdr_list, rte_mempool_memhdr);
> > > -
> > >  /**
> > >   * Callback used to free a memory chunk
> > >   */
> > > @@ -244,6 +246,7 @@ struct rte_mempool {
> > >         int32_t ops_index;
> > >
> > >         struct rte_mempool_cache *local_cache; /**< Per-lcore local
> > > cache */
> > > +       void *local_cache_base_addr; /**< Reference to the base value
> > > + */
> > >
> > >         uint32_t populated_size;         /**< Number of populated objects. */
> > >         struct rte_mempool_objhdr_list elt_list; /**< List of objects
> > > in pool */ @@ -1269,7 +1272,15 @@ rte_mempool_cache_flush(struct
> > rte_mempool_cache *cache,
> > >         if (cache == NULL || cache->len == 0)
> > >                 return;
> > >         rte_mempool_trace_cache_flush(cache, mp);
> > > -       rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
> > > +
> > > +       unsigned int i;
> > > +       unsigned int cache_len = cache->len;
> > > +       void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> > > +       void *base_value = cache->local_cache_base_value;
> > > +       uint32_t *cache_objs = cache->objs;
> > > +       for (i = 0; i < cache_len; i++)
> > > +               obj_table[i] = (void *) RTE_PTR_ADD(base_value, cache_objs[i]);
> > > +       rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
> > >         cache->len = 0;
> > >  }
> > >
> > > @@ -1289,7 +1300,9 @@ static __rte_always_inline void
> > > __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
> > >                       unsigned int n, struct rte_mempool_cache *cache)
> > > {
> > > -       void **cache_objs;
> > > +       uint32_t *cache_objs;
> > > +       void *base_value;
> > > +       uint32_t i;
> > >
> > >         /* increment stat now, adding in mempool always success */
> > >         __MEMPOOL_STAT_ADD(mp, put_bulk, 1); @@ -1301,6 +1314,12 @@
> > > __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
> > >
> > >         cache_objs = &cache->objs[cache->len];
> > >
> > > +       base_value = cache->local_cache_base_value;
> > > +
> > > +       uint64x2_t v_obj_table;
> > > +       uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> > > +       uint32x2_t v_cache_objs;
> > > +
> > >         /*
> > >          * The cache follows the following algorithm
> > >          *   1. Add the objects to the cache
> > > @@ -1309,12 +1328,26 @@ __mempool_generic_put(struct rte_mempool
> > *mp, void * const *obj_table,
> > >          */
> > >
> > >         /* Add elements back into the cache */
> > > -       rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
> > > +
> > > +#if defined __ARM_NEON
> > > +       for (i = 0; i < (n & ~0x1); i+=2) {
> > > +               v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
> > > +               v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table,
> > v_base_value));
> > > +               vst1_u32(cache_objs + i, v_cache_objs);
> > > +       }
> > > +       if (n & 0x1) {
> > > +               cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
> > > +       }
> > > +#else
> > > +       for (i = 0; i < n; i++) {
> > > +               cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
> > > +       }
> > > +#endif
> > >
> > >         cache->len += n;
> > >
> > >         if (cache->len >= cache->flushthresh) {
> > > -               rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size],
> > > +               rte_mempool_ops_enqueue_bulk(mp, obj_table +
> > > + cache->len - cache->size,
> > >                                 cache->len - cache->size);
> > >                 cache->len = cache->size;
> > >         }
> > > @@ -1415,23 +1448,26 @@ __mempool_generic_get(struct rte_mempool
> > *mp, void **obj_table,
> > >                       unsigned int n, struct rte_mempool_cache *cache)
> > > {
> > >         int ret;
> > > +       uint32_t i;
> > >         uint32_t index, len;
> > > -       void **cache_objs;
> > > +       uint32_t *cache_objs;
> > >
> > >         /* No cache provided or cannot be satisfied from cache */
> > >         if (unlikely(cache == NULL || n >= cache->size))
> > >                 goto ring_dequeue;
> > >
> > > +       void *base_value = cache->local_cache_base_value;
> > >         cache_objs = cache->objs;
> > >
> > >         /* Can this be satisfied from the cache? */
> > >         if (cache->len < n) {
> > >                 /* No. Backfill the cache first, and then fill from it */
> > >                 uint32_t req = n + (cache->size - cache->len);
> > > +               void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**<
> > > + Cache objects */
> > >
> > >                 /* How many do we require i.e. number to fill the cache + the
> > request */
> > >                 ret = rte_mempool_ops_dequeue_bulk(mp,
> > > -                       &cache->objs[cache->len], req);
> > > +                       temp_objs, req);
> > >                 if (unlikely(ret < 0)) {
> > >                         /*
> > >                          * In the off chance that we are buffer
> > > constrained, @@ -1442,12 +1478,32 @@ __mempool_generic_get(struct
> > rte_mempool *mp, void **obj_table,
> > >                         goto ring_dequeue;
> > >                 }
> > >
> > > +               len = cache->len;
> > > +               for (i = 0; i < req; ++i, ++len) {
> > > +                       cache_objs[len] = (uint32_t) RTE_PTR_DIFF(temp_objs[i],
> > base_value);
> > > +               }
> > > +
> > >                 cache->len += req;
> > >         }
> > >
> > > +       uint64x2_t v_obj_table;
> > > +       uint64x2_t v_cache_objs;
> > > +       uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> > > +
> > >         /* Now fill in the response ... */
> > > +#if defined __ARM_NEON
> > > +       for (index = 0, len = cache->len - 1; index < (n & ~0x1); index+=2,
> > > +                                               len-=2, obj_table+=2) {
> > > +               v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
> > > +               v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
> > > +               vst1q_u64((uint64_t *)obj_table, v_obj_table);
> > > +       }
> > > +       if (n & 0x1)
> > > +               *obj_table = (void *) RTE_PTR_ADD(base_value,
> > > +cache_objs[len]); #else
> > >         for (index = 0, len = cache->len - 1; index < n; ++index, len--,
> > obj_table++)
> > > -               *obj_table = cache_objs[len];
> > > +               *obj_table = (void *) RTE_PTR_ADD(base_value,
> > > +cache_objs[len]); #endif
> > >
> > >         cache->len -= n;
> > >
> > > --
> > > 2.17.1
> > >
Honnappa Nagarahalli Oct. 1, 2021, 5:57 p.m. UTC | #4
<snip>
> >
> > >
> > > On Thu, Sep 30, 2021 at 10:57 PM Dharmik Thakkar
> > > <dharmik.thakkar@arm.com> wrote:
> > > >
> > > > Current mempool per core cache implementation is based on pointer
> > > > For most architectures, each pointer consumes 64b Replace it with
> > > > index-based implementation, where in each buffer is addressed by
> > > > (pool address + index) It will reduce memory requirements
> > > >
> > > > L3Fwd performance testing reveals minor improvements in the cache
> > > > performance and no change in throughput
> > > >
> > > > Micro-benchmarking the patch using mempool_perf_test shows
> > > > significant improvement with majority of the test cases
> > > >
> > > > Future plan involves replacing global pool's pointer-based
> > > > implementation with index-based implementation
> > > >
> > > > Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
> > >
> > >
> > > Sane idea. Like VPP, we tried to do this for rte_graph, but not
> > > observed much gain.
> > > Since lcore cache is typically 512, maybe there is a gain on the mempool
> path.
> > > Also, Since you are enabling only for local cache, it is good as
> > > mempool drivers can work as-is.(i.e HW drivers works with 64bit) I
> > > think, getting more performance numbers for various cases may be the
> next step.
> > The gain is not observed in terms of PPS improvement, but do see some
> improvements that PMUs indicate. This approach definitely results in savings
> in number of cache lines utilized.
> 
> OK. IMO, If PPS has regression then this path is not viable, else it may be OK.
PPS has not regressed. It has improved, but not significantly.
Other way to look at this is, we are doing the same work with less amount of resources.

> 
> 
> >
> > >
> > > > ---
> > > >  drivers/mempool/ring/rte_mempool_ring.c |  2 +-
> > > >  lib/mempool/rte_mempool.c               |  8 +++
> > > >  lib/mempool/rte_mempool.h               | 74 ++++++++++++++++++++++---
> > > >  3 files changed, 74 insertions(+), 10 deletions(-)
> > > >
> > > > diff --git a/drivers/mempool/ring/rte_mempool_ring.c
> > > > b/drivers/mempool/ring/rte_mempool_ring.c
> > > > index b1f09ff28f4d..e55913e47f21 100644
> > > > --- a/drivers/mempool/ring/rte_mempool_ring.c
> > > > +++ b/drivers/mempool/ring/rte_mempool_ring.c
> > > > @@ -101,7 +101,7 @@ ring_alloc(struct rte_mempool *mp, uint32_t
> > > rg_flags)
> > > >                 return -rte_errno;
> > > >
> > > >         mp->pool_data = r;
> > > > -
> > > > +       mp->local_cache_base_addr = &r[1];
> > > >         return 0;
> > > >  }
> > > >
> > > > diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> > > > index 59a588425bd6..424bdb19c323 100644
> > > > --- a/lib/mempool/rte_mempool.c
> > > > +++ b/lib/mempool/rte_mempool.c
> > > > @@ -480,6 +480,7 @@ rte_mempool_populate_default(struct
> > > rte_mempool *mp)
> > > >         int ret;
> > > >         bool need_iova_contig_obj;
> > > >         size_t max_alloc_size = SIZE_MAX;
> > > > +       unsigned lcore_id;
> > > >
> > > >         ret = mempool_ops_alloc_once(mp);
> > > >         if (ret != 0)
> > > > @@ -600,6 +601,13 @@ rte_mempool_populate_default(struct
> > > rte_mempool *mp)
> > > >                 }
> > > >         }
> > > >
> > > > +       /* Init all default caches. */
> > > > +       if (mp->cache_size != 0) {
> > > > +               for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
> > > > +                       mp->local_cache[lcore_id].local_cache_base_value =
> > > > +                               *(void **)mp->local_cache_base_addr;
> > > > +       }
> > > > +
> > > >         rte_mempool_trace_populate_default(mp);
> > > >         return mp->size;
> > > >
> > > > diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> > > > index 4235d6f0bf2b..545405c0d3ce 100644
> > > > --- a/lib/mempool/rte_mempool.h
> > > > +++ b/lib/mempool/rte_mempool.h
> > > > @@ -51,6 +51,8 @@
> > > >  #include <rte_memcpy.h>
> > > >  #include <rte_common.h>
> > > >
> > > > +#include <arm_neon.h>
> > > > +
> > > >  #include "rte_mempool_trace_fp.h"
> > > >
> > > >  #ifdef __cplusplus
> > > > @@ -91,11 +93,12 @@ struct rte_mempool_cache {
> > > >         uint32_t size;        /**< Size of the cache */
> > > >         uint32_t flushthresh; /**< Threshold before we flush
> > > > excess elements
> > > */
> > > >         uint32_t len;         /**< Current cache count */
> > > > +       void *local_cache_base_value; /**< Base value to calculate
> > > > + indices */
> > > >         /*
> > > >          * Cache is allocated to this size to allow it to overflow in certain
> > > >          * cases to avoid needless emptying of cache.
> > > >          */
> > > > -       void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache
> objects
> > > */
> > > > +       uint32_t objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache
> > > > + objects */
> > > >  } __rte_cache_aligned;
> > > >
> > > >  /**
> > > > @@ -172,7 +175,6 @@ struct rte_mempool_objtlr {
> > > >   * A list of memory where objects are stored
> > > >   */
> > > >  STAILQ_HEAD(rte_mempool_memhdr_list, rte_mempool_memhdr);
> > > > -
> > > >  /**
> > > >   * Callback used to free a memory chunk
> > > >   */
> > > > @@ -244,6 +246,7 @@ struct rte_mempool {
> > > >         int32_t ops_index;
> > > >
> > > >         struct rte_mempool_cache *local_cache; /**< Per-lcore
> > > > local cache */
> > > > +       void *local_cache_base_addr; /**< Reference to the base
> > > > + value */
> > > >
> > > >         uint32_t populated_size;         /**< Number of populated objects. */
> > > >         struct rte_mempool_objhdr_list elt_list; /**< List of
> > > > objects in pool */ @@ -1269,7 +1272,15 @@
> > > > rte_mempool_cache_flush(struct
> > > rte_mempool_cache *cache,
> > > >         if (cache == NULL || cache->len == 0)
> > > >                 return;
> > > >         rte_mempool_trace_cache_flush(cache, mp);
> > > > -       rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
> > > > +
> > > > +       unsigned int i;
> > > > +       unsigned int cache_len = cache->len;
> > > > +       void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> > > > +       void *base_value = cache->local_cache_base_value;
> > > > +       uint32_t *cache_objs = cache->objs;
> > > > +       for (i = 0; i < cache_len; i++)
> > > > +               obj_table[i] = (void *) RTE_PTR_ADD(base_value,
> cache_objs[i]);
> > > > +       rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
> > > >         cache->len = 0;
> > > >  }
> > > >
> > > > @@ -1289,7 +1300,9 @@ static __rte_always_inline void
> > > > __mempool_generic_put(struct rte_mempool *mp, void * const
> *obj_table,
> > > >                       unsigned int n, struct rte_mempool_cache
> > > > *cache) {
> > > > -       void **cache_objs;
> > > > +       uint32_t *cache_objs;
> > > > +       void *base_value;
> > > > +       uint32_t i;
> > > >
> > > >         /* increment stat now, adding in mempool always success */
> > > >         __MEMPOOL_STAT_ADD(mp, put_bulk, 1); @@ -1301,6 +1314,12
> > > > @@ __mempool_generic_put(struct rte_mempool *mp, void * const
> > > > *obj_table,
> > > >
> > > >         cache_objs = &cache->objs[cache->len];
> > > >
> > > > +       base_value = cache->local_cache_base_value;
> > > > +
> > > > +       uint64x2_t v_obj_table;
> > > > +       uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> > > > +       uint32x2_t v_cache_objs;
> > > > +
> > > >         /*
> > > >          * The cache follows the following algorithm
> > > >          *   1. Add the objects to the cache
> > > > @@ -1309,12 +1328,26 @@ __mempool_generic_put(struct
> rte_mempool
> > > *mp, void * const *obj_table,
> > > >          */
> > > >
> > > >         /* Add elements back into the cache */
> > > > -       rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
> > > > +
> > > > +#if defined __ARM_NEON
> > > > +       for (i = 0; i < (n & ~0x1); i+=2) {
> > > > +               v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
> > > > +               v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table,
> > > v_base_value));
> > > > +               vst1_u32(cache_objs + i, v_cache_objs);
> > > > +       }
> > > > +       if (n & 0x1) {
> > > > +               cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i],
> base_value);
> > > > +       }
> > > > +#else
> > > > +       for (i = 0; i < n; i++) {
> > > > +               cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i],
> base_value);
> > > > +       }
> > > > +#endif
> > > >
> > > >         cache->len += n;
> > > >
> > > >         if (cache->len >= cache->flushthresh) {
> > > > -               rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache-
> >size],
> > > > +               rte_mempool_ops_enqueue_bulk(mp, obj_table +
> > > > + cache->len - cache->size,
> > > >                                 cache->len - cache->size);
> > > >                 cache->len = cache->size;
> > > >         }
> > > > @@ -1415,23 +1448,26 @@ __mempool_generic_get(struct
> rte_mempool
> > > *mp, void **obj_table,
> > > >                       unsigned int n, struct rte_mempool_cache
> > > > *cache) {
> > > >         int ret;
> > > > +       uint32_t i;
> > > >         uint32_t index, len;
> > > > -       void **cache_objs;
> > > > +       uint32_t *cache_objs;
> > > >
> > > >         /* No cache provided or cannot be satisfied from cache */
> > > >         if (unlikely(cache == NULL || n >= cache->size))
> > > >                 goto ring_dequeue;
> > > >
> > > > +       void *base_value = cache->local_cache_base_value;
> > > >         cache_objs = cache->objs;
> > > >
> > > >         /* Can this be satisfied from the cache? */
> > > >         if (cache->len < n) {
> > > >                 /* No. Backfill the cache first, and then fill from it */
> > > >                 uint32_t req = n + (cache->size - cache->len);
> > > > +               void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> > > > + /**< Cache objects */
> > > >
> > > >                 /* How many do we require i.e. number to fill the
> > > > cache + the
> > > request */
> > > >                 ret = rte_mempool_ops_dequeue_bulk(mp,
> > > > -                       &cache->objs[cache->len], req);
> > > > +                       temp_objs, req);
> > > >                 if (unlikely(ret < 0)) {
> > > >                         /*
> > > >                          * In the off chance that we are buffer
> > > > constrained, @@ -1442,12 +1478,32 @@
> __mempool_generic_get(struct
> > > rte_mempool *mp, void **obj_table,
> > > >                         goto ring_dequeue;
> > > >                 }
> > > >
> > > > +               len = cache->len;
> > > > +               for (i = 0; i < req; ++i, ++len) {
> > > > +                       cache_objs[len] = (uint32_t)
> > > > + RTE_PTR_DIFF(temp_objs[i],
> > > base_value);
> > > > +               }
> > > > +
> > > >                 cache->len += req;
> > > >         }
> > > >
> > > > +       uint64x2_t v_obj_table;
> > > > +       uint64x2_t v_cache_objs;
> > > > +       uint64x2_t v_base_value =
> > > > + vdupq_n_u64((uint64_t)base_value);
> > > > +
> > > >         /* Now fill in the response ... */
> > > > +#if defined __ARM_NEON
> > > > +       for (index = 0, len = cache->len - 1; index < (n & ~0x1); index+=2,
> > > > +                                               len-=2, obj_table+=2) {
> > > > +               v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
> > > > +               v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
> > > > +               vst1q_u64((uint64_t *)obj_table, v_obj_table);
> > > > +       }
> > > > +       if (n & 0x1)
> > > > +               *obj_table = (void *) RTE_PTR_ADD(base_value,
> > > > +cache_objs[len]); #else
> > > >         for (index = 0, len = cache->len - 1; index < n; ++index,
> > > > len--,
> > > obj_table++)
> > > > -               *obj_table = cache_objs[len];
> > > > +               *obj_table = (void *) RTE_PTR_ADD(base_value,
> > > > +cache_objs[len]); #endif
> > > >
> > > >         cache->len -= n;
> > > >
> > > > --
> > > > 2.17.1
> > > >
Jerin Jacob Oct. 1, 2021, 6:21 p.m. UTC | #5
On Fri, Oct 1, 2021 at 11:02 PM Jerin Jacob <jerinjacobk@gmail.com> wrote:
>
> On Fri, Oct 1, 2021 at 9:14 PM Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com> wrote:
> >
> > <snip>
> >
> > >
> > > On Thu, Sep 30, 2021 at 10:57 PM Dharmik Thakkar
> > > <dharmik.thakkar@arm.com> wrote:
> > > >
> > > > Current mempool per core cache implementation is based on pointer For
> > > > most architectures, each pointer consumes 64b Replace it with
> > > > index-based implementation, where in each buffer is addressed by (pool
> > > > address + index) It will reduce memory requirements
> > > >
> > > > L3Fwd performance testing reveals minor improvements in the cache
> > > > performance and no change in throughput
> > > >
> > > > Micro-benchmarking the patch using mempool_perf_test shows significant
> > > > improvement with majority of the test cases
> > > >
> > > > Future plan involves replacing global pool's pointer-based
> > > > implementation with index-based implementation
> > > >
> > > > Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
> > >
> > >
> > > Sane idea. Like VPP, we tried to do this for rte_graph, but not observed much
> > > gain.
> > > Since lcore cache is typically 512, maybe there is a gain on the mempool path.
> > > Also, Since you are enabling only for local cache, it is good as mempool
> > > drivers can work as-is.(i.e HW drivers works with 64bit) I think, getting more
> > > performance numbers for various cases may be the next step.
> > The gain is not observed in terms of PPS improvement, but do see some improvements that PMUs indicate. This approach definitely results in savings in number of cache lines utilized.
>
> OK. IMO, If PPS has regression then this path is not viable, else it may be OK.

Looks good then.

>
>
> >
> > >
> > > > ---
> > > >  drivers/mempool/ring/rte_mempool_ring.c |  2 +-
> > > >  lib/mempool/rte_mempool.c               |  8 +++
> > > >  lib/mempool/rte_mempool.h               | 74 ++++++++++++++++++++++---
> > > >  3 files changed, 74 insertions(+), 10 deletions(-)
> > > >
> > > > diff --git a/drivers/mempool/ring/rte_mempool_ring.c
> > > > b/drivers/mempool/ring/rte_mempool_ring.c
> > > > index b1f09ff28f4d..e55913e47f21 100644
> > > > --- a/drivers/mempool/ring/rte_mempool_ring.c
> > > > +++ b/drivers/mempool/ring/rte_mempool_ring.c
> > > > @@ -101,7 +101,7 @@ ring_alloc(struct rte_mempool *mp, uint32_t
> > > rg_flags)
> > > >                 return -rte_errno;
> > > >
> > > >         mp->pool_data = r;
> > > > -
> > > > +       mp->local_cache_base_addr = &r[1];
> > > >         return 0;
> > > >  }
> > > >
> > > > diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> > > > index 59a588425bd6..424bdb19c323 100644
> > > > --- a/lib/mempool/rte_mempool.c
> > > > +++ b/lib/mempool/rte_mempool.c
> > > > @@ -480,6 +480,7 @@ rte_mempool_populate_default(struct
> > > rte_mempool *mp)
> > > >         int ret;
> > > >         bool need_iova_contig_obj;
> > > >         size_t max_alloc_size = SIZE_MAX;
> > > > +       unsigned lcore_id;
> > > >
> > > >         ret = mempool_ops_alloc_once(mp);
> > > >         if (ret != 0)
> > > > @@ -600,6 +601,13 @@ rte_mempool_populate_default(struct
> > > rte_mempool *mp)
> > > >                 }
> > > >         }
> > > >
> > > > +       /* Init all default caches. */
> > > > +       if (mp->cache_size != 0) {
> > > > +               for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
> > > > +                       mp->local_cache[lcore_id].local_cache_base_value =
> > > > +                               *(void **)mp->local_cache_base_addr;
> > > > +       }
> > > > +
> > > >         rte_mempool_trace_populate_default(mp);
> > > >         return mp->size;
> > > >
> > > > diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> > > > index 4235d6f0bf2b..545405c0d3ce 100644
> > > > --- a/lib/mempool/rte_mempool.h
> > > > +++ b/lib/mempool/rte_mempool.h
> > > > @@ -51,6 +51,8 @@
> > > >  #include <rte_memcpy.h>
> > > >  #include <rte_common.h>
> > > >
> > > > +#include <arm_neon.h>
> > > > +
> > > >  #include "rte_mempool_trace_fp.h"
> > > >
> > > >  #ifdef __cplusplus
> > > > @@ -91,11 +93,12 @@ struct rte_mempool_cache {
> > > >         uint32_t size;        /**< Size of the cache */
> > > >         uint32_t flushthresh; /**< Threshold before we flush excess elements
> > > */
> > > >         uint32_t len;         /**< Current cache count */
> > > > +       void *local_cache_base_value; /**< Base value to calculate
> > > > + indices */
> > > >         /*
> > > >          * Cache is allocated to this size to allow it to overflow in certain
> > > >          * cases to avoid needless emptying of cache.
> > > >          */
> > > > -       void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects
> > > */
> > > > +       uint32_t objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache
> > > > + objects */
> > > >  } __rte_cache_aligned;
> > > >
> > > >  /**
> > > > @@ -172,7 +175,6 @@ struct rte_mempool_objtlr {
> > > >   * A list of memory where objects are stored
> > > >   */
> > > >  STAILQ_HEAD(rte_mempool_memhdr_list, rte_mempool_memhdr);
> > > > -
> > > >  /**
> > > >   * Callback used to free a memory chunk
> > > >   */
> > > > @@ -244,6 +246,7 @@ struct rte_mempool {
> > > >         int32_t ops_index;
> > > >
> > > >         struct rte_mempool_cache *local_cache; /**< Per-lcore local
> > > > cache */
> > > > +       void *local_cache_base_addr; /**< Reference to the base value
> > > > + */
> > > >
> > > >         uint32_t populated_size;         /**< Number of populated objects. */
> > > >         struct rte_mempool_objhdr_list elt_list; /**< List of objects
> > > > in pool */ @@ -1269,7 +1272,15 @@ rte_mempool_cache_flush(struct
> > > rte_mempool_cache *cache,
> > > >         if (cache == NULL || cache->len == 0)
> > > >                 return;
> > > >         rte_mempool_trace_cache_flush(cache, mp);
> > > > -       rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
> > > > +
> > > > +       unsigned int i;
> > > > +       unsigned int cache_len = cache->len;
> > > > +       void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> > > > +       void *base_value = cache->local_cache_base_value;
> > > > +       uint32_t *cache_objs = cache->objs;
> > > > +       for (i = 0; i < cache_len; i++)
> > > > +               obj_table[i] = (void *) RTE_PTR_ADD(base_value, cache_objs[i]);
> > > > +       rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
> > > >         cache->len = 0;
> > > >  }
> > > >
> > > > @@ -1289,7 +1300,9 @@ static __rte_always_inline void
> > > > __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
> > > >                       unsigned int n, struct rte_mempool_cache *cache)
> > > > {
> > > > -       void **cache_objs;
> > > > +       uint32_t *cache_objs;
> > > > +       void *base_value;
> > > > +       uint32_t i;
> > > >
> > > >         /* increment stat now, adding in mempool always success */
> > > >         __MEMPOOL_STAT_ADD(mp, put_bulk, 1); @@ -1301,6 +1314,12 @@
> > > > __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
> > > >
> > > >         cache_objs = &cache->objs[cache->len];
> > > >
> > > > +       base_value = cache->local_cache_base_value;
> > > > +
> > > > +       uint64x2_t v_obj_table;
> > > > +       uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> > > > +       uint32x2_t v_cache_objs;
> > > > +
> > > >         /*
> > > >          * The cache follows the following algorithm
> > > >          *   1. Add the objects to the cache
> > > > @@ -1309,12 +1328,26 @@ __mempool_generic_put(struct rte_mempool
> > > *mp, void * const *obj_table,
> > > >          */
> > > >
> > > >         /* Add elements back into the cache */
> > > > -       rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
> > > > +
> > > > +#if defined __ARM_NEON
> > > > +       for (i = 0; i < (n & ~0x1); i+=2) {
> > > > +               v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
> > > > +               v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table,
> > > v_base_value));
> > > > +               vst1_u32(cache_objs + i, v_cache_objs);
> > > > +       }
> > > > +       if (n & 0x1) {
> > > > +               cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
> > > > +       }
> > > > +#else
> > > > +       for (i = 0; i < n; i++) {
> > > > +               cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
> > > > +       }
> > > > +#endif
> > > >
> > > >         cache->len += n;
> > > >
> > > >         if (cache->len >= cache->flushthresh) {
> > > > -               rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size],
> > > > +               rte_mempool_ops_enqueue_bulk(mp, obj_table +
> > > > + cache->len - cache->size,
> > > >                                 cache->len - cache->size);
> > > >                 cache->len = cache->size;
> > > >         }
> > > > @@ -1415,23 +1448,26 @@ __mempool_generic_get(struct rte_mempool
> > > *mp, void **obj_table,
> > > >                       unsigned int n, struct rte_mempool_cache *cache)
> > > > {
> > > >         int ret;
> > > > +       uint32_t i;
> > > >         uint32_t index, len;
> > > > -       void **cache_objs;
> > > > +       uint32_t *cache_objs;
> > > >
> > > >         /* No cache provided or cannot be satisfied from cache */
> > > >         if (unlikely(cache == NULL || n >= cache->size))
> > > >                 goto ring_dequeue;
> > > >
> > > > +       void *base_value = cache->local_cache_base_value;
> > > >         cache_objs = cache->objs;
> > > >
> > > >         /* Can this be satisfied from the cache? */
> > > >         if (cache->len < n) {
> > > >                 /* No. Backfill the cache first, and then fill from it */
> > > >                 uint32_t req = n + (cache->size - cache->len);
> > > > +               void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**<
> > > > + Cache objects */
> > > >
> > > >                 /* How many do we require i.e. number to fill the cache + the
> > > request */
> > > >                 ret = rte_mempool_ops_dequeue_bulk(mp,
> > > > -                       &cache->objs[cache->len], req);
> > > > +                       temp_objs, req);
> > > >                 if (unlikely(ret < 0)) {
> > > >                         /*
> > > >                          * In the off chance that we are buffer
> > > > constrained, @@ -1442,12 +1478,32 @@ __mempool_generic_get(struct
> > > rte_mempool *mp, void **obj_table,
> > > >                         goto ring_dequeue;
> > > >                 }
> > > >
> > > > +               len = cache->len;
> > > > +               for (i = 0; i < req; ++i, ++len) {
> > > > +                       cache_objs[len] = (uint32_t) RTE_PTR_DIFF(temp_objs[i],
> > > base_value);
> > > > +               }
> > > > +
> > > >                 cache->len += req;
> > > >         }
> > > >
> > > > +       uint64x2_t v_obj_table;
> > > > +       uint64x2_t v_cache_objs;
> > > > +       uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> > > > +
> > > >         /* Now fill in the response ... */
> > > > +#if defined __ARM_NEON
> > > > +       for (index = 0, len = cache->len - 1; index < (n & ~0x1); index+=2,
> > > > +                                               len-=2, obj_table+=2) {
> > > > +               v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
> > > > +               v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
> > > > +               vst1q_u64((uint64_t *)obj_table, v_obj_table);
> > > > +       }
> > > > +       if (n & 0x1)
> > > > +               *obj_table = (void *) RTE_PTR_ADD(base_value,
> > > > +cache_objs[len]); #else
> > > >         for (index = 0, len = cache->len - 1; index < n; ++index, len--,
> > > obj_table++)
> > > > -               *obj_table = cache_objs[len];
> > > > +               *obj_table = (void *) RTE_PTR_ADD(base_value,
> > > > +cache_objs[len]); #endif
> > > >
> > > >         cache->len -= n;
> > > >
> > > > --
> > > > 2.17.1
> > > >
Ananyev, Konstantin Oct. 1, 2021, 9:30 p.m. UTC | #6
> Current mempool per core cache implementation is based on pointer
> For most architectures, each pointer consumes 64b
> Replace it with index-based implementation, where in each buffer
> is addressed by (pool address + index)

I don't think it is going to work:
On 64-bit systems difference between pool address and it's elem address
could be bigger than 4GB.
 
> It will reduce memory requirements
> 
> L3Fwd performance testing reveals minor improvements in the cache
> performance and no change in throughput
> 
> Micro-benchmarking the patch using mempool_perf_test shows
> significant improvement with majority of the test cases
> 
> Future plan involves replacing global pool's pointer-based implementation with index-based implementation
> 
> Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
> ---
>  drivers/mempool/ring/rte_mempool_ring.c |  2 +-
>  lib/mempool/rte_mempool.c               |  8 +++
>  lib/mempool/rte_mempool.h               | 74 ++++++++++++++++++++++---
>  3 files changed, 74 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/mempool/ring/rte_mempool_ring.c b/drivers/mempool/ring/rte_mempool_ring.c
> index b1f09ff28f4d..e55913e47f21 100644
> --- a/drivers/mempool/ring/rte_mempool_ring.c
> +++ b/drivers/mempool/ring/rte_mempool_ring.c
> @@ -101,7 +101,7 @@ ring_alloc(struct rte_mempool *mp, uint32_t rg_flags)
>  		return -rte_errno;
> 
>  	mp->pool_data = r;
> -
> +	mp->local_cache_base_addr = &r[1];
>  	return 0;
>  }
> 
> diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> index 59a588425bd6..424bdb19c323 100644
> --- a/lib/mempool/rte_mempool.c
> +++ b/lib/mempool/rte_mempool.c
> @@ -480,6 +480,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>  	int ret;
>  	bool need_iova_contig_obj;
>  	size_t max_alloc_size = SIZE_MAX;
> +	unsigned lcore_id;
> 
>  	ret = mempool_ops_alloc_once(mp);
>  	if (ret != 0)
> @@ -600,6 +601,13 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>  		}
>  	}
> 
> +	/* Init all default caches. */
> +	if (mp->cache_size != 0) {
> +		for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
> +			mp->local_cache[lcore_id].local_cache_base_value =
> +				*(void **)mp->local_cache_base_addr;
> +	}
> +
>  	rte_mempool_trace_populate_default(mp);
>  	return mp->size;
> 
> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> index 4235d6f0bf2b..545405c0d3ce 100644
> --- a/lib/mempool/rte_mempool.h
> +++ b/lib/mempool/rte_mempool.h
> @@ -51,6 +51,8 @@
>  #include <rte_memcpy.h>
>  #include <rte_common.h>
> 
> +#include <arm_neon.h>
> +
>  #include "rte_mempool_trace_fp.h"
> 
>  #ifdef __cplusplus
> @@ -91,11 +93,12 @@ struct rte_mempool_cache {
>  	uint32_t size;	      /**< Size of the cache */
>  	uint32_t flushthresh; /**< Threshold before we flush excess elements */
>  	uint32_t len;	      /**< Current cache count */
> +	void *local_cache_base_value; /**< Base value to calculate indices */
>  	/*
>  	 * Cache is allocated to this size to allow it to overflow in certain
>  	 * cases to avoid needless emptying of cache.
>  	 */
> -	void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
> +	uint32_t objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
>  } __rte_cache_aligned;
> 
>  /**
> @@ -172,7 +175,6 @@ struct rte_mempool_objtlr {
>   * A list of memory where objects are stored
>   */
>  STAILQ_HEAD(rte_mempool_memhdr_list, rte_mempool_memhdr);
> -
>  /**
>   * Callback used to free a memory chunk
>   */
> @@ -244,6 +246,7 @@ struct rte_mempool {
>  	int32_t ops_index;
> 
>  	struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
> +	void *local_cache_base_addr; /**< Reference to the base value */
> 
>  	uint32_t populated_size;         /**< Number of populated objects. */
>  	struct rte_mempool_objhdr_list elt_list; /**< List of objects in pool */
> @@ -1269,7 +1272,15 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
>  	if (cache == NULL || cache->len == 0)
>  		return;
>  	rte_mempool_trace_cache_flush(cache, mp);
> -	rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
> +
> +	unsigned int i;
> +	unsigned int cache_len = cache->len;
> +	void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> +	void *base_value = cache->local_cache_base_value;
> +	uint32_t *cache_objs = cache->objs;
> +	for (i = 0; i < cache_len; i++)
> +		obj_table[i] = (void *) RTE_PTR_ADD(base_value, cache_objs[i]);
> +	rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
>  	cache->len = 0;
>  }
> 
> @@ -1289,7 +1300,9 @@ static __rte_always_inline void
>  __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
>  		      unsigned int n, struct rte_mempool_cache *cache)
>  {
> -	void **cache_objs;
> +	uint32_t *cache_objs;
> +	void *base_value;
> +	uint32_t i;
> 
>  	/* increment stat now, adding in mempool always success */
>  	__MEMPOOL_STAT_ADD(mp, put_bulk, 1);
> @@ -1301,6 +1314,12 @@ __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
> 
>  	cache_objs = &cache->objs[cache->len];
> 
> +	base_value = cache->local_cache_base_value;
> +
> +	uint64x2_t v_obj_table;
> +	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> +	uint32x2_t v_cache_objs;
> +
>  	/*
>  	 * The cache follows the following algorithm
>  	 *   1. Add the objects to the cache
> @@ -1309,12 +1328,26 @@ __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
>  	 */
> 
>  	/* Add elements back into the cache */
> -	rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
> +
> +#if defined __ARM_NEON
> +	for (i = 0; i < (n & ~0x1); i+=2) {
> +		v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
> +		v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table, v_base_value));
> +		vst1_u32(cache_objs + i, v_cache_objs);
> +	}
> +	if (n & 0x1) {
> +		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
> +	}
> +#else
> +	for (i = 0; i < n; i++) {
> +		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
> +	}
> +#endif
> 
>  	cache->len += n;
> 
>  	if (cache->len >= cache->flushthresh) {
> -		rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size],
> +		rte_mempool_ops_enqueue_bulk(mp, obj_table + cache->len - cache->size,
>  				cache->len - cache->size);
>  		cache->len = cache->size;
>  	}
> @@ -1415,23 +1448,26 @@ __mempool_generic_get(struct rte_mempool *mp, void **obj_table,
>  		      unsigned int n, struct rte_mempool_cache *cache)
>  {
>  	int ret;
> +	uint32_t i;
>  	uint32_t index, len;
> -	void **cache_objs;
> +	uint32_t *cache_objs;
> 
>  	/* No cache provided or cannot be satisfied from cache */
>  	if (unlikely(cache == NULL || n >= cache->size))
>  		goto ring_dequeue;
> 
> +	void *base_value = cache->local_cache_base_value;
>  	cache_objs = cache->objs;
> 
>  	/* Can this be satisfied from the cache? */
>  	if (cache->len < n) {
>  		/* No. Backfill the cache first, and then fill from it */
>  		uint32_t req = n + (cache->size - cache->len);
> +		void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
> 
>  		/* How many do we require i.e. number to fill the cache + the request */
>  		ret = rte_mempool_ops_dequeue_bulk(mp,
> -			&cache->objs[cache->len], req);
> +			temp_objs, req);
>  		if (unlikely(ret < 0)) {
>  			/*
>  			 * In the off chance that we are buffer constrained,
> @@ -1442,12 +1478,32 @@ __mempool_generic_get(struct rte_mempool *mp, void **obj_table,
>  			goto ring_dequeue;
>  		}
> 
> +		len = cache->len;
> +		for (i = 0; i < req; ++i, ++len) {
> +			cache_objs[len] = (uint32_t) RTE_PTR_DIFF(temp_objs[i], base_value);
> +		}
> +
>  		cache->len += req;
>  	}
> 
> +	uint64x2_t v_obj_table;
> +	uint64x2_t v_cache_objs;
> +	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> +
>  	/* Now fill in the response ... */
> +#if defined __ARM_NEON
> +	for (index = 0, len = cache->len - 1; index < (n & ~0x1); index+=2,
> +						len-=2, obj_table+=2) {
> +		v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
> +		v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
> +		vst1q_u64((uint64_t *)obj_table, v_obj_table);
> +	}
> +	if (n & 0x1)
> +		*obj_table = (void *) RTE_PTR_ADD(base_value, cache_objs[len]);
> +#else
>  	for (index = 0, len = cache->len - 1; index < n; ++index, len--, obj_table++)
> -		*obj_table = cache_objs[len];
> +		*obj_table = (void *) RTE_PTR_ADD(base_value, cache_objs[len]);
> +#endif
> 
>  	cache->len -= n;
> 
> --
> 2.17.1
Honnappa Nagarahalli Oct. 2, 2021, 12:07 a.m. UTC | #7
<snip>
> 
> > Current mempool per core cache implementation is based on pointer For
> > most architectures, each pointer consumes 64b Replace it with
> > index-based implementation, where in each buffer is addressed by (pool
> > address + index)
> 
> I don't think it is going to work:
> On 64-bit systems difference between pool address and it's elem address
> could be bigger than 4GB.
Are you talking about a case where the memory pool size is more than 4GB?

> 
> > It will reduce memory requirements
> >
> > L3Fwd performance testing reveals minor improvements in the cache
> > performance and no change in throughput
> >
> > Micro-benchmarking the patch using mempool_perf_test shows significant
> > improvement with majority of the test cases
> >
> > Future plan involves replacing global pool's pointer-based
> > implementation with index-based implementation
> >
> > Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
> > ---
> >  drivers/mempool/ring/rte_mempool_ring.c |  2 +-
> >  lib/mempool/rte_mempool.c               |  8 +++
> >  lib/mempool/rte_mempool.h               | 74 ++++++++++++++++++++++---
> >  3 files changed, 74 insertions(+), 10 deletions(-)
> >
> > diff --git a/drivers/mempool/ring/rte_mempool_ring.c
> > b/drivers/mempool/ring/rte_mempool_ring.c
> > index b1f09ff28f4d..e55913e47f21 100644
> > --- a/drivers/mempool/ring/rte_mempool_ring.c
> > +++ b/drivers/mempool/ring/rte_mempool_ring.c
> > @@ -101,7 +101,7 @@ ring_alloc(struct rte_mempool *mp, uint32_t
> rg_flags)
> >  		return -rte_errno;
> >
> >  	mp->pool_data = r;
> > -
> > +	mp->local_cache_base_addr = &r[1];
> >  	return 0;
> >  }
> >
> > diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> > index 59a588425bd6..424bdb19c323 100644
> > --- a/lib/mempool/rte_mempool.c
> > +++ b/lib/mempool/rte_mempool.c
> > @@ -480,6 +480,7 @@ rte_mempool_populate_default(struct
> rte_mempool *mp)
> >  	int ret;
> >  	bool need_iova_contig_obj;
> >  	size_t max_alloc_size = SIZE_MAX;
> > +	unsigned lcore_id;
> >
> >  	ret = mempool_ops_alloc_once(mp);
> >  	if (ret != 0)
> > @@ -600,6 +601,13 @@ rte_mempool_populate_default(struct
> rte_mempool *mp)
> >  		}
> >  	}
> >
> > +	/* Init all default caches. */
> > +	if (mp->cache_size != 0) {
> > +		for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
> > +			mp->local_cache[lcore_id].local_cache_base_value =
> > +				*(void **)mp->local_cache_base_addr;
> > +	}
> > +
> >  	rte_mempool_trace_populate_default(mp);
> >  	return mp->size;
> >
> > diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> > index 4235d6f0bf2b..545405c0d3ce 100644
> > --- a/lib/mempool/rte_mempool.h
> > +++ b/lib/mempool/rte_mempool.h
> > @@ -51,6 +51,8 @@
> >  #include <rte_memcpy.h>
> >  #include <rte_common.h>
> >
> > +#include <arm_neon.h>
> > +
> >  #include "rte_mempool_trace_fp.h"
> >
> >  #ifdef __cplusplus
> > @@ -91,11 +93,12 @@ struct rte_mempool_cache {
> >  	uint32_t size;	      /**< Size of the cache */
> >  	uint32_t flushthresh; /**< Threshold before we flush excess elements
> */
> >  	uint32_t len;	      /**< Current cache count */
> > +	void *local_cache_base_value; /**< Base value to calculate indices
> > +*/
> >  	/*
> >  	 * Cache is allocated to this size to allow it to overflow in certain
> >  	 * cases to avoid needless emptying of cache.
> >  	 */
> > -	void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache
> objects */
> > +	uint32_t objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache
> objects */
> >  } __rte_cache_aligned;
> >
> >  /**
> > @@ -172,7 +175,6 @@ struct rte_mempool_objtlr {
> >   * A list of memory where objects are stored
> >   */
> >  STAILQ_HEAD(rte_mempool_memhdr_list, rte_mempool_memhdr);
> > -
> >  /**
> >   * Callback used to free a memory chunk
> >   */
> > @@ -244,6 +246,7 @@ struct rte_mempool {
> >  	int32_t ops_index;
> >
> >  	struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
> > +	void *local_cache_base_addr; /**< Reference to the base value */
> >
> >  	uint32_t populated_size;         /**< Number of populated objects. */
> >  	struct rte_mempool_objhdr_list elt_list; /**< List of objects in
> > pool */ @@ -1269,7 +1272,15 @@ rte_mempool_cache_flush(struct
> rte_mempool_cache *cache,
> >  	if (cache == NULL || cache->len == 0)
> >  		return;
> >  	rte_mempool_trace_cache_flush(cache, mp);
> > -	rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
> > +
> > +	unsigned int i;
> > +	unsigned int cache_len = cache->len;
> > +	void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> > +	void *base_value = cache->local_cache_base_value;
> > +	uint32_t *cache_objs = cache->objs;
> > +	for (i = 0; i < cache_len; i++)
> > +		obj_table[i] = (void *) RTE_PTR_ADD(base_value,
> cache_objs[i]);
> > +	rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
> >  	cache->len = 0;
> >  }
> >
> > @@ -1289,7 +1300,9 @@ static __rte_always_inline void
> > __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
> >  		      unsigned int n, struct rte_mempool_cache *cache)  {
> > -	void **cache_objs;
> > +	uint32_t *cache_objs;
> > +	void *base_value;
> > +	uint32_t i;
> >
> >  	/* increment stat now, adding in mempool always success */
> >  	__MEMPOOL_STAT_ADD(mp, put_bulk, 1); @@ -1301,6 +1314,12
> @@
> > __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
> >
> >  	cache_objs = &cache->objs[cache->len];
> >
> > +	base_value = cache->local_cache_base_value;
> > +
> > +	uint64x2_t v_obj_table;
> > +	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> > +	uint32x2_t v_cache_objs;
> > +
> >  	/*
> >  	 * The cache follows the following algorithm
> >  	 *   1. Add the objects to the cache
> > @@ -1309,12 +1328,26 @@ __mempool_generic_put(struct rte_mempool
> *mp, void * const *obj_table,
> >  	 */
> >
> >  	/* Add elements back into the cache */
> > -	rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
> > +
> > +#if defined __ARM_NEON
> > +	for (i = 0; i < (n & ~0x1); i+=2) {
> > +		v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
> > +		v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table,
> v_base_value));
> > +		vst1_u32(cache_objs + i, v_cache_objs);
> > +	}
> > +	if (n & 0x1) {
> > +		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i],
> base_value);
> > +	}
> > +#else
> > +	for (i = 0; i < n; i++) {
> > +		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i],
> base_value);
> > +	}
> > +#endif
> >
> >  	cache->len += n;
> >
> >  	if (cache->len >= cache->flushthresh) {
> > -		rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache-
> >size],
> > +		rte_mempool_ops_enqueue_bulk(mp, obj_table + cache->len
> -
> > +cache->size,
> >  				cache->len - cache->size);
> >  		cache->len = cache->size;
> >  	}
> > @@ -1415,23 +1448,26 @@ __mempool_generic_get(struct rte_mempool
> *mp, void **obj_table,
> >  		      unsigned int n, struct rte_mempool_cache *cache)  {
> >  	int ret;
> > +	uint32_t i;
> >  	uint32_t index, len;
> > -	void **cache_objs;
> > +	uint32_t *cache_objs;
> >
> >  	/* No cache provided or cannot be satisfied from cache */
> >  	if (unlikely(cache == NULL || n >= cache->size))
> >  		goto ring_dequeue;
> >
> > +	void *base_value = cache->local_cache_base_value;
> >  	cache_objs = cache->objs;
> >
> >  	/* Can this be satisfied from the cache? */
> >  	if (cache->len < n) {
> >  		/* No. Backfill the cache first, and then fill from it */
> >  		uint32_t req = n + (cache->size - cache->len);
> > +		void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> /**< Cache objects
> > +*/
> >
> >  		/* How many do we require i.e. number to fill the cache + the
> request */
> >  		ret = rte_mempool_ops_dequeue_bulk(mp,
> > -			&cache->objs[cache->len], req);
> > +			temp_objs, req);
> >  		if (unlikely(ret < 0)) {
> >  			/*
> >  			 * In the off chance that we are buffer constrained,
> @@ -1442,12
> > +1478,32 @@ __mempool_generic_get(struct rte_mempool *mp, void
> **obj_table,
> >  			goto ring_dequeue;
> >  		}
> >
> > +		len = cache->len;
> > +		for (i = 0; i < req; ++i, ++len) {
> > +			cache_objs[len] = (uint32_t)
> RTE_PTR_DIFF(temp_objs[i], base_value);
> > +		}
> > +
> >  		cache->len += req;
> >  	}
> >
> > +	uint64x2_t v_obj_table;
> > +	uint64x2_t v_cache_objs;
> > +	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> > +
> >  	/* Now fill in the response ... */
> > +#if defined __ARM_NEON
> > +	for (index = 0, len = cache->len - 1; index < (n & ~0x1); index+=2,
> > +						len-=2, obj_table+=2) {
> > +		v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
> > +		v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
> > +		vst1q_u64((uint64_t *)obj_table, v_obj_table);
> > +	}
> > +	if (n & 0x1)
> > +		*obj_table = (void *) RTE_PTR_ADD(base_value,
> cache_objs[len]);
> > +#else
> >  	for (index = 0, len = cache->len - 1; index < n; ++index, len--,
> obj_table++)
> > -		*obj_table = cache_objs[len];
> > +		*obj_table = (void *) RTE_PTR_ADD(base_value,
> cache_objs[len]);
> > +#endif
> >
> >  	cache->len -= n;
> >
> > --
> > 2.17.1
Ananyev, Konstantin Oct. 2, 2021, 6:51 p.m. UTC | #8
> > > Current mempool per core cache implementation is based on pointer For
> > > most architectures, each pointer consumes 64b Replace it with
> > > index-based implementation, where in each buffer is addressed by (pool
> > > address + index)
> >
> > I don't think it is going to work:
> > On 64-bit systems difference between pool address and it's elem address
> > could be bigger than 4GB.
> Are you talking about a case where the memory pool size is more than 4GB?

That is one possible scenario.
Another possibility - user populates mempool himself with some external
memory by calling rte_mempool_populate_iova() directly.
I suppose such situation can even occur even with normal rte_mempool_create(),
though it should be a really rare one.  

> 
> >
> > > It will reduce memory requirements
> > >
> > > L3Fwd performance testing reveals minor improvements in the cache
> > > performance and no change in throughput
> > >
> > > Micro-benchmarking the patch using mempool_perf_test shows significant
> > > improvement with majority of the test cases
> > >
> > > Future plan involves replacing global pool's pointer-based
> > > implementation with index-based implementation
> > >
> > > Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
> > > ---
> > >  drivers/mempool/ring/rte_mempool_ring.c |  2 +-
> > >  lib/mempool/rte_mempool.c               |  8 +++
> > >  lib/mempool/rte_mempool.h               | 74 ++++++++++++++++++++++---
> > >  3 files changed, 74 insertions(+), 10 deletions(-)
> > >
> > > diff --git a/drivers/mempool/ring/rte_mempool_ring.c
> > > b/drivers/mempool/ring/rte_mempool_ring.c
> > > index b1f09ff28f4d..e55913e47f21 100644
> > > --- a/drivers/mempool/ring/rte_mempool_ring.c
> > > +++ b/drivers/mempool/ring/rte_mempool_ring.c
> > > @@ -101,7 +101,7 @@ ring_alloc(struct rte_mempool *mp, uint32_t
> > rg_flags)
> > >  		return -rte_errno;
> > >
> > >  	mp->pool_data = r;
> > > -
> > > +	mp->local_cache_base_addr = &r[1];
> > >  	return 0;
> > >  }
> > >
> > > diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> > > index 59a588425bd6..424bdb19c323 100644
> > > --- a/lib/mempool/rte_mempool.c
> > > +++ b/lib/mempool/rte_mempool.c
> > > @@ -480,6 +480,7 @@ rte_mempool_populate_default(struct
> > rte_mempool *mp)
> > >  	int ret;
> > >  	bool need_iova_contig_obj;
> > >  	size_t max_alloc_size = SIZE_MAX;
> > > +	unsigned lcore_id;
> > >
> > >  	ret = mempool_ops_alloc_once(mp);
> > >  	if (ret != 0)
> > > @@ -600,6 +601,13 @@ rte_mempool_populate_default(struct
> > rte_mempool *mp)
> > >  		}
> > >  	}
> > >
> > > +	/* Init all default caches. */
> > > +	if (mp->cache_size != 0) {
> > > +		for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
> > > +			mp->local_cache[lcore_id].local_cache_base_value =
> > > +				*(void **)mp->local_cache_base_addr;
> > > +	}
> > > +
> > >  	rte_mempool_trace_populate_default(mp);
> > >  	return mp->size;
> > >
> > > diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> > > index 4235d6f0bf2b..545405c0d3ce 100644
> > > --- a/lib/mempool/rte_mempool.h
> > > +++ b/lib/mempool/rte_mempool.h
> > > @@ -51,6 +51,8 @@
> > >  #include <rte_memcpy.h>
> > >  #include <rte_common.h>
> > >
> > > +#include <arm_neon.h>
> > > +
> > >  #include "rte_mempool_trace_fp.h"
> > >
> > >  #ifdef __cplusplus
> > > @@ -91,11 +93,12 @@ struct rte_mempool_cache {
> > >  	uint32_t size;	      /**< Size of the cache */
> > >  	uint32_t flushthresh; /**< Threshold before we flush excess elements
> > */
> > >  	uint32_t len;	      /**< Current cache count */
> > > +	void *local_cache_base_value; /**< Base value to calculate indices
> > > +*/
> > >  	/*
> > >  	 * Cache is allocated to this size to allow it to overflow in certain
> > >  	 * cases to avoid needless emptying of cache.
> > >  	 */
> > > -	void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache
> > objects */
> > > +	uint32_t objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache
> > objects */
> > >  } __rte_cache_aligned;
> > >
> > >  /**
> > > @@ -172,7 +175,6 @@ struct rte_mempool_objtlr {
> > >   * A list of memory where objects are stored
> > >   */
> > >  STAILQ_HEAD(rte_mempool_memhdr_list, rte_mempool_memhdr);
> > > -
> > >  /**
> > >   * Callback used to free a memory chunk
> > >   */
> > > @@ -244,6 +246,7 @@ struct rte_mempool {
> > >  	int32_t ops_index;
> > >
> > >  	struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
> > > +	void *local_cache_base_addr; /**< Reference to the base value */
> > >
> > >  	uint32_t populated_size;         /**< Number of populated objects. */
> > >  	struct rte_mempool_objhdr_list elt_list; /**< List of objects in
> > > pool */ @@ -1269,7 +1272,15 @@ rte_mempool_cache_flush(struct
> > rte_mempool_cache *cache,
> > >  	if (cache == NULL || cache->len == 0)
> > >  		return;
> > >  	rte_mempool_trace_cache_flush(cache, mp);
> > > -	rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
> > > +
> > > +	unsigned int i;
> > > +	unsigned int cache_len = cache->len;
> > > +	void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> > > +	void *base_value = cache->local_cache_base_value;
> > > +	uint32_t *cache_objs = cache->objs;
> > > +	for (i = 0; i < cache_len; i++)
> > > +		obj_table[i] = (void *) RTE_PTR_ADD(base_value,
> > cache_objs[i]);
> > > +	rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
> > >  	cache->len = 0;
> > >  }
> > >
> > > @@ -1289,7 +1300,9 @@ static __rte_always_inline void
> > > __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
> > >  		      unsigned int n, struct rte_mempool_cache *cache)  {
> > > -	void **cache_objs;
> > > +	uint32_t *cache_objs;
> > > +	void *base_value;
> > > +	uint32_t i;
> > >
> > >  	/* increment stat now, adding in mempool always success */
> > >  	__MEMPOOL_STAT_ADD(mp, put_bulk, 1); @@ -1301,6 +1314,12
> > @@
> > > __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
> > >
> > >  	cache_objs = &cache->objs[cache->len];
> > >
> > > +	base_value = cache->local_cache_base_value;
> > > +
> > > +	uint64x2_t v_obj_table;
> > > +	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> > > +	uint32x2_t v_cache_objs;
> > > +
> > >  	/*
> > >  	 * The cache follows the following algorithm
> > >  	 *   1. Add the objects to the cache
> > > @@ -1309,12 +1328,26 @@ __mempool_generic_put(struct rte_mempool
> > *mp, void * const *obj_table,
> > >  	 */
> > >
> > >  	/* Add elements back into the cache */
> > > -	rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
> > > +
> > > +#if defined __ARM_NEON
> > > +	for (i = 0; i < (n & ~0x1); i+=2) {
> > > +		v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
> > > +		v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table,
> > v_base_value));
> > > +		vst1_u32(cache_objs + i, v_cache_objs);
> > > +	}
> > > +	if (n & 0x1) {
> > > +		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i],
> > base_value);
> > > +	}
> > > +#else
> > > +	for (i = 0; i < n; i++) {
> > > +		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i],
> > base_value);
> > > +	}
> > > +#endif
> > >
> > >  	cache->len += n;
> > >
> > >  	if (cache->len >= cache->flushthresh) {
> > > -		rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache-
> > >size],
> > > +		rte_mempool_ops_enqueue_bulk(mp, obj_table + cache->len
> > -
> > > +cache->size,
> > >  				cache->len - cache->size);
> > >  		cache->len = cache->size;
> > >  	}
> > > @@ -1415,23 +1448,26 @@ __mempool_generic_get(struct rte_mempool
> > *mp, void **obj_table,
> > >  		      unsigned int n, struct rte_mempool_cache *cache)  {
> > >  	int ret;
> > > +	uint32_t i;
> > >  	uint32_t index, len;
> > > -	void **cache_objs;
> > > +	uint32_t *cache_objs;
> > >
> > >  	/* No cache provided or cannot be satisfied from cache */
> > >  	if (unlikely(cache == NULL || n >= cache->size))
> > >  		goto ring_dequeue;
> > >
> > > +	void *base_value = cache->local_cache_base_value;
> > >  	cache_objs = cache->objs;
> > >
> > >  	/* Can this be satisfied from the cache? */
> > >  	if (cache->len < n) {
> > >  		/* No. Backfill the cache first, and then fill from it */
> > >  		uint32_t req = n + (cache->size - cache->len);
> > > +		void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> > /**< Cache objects
> > > +*/
> > >
> > >  		/* How many do we require i.e. number to fill the cache + the
> > request */
> > >  		ret = rte_mempool_ops_dequeue_bulk(mp,
> > > -			&cache->objs[cache->len], req);
> > > +			temp_objs, req);
> > >  		if (unlikely(ret < 0)) {
> > >  			/*
> > >  			 * In the off chance that we are buffer constrained,
> > @@ -1442,12
> > > +1478,32 @@ __mempool_generic_get(struct rte_mempool *mp, void
> > **obj_table,
> > >  			goto ring_dequeue;
> > >  		}
> > >
> > > +		len = cache->len;
> > > +		for (i = 0; i < req; ++i, ++len) {
> > > +			cache_objs[len] = (uint32_t)
> > RTE_PTR_DIFF(temp_objs[i], base_value);
> > > +		}
> > > +
> > >  		cache->len += req;
> > >  	}
> > >
> > > +	uint64x2_t v_obj_table;
> > > +	uint64x2_t v_cache_objs;
> > > +	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> > > +
> > >  	/* Now fill in the response ... */
> > > +#if defined __ARM_NEON
> > > +	for (index = 0, len = cache->len - 1; index < (n & ~0x1); index+=2,
> > > +						len-=2, obj_table+=2) {
> > > +		v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
> > > +		v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
> > > +		vst1q_u64((uint64_t *)obj_table, v_obj_table);
> > > +	}
> > > +	if (n & 0x1)
> > > +		*obj_table = (void *) RTE_PTR_ADD(base_value,
> > cache_objs[len]);
> > > +#else
> > >  	for (index = 0, len = cache->len - 1; index < n; ++index, len--,
> > obj_table++)
> > > -		*obj_table = cache_objs[len];
> > > +		*obj_table = (void *) RTE_PTR_ADD(base_value,
> > cache_objs[len]);
> > > +#endif
> > >
> > >  	cache->len -= n;
> > >
> > > --
> > > 2.17.1
Honnappa Nagarahalli Oct. 4, 2021, 4:36 p.m. UTC | #9
<snip>
> 
> 
> > > > Current mempool per core cache implementation is based on pointer
> > > > For most architectures, each pointer consumes 64b Replace it with
> > > > index-based implementation, where in each buffer is addressed by
> > > > (pool address + index)
> > >
> > > I don't think it is going to work:
> > > On 64-bit systems difference between pool address and it's elem
> > > address could be bigger than 4GB.
> > Are you talking about a case where the memory pool size is more than 4GB?
> 
> That is one possible scenario.
> Another possibility - user populates mempool himself with some external
> memory by calling rte_mempool_populate_iova() directly.
Is the concern that IOVA might not be contiguous for all the memory used by the mempool?

> I suppose such situation can even occur even with normal
> rte_mempool_create(), though it should be a really rare one.
All in all, this feature needs to be configurable during compile time.

> 
> >
> > >
<snip>
Morten Brørup Oct. 30, 2021, 10:23 a.m. UTC | #10
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Honnappa
> Nagarahalli
> Sent: Monday, 4 October 2021 18.36
> 
> <snip>
> >
> >
> > > > > Current mempool per core cache implementation is based on
> pointer
> > > > > For most architectures, each pointer consumes 64b Replace it
> with
> > > > > index-based implementation, where in each buffer is addressed
> by
> > > > > (pool address + index)

I like Dharmik's suggestion very much. CPU cache is a critical and limited resource.

DPDK has a tendency of using pointers where indexes could be used instead. I suppose pointers provide the additional flexibility of mixing entries from different memory pools, e.g. multiple mbuf pools.

> > > >
> > > > I don't think it is going to work:
> > > > On 64-bit systems difference between pool address and it's elem
> > > > address could be bigger than 4GB.
> > > Are you talking about a case where the memory pool size is more
> than 4GB?
> >
> > That is one possible scenario.

That could be solved by making the index an element index instead of a pointer offset: address = (pool address + index * element size).

> > Another possibility - user populates mempool himself with some
> external
> > memory by calling rte_mempool_populate_iova() directly.
> Is the concern that IOVA might not be contiguous for all the memory
> used by the mempool?
> 
> > I suppose such situation can even occur even with normal
> > rte_mempool_create(), though it should be a really rare one.
> All in all, this feature needs to be configurable during compile time.
Morten Brørup Oct. 31, 2021, 8:14 a.m. UTC | #11
> From: Morten Brørup
> Sent: Saturday, 30 October 2021 12.24
> 
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Honnappa
> > Nagarahalli
> > Sent: Monday, 4 October 2021 18.36
> >
> > <snip>
> > >
> > >
> > > > > > Current mempool per core cache implementation is based on
> > pointer
> > > > > > For most architectures, each pointer consumes 64b Replace it
> > with
> > > > > > index-based implementation, where in each buffer is addressed
> > by
> > > > > > (pool address + index)
> 
> I like Dharmik's suggestion very much. CPU cache is a critical and
> limited resource.
> 
> DPDK has a tendency of using pointers where indexes could be used
> instead. I suppose pointers provide the additional flexibility of
> mixing entries from different memory pools, e.g. multiple mbuf pools.
> 
> > > > >
> > > > > I don't think it is going to work:
> > > > > On 64-bit systems difference between pool address and it's elem
> > > > > address could be bigger than 4GB.
> > > > Are you talking about a case where the memory pool size is more
> > than 4GB?
> > >
> > > That is one possible scenario.
> 
> That could be solved by making the index an element index instead of a
> pointer offset: address = (pool address + index * element size).

Or instead of scaling the index with the element size, which is only known at runtime, the index could be more efficiently scaled by a compile time constant such as RTE_MEMPOOL_ALIGN (= RTE_CACHE_LINE_SIZE). With a cache line size of 64 byte, that would allow indexing into mempools up to 256 GB in size.

> 
> > > Another possibility - user populates mempool himself with some
> > external
> > > memory by calling rte_mempool_populate_iova() directly.
> > Is the concern that IOVA might not be contiguous for all the memory
> > used by the mempool?
> >
> > > I suppose such situation can even occur even with normal
> > > rte_mempool_create(), though it should be a really rare one.
> > All in all, this feature needs to be configurable during compile
> time.
Dharmik Thakkar Nov. 3, 2021, 3:12 p.m. UTC | #12
Hi,

Thank you everyone for the comments! I am currently working on making the global pool ring’s implementation as index based.
Once done, I will send a patch for community review. I will also make it as a compile time option.

> On Oct 31, 2021, at 3:14 AM, Morten Brørup <mb@smartsharesystems.com> wrote:
> 
>> From: Morten Brørup
>> Sent: Saturday, 30 October 2021 12.24
>> 
>>> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Honnappa
>>> Nagarahalli
>>> Sent: Monday, 4 October 2021 18.36
>>> 
>>> <snip>
>>>> 
>>>> 
>>>>>>> Current mempool per core cache implementation is based on
>>> pointer
>>>>>>> For most architectures, each pointer consumes 64b Replace it
>>> with
>>>>>>> index-based implementation, where in each buffer is addressed
>>> by
>>>>>>> (pool address + index)
>> 
>> I like Dharmik's suggestion very much. CPU cache is a critical and
>> limited resource.
>> 
>> DPDK has a tendency of using pointers where indexes could be used
>> instead. I suppose pointers provide the additional flexibility of
>> mixing entries from different memory pools, e.g. multiple mbuf pools.
>> 

Agreed, thank you!

>>>>>> 
>>>>>> I don't think it is going to work:
>>>>>> On 64-bit systems difference between pool address and it's elem
>>>>>> address could be bigger than 4GB.
>>>>> Are you talking about a case where the memory pool size is more
>>> than 4GB?
>>>> 
>>>> That is one possible scenario.
>> 
>> That could be solved by making the index an element index instead of a
>> pointer offset: address = (pool address + index * element size).
> 
> Or instead of scaling the index with the element size, which is only known at runtime, the index could be more efficiently scaled by a compile time constant such as RTE_MEMPOOL_ALIGN (= RTE_CACHE_LINE_SIZE). With a cache line size of 64 byte, that would allow indexing into mempools up to 256 GB in size.
> 

Looking at this snippet [1] from rte_mempool_op_populate_helper(), there is an ‘offset’ added to avoid objects to cross page boundaries. If my understanding is correct, using the index of element instead of a pointer offset will pose a challenge for some of the corner cases.

[1]
        for (i = 0; i < max_objs; i++) {                                           
                /* avoid objects to cross page boundaries */
                if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0) {
                        off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va + off);
                        if (flags & RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
                                off += total_elt_sz -
                                        (((uintptr_t)(va + off - 1) %
                                                total_elt_sz) + 1);
                }

>> 
>>>> Another possibility - user populates mempool himself with some
>>> external
>>>> memory by calling rte_mempool_populate_iova() directly.
>>> Is the concern that IOVA might not be contiguous for all the memory
>>> used by the mempool?
>>> 
>>>> I suppose such situation can even occur even with normal
>>>> rte_mempool_create(), though it should be a really rare one.
>>> All in all, this feature needs to be configurable during compile
>> time.
>
Morten Brørup Nov. 3, 2021, 3:52 p.m. UTC | #13
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Dharmik Thakkar
> Sent: Wednesday, 3 November 2021 16.13
> 
> Hi,
> 
> Thank you everyone for the comments! I am currently working on making
> the global pool ring’s implementation as index based.
> Once done, I will send a patch for community review. I will also make
> it as a compile time option.

Sounds good to me.

This could probably be abstracted to other libraries too. E.g. the ring library holds pointers to objects (void *); an alternative ring library could hold indexes to objects (uint32_t). A ring often holds objects from the same mempool, and the application knows which mempool, so indexing would be useful here too.

> 
> > On Oct 31, 2021, at 3:14 AM, Morten Brørup <mb@smartsharesystems.com>
> wrote:
> >
> >> From: Morten Brørup
> >> Sent: Saturday, 30 October 2021 12.24
> >>
> >>> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Honnappa
> >>> Nagarahalli
> >>> Sent: Monday, 4 October 2021 18.36
> >>>
> >>> <snip>
> >>>>
> >>>>
> >>>>>>> Current mempool per core cache implementation is based on
> >>> pointer
> >>>>>>> For most architectures, each pointer consumes 64b Replace it
> >>> with
> >>>>>>> index-based implementation, where in each buffer is addressed
> >>> by
> >>>>>>> (pool address + index)
> >>
> >> I like Dharmik's suggestion very much. CPU cache is a critical and
> >> limited resource.
> >>
> >> DPDK has a tendency of using pointers where indexes could be used
> >> instead. I suppose pointers provide the additional flexibility of
> >> mixing entries from different memory pools, e.g. multiple mbuf
> pools.
> >>
> 
> Agreed, thank you!
> 
> >>>>>>
> >>>>>> I don't think it is going to work:
> >>>>>> On 64-bit systems difference between pool address and it's elem
> >>>>>> address could be bigger than 4GB.
> >>>>> Are you talking about a case where the memory pool size is more
> >>> than 4GB?
> >>>>
> >>>> That is one possible scenario.
> >>
> >> That could be solved by making the index an element index instead of
> a
> >> pointer offset: address = (pool address + index * element size).
> >
> > Or instead of scaling the index with the element size, which is only
> known at runtime, the index could be more efficiently scaled by a
> compile time constant such as RTE_MEMPOOL_ALIGN (=
> RTE_CACHE_LINE_SIZE). With a cache line size of 64 byte, that would
> allow indexing into mempools up to 256 GB in size.
> >
> 
> Looking at this snippet [1] from rte_mempool_op_populate_helper(),
> there is an ‘offset’ added to avoid objects to cross page boundaries.
> If my understanding is correct, using the index of element instead of a
> pointer offset will pose a challenge for some of the corner cases.
> 
> [1]
>         for (i = 0; i < max_objs; i++) {
>                 /* avoid objects to cross page boundaries */
>                 if (check_obj_bounds(va + off, pg_sz, total_elt_sz) <
> 0) {
>                         off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) -
> (va + off);
>                         if (flags & RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
>                                 off += total_elt_sz -
>                                         (((uintptr_t)(va + off - 1) %
>                                                 total_elt_sz) + 1);
>                 }
> 

OK. Alternatively to scaling the index with a cache line size, you can scale it with sizeof(uintptr_t) to be able to address 32 or 16 GB mempools on respectively 64 bit and 32 bit architectures. Both x86 and ARM CPUs have instructions to access memory with an added offset multiplied by 4 or 8. So that should be high performance.

> >>
> >>>> Another possibility - user populates mempool himself with some
> >>> external
> >>>> memory by calling rte_mempool_populate_iova() directly.
> >>> Is the concern that IOVA might not be contiguous for all the memory
> >>> used by the mempool?
> >>>
> >>>> I suppose such situation can even occur even with normal
> >>>> rte_mempool_create(), though it should be a really rare one.
> >>> All in all, this feature needs to be configurable during compile
> >> time.
> >
Dharmik Thakkar Nov. 4, 2021, 4:42 a.m. UTC | #14
> On Nov 3, 2021, at 10:52 AM, Morten Brørup <mb@smartsharesystems.com> wrote:
> 
>> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Dharmik Thakkar
>> Sent: Wednesday, 3 November 2021 16.13
>> 
>> Hi,
>> 
>> Thank you everyone for the comments! I am currently working on making
>> the global pool ring’s implementation as index based.
>> Once done, I will send a patch for community review. I will also make
>> it as a compile time option.
> 
> Sounds good to me.
> 
> This could probably be abstracted to other libraries too. E.g. the ring library holds pointers to objects (void *); an alternative ring library could hold indexes to objects (uint32_t). A ring often holds objects from the same mempool, and the application knows which mempool, so indexing would be useful here too.
> 

Yes, ring library within DPDK has the APIs to support configurable element size

>> 
>>> On Oct 31, 2021, at 3:14 AM, Morten Brørup <mb@smartsharesystems.com>
>> wrote:
>>> 
>>>> From: Morten Brørup
>>>> Sent: Saturday, 30 October 2021 12.24
>>>> 
>>>>> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Honnappa
>>>>> Nagarahalli
>>>>> Sent: Monday, 4 October 2021 18.36
>>>>> 
>>>>> <snip>
>>>>>> 
>>>>>> 
>>>>>>>>> Current mempool per core cache implementation is based on
>>>>> pointer
>>>>>>>>> For most architectures, each pointer consumes 64b Replace it
>>>>> with
>>>>>>>>> index-based implementation, where in each buffer is addressed
>>>>> by
>>>>>>>>> (pool address + index)
>>>> 
>>>> I like Dharmik's suggestion very much. CPU cache is a critical and
>>>> limited resource.
>>>> 
>>>> DPDK has a tendency of using pointers where indexes could be used
>>>> instead. I suppose pointers provide the additional flexibility of
>>>> mixing entries from different memory pools, e.g. multiple mbuf
>> pools.
>>>> 
>> 
>> Agreed, thank you!
>> 
>>>>>>>> 
>>>>>>>> I don't think it is going to work:
>>>>>>>> On 64-bit systems difference between pool address and it's elem
>>>>>>>> address could be bigger than 4GB.
>>>>>>> Are you talking about a case where the memory pool size is more
>>>>> than 4GB?
>>>>>> 
>>>>>> That is one possible scenario.
>>>> 
>>>> That could be solved by making the index an element index instead of
>> a
>>>> pointer offset: address = (pool address + index * element size).
>>> 
>>> Or instead of scaling the index with the element size, which is only
>> known at runtime, the index could be more efficiently scaled by a
>> compile time constant such as RTE_MEMPOOL_ALIGN (=
>> RTE_CACHE_LINE_SIZE). With a cache line size of 64 byte, that would
>> allow indexing into mempools up to 256 GB in size.
>>> 
>> 
>> Looking at this snippet [1] from rte_mempool_op_populate_helper(),
>> there is an ‘offset’ added to avoid objects to cross page boundaries.
>> If my understanding is correct, using the index of element instead of a
>> pointer offset will pose a challenge for some of the corner cases.
>> 
>> [1]
>>        for (i = 0; i < max_objs; i++) {
>>                /* avoid objects to cross page boundaries */
>>                if (check_obj_bounds(va + off, pg_sz, total_elt_sz) <
>> 0) {
>>                        off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) -
>> (va + off);
>>                        if (flags & RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
>>                                off += total_elt_sz -
>>                                        (((uintptr_t)(va + off - 1) %
>>                                                total_elt_sz) + 1);
>>                }
>> 
> 
> OK. Alternatively to scaling the index with a cache line size, you can scale it with sizeof(uintptr_t) to be able to address 32 or 16 GB mempools on respectively 64 bit and 32 bit architectures. Both x86 and ARM CPUs have instructions to access memory with an added offset multiplied by 4 or 8. So that should be high performance.

Yes, agreed this can be done.
Cache line size can also be used when ‘MEMPOOL_F_NO_CACHE_ALIGN’ is not enabled.
On a side note, I wanted to better understand the need for having the 'MEMPOOL_F_NO_CACHE_ALIGN' option.

> 
>>>> 
>>>>>> Another possibility - user populates mempool himself with some
>>>>> external
>>>>>> memory by calling rte_mempool_populate_iova() directly.
>>>>> Is the concern that IOVA might not be contiguous for all the memory
>>>>> used by the mempool?
>>>>> 
>>>>>> I suppose such situation can even occur even with normal
>>>>>> rte_mempool_create(), though it should be a really rare one.
>>>>> All in all, this feature needs to be configurable during compile
>>>> time.
>>> 
>
Morten Brørup Nov. 4, 2021, 8:04 a.m. UTC | #15
+ Ring library maintainers (@Honnappa and @Konstantin) for my rants about its documentation.

> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Dharmik Thakkar
> Sent: Thursday, 4 November 2021 05.42
> 
> > On Nov 3, 2021, at 10:52 AM, Morten Brørup <mb@smartsharesystems.com>
> wrote:
> >
> >> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Dharmik Thakkar
> >> Sent: Wednesday, 3 November 2021 16.13
> >>
> >> Hi,
> >>
> >> Thank you everyone for the comments! I am currently working on
> making
> >> the global pool ring’s implementation as index based.
> >> Once done, I will send a patch for community review. I will also
> make
> >> it as a compile time option.
> >
> > Sounds good to me.
> >
> > This could probably be abstracted to other libraries too. E.g. the
> ring library holds pointers to objects (void *); an alternative ring
> library could hold indexes to objects (uint32_t). A ring often holds
> objects from the same mempool, and the application knows which mempool,
> so indexing would be useful here too.
> >
> 
> Yes, ring library within DPDK has the APIs to support configurable
> element size

I remember seeing that feature proposed on the mailing list too, but I couldn't find it in the API documentation, so I was not sure it was ever accepted.

The containers section of the API documentation (/doc/api/doxy-api-index.md) doesn't contain any references to it. And the description of the RTE Ring library, which the "ring" link in the API documentation refers to, clearly says: The Ring Manager is a fixed-size queue, implemented as a table of *pointers*. (My emphasis.) So I thought it wasn't accepted.

However, searching for it in the source code reveals that it is indeed there! And the Ring Library chapter in the Programmer's Guide does mention that the objects can be something else than pointers.

So the documentation is not all screwed up, just a little sparse. :-)

> 
> >>
> >>> On Oct 31, 2021, at 3:14 AM, Morten Brørup
> <mb@smartsharesystems.com>
> >> wrote:
> >>>
> >>>> From: Morten Brørup
> >>>> Sent: Saturday, 30 October 2021 12.24
> >>>>
> >>>>> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Honnappa
> >>>>> Nagarahalli
> >>>>> Sent: Monday, 4 October 2021 18.36
> >>>>>
> >>>>> <snip>
> >>>>>>
> >>>>>>
> >>>>>>>>> Current mempool per core cache implementation is based on
> >>>>> pointer
> >>>>>>>>> For most architectures, each pointer consumes 64b Replace it
> >>>>> with
> >>>>>>>>> index-based implementation, where in each buffer is addressed
> >>>>> by
> >>>>>>>>> (pool address + index)
> >>>>
> >>>> I like Dharmik's suggestion very much. CPU cache is a critical and
> >>>> limited resource.
> >>>>
> >>>> DPDK has a tendency of using pointers where indexes could be used
> >>>> instead. I suppose pointers provide the additional flexibility of
> >>>> mixing entries from different memory pools, e.g. multiple mbuf
> >> pools.
> >>>>
> >>
> >> Agreed, thank you!
> >>
> >>>>>>>>
> >>>>>>>> I don't think it is going to work:
> >>>>>>>> On 64-bit systems difference between pool address and it's
> elem
> >>>>>>>> address could be bigger than 4GB.
> >>>>>>> Are you talking about a case where the memory pool size is more
> >>>>> than 4GB?
> >>>>>>
> >>>>>> That is one possible scenario.
> >>>>
> >>>> That could be solved by making the index an element index instead
> of
> >> a
> >>>> pointer offset: address = (pool address + index * element size).
> >>>
> >>> Or instead of scaling the index with the element size, which is
> only
> >> known at runtime, the index could be more efficiently scaled by a
> >> compile time constant such as RTE_MEMPOOL_ALIGN (=
> >> RTE_CACHE_LINE_SIZE). With a cache line size of 64 byte, that would
> >> allow indexing into mempools up to 256 GB in size.
> >>>
> >>
> >> Looking at this snippet [1] from rte_mempool_op_populate_helper(),
> >> there is an ‘offset’ added to avoid objects to cross page
> boundaries.
> >> If my understanding is correct, using the index of element instead
> of a
> >> pointer offset will pose a challenge for some of the corner cases.
> >>
> >> [1]
> >>        for (i = 0; i < max_objs; i++) {
> >>                /* avoid objects to cross page boundaries */
> >>                if (check_obj_bounds(va + off, pg_sz, total_elt_sz) <
> >> 0) {
> >>                        off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) -
> >> (va + off);
> >>                        if (flags & RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
> >>                                off += total_elt_sz -
> >>                                        (((uintptr_t)(va + off - 1) %
> >>                                                total_elt_sz) + 1);
> >>                }
> >>
> >
> > OK. Alternatively to scaling the index with a cache line size, you
> can scale it with sizeof(uintptr_t) to be able to address 32 or 16 GB
> mempools on respectively 64 bit and 32 bit architectures. Both x86 and
> ARM CPUs have instructions to access memory with an added offset
> multiplied by 4 or 8. So that should be high performance.
> 
> Yes, agreed this can be done.
> Cache line size can also be used when ‘MEMPOOL_F_NO_CACHE_ALIGN’ is not
> enabled.
> On a side note, I wanted to better understand the need for having the
> 'MEMPOOL_F_NO_CACHE_ALIGN' option.

The description of this field is misleading, and should be corrected.
The correct description would be: Don't need to align objs on cache lines.

It is useful for mempools containing very small objects, to conserve memory.

> 
> >
> >>>>
> >>>>>> Another possibility - user populates mempool himself with some
> >>>>> external
> >>>>>> memory by calling rte_mempool_populate_iova() directly.
> >>>>> Is the concern that IOVA might not be contiguous for all the
> memory
> >>>>> used by the mempool?
> >>>>>
> >>>>>> I suppose such situation can even occur even with normal
> >>>>>> rte_mempool_create(), though it should be a really rare one.
> >>>>> All in all, this feature needs to be configurable during compile
> >>>> time.
> >>>
> >
Honnappa Nagarahalli Nov. 8, 2021, 4:32 a.m. UTC | #16
<snip>

> > >>>>>>>>> Current mempool per core cache implementation is based on
> > >>>>> pointer
> > >>>>>>>>> For most architectures, each pointer consumes 64b Replace it
> > >>>>> with
> > >>>>>>>>> index-based implementation, where in each buffer is
> > >>>>>>>>> addressed
> > >>>>> by
> > >>>>>>>>> (pool address + index)
> > >>>>
> > >>>> I like Dharmik's suggestion very much. CPU cache is a critical
> > >>>> and limited resource.
> > >>>>
> > >>>> DPDK has a tendency of using pointers where indexes could be used
> > >>>> instead. I suppose pointers provide the additional flexibility of
> > >>>> mixing entries from different memory pools, e.g. multiple mbuf
> > >> pools.
> > >>>>
> > >>
> > >> Agreed, thank you!
> > >>
> > >>>>>>>>
> > >>>>>>>> I don't think it is going to work:
> > >>>>>>>> On 64-bit systems difference between pool address and it's
> > elem
> > >>>>>>>> address could be bigger than 4GB.
> > >>>>>>> Are you talking about a case where the memory pool size is
> > >>>>>>> more
> > >>>>> than 4GB?
> > >>>>>>
> > >>>>>> That is one possible scenario.
> > >>>>
> > >>>> That could be solved by making the index an element index instead
> > of
> > >> a
> > >>>> pointer offset: address = (pool address + index * element size).
> > >>>
> > >>> Or instead of scaling the index with the element size, which is
> > only
> > >> known at runtime, the index could be more efficiently scaled by a
> > >> compile time constant such as RTE_MEMPOOL_ALIGN (=
> > >> RTE_CACHE_LINE_SIZE). With a cache line size of 64 byte, that would
> > >> allow indexing into mempools up to 256 GB in size.
> > >>>
> > >>
> > >> Looking at this snippet [1] from rte_mempool_op_populate_helper(),
> > >> there is an ‘offset’ added to avoid objects to cross page
> > boundaries.
> > >> If my understanding is correct, using the index of element instead
> > of a
> > >> pointer offset will pose a challenge for some of the corner cases.
> > >>
> > >> [1]
> > >>        for (i = 0; i < max_objs; i++) {
> > >>                /* avoid objects to cross page boundaries */
> > >>                if (check_obj_bounds(va + off, pg_sz, total_elt_sz)
> > >> <
> > >> 0) {
> > >>                        off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) -
> > >> (va + off);
> > >>                        if (flags & RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
> > >>                                off += total_elt_sz -
> > >>                                        (((uintptr_t)(va + off - 1) %
> > >>                                                total_elt_sz) + 1);
> > >>                }
> > >>
> > >
> > > OK. Alternatively to scaling the index with a cache line size, you
> > can scale it with sizeof(uintptr_t) to be able to address 32 or 16 GB
> > mempools on respectively 64 bit and 32 bit architectures. Both x86 and
> > ARM CPUs have instructions to access memory with an added offset
> > multiplied by 4 or 8. So that should be high performance.
> >
> > Yes, agreed this can be done.
> > Cache line size can also be used when ‘MEMPOOL_F_NO_CACHE_ALIGN’ is
> > not enabled.
> > On a side note, I wanted to better understand the need for having the
> > 'MEMPOOL_F_NO_CACHE_ALIGN' option.
> 
> The description of this field is misleading, and should be corrected.
> The correct description would be: Don't need to align objs on cache lines.
> 
> It is useful for mempools containing very small objects, to conserve memory.
I think we can assume that mbuf pools are created with the 'MEMPOOL_F_NO_CACHE_ALIGN' flag set. With this we can use offset calculated with cache line size as the unit.

> 
> >
> > >
> > >>>>
> > >>>>>> Another possibility - user populates mempool himself with some
> > >>>>> external
> > >>>>>> memory by calling rte_mempool_populate_iova() directly.
> > >>>>> Is the concern that IOVA might not be contiguous for all the
> > memory
> > >>>>> used by the mempool?
> > >>>>>
> > >>>>>> I suppose such situation can even occur even with normal
> > >>>>>> rte_mempool_create(), though it should be a really rare one.
> > >>>>> All in all, this feature needs to be configurable during compile
> > >>>> time.
> > >>>
> > >
Morten Brørup Nov. 8, 2021, 7:22 a.m. UTC | #17
> From: Honnappa Nagarahalli [mailto:Honnappa.Nagarahalli@arm.com]
> Sent: Monday, 8 November 2021 05.33
> 
> <snip>
> 
> > > >>>>>>>>> Current mempool per core cache implementation is based on
> > > >>>>> pointer
> > > >>>>>>>>> For most architectures, each pointer consumes 64b Replace
> it
> > > >>>>> with
> > > >>>>>>>>> index-based implementation, where in each buffer is
> > > >>>>>>>>> addressed
> > > >>>>> by
> > > >>>>>>>>> (pool address + index)
> > > >>>>
> > > >>>> I like Dharmik's suggestion very much. CPU cache is a critical
> > > >>>> and limited resource.
> > > >>>>
> > > >>>> DPDK has a tendency of using pointers where indexes could be
> used
> > > >>>> instead. I suppose pointers provide the additional flexibility
> of
> > > >>>> mixing entries from different memory pools, e.g. multiple mbuf
> > > >> pools.
> > > >>>>
> > > >>
> > > >> Agreed, thank you!
> > > >>
> > > >>>>>>>>
> > > >>>>>>>> I don't think it is going to work:
> > > >>>>>>>> On 64-bit systems difference between pool address and it's
> > > elem
> > > >>>>>>>> address could be bigger than 4GB.
> > > >>>>>>> Are you talking about a case where the memory pool size is
> > > >>>>>>> more
> > > >>>>> than 4GB?
> > > >>>>>>
> > > >>>>>> That is one possible scenario.
> > > >>>>
> > > >>>> That could be solved by making the index an element index
> instead
> > > of
> > > >> a
> > > >>>> pointer offset: address = (pool address + index * element
> size).
> > > >>>
> > > >>> Or instead of scaling the index with the element size, which is
> > > only
> > > >> known at runtime, the index could be more efficiently scaled by
> a
> > > >> compile time constant such as RTE_MEMPOOL_ALIGN (=
> > > >> RTE_CACHE_LINE_SIZE). With a cache line size of 64 byte, that
> would
> > > >> allow indexing into mempools up to 256 GB in size.
> > > >>>
> > > >>
> > > >> Looking at this snippet [1] from
> rte_mempool_op_populate_helper(),
> > > >> there is an ‘offset’ added to avoid objects to cross page
> > > boundaries.
> > > >> If my understanding is correct, using the index of element
> instead
> > > of a
> > > >> pointer offset will pose a challenge for some of the corner
> cases.
> > > >>
> > > >> [1]
> > > >>        for (i = 0; i < max_objs; i++) {
> > > >>                /* avoid objects to cross page boundaries */
> > > >>                if (check_obj_bounds(va + off, pg_sz,
> total_elt_sz)
> > > >> <
> > > >> 0) {
> > > >>                        off += RTE_PTR_ALIGN_CEIL(va + off,
> pg_sz) -
> > > >> (va + off);
> > > >>                        if (flags &
> RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
> > > >>                                off += total_elt_sz -
> > > >>                                        (((uintptr_t)(va + off -
> 1) %
> > > >>                                                total_elt_sz) +
> 1);
> > > >>                }
> > > >>
> > > >
> > > > OK. Alternatively to scaling the index with a cache line size,
> you
> > > can scale it with sizeof(uintptr_t) to be able to address 32 or 16
> GB
> > > mempools on respectively 64 bit and 32 bit architectures. Both x86
> and
> > > ARM CPUs have instructions to access memory with an added offset
> > > multiplied by 4 or 8. So that should be high performance.
> > >
> > > Yes, agreed this can be done.
> > > Cache line size can also be used when ‘MEMPOOL_F_NO_CACHE_ALIGN’ is
> > > not enabled.
> > > On a side note, I wanted to better understand the need for having
> the
> > > 'MEMPOOL_F_NO_CACHE_ALIGN' option.
> >
> > The description of this field is misleading, and should be corrected.
> > The correct description would be: Don't need to align objs on cache
> lines.
> >
> > It is useful for mempools containing very small objects, to conserve
> memory.
> I think we can assume that mbuf pools are created with the
> 'MEMPOOL_F_NO_CACHE_ALIGN' flag set. With this we can use offset
> calculated with cache line size as the unit.

You mean MEMPOOL_F_NO_CACHE_ALIGN flag not set. ;-)

I agree. And since the flag is a hint only, it can be ignored if the mempool library is scaling the index with the cache line size.

However, a mempool may contain other objects than mbufs, and those objects may be small, so ignoring the MEMPOOL_F_NO_CACHE_ALIGN flag may cost a lot of memory for such mempools.

> 
> >
> > >
> > > >
> > > >>>>
> > > >>>>>> Another possibility - user populates mempool himself with
> some
> > > >>>>> external
> > > >>>>>> memory by calling rte_mempool_populate_iova() directly.
> > > >>>>> Is the concern that IOVA might not be contiguous for all the
> > > memory
> > > >>>>> used by the mempool?
> > > >>>>>
> > > >>>>>> I suppose such situation can even occur even with normal
> > > >>>>>> rte_mempool_create(), though it should be a really rare one.
> > > >>>>> All in all, this feature needs to be configurable during
> compile
> > > >>>> time.
> > > >>>
> > > >
Honnappa Nagarahalli Nov. 8, 2021, 3:29 p.m. UTC | #18
<snip>

> > > > >>>>>>>>> Current mempool per core cache implementation is based
> > > > >>>>>>>>> on
> > > > >>>>> pointer
> > > > >>>>>>>>> For most architectures, each pointer consumes 64b
> > > > >>>>>>>>> Replace
> > it
> > > > >>>>> with
> > > > >>>>>>>>> index-based implementation, where in each buffer is
> > > > >>>>>>>>> addressed
> > > > >>>>> by
> > > > >>>>>>>>> (pool address + index)
> > > > >>>>
> > > > >>>> I like Dharmik's suggestion very much. CPU cache is a
> > > > >>>> critical and limited resource.
> > > > >>>>
> > > > >>>> DPDK has a tendency of using pointers where indexes could be
> > used
> > > > >>>> instead. I suppose pointers provide the additional
> > > > >>>> flexibility
> > of
> > > > >>>> mixing entries from different memory pools, e.g. multiple
> > > > >>>> mbuf
> > > > >> pools.
> > > > >>>>
> > > > >>
> > > > >> Agreed, thank you!
> > > > >>
> > > > >>>>>>>>
> > > > >>>>>>>> I don't think it is going to work:
> > > > >>>>>>>> On 64-bit systems difference between pool address and
> > > > >>>>>>>> it's
> > > > elem
> > > > >>>>>>>> address could be bigger than 4GB.
> > > > >>>>>>> Are you talking about a case where the memory pool size is
> > > > >>>>>>> more
> > > > >>>>> than 4GB?
> > > > >>>>>>
> > > > >>>>>> That is one possible scenario.
> > > > >>>>
> > > > >>>> That could be solved by making the index an element index
> > instead
> > > > of
> > > > >> a
> > > > >>>> pointer offset: address = (pool address + index * element
> > size).
> > > > >>>
> > > > >>> Or instead of scaling the index with the element size, which
> > > > >>> is
> > > > only
> > > > >> known at runtime, the index could be more efficiently scaled by
> > a
> > > > >> compile time constant such as RTE_MEMPOOL_ALIGN (=
> > > > >> RTE_CACHE_LINE_SIZE). With a cache line size of 64 byte, that
> > would
> > > > >> allow indexing into mempools up to 256 GB in size.
> > > > >>>
> > > > >>
> > > > >> Looking at this snippet [1] from
> > rte_mempool_op_populate_helper(),
> > > > >> there is an ‘offset’ added to avoid objects to cross page
> > > > boundaries.
> > > > >> If my understanding is correct, using the index of element
> > instead
> > > > of a
> > > > >> pointer offset will pose a challenge for some of the corner
> > cases.
> > > > >>
> > > > >> [1]
> > > > >>        for (i = 0; i < max_objs; i++) {
> > > > >>                /* avoid objects to cross page boundaries */
> > > > >>                if (check_obj_bounds(va + off, pg_sz,
> > total_elt_sz)
> > > > >> <
> > > > >> 0) {
> > > > >>                        off += RTE_PTR_ALIGN_CEIL(va + off,
> > pg_sz) -
> > > > >> (va + off);
> > > > >>                        if (flags &
> > RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
> > > > >>                                off += total_elt_sz -
> > > > >>                                        (((uintptr_t)(va + off -
> > 1) %
> > > > >>                                                total_elt_sz) +
> > 1);
> > > > >>                }
> > > > >>
> > > > >
> > > > > OK. Alternatively to scaling the index with a cache line size,
> > you
> > > > can scale it with sizeof(uintptr_t) to be able to address 32 or 16
> > GB
> > > > mempools on respectively 64 bit and 32 bit architectures. Both x86
> > and
> > > > ARM CPUs have instructions to access memory with an added offset
> > > > multiplied by 4 or 8. So that should be high performance.
> > > >
> > > > Yes, agreed this can be done.
> > > > Cache line size can also be used when ‘MEMPOOL_F_NO_CACHE_ALIGN’
> > > > is not enabled.
> > > > On a side note, I wanted to better understand the need for having
> > the
> > > > 'MEMPOOL_F_NO_CACHE_ALIGN' option.
> > >
> > > The description of this field is misleading, and should be corrected.
> > > The correct description would be: Don't need to align objs on cache
> > lines.
> > >
> > > It is useful for mempools containing very small objects, to conserve
> > memory.
> > I think we can assume that mbuf pools are created with the
> > 'MEMPOOL_F_NO_CACHE_ALIGN' flag set. With this we can use offset
> > calculated with cache line size as the unit.
> 
> You mean MEMPOOL_F_NO_CACHE_ALIGN flag not set. ;-)
Yes 😊

> 
> I agree. And since the flag is a hint only, it can be ignored if the mempool
> library is scaling the index with the cache line size.
I do not think we should ignore the flag for reason you mention below.

> 
> However, a mempool may contain other objects than mbufs, and those objects
> may be small, so ignoring the MEMPOOL_F_NO_CACHE_ALIGN flag may cost a
> lot of memory for such mempools.
We could use different methods. If MEMPOOL_F_NO_CACHE_ALIGN is set, use the unit as 'sizeof(uintptr_t)', if not set use cache line size as the unit.

> 
> >
> > >
<snip>
Morten Brørup Nov. 8, 2021, 3:39 p.m. UTC | #19
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Honnappa
> Nagarahalli
> Sent: Monday, 8 November 2021 16.29
> 
> <snip>
> 
> > > > > >>>>>>>>> Current mempool per core cache implementation is
> based
> > > > > >>>>>>>>> on
> > > > > >>>>> pointer
> > > > > >>>>>>>>> For most architectures, each pointer consumes 64b
> > > > > >>>>>>>>> Replace
> > > it
> > > > > >>>>> with
> > > > > >>>>>>>>> index-based implementation, where in each buffer is
> > > > > >>>>>>>>> addressed
> > > > > >>>>> by
> > > > > >>>>>>>>> (pool address + index)
> > > > > >>>>
> > > > > >>>> I like Dharmik's suggestion very much. CPU cache is a
> > > > > >>>> critical and limited resource.
> > > > > >>>>
> > > > > >>>> DPDK has a tendency of using pointers where indexes could
> be
> > > used
> > > > > >>>> instead. I suppose pointers provide the additional
> > > > > >>>> flexibility
> > > of
> > > > > >>>> mixing entries from different memory pools, e.g. multiple
> > > > > >>>> mbuf
> > > > > >> pools.
> > > > > >>>>
> > > > > >>
> > > > > >> Agreed, thank you!
> > > > > >>
> > > > > >>>>>>>>
> > > > > >>>>>>>> I don't think it is going to work:
> > > > > >>>>>>>> On 64-bit systems difference between pool address and
> > > > > >>>>>>>> it's
> > > > > elem
> > > > > >>>>>>>> address could be bigger than 4GB.
> > > > > >>>>>>> Are you talking about a case where the memory pool size
> is
> > > > > >>>>>>> more
> > > > > >>>>> than 4GB?
> > > > > >>>>>>
> > > > > >>>>>> That is one possible scenario.
> > > > > >>>>
> > > > > >>>> That could be solved by making the index an element index
> > > instead
> > > > > of
> > > > > >> a
> > > > > >>>> pointer offset: address = (pool address + index * element
> > > size).
> > > > > >>>
> > > > > >>> Or instead of scaling the index with the element size,
> which
> > > > > >>> is
> > > > > only
> > > > > >> known at runtime, the index could be more efficiently scaled
> by
> > > a
> > > > > >> compile time constant such as RTE_MEMPOOL_ALIGN (=
> > > > > >> RTE_CACHE_LINE_SIZE). With a cache line size of 64 byte,
> that
> > > would
> > > > > >> allow indexing into mempools up to 256 GB in size.
> > > > > >>>
> > > > > >>
> > > > > >> Looking at this snippet [1] from
> > > rte_mempool_op_populate_helper(),
> > > > > >> there is an ‘offset’ added to avoid objects to cross page
> > > > > boundaries.
> > > > > >> If my understanding is correct, using the index of element
> > > instead
> > > > > of a
> > > > > >> pointer offset will pose a challenge for some of the corner
> > > cases.
> > > > > >>
> > > > > >> [1]
> > > > > >>        for (i = 0; i < max_objs; i++) {
> > > > > >>                /* avoid objects to cross page boundaries */
> > > > > >>                if (check_obj_bounds(va + off, pg_sz,
> > > total_elt_sz)
> > > > > >> <
> > > > > >> 0) {
> > > > > >>                        off += RTE_PTR_ALIGN_CEIL(va + off,
> > > pg_sz) -
> > > > > >> (va + off);
> > > > > >>                        if (flags &
> > > RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
> > > > > >>                                off += total_elt_sz -
> > > > > >>                                        (((uintptr_t)(va +
> off -
> > > 1) %
> > > > > >>                                                total_elt_sz)
> +
> > > 1);
> > > > > >>                }
> > > > > >>
> > > > > >
> > > > > > OK. Alternatively to scaling the index with a cache line
> size,
> > > you
> > > > > can scale it with sizeof(uintptr_t) to be able to address 32 or
> 16
> > > GB
> > > > > mempools on respectively 64 bit and 32 bit architectures. Both
> x86
> > > and
> > > > > ARM CPUs have instructions to access memory with an added
> offset
> > > > > multiplied by 4 or 8. So that should be high performance.
> > > > >
> > > > > Yes, agreed this can be done.
> > > > > Cache line size can also be used when
> ‘MEMPOOL_F_NO_CACHE_ALIGN’
> > > > > is not enabled.
> > > > > On a side note, I wanted to better understand the need for
> having
> > > the
> > > > > 'MEMPOOL_F_NO_CACHE_ALIGN' option.
> > > >
> > > > The description of this field is misleading, and should be
> corrected.
> > > > The correct description would be: Don't need to align objs on
> cache
> > > lines.
> > > >
> > > > It is useful for mempools containing very small objects, to
> conserve
> > > memory.
> > > I think we can assume that mbuf pools are created with the
> > > 'MEMPOOL_F_NO_CACHE_ALIGN' flag set. With this we can use offset
> > > calculated with cache line size as the unit.
> >
> > You mean MEMPOOL_F_NO_CACHE_ALIGN flag not set. ;-)
> Yes 😊
> 
> >
> > I agree. And since the flag is a hint only, it can be ignored if the
> mempool
> > library is scaling the index with the cache line size.
> I do not think we should ignore the flag for reason you mention below.
> 
> >
> > However, a mempool may contain other objects than mbufs, and those
> objects
> > may be small, so ignoring the MEMPOOL_F_NO_CACHE_ALIGN flag may cost
> a
> > lot of memory for such mempools.
> We could use different methods. If MEMPOOL_F_NO_CACHE_ALIGN is set, use
> the unit as 'sizeof(uintptr_t)', if not set use cache line size as the
> unit.
> 

That would require that the indexing multiplier is a runtime parameter instead of a compile time parameter. So it would have a performance penalty.

The indexing multiplier could be compile time configurable, so it is a tradeoff between granularity and maximum mempool size.
Honnappa Nagarahalli Nov. 8, 2021, 3:46 p.m. UTC | #20
<snip>
> >
> > > > > > >>>>>>>>> Current mempool per core cache implementation is
> > based
> > > > > > >>>>>>>>> on
> > > > > > >>>>> pointer
> > > > > > >>>>>>>>> For most architectures, each pointer consumes 64b
> > > > > > >>>>>>>>> Replace
> > > > it
> > > > > > >>>>> with
> > > > > > >>>>>>>>> index-based implementation, where in each buffer is
> > > > > > >>>>>>>>> addressed
> > > > > > >>>>> by
> > > > > > >>>>>>>>> (pool address + index)
> > > > > > >>>>
> > > > > > >>>> I like Dharmik's suggestion very much. CPU cache is a
> > > > > > >>>> critical and limited resource.
> > > > > > >>>>
> > > > > > >>>> DPDK has a tendency of using pointers where indexes could
> > be
> > > > used
> > > > > > >>>> instead. I suppose pointers provide the additional
> > > > > > >>>> flexibility
> > > > of
> > > > > > >>>> mixing entries from different memory pools, e.g. multiple
> > > > > > >>>> mbuf
> > > > > > >> pools.
> > > > > > >>>>
> > > > > > >>
> > > > > > >> Agreed, thank you!
> > > > > > >>
> > > > > > >>>>>>>>
> > > > > > >>>>>>>> I don't think it is going to work:
> > > > > > >>>>>>>> On 64-bit systems difference between pool address and
> > > > > > >>>>>>>> it's
> > > > > > elem
> > > > > > >>>>>>>> address could be bigger than 4GB.
> > > > > > >>>>>>> Are you talking about a case where the memory pool
> > > > > > >>>>>>> size
> > is
> > > > > > >>>>>>> more
> > > > > > >>>>> than 4GB?
> > > > > > >>>>>>
> > > > > > >>>>>> That is one possible scenario.
> > > > > > >>>>
> > > > > > >>>> That could be solved by making the index an element index
> > > > instead
> > > > > > of
> > > > > > >> a
> > > > > > >>>> pointer offset: address = (pool address + index * element
> > > > size).
> > > > > > >>>
> > > > > > >>> Or instead of scaling the index with the element size,
> > which
> > > > > > >>> is
> > > > > > only
> > > > > > >> known at runtime, the index could be more efficiently
> > > > > > >> scaled
> > by
> > > > a
> > > > > > >> compile time constant such as RTE_MEMPOOL_ALIGN (=
> > > > > > >> RTE_CACHE_LINE_SIZE). With a cache line size of 64 byte,
> > that
> > > > would
> > > > > > >> allow indexing into mempools up to 256 GB in size.
> > > > > > >>>
> > > > > > >>
> > > > > > >> Looking at this snippet [1] from
> > > > rte_mempool_op_populate_helper(),
> > > > > > >> there is an ‘offset’ added to avoid objects to cross page
> > > > > > boundaries.
> > > > > > >> If my understanding is correct, using the index of element
> > > > instead
> > > > > > of a
> > > > > > >> pointer offset will pose a challenge for some of the corner
> > > > cases.
> > > > > > >>
> > > > > > >> [1]
> > > > > > >>        for (i = 0; i < max_objs; i++) {
> > > > > > >>                /* avoid objects to cross page boundaries */
> > > > > > >>                if (check_obj_bounds(va + off, pg_sz,
> > > > total_elt_sz)
> > > > > > >> <
> > > > > > >> 0) {
> > > > > > >>                        off += RTE_PTR_ALIGN_CEIL(va + off,
> > > > pg_sz) -
> > > > > > >> (va + off);
> > > > > > >>                        if (flags &
> > > > RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
> > > > > > >>                                off += total_elt_sz -
> > > > > > >>                                        (((uintptr_t)(va +
> > off -
> > > > 1) %
> > > > > > >>
> > > > > > >> total_elt_sz)
> > +
> > > > 1);
> > > > > > >>                }
> > > > > > >>
> > > > > > >
> > > > > > > OK. Alternatively to scaling the index with a cache line
> > size,
> > > > you
> > > > > > can scale it with sizeof(uintptr_t) to be able to address 32
> > > > > > or
> > 16
> > > > GB
> > > > > > mempools on respectively 64 bit and 32 bit architectures. Both
> > x86
> > > > and
> > > > > > ARM CPUs have instructions to access memory with an added
> > offset
> > > > > > multiplied by 4 or 8. So that should be high performance.
> > > > > >
> > > > > > Yes, agreed this can be done.
> > > > > > Cache line size can also be used when
> > ‘MEMPOOL_F_NO_CACHE_ALIGN’
> > > > > > is not enabled.
> > > > > > On a side note, I wanted to better understand the need for
> > having
> > > > the
> > > > > > 'MEMPOOL_F_NO_CACHE_ALIGN' option.
> > > > >
> > > > > The description of this field is misleading, and should be
> > corrected.
> > > > > The correct description would be: Don't need to align objs on
> > cache
> > > > lines.
> > > > >
> > > > > It is useful for mempools containing very small objects, to
> > conserve
> > > > memory.
> > > > I think we can assume that mbuf pools are created with the
> > > > 'MEMPOOL_F_NO_CACHE_ALIGN' flag set. With this we can use offset
> > > > calculated with cache line size as the unit.
> > >
> > > You mean MEMPOOL_F_NO_CACHE_ALIGN flag not set. ;-)
> > Yes 😊
> >
> > >
> > > I agree. And since the flag is a hint only, it can be ignored if the
> > mempool
> > > library is scaling the index with the cache line size.
> > I do not think we should ignore the flag for reason you mention below.
> >
> > >
> > > However, a mempool may contain other objects than mbufs, and those
> > objects
> > > may be small, so ignoring the MEMPOOL_F_NO_CACHE_ALIGN flag may
> cost
> > a
> > > lot of memory for such mempools.
> > We could use different methods. If MEMPOOL_F_NO_CACHE_ALIGN is set,
> > use the unit as 'sizeof(uintptr_t)', if not set use cache line size as
> > the unit.
> >
> 
> That would require that the indexing multiplier is a runtime parameter instead
> of a compile time parameter. So it would have a performance penalty.
> 
> The indexing multiplier could be compile time configurable, so it is a tradeoff
> between granularity and maximum mempool size.
I meant compile time configurable. i.e.

#ifdef MEMPOOL_F_NO_CACHE_ALIGN
<use sizeof(uintptr_t) as the multiplier>
#else
<use cache line size as the multiplier> /* This should provide enough memory for packet buffers */
#endif
Morten Brørup Nov. 8, 2021, 4:03 p.m. UTC | #21
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Honnappa
> Nagarahalli
> Sent: Monday, 8 November 2021 16.46
> 
> <snip>
> > >
> > > > > > > >>>>>>>>> Current mempool per core cache implementation is
> > > based
> > > > > > > >>>>>>>>> on
> > > > > > > >>>>> pointer
> > > > > > > >>>>>>>>> For most architectures, each pointer consumes 64b
> > > > > > > >>>>>>>>> Replace
> > > > > it
> > > > > > > >>>>> with
> > > > > > > >>>>>>>>> index-based implementation, where in each buffer
> is
> > > > > > > >>>>>>>>> addressed
> > > > > > > >>>>> by
> > > > > > > >>>>>>>>> (pool address + index)
> > > > > > > >>>>
> > > > > > > >>>> I like Dharmik's suggestion very much. CPU cache is a
> > > > > > > >>>> critical and limited resource.
> > > > > > > >>>>
> > > > > > > >>>> DPDK has a tendency of using pointers where indexes
> could
> > > be
> > > > > used
> > > > > > > >>>> instead. I suppose pointers provide the additional
> > > > > > > >>>> flexibility
> > > > > of
> > > > > > > >>>> mixing entries from different memory pools, e.g.
> multiple
> > > > > > > >>>> mbuf
> > > > > > > >> pools.
> > > > > > > >>>>
> > > > > > > >>
> > > > > > > >> Agreed, thank you!
> > > > > > > >>
> > > > > > > >>>>>>>>
> > > > > > > >>>>>>>> I don't think it is going to work:
> > > > > > > >>>>>>>> On 64-bit systems difference between pool address
> and
> > > > > > > >>>>>>>> it's
> > > > > > > elem
> > > > > > > >>>>>>>> address could be bigger than 4GB.
> > > > > > > >>>>>>> Are you talking about a case where the memory pool
> > > > > > > >>>>>>> size
> > > is
> > > > > > > >>>>>>> more
> > > > > > > >>>>> than 4GB?
> > > > > > > >>>>>>
> > > > > > > >>>>>> That is one possible scenario.
> > > > > > > >>>>
> > > > > > > >>>> That could be solved by making the index an element
> index
> > > > > instead
> > > > > > > of
> > > > > > > >> a
> > > > > > > >>>> pointer offset: address = (pool address + index *
> element
> > > > > size).
> > > > > > > >>>
> > > > > > > >>> Or instead of scaling the index with the element size,
> > > which
> > > > > > > >>> is
> > > > > > > only
> > > > > > > >> known at runtime, the index could be more efficiently
> > > > > > > >> scaled
> > > by
> > > > > a
> > > > > > > >> compile time constant such as RTE_MEMPOOL_ALIGN (=
> > > > > > > >> RTE_CACHE_LINE_SIZE). With a cache line size of 64 byte,
> > > that
> > > > > would
> > > > > > > >> allow indexing into mempools up to 256 GB in size.
> > > > > > > >>>
> > > > > > > >>
> > > > > > > >> Looking at this snippet [1] from
> > > > > rte_mempool_op_populate_helper(),
> > > > > > > >> there is an ‘offset’ added to avoid objects to cross
> page
> > > > > > > boundaries.
> > > > > > > >> If my understanding is correct, using the index of
> element
> > > > > instead
> > > > > > > of a
> > > > > > > >> pointer offset will pose a challenge for some of the
> corner
> > > > > cases.
> > > > > > > >>
> > > > > > > >> [1]
> > > > > > > >>        for (i = 0; i < max_objs; i++) {
> > > > > > > >>                /* avoid objects to cross page boundaries
> */
> > > > > > > >>                if (check_obj_bounds(va + off, pg_sz,
> > > > > total_elt_sz)
> > > > > > > >> <
> > > > > > > >> 0) {
> > > > > > > >>                        off += RTE_PTR_ALIGN_CEIL(va +
> off,
> > > > > pg_sz) -
> > > > > > > >> (va + off);
> > > > > > > >>                        if (flags &
> > > > > RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
> > > > > > > >>                                off += total_elt_sz -
> > > > > > > >>                                        (((uintptr_t)(va
> +
> > > off -
> > > > > 1) %
> > > > > > > >>
> > > > > > > >> total_elt_sz)
> > > +
> > > > > 1);
> > > > > > > >>                }
> > > > > > > >>
> > > > > > > >
> > > > > > > > OK. Alternatively to scaling the index with a cache line
> > > size,
> > > > > you
> > > > > > > can scale it with sizeof(uintptr_t) to be able to address
> 32
> > > > > > > or
> > > 16
> > > > > GB
> > > > > > > mempools on respectively 64 bit and 32 bit architectures.
> Both
> > > x86
> > > > > and
> > > > > > > ARM CPUs have instructions to access memory with an added
> > > offset
> > > > > > > multiplied by 4 or 8. So that should be high performance.
> > > > > > >
> > > > > > > Yes, agreed this can be done.
> > > > > > > Cache line size can also be used when
> > > ‘MEMPOOL_F_NO_CACHE_ALIGN’
> > > > > > > is not enabled.
> > > > > > > On a side note, I wanted to better understand the need for
> > > having
> > > > > the
> > > > > > > 'MEMPOOL_F_NO_CACHE_ALIGN' option.
> > > > > >
> > > > > > The description of this field is misleading, and should be
> > > corrected.
> > > > > > The correct description would be: Don't need to align objs on
> > > cache
> > > > > lines.
> > > > > >
> > > > > > It is useful for mempools containing very small objects, to
> > > conserve
> > > > > memory.
> > > > > I think we can assume that mbuf pools are created with the
> > > > > 'MEMPOOL_F_NO_CACHE_ALIGN' flag set. With this we can use
> offset
> > > > > calculated with cache line size as the unit.
> > > >
> > > > You mean MEMPOOL_F_NO_CACHE_ALIGN flag not set. ;-)
> > > Yes 😊
> > >
> > > >
> > > > I agree. And since the flag is a hint only, it can be ignored if
> the
> > > mempool
> > > > library is scaling the index with the cache line size.
> > > I do not think we should ignore the flag for reason you mention
> below.
> > >
> > > >
> > > > However, a mempool may contain other objects than mbufs, and
> those
> > > objects
> > > > may be small, so ignoring the MEMPOOL_F_NO_CACHE_ALIGN flag may
> > cost
> > > a
> > > > lot of memory for such mempools.
> > > We could use different methods. If MEMPOOL_F_NO_CACHE_ALIGN is set,
> > > use the unit as 'sizeof(uintptr_t)', if not set use cache line size
> as
> > > the unit.
> > >
> >
> > That would require that the indexing multiplier is a runtime
> parameter instead
> > of a compile time parameter. So it would have a performance penalty.
> >
> > The indexing multiplier could be compile time configurable, so it is
> a tradeoff
> > between granularity and maximum mempool size.
> I meant compile time configurable. i.e.
> 
> #ifdef MEMPOOL_F_NO_CACHE_ALIGN
> <use sizeof(uintptr_t) as the multiplier>
> #else
> <use cache line size as the multiplier> /* This should provide enough
> memory for packet buffers */
> #endif

Please note that MEMPOOL_F_NO_CACHE_ALIGN is a runtime flag passed when creating a mempool, not a compile time option.
Jerin Jacob Nov. 8, 2021, 4:47 p.m. UTC | #22
On Mon, Nov 8, 2021 at 9:34 PM Morten Brørup <mb@smartsharesystems.com> wrote:
>
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Honnappa
> > Nagarahalli
> > Sent: Monday, 8 November 2021 16.46
> >
> > <snip>
> > > >
> > > > > > > > >>>>>>>>> Current mempool per core cache implementation is
> > > > based
> > > > > > > > >>>>>>>>> on
> > > > > > > > >>>>> pointer
> > > > > > > > >>>>>>>>> For most architectures, each pointer consumes 64b
> > > > > > > > >>>>>>>>> Replace
> > > > > > it
> > > > > > > > >>>>> with
> > > > > > > > >>>>>>>>> index-based implementation, where in each buffer
> > is
> > > > > > > > >>>>>>>>> addressed
> > > > > > > > >>>>> by
> > > > > > > > >>>>>>>>> (pool address + index)
> > > > > > > > >>>>
> > > > > > > > >>>> I like Dharmik's suggestion very much. CPU cache is a
> > > > > > > > >>>> critical and limited resource.
> > > > > > > > >>>>
> > > > > > > > >>>> DPDK has a tendency of using pointers where indexes
> > could
> > > > be
> > > > > > used
> > > > > > > > >>>> instead. I suppose pointers provide the additional
> > > > > > > > >>>> flexibility
> > > > > > of
> > > > > > > > >>>> mixing entries from different memory pools, e.g.
> > multiple
> > > > > > > > >>>> mbuf
> > > > > > > > >> pools.
> > > > > > > > >>>>
> > > > > > > > >>
> > > > > > > > >> Agreed, thank you!
> > > > > > > > >>
> > > > > > > > >>>>>>>>
> > > > > > > > >>>>>>>> I don't think it is going to work:
> > > > > > > > >>>>>>>> On 64-bit systems difference between pool address
> > and
> > > > > > > > >>>>>>>> it's
> > > > > > > > elem
> > > > > > > > >>>>>>>> address could be bigger than 4GB.
> > > > > > > > >>>>>>> Are you talking about a case where the memory pool
> > > > > > > > >>>>>>> size
> > > > is
> > > > > > > > >>>>>>> more
> > > > > > > > >>>>> than 4GB?
> > > > > > > > >>>>>>
> > > > > > > > >>>>>> That is one possible scenario.
> > > > > > > > >>>>
> > > > > > > > >>>> That could be solved by making the index an element
> > index
> > > > > > instead
> > > > > > > > of
> > > > > > > > >> a
> > > > > > > > >>>> pointer offset: address = (pool address + index *
> > element
> > > > > > size).
> > > > > > > > >>>
> > > > > > > > >>> Or instead of scaling the index with the element size,
> > > > which
> > > > > > > > >>> is
> > > > > > > > only
> > > > > > > > >> known at runtime, the index could be more efficiently
> > > > > > > > >> scaled
> > > > by
> > > > > > a
> > > > > > > > >> compile time constant such as RTE_MEMPOOL_ALIGN (=
> > > > > > > > >> RTE_CACHE_LINE_SIZE). With a cache line size of 64 byte,
> > > > that
> > > > > > would
> > > > > > > > >> allow indexing into mempools up to 256 GB in size.
> > > > > > > > >>>
> > > > > > > > >>
> > > > > > > > >> Looking at this snippet [1] from
> > > > > > rte_mempool_op_populate_helper(),
> > > > > > > > >> there is an ‘offset’ added to avoid objects to cross
> > page
> > > > > > > > boundaries.
> > > > > > > > >> If my understanding is correct, using the index of
> > element
> > > > > > instead
> > > > > > > > of a
> > > > > > > > >> pointer offset will pose a challenge for some of the
> > corner
> > > > > > cases.
> > > > > > > > >>
> > > > > > > > >> [1]
> > > > > > > > >>        for (i = 0; i < max_objs; i++) {
> > > > > > > > >>                /* avoid objects to cross page boundaries
> > */
> > > > > > > > >>                if (check_obj_bounds(va + off, pg_sz,
> > > > > > total_elt_sz)
> > > > > > > > >> <
> > > > > > > > >> 0) {
> > > > > > > > >>                        off += RTE_PTR_ALIGN_CEIL(va +
> > off,
> > > > > > pg_sz) -
> > > > > > > > >> (va + off);
> > > > > > > > >>                        if (flags &
> > > > > > RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
> > > > > > > > >>                                off += total_elt_sz -
> > > > > > > > >>                                        (((uintptr_t)(va
> > +
> > > > off -
> > > > > > 1) %
> > > > > > > > >>
> > > > > > > > >> total_elt_sz)
> > > > +
> > > > > > 1);
> > > > > > > > >>                }
> > > > > > > > >>
> > > > > > > > >
> > > > > > > > > OK. Alternatively to scaling the index with a cache line
> > > > size,
> > > > > > you
> > > > > > > > can scale it with sizeof(uintptr_t) to be able to address
> > 32
> > > > > > > > or
> > > > 16
> > > > > > GB
> > > > > > > > mempools on respectively 64 bit and 32 bit architectures.
> > Both
> > > > x86
> > > > > > and
> > > > > > > > ARM CPUs have instructions to access memory with an added
> > > > offset
> > > > > > > > multiplied by 4 or 8. So that should be high performance.
> > > > > > > >
> > > > > > > > Yes, agreed this can be done.
> > > > > > > > Cache line size can also be used when
> > > > ‘MEMPOOL_F_NO_CACHE_ALIGN’
> > > > > > > > is not enabled.
> > > > > > > > On a side note, I wanted to better understand the need for
> > > > having
> > > > > > the
> > > > > > > > 'MEMPOOL_F_NO_CACHE_ALIGN' option.
> > > > > > >
> > > > > > > The description of this field is misleading, and should be
> > > > corrected.
> > > > > > > The correct description would be: Don't need to align objs on
> > > > cache
> > > > > > lines.
> > > > > > >
> > > > > > > It is useful for mempools containing very small objects, to
> > > > conserve
> > > > > > memory.
> > > > > > I think we can assume that mbuf pools are created with the
> > > > > > 'MEMPOOL_F_NO_CACHE_ALIGN' flag set. With this we can use
> > offset
> > > > > > calculated with cache line size as the unit.
> > > > >
> > > > > You mean MEMPOOL_F_NO_CACHE_ALIGN flag not set. ;-)
> > > > Yes 😊
> > > >
> > > > >
> > > > > I agree. And since the flag is a hint only, it can be ignored if
> > the
> > > > mempool
> > > > > library is scaling the index with the cache line size.
> > > > I do not think we should ignore the flag for reason you mention
> > below.
> > > >
> > > > >
> > > > > However, a mempool may contain other objects than mbufs, and
> > those
> > > > objects
> > > > > may be small, so ignoring the MEMPOOL_F_NO_CACHE_ALIGN flag may
> > > cost
> > > > a
> > > > > lot of memory for such mempools.
> > > > We could use different methods. If MEMPOOL_F_NO_CACHE_ALIGN is set,
> > > > use the unit as 'sizeof(uintptr_t)', if not set use cache line size
> > as
> > > > the unit.
> > > >
> > >
> > > That would require that the indexing multiplier is a runtime
> > parameter instead
> > > of a compile time parameter. So it would have a performance penalty.
> > >
> > > The indexing multiplier could be compile time configurable, so it is
> > a tradeoff
> > > between granularity and maximum mempool size.
> > I meant compile time configurable. i.e.
> >
> > #ifdef MEMPOOL_F_NO_CACHE_ALIGN
> > <use sizeof(uintptr_t) as the multiplier>
> > #else
> > <use cache line size as the multiplier> /* This should provide enough
> > memory for packet buffers */
> > #endif
>
> Please note that MEMPOOL_F_NO_CACHE_ALIGN is a runtime flag passed when creating a mempool, not a compile time option.

Also, Please share  PMU counters stats on L1 and L2 miss with or
without this scheme after the rework. IMO, we should not have any
regression on
1) Per core mpps
OR
2) L1 and L2 misses.
with l3fwd/testpmd/l2fwd etc,


>
>
diff mbox series

Patch

diff --git a/drivers/mempool/ring/rte_mempool_ring.c b/drivers/mempool/ring/rte_mempool_ring.c
index b1f09ff28f4d..e55913e47f21 100644
--- a/drivers/mempool/ring/rte_mempool_ring.c
+++ b/drivers/mempool/ring/rte_mempool_ring.c
@@ -101,7 +101,7 @@  ring_alloc(struct rte_mempool *mp, uint32_t rg_flags)
 		return -rte_errno;
 
 	mp->pool_data = r;
-
+	mp->local_cache_base_addr = &r[1];
 	return 0;
 }
 
diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index 59a588425bd6..424bdb19c323 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -480,6 +480,7 @@  rte_mempool_populate_default(struct rte_mempool *mp)
 	int ret;
 	bool need_iova_contig_obj;
 	size_t max_alloc_size = SIZE_MAX;
+	unsigned lcore_id;
 
 	ret = mempool_ops_alloc_once(mp);
 	if (ret != 0)
@@ -600,6 +601,13 @@  rte_mempool_populate_default(struct rte_mempool *mp)
 		}
 	}
 
+	/* Init all default caches. */
+	if (mp->cache_size != 0) {
+		for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
+			mp->local_cache[lcore_id].local_cache_base_value =
+				*(void **)mp->local_cache_base_addr;
+	}
+
 	rte_mempool_trace_populate_default(mp);
 	return mp->size;
 
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 4235d6f0bf2b..545405c0d3ce 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -51,6 +51,8 @@ 
 #include <rte_memcpy.h>
 #include <rte_common.h>
 
+#include <arm_neon.h>
+
 #include "rte_mempool_trace_fp.h"
 
 #ifdef __cplusplus
@@ -91,11 +93,12 @@  struct rte_mempool_cache {
 	uint32_t size;	      /**< Size of the cache */
 	uint32_t flushthresh; /**< Threshold before we flush excess elements */
 	uint32_t len;	      /**< Current cache count */
+	void *local_cache_base_value; /**< Base value to calculate indices */
 	/*
 	 * Cache is allocated to this size to allow it to overflow in certain
 	 * cases to avoid needless emptying of cache.
 	 */
-	void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
+	uint32_t objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
 } __rte_cache_aligned;
 
 /**
@@ -172,7 +175,6 @@  struct rte_mempool_objtlr {
  * A list of memory where objects are stored
  */
 STAILQ_HEAD(rte_mempool_memhdr_list, rte_mempool_memhdr);
-
 /**
  * Callback used to free a memory chunk
  */
@@ -244,6 +246,7 @@  struct rte_mempool {
 	int32_t ops_index;
 
 	struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
+	void *local_cache_base_addr; /**< Reference to the base value */
 
 	uint32_t populated_size;         /**< Number of populated objects. */
 	struct rte_mempool_objhdr_list elt_list; /**< List of objects in pool */
@@ -1269,7 +1272,15 @@  rte_mempool_cache_flush(struct rte_mempool_cache *cache,
 	if (cache == NULL || cache->len == 0)
 		return;
 	rte_mempool_trace_cache_flush(cache, mp);
-	rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
+
+	unsigned int i;
+	unsigned int cache_len = cache->len;
+	void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
+	void *base_value = cache->local_cache_base_value;
+	uint32_t *cache_objs = cache->objs;
+	for (i = 0; i < cache_len; i++)
+		obj_table[i] = (void *) RTE_PTR_ADD(base_value, cache_objs[i]);
+	rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
 	cache->len = 0;
 }
 
@@ -1289,7 +1300,9 @@  static __rte_always_inline void
 __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
 		      unsigned int n, struct rte_mempool_cache *cache)
 {
-	void **cache_objs;
+	uint32_t *cache_objs;
+	void *base_value;
+	uint32_t i;
 
 	/* increment stat now, adding in mempool always success */
 	__MEMPOOL_STAT_ADD(mp, put_bulk, 1);
@@ -1301,6 +1314,12 @@  __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
 
 	cache_objs = &cache->objs[cache->len];
 
+	base_value = cache->local_cache_base_value;
+
+	uint64x2_t v_obj_table;
+	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
+	uint32x2_t v_cache_objs;
+
 	/*
 	 * The cache follows the following algorithm
 	 *   1. Add the objects to the cache
@@ -1309,12 +1328,26 @@  __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
 	 */
 
 	/* Add elements back into the cache */
-	rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
+
+#if defined __ARM_NEON
+	for (i = 0; i < (n & ~0x1); i+=2) {
+		v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
+		v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table, v_base_value));
+		vst1_u32(cache_objs + i, v_cache_objs);
+	}
+	if (n & 0x1) {
+		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
+	}
+#else
+	for (i = 0; i < n; i++) {
+		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
+	}
+#endif
 
 	cache->len += n;
 
 	if (cache->len >= cache->flushthresh) {
-		rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size],
+		rte_mempool_ops_enqueue_bulk(mp, obj_table + cache->len - cache->size,
 				cache->len - cache->size);
 		cache->len = cache->size;
 	}
@@ -1415,23 +1448,26 @@  __mempool_generic_get(struct rte_mempool *mp, void **obj_table,
 		      unsigned int n, struct rte_mempool_cache *cache)
 {
 	int ret;
+	uint32_t i;
 	uint32_t index, len;
-	void **cache_objs;
+	uint32_t *cache_objs;
 
 	/* No cache provided or cannot be satisfied from cache */
 	if (unlikely(cache == NULL || n >= cache->size))
 		goto ring_dequeue;
 
+	void *base_value = cache->local_cache_base_value;
 	cache_objs = cache->objs;
 
 	/* Can this be satisfied from the cache? */
 	if (cache->len < n) {
 		/* No. Backfill the cache first, and then fill from it */
 		uint32_t req = n + (cache->size - cache->len);
+		void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
 
 		/* How many do we require i.e. number to fill the cache + the request */
 		ret = rte_mempool_ops_dequeue_bulk(mp,
-			&cache->objs[cache->len], req);
+			temp_objs, req);
 		if (unlikely(ret < 0)) {
 			/*
 			 * In the off chance that we are buffer constrained,
@@ -1442,12 +1478,32 @@  __mempool_generic_get(struct rte_mempool *mp, void **obj_table,
 			goto ring_dequeue;
 		}
 
+		len = cache->len;
+		for (i = 0; i < req; ++i, ++len) {
+			cache_objs[len] = (uint32_t) RTE_PTR_DIFF(temp_objs[i], base_value);
+		}
+
 		cache->len += req;
 	}
 
+	uint64x2_t v_obj_table;
+	uint64x2_t v_cache_objs;
+	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
+
 	/* Now fill in the response ... */
+#if defined __ARM_NEON
+	for (index = 0, len = cache->len - 1; index < (n & ~0x1); index+=2,
+						len-=2, obj_table+=2) {
+		v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
+		v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
+		vst1q_u64((uint64_t *)obj_table, v_obj_table);
+	}
+	if (n & 0x1)
+		*obj_table = (void *) RTE_PTR_ADD(base_value, cache_objs[len]);
+#else
 	for (index = 0, len = cache->len - 1; index < n; ++index, len--, obj_table++)
-		*obj_table = cache_objs[len];
+		*obj_table = (void *) RTE_PTR_ADD(base_value, cache_objs[len]);
+#endif
 
 	cache->len -= n;