[RFC] mempool: implement index-based per core cache

Message ID 20210930172735.2675627-1-dharmik.thakkar@arm.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series [RFC] mempool: implement index-based per core cache |

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/Intel-compilation warning apply issues

Commit Message

Dharmik Thakkar Sept. 30, 2021, 5:27 p.m. UTC
  Current mempool per core cache implementation is based on pointer
For most architectures, each pointer consumes 64b
Replace it with index-based implementation, where in each buffer
is addressed by (pool address + index)
It will reduce memory requirements

L3Fwd performance testing reveals minor improvements in the cache
performance and no change in throughput

Micro-benchmarking the patch using mempool_perf_test shows
significant improvement with majority of the test cases

Future plan involves replacing global pool's pointer-based implementation with index-based implementation

Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
---
 drivers/mempool/ring/rte_mempool_ring.c |  2 +-
 lib/mempool/rte_mempool.c               |  8 +++
 lib/mempool/rte_mempool.h               | 74 ++++++++++++++++++++++---
 3 files changed, 74 insertions(+), 10 deletions(-)
  

Comments

Jerin Jacob Oct. 1, 2021, 12:36 p.m. UTC | #1
On Thu, Sep 30, 2021 at 10:57 PM Dharmik Thakkar
<dharmik.thakkar@arm.com> wrote:
>
> Current mempool per core cache implementation is based on pointer
> For most architectures, each pointer consumes 64b
> Replace it with index-based implementation, where in each buffer
> is addressed by (pool address + index)
> It will reduce memory requirements
>
> L3Fwd performance testing reveals minor improvements in the cache
> performance and no change in throughput
>
> Micro-benchmarking the patch using mempool_perf_test shows
> significant improvement with majority of the test cases
>
> Future plan involves replacing global pool's pointer-based implementation with index-based implementation
>
> Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>


Sane idea. Like VPP, we tried to do this for rte_graph, but not
observed much gain.
Since lcore cache is typically 512, maybe there is a gain on the mempool path.
Also, Since you are enabling only for local cache, it is good as
mempool drivers can work as-is.(i.e HW drivers works with 64bit)
I think, getting more performance numbers for various cases may be the
next step.

> ---
>  drivers/mempool/ring/rte_mempool_ring.c |  2 +-
>  lib/mempool/rte_mempool.c               |  8 +++
>  lib/mempool/rte_mempool.h               | 74 ++++++++++++++++++++++---
>  3 files changed, 74 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/mempool/ring/rte_mempool_ring.c b/drivers/mempool/ring/rte_mempool_ring.c
> index b1f09ff28f4d..e55913e47f21 100644
> --- a/drivers/mempool/ring/rte_mempool_ring.c
> +++ b/drivers/mempool/ring/rte_mempool_ring.c
> @@ -101,7 +101,7 @@ ring_alloc(struct rte_mempool *mp, uint32_t rg_flags)
>                 return -rte_errno;
>
>         mp->pool_data = r;
> -
> +       mp->local_cache_base_addr = &r[1];
>         return 0;
>  }
>
> diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> index 59a588425bd6..424bdb19c323 100644
> --- a/lib/mempool/rte_mempool.c
> +++ b/lib/mempool/rte_mempool.c
> @@ -480,6 +480,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>         int ret;
>         bool need_iova_contig_obj;
>         size_t max_alloc_size = SIZE_MAX;
> +       unsigned lcore_id;
>
>         ret = mempool_ops_alloc_once(mp);
>         if (ret != 0)
> @@ -600,6 +601,13 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>                 }
>         }
>
> +       /* Init all default caches. */
> +       if (mp->cache_size != 0) {
> +               for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
> +                       mp->local_cache[lcore_id].local_cache_base_value =
> +                               *(void **)mp->local_cache_base_addr;
> +       }
> +
>         rte_mempool_trace_populate_default(mp);
>         return mp->size;
>
> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> index 4235d6f0bf2b..545405c0d3ce 100644
> --- a/lib/mempool/rte_mempool.h
> +++ b/lib/mempool/rte_mempool.h
> @@ -51,6 +51,8 @@
>  #include <rte_memcpy.h>
>  #include <rte_common.h>
>
> +#include <arm_neon.h>
> +
>  #include "rte_mempool_trace_fp.h"
>
>  #ifdef __cplusplus
> @@ -91,11 +93,12 @@ struct rte_mempool_cache {
>         uint32_t size;        /**< Size of the cache */
>         uint32_t flushthresh; /**< Threshold before we flush excess elements */
>         uint32_t len;         /**< Current cache count */
> +       void *local_cache_base_value; /**< Base value to calculate indices */
>         /*
>          * Cache is allocated to this size to allow it to overflow in certain
>          * cases to avoid needless emptying of cache.
>          */
> -       void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
> +       uint32_t objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
>  } __rte_cache_aligned;
>
>  /**
> @@ -172,7 +175,6 @@ struct rte_mempool_objtlr {
>   * A list of memory where objects are stored
>   */
>  STAILQ_HEAD(rte_mempool_memhdr_list, rte_mempool_memhdr);
> -
>  /**
>   * Callback used to free a memory chunk
>   */
> @@ -244,6 +246,7 @@ struct rte_mempool {
>         int32_t ops_index;
>
>         struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
> +       void *local_cache_base_addr; /**< Reference to the base value */
>
>         uint32_t populated_size;         /**< Number of populated objects. */
>         struct rte_mempool_objhdr_list elt_list; /**< List of objects in pool */
> @@ -1269,7 +1272,15 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
>         if (cache == NULL || cache->len == 0)
>                 return;
>         rte_mempool_trace_cache_flush(cache, mp);
> -       rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
> +
> +       unsigned int i;
> +       unsigned int cache_len = cache->len;
> +       void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> +       void *base_value = cache->local_cache_base_value;
> +       uint32_t *cache_objs = cache->objs;
> +       for (i = 0; i < cache_len; i++)
> +               obj_table[i] = (void *) RTE_PTR_ADD(base_value, cache_objs[i]);
> +       rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
>         cache->len = 0;
>  }
>
> @@ -1289,7 +1300,9 @@ static __rte_always_inline void
>  __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
>                       unsigned int n, struct rte_mempool_cache *cache)
>  {
> -       void **cache_objs;
> +       uint32_t *cache_objs;
> +       void *base_value;
> +       uint32_t i;
>
>         /* increment stat now, adding in mempool always success */
>         __MEMPOOL_STAT_ADD(mp, put_bulk, 1);
> @@ -1301,6 +1314,12 @@ __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
>
>         cache_objs = &cache->objs[cache->len];
>
> +       base_value = cache->local_cache_base_value;
> +
> +       uint64x2_t v_obj_table;
> +       uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> +       uint32x2_t v_cache_objs;
> +
>         /*
>          * The cache follows the following algorithm
>          *   1. Add the objects to the cache
> @@ -1309,12 +1328,26 @@ __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
>          */
>
>         /* Add elements back into the cache */
> -       rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
> +
> +#if defined __ARM_NEON
> +       for (i = 0; i < (n & ~0x1); i+=2) {
> +               v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
> +               v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table, v_base_value));
> +               vst1_u32(cache_objs + i, v_cache_objs);
> +       }
> +       if (n & 0x1) {
> +               cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
> +       }
> +#else
> +       for (i = 0; i < n; i++) {
> +               cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
> +       }
> +#endif
>
>         cache->len += n;
>
>         if (cache->len >= cache->flushthresh) {
> -               rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size],
> +               rte_mempool_ops_enqueue_bulk(mp, obj_table + cache->len - cache->size,
>                                 cache->len - cache->size);
>                 cache->len = cache->size;
>         }
> @@ -1415,23 +1448,26 @@ __mempool_generic_get(struct rte_mempool *mp, void **obj_table,
>                       unsigned int n, struct rte_mempool_cache *cache)
>  {
>         int ret;
> +       uint32_t i;
>         uint32_t index, len;
> -       void **cache_objs;
> +       uint32_t *cache_objs;
>
>         /* No cache provided or cannot be satisfied from cache */
>         if (unlikely(cache == NULL || n >= cache->size))
>                 goto ring_dequeue;
>
> +       void *base_value = cache->local_cache_base_value;
>         cache_objs = cache->objs;
>
>         /* Can this be satisfied from the cache? */
>         if (cache->len < n) {
>                 /* No. Backfill the cache first, and then fill from it */
>                 uint32_t req = n + (cache->size - cache->len);
> +               void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
>
>                 /* How many do we require i.e. number to fill the cache + the request */
>                 ret = rte_mempool_ops_dequeue_bulk(mp,
> -                       &cache->objs[cache->len], req);
> +                       temp_objs, req);
>                 if (unlikely(ret < 0)) {
>                         /*
>                          * In the off chance that we are buffer constrained,
> @@ -1442,12 +1478,32 @@ __mempool_generic_get(struct rte_mempool *mp, void **obj_table,
>                         goto ring_dequeue;
>                 }
>
> +               len = cache->len;
> +               for (i = 0; i < req; ++i, ++len) {
> +                       cache_objs[len] = (uint32_t) RTE_PTR_DIFF(temp_objs[i], base_value);
> +               }
> +
>                 cache->len += req;
>         }
>
> +       uint64x2_t v_obj_table;
> +       uint64x2_t v_cache_objs;
> +       uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> +
>         /* Now fill in the response ... */
> +#if defined __ARM_NEON
> +       for (index = 0, len = cache->len - 1; index < (n & ~0x1); index+=2,
> +                                               len-=2, obj_table+=2) {
> +               v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
> +               v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
> +               vst1q_u64((uint64_t *)obj_table, v_obj_table);
> +       }
> +       if (n & 0x1)
> +               *obj_table = (void *) RTE_PTR_ADD(base_value, cache_objs[len]);
> +#else
>         for (index = 0, len = cache->len - 1; index < n; ++index, len--, obj_table++)
> -               *obj_table = cache_objs[len];
> +               *obj_table = (void *) RTE_PTR_ADD(base_value, cache_objs[len]);
> +#endif
>
>         cache->len -= n;
>
> --
> 2.17.1
>
  
Honnappa Nagarahalli Oct. 1, 2021, 3:44 p.m. UTC | #2
<snip>

> 
> On Thu, Sep 30, 2021 at 10:57 PM Dharmik Thakkar
> <dharmik.thakkar@arm.com> wrote:
> >
> > Current mempool per core cache implementation is based on pointer For
> > most architectures, each pointer consumes 64b Replace it with
> > index-based implementation, where in each buffer is addressed by (pool
> > address + index) It will reduce memory requirements
> >
> > L3Fwd performance testing reveals minor improvements in the cache
> > performance and no change in throughput
> >
> > Micro-benchmarking the patch using mempool_perf_test shows significant
> > improvement with majority of the test cases
> >
> > Future plan involves replacing global pool's pointer-based
> > implementation with index-based implementation
> >
> > Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
> 
> 
> Sane idea. Like VPP, we tried to do this for rte_graph, but not observed much
> gain.
> Since lcore cache is typically 512, maybe there is a gain on the mempool path.
> Also, Since you are enabling only for local cache, it is good as mempool
> drivers can work as-is.(i.e HW drivers works with 64bit) I think, getting more
> performance numbers for various cases may be the next step.
The gain is not observed in terms of PPS improvement, but do see some improvements that PMUs indicate. This approach definitely results in savings in number of cache lines utilized.

> 
> > ---
> >  drivers/mempool/ring/rte_mempool_ring.c |  2 +-
> >  lib/mempool/rte_mempool.c               |  8 +++
> >  lib/mempool/rte_mempool.h               | 74 ++++++++++++++++++++++---
> >  3 files changed, 74 insertions(+), 10 deletions(-)
> >
> > diff --git a/drivers/mempool/ring/rte_mempool_ring.c
> > b/drivers/mempool/ring/rte_mempool_ring.c
> > index b1f09ff28f4d..e55913e47f21 100644
> > --- a/drivers/mempool/ring/rte_mempool_ring.c
> > +++ b/drivers/mempool/ring/rte_mempool_ring.c
> > @@ -101,7 +101,7 @@ ring_alloc(struct rte_mempool *mp, uint32_t
> rg_flags)
> >                 return -rte_errno;
> >
> >         mp->pool_data = r;
> > -
> > +       mp->local_cache_base_addr = &r[1];
> >         return 0;
> >  }
> >
> > diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> > index 59a588425bd6..424bdb19c323 100644
> > --- a/lib/mempool/rte_mempool.c
> > +++ b/lib/mempool/rte_mempool.c
> > @@ -480,6 +480,7 @@ rte_mempool_populate_default(struct
> rte_mempool *mp)
> >         int ret;
> >         bool need_iova_contig_obj;
> >         size_t max_alloc_size = SIZE_MAX;
> > +       unsigned lcore_id;
> >
> >         ret = mempool_ops_alloc_once(mp);
> >         if (ret != 0)
> > @@ -600,6 +601,13 @@ rte_mempool_populate_default(struct
> rte_mempool *mp)
> >                 }
> >         }
> >
> > +       /* Init all default caches. */
> > +       if (mp->cache_size != 0) {
> > +               for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
> > +                       mp->local_cache[lcore_id].local_cache_base_value =
> > +                               *(void **)mp->local_cache_base_addr;
> > +       }
> > +
> >         rte_mempool_trace_populate_default(mp);
> >         return mp->size;
> >
> > diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> > index 4235d6f0bf2b..545405c0d3ce 100644
> > --- a/lib/mempool/rte_mempool.h
> > +++ b/lib/mempool/rte_mempool.h
> > @@ -51,6 +51,8 @@
> >  #include <rte_memcpy.h>
> >  #include <rte_common.h>
> >
> > +#include <arm_neon.h>
> > +
> >  #include "rte_mempool_trace_fp.h"
> >
> >  #ifdef __cplusplus
> > @@ -91,11 +93,12 @@ struct rte_mempool_cache {
> >         uint32_t size;        /**< Size of the cache */
> >         uint32_t flushthresh; /**< Threshold before we flush excess elements
> */
> >         uint32_t len;         /**< Current cache count */
> > +       void *local_cache_base_value; /**< Base value to calculate
> > + indices */
> >         /*
> >          * Cache is allocated to this size to allow it to overflow in certain
> >          * cases to avoid needless emptying of cache.
> >          */
> > -       void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects
> */
> > +       uint32_t objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache
> > + objects */
> >  } __rte_cache_aligned;
> >
> >  /**
> > @@ -172,7 +175,6 @@ struct rte_mempool_objtlr {
> >   * A list of memory where objects are stored
> >   */
> >  STAILQ_HEAD(rte_mempool_memhdr_list, rte_mempool_memhdr);
> > -
> >  /**
> >   * Callback used to free a memory chunk
> >   */
> > @@ -244,6 +246,7 @@ struct rte_mempool {
> >         int32_t ops_index;
> >
> >         struct rte_mempool_cache *local_cache; /**< Per-lcore local
> > cache */
> > +       void *local_cache_base_addr; /**< Reference to the base value
> > + */
> >
> >         uint32_t populated_size;         /**< Number of populated objects. */
> >         struct rte_mempool_objhdr_list elt_list; /**< List of objects
> > in pool */ @@ -1269,7 +1272,15 @@ rte_mempool_cache_flush(struct
> rte_mempool_cache *cache,
> >         if (cache == NULL || cache->len == 0)
> >                 return;
> >         rte_mempool_trace_cache_flush(cache, mp);
> > -       rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
> > +
> > +       unsigned int i;
> > +       unsigned int cache_len = cache->len;
> > +       void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> > +       void *base_value = cache->local_cache_base_value;
> > +       uint32_t *cache_objs = cache->objs;
> > +       for (i = 0; i < cache_len; i++)
> > +               obj_table[i] = (void *) RTE_PTR_ADD(base_value, cache_objs[i]);
> > +       rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
> >         cache->len = 0;
> >  }
> >
> > @@ -1289,7 +1300,9 @@ static __rte_always_inline void
> > __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
> >                       unsigned int n, struct rte_mempool_cache *cache)
> > {
> > -       void **cache_objs;
> > +       uint32_t *cache_objs;
> > +       void *base_value;
> > +       uint32_t i;
> >
> >         /* increment stat now, adding in mempool always success */
> >         __MEMPOOL_STAT_ADD(mp, put_bulk, 1); @@ -1301,6 +1314,12 @@
> > __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
> >
> >         cache_objs = &cache->objs[cache->len];
> >
> > +       base_value = cache->local_cache_base_value;
> > +
> > +       uint64x2_t v_obj_table;
> > +       uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> > +       uint32x2_t v_cache_objs;
> > +
> >         /*
> >          * The cache follows the following algorithm
> >          *   1. Add the objects to the cache
> > @@ -1309,12 +1328,26 @@ __mempool_generic_put(struct rte_mempool
> *mp, void * const *obj_table,
> >          */
> >
> >         /* Add elements back into the cache */
> > -       rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
> > +
> > +#if defined __ARM_NEON
> > +       for (i = 0; i < (n & ~0x1); i+=2) {
> > +               v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
> > +               v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table,
> v_base_value));
> > +               vst1_u32(cache_objs + i, v_cache_objs);
> > +       }
> > +       if (n & 0x1) {
> > +               cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
> > +       }
> > +#else
> > +       for (i = 0; i < n; i++) {
> > +               cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
> > +       }
> > +#endif
> >
> >         cache->len += n;
> >
> >         if (cache->len >= cache->flushthresh) {
> > -               rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size],
> > +               rte_mempool_ops_enqueue_bulk(mp, obj_table +
> > + cache->len - cache->size,
> >                                 cache->len - cache->size);
> >                 cache->len = cache->size;
> >         }
> > @@ -1415,23 +1448,26 @@ __mempool_generic_get(struct rte_mempool
> *mp, void **obj_table,
> >                       unsigned int n, struct rte_mempool_cache *cache)
> > {
> >         int ret;
> > +       uint32_t i;
> >         uint32_t index, len;
> > -       void **cache_objs;
> > +       uint32_t *cache_objs;
> >
> >         /* No cache provided or cannot be satisfied from cache */
> >         if (unlikely(cache == NULL || n >= cache->size))
> >                 goto ring_dequeue;
> >
> > +       void *base_value = cache->local_cache_base_value;
> >         cache_objs = cache->objs;
> >
> >         /* Can this be satisfied from the cache? */
> >         if (cache->len < n) {
> >                 /* No. Backfill the cache first, and then fill from it */
> >                 uint32_t req = n + (cache->size - cache->len);
> > +               void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**<
> > + Cache objects */
> >
> >                 /* How many do we require i.e. number to fill the cache + the
> request */
> >                 ret = rte_mempool_ops_dequeue_bulk(mp,
> > -                       &cache->objs[cache->len], req);
> > +                       temp_objs, req);
> >                 if (unlikely(ret < 0)) {
> >                         /*
> >                          * In the off chance that we are buffer
> > constrained, @@ -1442,12 +1478,32 @@ __mempool_generic_get(struct
> rte_mempool *mp, void **obj_table,
> >                         goto ring_dequeue;
> >                 }
> >
> > +               len = cache->len;
> > +               for (i = 0; i < req; ++i, ++len) {
> > +                       cache_objs[len] = (uint32_t) RTE_PTR_DIFF(temp_objs[i],
> base_value);
> > +               }
> > +
> >                 cache->len += req;
> >         }
> >
> > +       uint64x2_t v_obj_table;
> > +       uint64x2_t v_cache_objs;
> > +       uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> > +
> >         /* Now fill in the response ... */
> > +#if defined __ARM_NEON
> > +       for (index = 0, len = cache->len - 1; index < (n & ~0x1); index+=2,
> > +                                               len-=2, obj_table+=2) {
> > +               v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
> > +               v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
> > +               vst1q_u64((uint64_t *)obj_table, v_obj_table);
> > +       }
> > +       if (n & 0x1)
> > +               *obj_table = (void *) RTE_PTR_ADD(base_value,
> > +cache_objs[len]); #else
> >         for (index = 0, len = cache->len - 1; index < n; ++index, len--,
> obj_table++)
> > -               *obj_table = cache_objs[len];
> > +               *obj_table = (void *) RTE_PTR_ADD(base_value,
> > +cache_objs[len]); #endif
> >
> >         cache->len -= n;
> >
> > --
> > 2.17.1
> >
  
Jerin Jacob Oct. 1, 2021, 5:32 p.m. UTC | #3
On Fri, Oct 1, 2021 at 9:14 PM Honnappa Nagarahalli
<Honnappa.Nagarahalli@arm.com> wrote:
>
> <snip>
>
> >
> > On Thu, Sep 30, 2021 at 10:57 PM Dharmik Thakkar
> > <dharmik.thakkar@arm.com> wrote:
> > >
> > > Current mempool per core cache implementation is based on pointer For
> > > most architectures, each pointer consumes 64b Replace it with
> > > index-based implementation, where in each buffer is addressed by (pool
> > > address + index) It will reduce memory requirements
> > >
> > > L3Fwd performance testing reveals minor improvements in the cache
> > > performance and no change in throughput
> > >
> > > Micro-benchmarking the patch using mempool_perf_test shows significant
> > > improvement with majority of the test cases
> > >
> > > Future plan involves replacing global pool's pointer-based
> > > implementation with index-based implementation
> > >
> > > Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
> >
> >
> > Sane idea. Like VPP, we tried to do this for rte_graph, but not observed much
> > gain.
> > Since lcore cache is typically 512, maybe there is a gain on the mempool path.
> > Also, Since you are enabling only for local cache, it is good as mempool
> > drivers can work as-is.(i.e HW drivers works with 64bit) I think, getting more
> > performance numbers for various cases may be the next step.
> The gain is not observed in terms of PPS improvement, but do see some improvements that PMUs indicate. This approach definitely results in savings in number of cache lines utilized.

OK. IMO, If PPS has regression then this path is not viable, else it may be OK.


>
> >
> > > ---
> > >  drivers/mempool/ring/rte_mempool_ring.c |  2 +-
> > >  lib/mempool/rte_mempool.c               |  8 +++
> > >  lib/mempool/rte_mempool.h               | 74 ++++++++++++++++++++++---
> > >  3 files changed, 74 insertions(+), 10 deletions(-)
> > >
> > > diff --git a/drivers/mempool/ring/rte_mempool_ring.c
> > > b/drivers/mempool/ring/rte_mempool_ring.c
> > > index b1f09ff28f4d..e55913e47f21 100644
> > > --- a/drivers/mempool/ring/rte_mempool_ring.c
> > > +++ b/drivers/mempool/ring/rte_mempool_ring.c
> > > @@ -101,7 +101,7 @@ ring_alloc(struct rte_mempool *mp, uint32_t
> > rg_flags)
> > >                 return -rte_errno;
> > >
> > >         mp->pool_data = r;
> > > -
> > > +       mp->local_cache_base_addr = &r[1];
> > >         return 0;
> > >  }
> > >
> > > diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> > > index 59a588425bd6..424bdb19c323 100644
> > > --- a/lib/mempool/rte_mempool.c
> > > +++ b/lib/mempool/rte_mempool.c
> > > @@ -480,6 +480,7 @@ rte_mempool_populate_default(struct
> > rte_mempool *mp)
> > >         int ret;
> > >         bool need_iova_contig_obj;
> > >         size_t max_alloc_size = SIZE_MAX;
> > > +       unsigned lcore_id;
> > >
> > >         ret = mempool_ops_alloc_once(mp);
> > >         if (ret != 0)
> > > @@ -600,6 +601,13 @@ rte_mempool_populate_default(struct
> > rte_mempool *mp)
> > >                 }
> > >         }
> > >
> > > +       /* Init all default caches. */
> > > +       if (mp->cache_size != 0) {
> > > +               for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
> > > +                       mp->local_cache[lcore_id].local_cache_base_value =
> > > +                               *(void **)mp->local_cache_base_addr;
> > > +       }
> > > +
> > >         rte_mempool_trace_populate_default(mp);
> > >         return mp->size;
> > >
> > > diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> > > index 4235d6f0bf2b..545405c0d3ce 100644
> > > --- a/lib/mempool/rte_mempool.h
> > > +++ b/lib/mempool/rte_mempool.h
> > > @@ -51,6 +51,8 @@
> > >  #include <rte_memcpy.h>
> > >  #include <rte_common.h>
> > >
> > > +#include <arm_neon.h>
> > > +
> > >  #include "rte_mempool_trace_fp.h"
> > >
> > >  #ifdef __cplusplus
> > > @@ -91,11 +93,12 @@ struct rte_mempool_cache {
> > >         uint32_t size;        /**< Size of the cache */
> > >         uint32_t flushthresh; /**< Threshold before we flush excess elements
> > */
> > >         uint32_t len;         /**< Current cache count */
> > > +       void *local_cache_base_value; /**< Base value to calculate
> > > + indices */
> > >         /*
> > >          * Cache is allocated to this size to allow it to overflow in certain
> > >          * cases to avoid needless emptying of cache.
> > >          */
> > > -       void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects
> > */
> > > +       uint32_t objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache
> > > + objects */
> > >  } __rte_cache_aligned;
> > >
> > >  /**
> > > @@ -172,7 +175,6 @@ struct rte_mempool_objtlr {
> > >   * A list of memory where objects are stored
> > >   */
> > >  STAILQ_HEAD(rte_mempool_memhdr_list, rte_mempool_memhdr);
> > > -
> > >  /**
> > >   * Callback used to free a memory chunk
> > >   */
> > > @@ -244,6 +246,7 @@ struct rte_mempool {
> > >         int32_t ops_index;
> > >
> > >         struct rte_mempool_cache *local_cache; /**< Per-lcore local
> > > cache */
> > > +       void *local_cache_base_addr; /**< Reference to the base value
> > > + */
> > >
> > >         uint32_t populated_size;         /**< Number of populated objects. */
> > >         struct rte_mempool_objhdr_list elt_list; /**< List of objects
> > > in pool */ @@ -1269,7 +1272,15 @@ rte_mempool_cache_flush(struct
> > rte_mempool_cache *cache,
> > >         if (cache == NULL || cache->len == 0)
> > >                 return;
> > >         rte_mempool_trace_cache_flush(cache, mp);
> > > -       rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
> > > +
> > > +       unsigned int i;
> > > +       unsigned int cache_len = cache->len;
> > > +       void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> > > +       void *base_value = cache->local_cache_base_value;
> > > +       uint32_t *cache_objs = cache->objs;
> > > +       for (i = 0; i < cache_len; i++)
> > > +               obj_table[i] = (void *) RTE_PTR_ADD(base_value, cache_objs[i]);
> > > +       rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
> > >         cache->len = 0;
> > >  }
> > >
> > > @@ -1289,7 +1300,9 @@ static __rte_always_inline void
> > > __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
> > >                       unsigned int n, struct rte_mempool_cache *cache)
> > > {
> > > -       void **cache_objs;
> > > +       uint32_t *cache_objs;
> > > +       void *base_value;
> > > +       uint32_t i;
> > >
> > >         /* increment stat now, adding in mempool always success */
> > >         __MEMPOOL_STAT_ADD(mp, put_bulk, 1); @@ -1301,6 +1314,12 @@
> > > __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
> > >
> > >         cache_objs = &cache->objs[cache->len];
> > >
> > > +       base_value = cache->local_cache_base_value;
> > > +
> > > +       uint64x2_t v_obj_table;
> > > +       uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> > > +       uint32x2_t v_cache_objs;
> > > +
> > >         /*
> > >          * The cache follows the following algorithm
> > >          *   1. Add the objects to the cache
> > > @@ -1309,12 +1328,26 @@ __mempool_generic_put(struct rte_mempool
> > *mp, void * const *obj_table,
> > >          */
> > >
> > >         /* Add elements back into the cache */
> > > -       rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
> > > +
> > > +#if defined __ARM_NEON
> > > +       for (i = 0; i < (n & ~0x1); i+=2) {
> > > +               v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
> > > +               v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table,
> > v_base_value));
> > > +               vst1_u32(cache_objs + i, v_cache_objs);
> > > +       }
> > > +       if (n & 0x1) {
> > > +               cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
> > > +       }
> > > +#else
> > > +       for (i = 0; i < n; i++) {
> > > +               cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
> > > +       }
> > > +#endif
> > >
> > >         cache->len += n;
> > >
> > >         if (cache->len >= cache->flushthresh) {
> > > -               rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size],
> > > +               rte_mempool_ops_enqueue_bulk(mp, obj_table +
> > > + cache->len - cache->size,
> > >                                 cache->len - cache->size);
> > >                 cache->len = cache->size;
> > >         }
> > > @@ -1415,23 +1448,26 @@ __mempool_generic_get(struct rte_mempool
> > *mp, void **obj_table,
> > >                       unsigned int n, struct rte_mempool_cache *cache)
> > > {
> > >         int ret;
> > > +       uint32_t i;
> > >         uint32_t index, len;
> > > -       void **cache_objs;
> > > +       uint32_t *cache_objs;
> > >
> > >         /* No cache provided or cannot be satisfied from cache */
> > >         if (unlikely(cache == NULL || n >= cache->size))
> > >                 goto ring_dequeue;
> > >
> > > +       void *base_value = cache->local_cache_base_value;
> > >         cache_objs = cache->objs;
> > >
> > >         /* Can this be satisfied from the cache? */
> > >         if (cache->len < n) {
> > >                 /* No. Backfill the cache first, and then fill from it */
> > >                 uint32_t req = n + (cache->size - cache->len);
> > > +               void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**<
> > > + Cache objects */
> > >
> > >                 /* How many do we require i.e. number to fill the cache + the
> > request */
> > >                 ret = rte_mempool_ops_dequeue_bulk(mp,
> > > -                       &cache->objs[cache->len], req);
> > > +                       temp_objs, req);
> > >                 if (unlikely(ret < 0)) {
> > >                         /*
> > >                          * In the off chance that we are buffer
> > > constrained, @@ -1442,12 +1478,32 @@ __mempool_generic_get(struct
> > rte_mempool *mp, void **obj_table,
> > >                         goto ring_dequeue;
> > >                 }
> > >
> > > +               len = cache->len;
> > > +               for (i = 0; i < req; ++i, ++len) {
> > > +                       cache_objs[len] = (uint32_t) RTE_PTR_DIFF(temp_objs[i],
> > base_value);
> > > +               }
> > > +
> > >                 cache->len += req;
> > >         }
> > >
> > > +       uint64x2_t v_obj_table;
> > > +       uint64x2_t v_cache_objs;
> > > +       uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> > > +
> > >         /* Now fill in the response ... */
> > > +#if defined __ARM_NEON
> > > +       for (index = 0, len = cache->len - 1; index < (n & ~0x1); index+=2,
> > > +                                               len-=2, obj_table+=2) {
> > > +               v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
> > > +               v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
> > > +               vst1q_u64((uint64_t *)obj_table, v_obj_table);
> > > +       }
> > > +       if (n & 0x1)
> > > +               *obj_table = (void *) RTE_PTR_ADD(base_value,
> > > +cache_objs[len]); #else
> > >         for (index = 0, len = cache->len - 1; index < n; ++index, len--,
> > obj_table++)
> > > -               *obj_table = cache_objs[len];
> > > +               *obj_table = (void *) RTE_PTR_ADD(base_value,
> > > +cache_objs[len]); #endif
> > >
> > >         cache->len -= n;
> > >
> > > --
> > > 2.17.1
> > >
  
Honnappa Nagarahalli Oct. 1, 2021, 5:57 p.m. UTC | #4
<snip>
> >
> > >
> > > On Thu, Sep 30, 2021 at 10:57 PM Dharmik Thakkar
> > > <dharmik.thakkar@arm.com> wrote:
> > > >
> > > > Current mempool per core cache implementation is based on pointer
> > > > For most architectures, each pointer consumes 64b Replace it with
> > > > index-based implementation, where in each buffer is addressed by
> > > > (pool address + index) It will reduce memory requirements
> > > >
> > > > L3Fwd performance testing reveals minor improvements in the cache
> > > > performance and no change in throughput
> > > >
> > > > Micro-benchmarking the patch using mempool_perf_test shows
> > > > significant improvement with majority of the test cases
> > > >
> > > > Future plan involves replacing global pool's pointer-based
> > > > implementation with index-based implementation
> > > >
> > > > Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
> > >
> > >
> > > Sane idea. Like VPP, we tried to do this for rte_graph, but not
> > > observed much gain.
> > > Since lcore cache is typically 512, maybe there is a gain on the mempool
> path.
> > > Also, Since you are enabling only for local cache, it is good as
> > > mempool drivers can work as-is.(i.e HW drivers works with 64bit) I
> > > think, getting more performance numbers for various cases may be the
> next step.
> > The gain is not observed in terms of PPS improvement, but do see some
> improvements that PMUs indicate. This approach definitely results in savings
> in number of cache lines utilized.
> 
> OK. IMO, If PPS has regression then this path is not viable, else it may be OK.
PPS has not regressed. It has improved, but not significantly.
Other way to look at this is, we are doing the same work with less amount of resources.

> 
> 
> >
> > >
> > > > ---
> > > >  drivers/mempool/ring/rte_mempool_ring.c |  2 +-
> > > >  lib/mempool/rte_mempool.c               |  8 +++
> > > >  lib/mempool/rte_mempool.h               | 74 ++++++++++++++++++++++---
> > > >  3 files changed, 74 insertions(+), 10 deletions(-)
> > > >
> > > > diff --git a/drivers/mempool/ring/rte_mempool_ring.c
> > > > b/drivers/mempool/ring/rte_mempool_ring.c
> > > > index b1f09ff28f4d..e55913e47f21 100644
> > > > --- a/drivers/mempool/ring/rte_mempool_ring.c
> > > > +++ b/drivers/mempool/ring/rte_mempool_ring.c
> > > > @@ -101,7 +101,7 @@ ring_alloc(struct rte_mempool *mp, uint32_t
> > > rg_flags)
> > > >                 return -rte_errno;
> > > >
> > > >         mp->pool_data = r;
> > > > -
> > > > +       mp->local_cache_base_addr = &r[1];
> > > >         return 0;
> > > >  }
> > > >
> > > > diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> > > > index 59a588425bd6..424bdb19c323 100644
> > > > --- a/lib/mempool/rte_mempool.c
> > > > +++ b/lib/mempool/rte_mempool.c
> > > > @@ -480,6 +480,7 @@ rte_mempool_populate_default(struct
> > > rte_mempool *mp)
> > > >         int ret;
> > > >         bool need_iova_contig_obj;
> > > >         size_t max_alloc_size = SIZE_MAX;
> > > > +       unsigned lcore_id;
> > > >
> > > >         ret = mempool_ops_alloc_once(mp);
> > > >         if (ret != 0)
> > > > @@ -600,6 +601,13 @@ rte_mempool_populate_default(struct
> > > rte_mempool *mp)
> > > >                 }
> > > >         }
> > > >
> > > > +       /* Init all default caches. */
> > > > +       if (mp->cache_size != 0) {
> > > > +               for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
> > > > +                       mp->local_cache[lcore_id].local_cache_base_value =
> > > > +                               *(void **)mp->local_cache_base_addr;
> > > > +       }
> > > > +
> > > >         rte_mempool_trace_populate_default(mp);
> > > >         return mp->size;
> > > >
> > > > diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> > > > index 4235d6f0bf2b..545405c0d3ce 100644
> > > > --- a/lib/mempool/rte_mempool.h
> > > > +++ b/lib/mempool/rte_mempool.h
> > > > @@ -51,6 +51,8 @@
> > > >  #include <rte_memcpy.h>
> > > >  #include <rte_common.h>
> > > >
> > > > +#include <arm_neon.h>
> > > > +
> > > >  #include "rte_mempool_trace_fp.h"
> > > >
> > > >  #ifdef __cplusplus
> > > > @@ -91,11 +93,12 @@ struct rte_mempool_cache {
> > > >         uint32_t size;        /**< Size of the cache */
> > > >         uint32_t flushthresh; /**< Threshold before we flush
> > > > excess elements
> > > */
> > > >         uint32_t len;         /**< Current cache count */
> > > > +       void *local_cache_base_value; /**< Base value to calculate
> > > > + indices */
> > > >         /*
> > > >          * Cache is allocated to this size to allow it to overflow in certain
> > > >          * cases to avoid needless emptying of cache.
> > > >          */
> > > > -       void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache
> objects
> > > */
> > > > +       uint32_t objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache
> > > > + objects */
> > > >  } __rte_cache_aligned;
> > > >
> > > >  /**
> > > > @@ -172,7 +175,6 @@ struct rte_mempool_objtlr {
> > > >   * A list of memory where objects are stored
> > > >   */
> > > >  STAILQ_HEAD(rte_mempool_memhdr_list, rte_mempool_memhdr);
> > > > -
> > > >  /**
> > > >   * Callback used to free a memory chunk
> > > >   */
> > > > @@ -244,6 +246,7 @@ struct rte_mempool {
> > > >         int32_t ops_index;
> > > >
> > > >         struct rte_mempool_cache *local_cache; /**< Per-lcore
> > > > local cache */
> > > > +       void *local_cache_base_addr; /**< Reference to the base
> > > > + value */
> > > >
> > > >         uint32_t populated_size;         /**< Number of populated objects. */
> > > >         struct rte_mempool_objhdr_list elt_list; /**< List of
> > > > objects in pool */ @@ -1269,7 +1272,15 @@
> > > > rte_mempool_cache_flush(struct
> > > rte_mempool_cache *cache,
> > > >         if (cache == NULL || cache->len == 0)
> > > >                 return;
> > > >         rte_mempool_trace_cache_flush(cache, mp);
> > > > -       rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
> > > > +
> > > > +       unsigned int i;
> > > > +       unsigned int cache_len = cache->len;
> > > > +       void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> > > > +       void *base_value = cache->local_cache_base_value;
> > > > +       uint32_t *cache_objs = cache->objs;
> > > > +       for (i = 0; i < cache_len; i++)
> > > > +               obj_table[i] = (void *) RTE_PTR_ADD(base_value,
> cache_objs[i]);
> > > > +       rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
> > > >         cache->len = 0;
> > > >  }
> > > >
> > > > @@ -1289,7 +1300,9 @@ static __rte_always_inline void
> > > > __mempool_generic_put(struct rte_mempool *mp, void * const
> *obj_table,
> > > >                       unsigned int n, struct rte_mempool_cache
> > > > *cache) {
> > > > -       void **cache_objs;
> > > > +       uint32_t *cache_objs;
> > > > +       void *base_value;
> > > > +       uint32_t i;
> > > >
> > > >         /* increment stat now, adding in mempool always success */
> > > >         __MEMPOOL_STAT_ADD(mp, put_bulk, 1); @@ -1301,6 +1314,12
> > > > @@ __mempool_generic_put(struct rte_mempool *mp, void * const
> > > > *obj_table,
> > > >
> > > >         cache_objs = &cache->objs[cache->len];
> > > >
> > > > +       base_value = cache->local_cache_base_value;
> > > > +
> > > > +       uint64x2_t v_obj_table;
> > > > +       uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> > > > +       uint32x2_t v_cache_objs;
> > > > +
> > > >         /*
> > > >          * The cache follows the following algorithm
> > > >          *   1. Add the objects to the cache
> > > > @@ -1309,12 +1328,26 @@ __mempool_generic_put(struct
> rte_mempool
> > > *mp, void * const *obj_table,
> > > >          */
> > > >
> > > >         /* Add elements back into the cache */
> > > > -       rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
> > > > +
> > > > +#if defined __ARM_NEON
> > > > +       for (i = 0; i < (n & ~0x1); i+=2) {
> > > > +               v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
> > > > +               v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table,
> > > v_base_value));
> > > > +               vst1_u32(cache_objs + i, v_cache_objs);
> > > > +       }
> > > > +       if (n & 0x1) {
> > > > +               cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i],
> base_value);
> > > > +       }
> > > > +#else
> > > > +       for (i = 0; i < n; i++) {
> > > > +               cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i],
> base_value);
> > > > +       }
> > > > +#endif
> > > >
> > > >         cache->len += n;
> > > >
> > > >         if (cache->len >= cache->flushthresh) {
> > > > -               rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache-
> >size],
> > > > +               rte_mempool_ops_enqueue_bulk(mp, obj_table +
> > > > + cache->len - cache->size,
> > > >                                 cache->len - cache->size);
> > > >                 cache->len = cache->size;
> > > >         }
> > > > @@ -1415,23 +1448,26 @@ __mempool_generic_get(struct
> rte_mempool
> > > *mp, void **obj_table,
> > > >                       unsigned int n, struct rte_mempool_cache
> > > > *cache) {
> > > >         int ret;
> > > > +       uint32_t i;
> > > >         uint32_t index, len;
> > > > -       void **cache_objs;
> > > > +       uint32_t *cache_objs;
> > > >
> > > >         /* No cache provided or cannot be satisfied from cache */
> > > >         if (unlikely(cache == NULL || n >= cache->size))
> > > >                 goto ring_dequeue;
> > > >
> > > > +       void *base_value = cache->local_cache_base_value;
> > > >         cache_objs = cache->objs;
> > > >
> > > >         /* Can this be satisfied from the cache? */
> > > >         if (cache->len < n) {
> > > >                 /* No. Backfill the cache first, and then fill from it */
> > > >                 uint32_t req = n + (cache->size - cache->len);
> > > > +               void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> > > > + /**< Cache objects */
> > > >
> > > >                 /* How many do we require i.e. number to fill the
> > > > cache + the
> > > request */
> > > >                 ret = rte_mempool_ops_dequeue_bulk(mp,
> > > > -                       &cache->objs[cache->len], req);
> > > > +                       temp_objs, req);
> > > >                 if (unlikely(ret < 0)) {
> > > >                         /*
> > > >                          * In the off chance that we are buffer
> > > > constrained, @@ -1442,12 +1478,32 @@
> __mempool_generic_get(struct
> > > rte_mempool *mp, void **obj_table,
> > > >                         goto ring_dequeue;
> > > >                 }
> > > >
> > > > +               len = cache->len;
> > > > +               for (i = 0; i < req; ++i, ++len) {
> > > > +                       cache_objs[len] = (uint32_t)
> > > > + RTE_PTR_DIFF(temp_objs[i],
> > > base_value);
> > > > +               }
> > > > +
> > > >                 cache->len += req;
> > > >         }
> > > >
> > > > +       uint64x2_t v_obj_table;
> > > > +       uint64x2_t v_cache_objs;
> > > > +       uint64x2_t v_base_value =
> > > > + vdupq_n_u64((uint64_t)base_value);
> > > > +
> > > >         /* Now fill in the response ... */
> > > > +#if defined __ARM_NEON
> > > > +       for (index = 0, len = cache->len - 1; index < (n & ~0x1); index+=2,
> > > > +                                               len-=2, obj_table+=2) {
> > > > +               v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
> > > > +               v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
> > > > +               vst1q_u64((uint64_t *)obj_table, v_obj_table);
> > > > +       }
> > > > +       if (n & 0x1)
> > > > +               *obj_table = (void *) RTE_PTR_ADD(base_value,
> > > > +cache_objs[len]); #else
> > > >         for (index = 0, len = cache->len - 1; index < n; ++index,
> > > > len--,
> > > obj_table++)
> > > > -               *obj_table = cache_objs[len];
> > > > +               *obj_table = (void *) RTE_PTR_ADD(base_value,
> > > > +cache_objs[len]); #endif
> > > >
> > > >         cache->len -= n;
> > > >
> > > > --
> > > > 2.17.1
> > > >
  
Jerin Jacob Oct. 1, 2021, 6:21 p.m. UTC | #5
On Fri, Oct 1, 2021 at 11:02 PM Jerin Jacob <jerinjacobk@gmail.com> wrote:
>
> On Fri, Oct 1, 2021 at 9:14 PM Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com> wrote:
> >
> > <snip>
> >
> > >
> > > On Thu, Sep 30, 2021 at 10:57 PM Dharmik Thakkar
> > > <dharmik.thakkar@arm.com> wrote:
> > > >
> > > > Current mempool per core cache implementation is based on pointer For
> > > > most architectures, each pointer consumes 64b Replace it with
> > > > index-based implementation, where in each buffer is addressed by (pool
> > > > address + index) It will reduce memory requirements
> > > >
> > > > L3Fwd performance testing reveals minor improvements in the cache
> > > > performance and no change in throughput
> > > >
> > > > Micro-benchmarking the patch using mempool_perf_test shows significant
> > > > improvement with majority of the test cases
> > > >
> > > > Future plan involves replacing global pool's pointer-based
> > > > implementation with index-based implementation
> > > >
> > > > Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
> > >
> > >
> > > Sane idea. Like VPP, we tried to do this for rte_graph, but not observed much
> > > gain.
> > > Since lcore cache is typically 512, maybe there is a gain on the mempool path.
> > > Also, Since you are enabling only for local cache, it is good as mempool
> > > drivers can work as-is.(i.e HW drivers works with 64bit) I think, getting more
> > > performance numbers for various cases may be the next step.
> > The gain is not observed in terms of PPS improvement, but do see some improvements that PMUs indicate. This approach definitely results in savings in number of cache lines utilized.
>
> OK. IMO, If PPS has regression then this path is not viable, else it may be OK.

Looks good then.

>
>
> >
> > >
> > > > ---
> > > >  drivers/mempool/ring/rte_mempool_ring.c |  2 +-
> > > >  lib/mempool/rte_mempool.c               |  8 +++
> > > >  lib/mempool/rte_mempool.h               | 74 ++++++++++++++++++++++---
> > > >  3 files changed, 74 insertions(+), 10 deletions(-)
> > > >
> > > > diff --git a/drivers/mempool/ring/rte_mempool_ring.c
> > > > b/drivers/mempool/ring/rte_mempool_ring.c
> > > > index b1f09ff28f4d..e55913e47f21 100644
> > > > --- a/drivers/mempool/ring/rte_mempool_ring.c
> > > > +++ b/drivers/mempool/ring/rte_mempool_ring.c
> > > > @@ -101,7 +101,7 @@ ring_alloc(struct rte_mempool *mp, uint32_t
> > > rg_flags)
> > > >                 return -rte_errno;
> > > >
> > > >         mp->pool_data = r;
> > > > -
> > > > +       mp->local_cache_base_addr = &r[1];
> > > >         return 0;
> > > >  }
> > > >
> > > > diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> > > > index 59a588425bd6..424bdb19c323 100644
> > > > --- a/lib/mempool/rte_mempool.c
> > > > +++ b/lib/mempool/rte_mempool.c
> > > > @@ -480,6 +480,7 @@ rte_mempool_populate_default(struct
> > > rte_mempool *mp)
> > > >         int ret;
> > > >         bool need_iova_contig_obj;
> > > >         size_t max_alloc_size = SIZE_MAX;
> > > > +       unsigned lcore_id;
> > > >
> > > >         ret = mempool_ops_alloc_once(mp);
> > > >         if (ret != 0)
> > > > @@ -600,6 +601,13 @@ rte_mempool_populate_default(struct
> > > rte_mempool *mp)
> > > >                 }
> > > >         }
> > > >
> > > > +       /* Init all default caches. */
> > > > +       if (mp->cache_size != 0) {
> > > > +               for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
> > > > +                       mp->local_cache[lcore_id].local_cache_base_value =
> > > > +                               *(void **)mp->local_cache_base_addr;
> > > > +       }
> > > > +
> > > >         rte_mempool_trace_populate_default(mp);
> > > >         return mp->size;
> > > >
> > > > diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> > > > index 4235d6f0bf2b..545405c0d3ce 100644
> > > > --- a/lib/mempool/rte_mempool.h
> > > > +++ b/lib/mempool/rte_mempool.h
> > > > @@ -51,6 +51,8 @@
> > > >  #include <rte_memcpy.h>
> > > >  #include <rte_common.h>
> > > >
> > > > +#include <arm_neon.h>
> > > > +
> > > >  #include "rte_mempool_trace_fp.h"
> > > >
> > > >  #ifdef __cplusplus
> > > > @@ -91,11 +93,12 @@ struct rte_mempool_cache {
> > > >         uint32_t size;        /**< Size of the cache */
> > > >         uint32_t flushthresh; /**< Threshold before we flush excess elements
> > > */
> > > >         uint32_t len;         /**< Current cache count */
> > > > +       void *local_cache_base_value; /**< Base value to calculate
> > > > + indices */
> > > >         /*
> > > >          * Cache is allocated to this size to allow it to overflow in certain
> > > >          * cases to avoid needless emptying of cache.
> > > >          */
> > > > -       void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects
> > > */
> > > > +       uint32_t objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache
> > > > + objects */
> > > >  } __rte_cache_aligned;
> > > >
> > > >  /**
> > > > @@ -172,7 +175,6 @@ struct rte_mempool_objtlr {
> > > >   * A list of memory where objects are stored
> > > >   */
> > > >  STAILQ_HEAD(rte_mempool_memhdr_list, rte_mempool_memhdr);
> > > > -
> > > >  /**
> > > >   * Callback used to free a memory chunk
> > > >   */
> > > > @@ -244,6 +246,7 @@ struct rte_mempool {
> > > >         int32_t ops_index;
> > > >
> > > >         struct rte_mempool_cache *local_cache; /**< Per-lcore local
> > > > cache */
> > > > +       void *local_cache_base_addr; /**< Reference to the base value
> > > > + */
> > > >
> > > >         uint32_t populated_size;         /**< Number of populated objects. */
> > > >         struct rte_mempool_objhdr_list elt_list; /**< List of objects
> > > > in pool */ @@ -1269,7 +1272,15 @@ rte_mempool_cache_flush(struct
> > > rte_mempool_cache *cache,
> > > >         if (cache == NULL || cache->len == 0)
> > > >                 return;
> > > >         rte_mempool_trace_cache_flush(cache, mp);
> > > > -       rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
> > > > +
> > > > +       unsigned int i;
> > > > +       unsigned int cache_len = cache->len;
> > > > +       void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> > > > +       void *base_value = cache->local_cache_base_value;
> > > > +       uint32_t *cache_objs = cache->objs;
> > > > +       for (i = 0; i < cache_len; i++)
> > > > +               obj_table[i] = (void *) RTE_PTR_ADD(base_value, cache_objs[i]);
> > > > +       rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
> > > >         cache->len = 0;
> > > >  }
> > > >
> > > > @@ -1289,7 +1300,9 @@ static __rte_always_inline void
> > > > __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
> > > >                       unsigned int n, struct rte_mempool_cache *cache)
> > > > {
> > > > -       void **cache_objs;
> > > > +       uint32_t *cache_objs;
> > > > +       void *base_value;
> > > > +       uint32_t i;
> > > >
> > > >         /* increment stat now, adding in mempool always success */
> > > >         __MEMPOOL_STAT_ADD(mp, put_bulk, 1); @@ -1301,6 +1314,12 @@
> > > > __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
> > > >
> > > >         cache_objs = &cache->objs[cache->len];
> > > >
> > > > +       base_value = cache->local_cache_base_value;
> > > > +
> > > > +       uint64x2_t v_obj_table;
> > > > +       uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> > > > +       uint32x2_t v_cache_objs;
> > > > +
> > > >         /*
> > > >          * The cache follows the following algorithm
> > > >          *   1. Add the objects to the cache
> > > > @@ -1309,12 +1328,26 @@ __mempool_generic_put(struct rte_mempool
> > > *mp, void * const *obj_table,
> > > >          */
> > > >
> > > >         /* Add elements back into the cache */
> > > > -       rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
> > > > +
> > > > +#if defined __ARM_NEON
> > > > +       for (i = 0; i < (n & ~0x1); i+=2) {
> > > > +               v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
> > > > +               v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table,
> > > v_base_value));
> > > > +               vst1_u32(cache_objs + i, v_cache_objs);
> > > > +       }
> > > > +       if (n & 0x1) {
> > > > +               cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
> > > > +       }
> > > > +#else
> > > > +       for (i = 0; i < n; i++) {
> > > > +               cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
> > > > +       }
> > > > +#endif
> > > >
> > > >         cache->len += n;
> > > >
> > > >         if (cache->len >= cache->flushthresh) {
> > > > -               rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size],
> > > > +               rte_mempool_ops_enqueue_bulk(mp, obj_table +
> > > > + cache->len - cache->size,
> > > >                                 cache->len - cache->size);
> > > >                 cache->len = cache->size;
> > > >         }
> > > > @@ -1415,23 +1448,26 @@ __mempool_generic_get(struct rte_mempool
> > > *mp, void **obj_table,
> > > >                       unsigned int n, struct rte_mempool_cache *cache)
> > > > {
> > > >         int ret;
> > > > +       uint32_t i;
> > > >         uint32_t index, len;
> > > > -       void **cache_objs;
> > > > +       uint32_t *cache_objs;
> > > >
> > > >         /* No cache provided or cannot be satisfied from cache */
> > > >         if (unlikely(cache == NULL || n >= cache->size))
> > > >                 goto ring_dequeue;
> > > >
> > > > +       void *base_value = cache->local_cache_base_value;
> > > >         cache_objs = cache->objs;
> > > >
> > > >         /* Can this be satisfied from the cache? */
> > > >         if (cache->len < n) {
> > > >                 /* No. Backfill the cache first, and then fill from it */
> > > >                 uint32_t req = n + (cache->size - cache->len);
> > > > +               void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**<
> > > > + Cache objects */
> > > >
> > > >                 /* How many do we require i.e. number to fill the cache + the
> > > request */
> > > >                 ret = rte_mempool_ops_dequeue_bulk(mp,
> > > > -                       &cache->objs[cache->len], req);
> > > > +                       temp_objs, req);
> > > >                 if (unlikely(ret < 0)) {
> > > >                         /*
> > > >                          * In the off chance that we are buffer
> > > > constrained, @@ -1442,12 +1478,32 @@ __mempool_generic_get(struct
> > > rte_mempool *mp, void **obj_table,
> > > >                         goto ring_dequeue;
> > > >                 }
> > > >
> > > > +               len = cache->len;
> > > > +               for (i = 0; i < req; ++i, ++len) {
> > > > +                       cache_objs[len] = (uint32_t) RTE_PTR_DIFF(temp_objs[i],
> > > base_value);
> > > > +               }
> > > > +
> > > >                 cache->len += req;
> > > >         }
> > > >
> > > > +       uint64x2_t v_obj_table;
> > > > +       uint64x2_t v_cache_objs;
> > > > +       uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> > > > +
> > > >         /* Now fill in the response ... */
> > > > +#if defined __ARM_NEON
> > > > +       for (index = 0, len = cache->len - 1; index < (n & ~0x1); index+=2,
> > > > +                                               len-=2, obj_table+=2) {
> > > > +               v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
> > > > +               v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
> > > > +               vst1q_u64((uint64_t *)obj_table, v_obj_table);
> > > > +       }
> > > > +       if (n & 0x1)
> > > > +               *obj_table = (void *) RTE_PTR_ADD(base_value,
> > > > +cache_objs[len]); #else
> > > >         for (index = 0, len = cache->len - 1; index < n; ++index, len--,
> > > obj_table++)
> > > > -               *obj_table = cache_objs[len];
> > > > +               *obj_table = (void *) RTE_PTR_ADD(base_value,
> > > > +cache_objs[len]); #endif
> > > >
> > > >         cache->len -= n;
> > > >
> > > > --
> > > > 2.17.1
> > > >
  
Ananyev, Konstantin Oct. 1, 2021, 9:30 p.m. UTC | #6
> Current mempool per core cache implementation is based on pointer
> For most architectures, each pointer consumes 64b
> Replace it with index-based implementation, where in each buffer
> is addressed by (pool address + index)

I don't think it is going to work:
On 64-bit systems difference between pool address and it's elem address
could be bigger than 4GB.
 
> It will reduce memory requirements
> 
> L3Fwd performance testing reveals minor improvements in the cache
> performance and no change in throughput
> 
> Micro-benchmarking the patch using mempool_perf_test shows
> significant improvement with majority of the test cases
> 
> Future plan involves replacing global pool's pointer-based implementation with index-based implementation
> 
> Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
> ---
>  drivers/mempool/ring/rte_mempool_ring.c |  2 +-
>  lib/mempool/rte_mempool.c               |  8 +++
>  lib/mempool/rte_mempool.h               | 74 ++++++++++++++++++++++---
>  3 files changed, 74 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/mempool/ring/rte_mempool_ring.c b/drivers/mempool/ring/rte_mempool_ring.c
> index b1f09ff28f4d..e55913e47f21 100644
> --- a/drivers/mempool/ring/rte_mempool_ring.c
> +++ b/drivers/mempool/ring/rte_mempool_ring.c
> @@ -101,7 +101,7 @@ ring_alloc(struct rte_mempool *mp, uint32_t rg_flags)
>  		return -rte_errno;
> 
>  	mp->pool_data = r;
> -
> +	mp->local_cache_base_addr = &r[1];
>  	return 0;
>  }
> 
> diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> index 59a588425bd6..424bdb19c323 100644
> --- a/lib/mempool/rte_mempool.c
> +++ b/lib/mempool/rte_mempool.c
> @@ -480,6 +480,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>  	int ret;
>  	bool need_iova_contig_obj;
>  	size_t max_alloc_size = SIZE_MAX;
> +	unsigned lcore_id;
> 
>  	ret = mempool_ops_alloc_once(mp);
>  	if (ret != 0)
> @@ -600,6 +601,13 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>  		}
>  	}
> 
> +	/* Init all default caches. */
> +	if (mp->cache_size != 0) {
> +		for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
> +			mp->local_cache[lcore_id].local_cache_base_value =
> +				*(void **)mp->local_cache_base_addr;
> +	}
> +
>  	rte_mempool_trace_populate_default(mp);
>  	return mp->size;
> 
> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> index 4235d6f0bf2b..545405c0d3ce 100644
> --- a/lib/mempool/rte_mempool.h
> +++ b/lib/mempool/rte_mempool.h
> @@ -51,6 +51,8 @@
>  #include <rte_memcpy.h>
>  #include <rte_common.h>
> 
> +#include <arm_neon.h>
> +
>  #include "rte_mempool_trace_fp.h"
> 
>  #ifdef __cplusplus
> @@ -91,11 +93,12 @@ struct rte_mempool_cache {
>  	uint32_t size;	      /**< Size of the cache */
>  	uint32_t flushthresh; /**< Threshold before we flush excess elements */
>  	uint32_t len;	      /**< Current cache count */
> +	void *local_cache_base_value; /**< Base value to calculate indices */
>  	/*
>  	 * Cache is allocated to this size to allow it to overflow in certain
>  	 * cases to avoid needless emptying of cache.
>  	 */
> -	void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
> +	uint32_t objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
>  } __rte_cache_aligned;
> 
>  /**
> @@ -172,7 +175,6 @@ struct rte_mempool_objtlr {
>   * A list of memory where objects are stored
>   */
>  STAILQ_HEAD(rte_mempool_memhdr_list, rte_mempool_memhdr);
> -
>  /**
>   * Callback used to free a memory chunk
>   */
> @@ -244,6 +246,7 @@ struct rte_mempool {
>  	int32_t ops_index;
> 
>  	struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
> +	void *local_cache_base_addr; /**< Reference to the base value */
> 
>  	uint32_t populated_size;         /**< Number of populated objects. */
>  	struct rte_mempool_objhdr_list elt_list; /**< List of objects in pool */
> @@ -1269,7 +1272,15 @@ rte_mempool_cache_flush(struct rte_mempool_cache *cache,
>  	if (cache == NULL || cache->len == 0)
>  		return;
>  	rte_mempool_trace_cache_flush(cache, mp);
> -	rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
> +
> +	unsigned int i;
> +	unsigned int cache_len = cache->len;
> +	void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> +	void *base_value = cache->local_cache_base_value;
> +	uint32_t *cache_objs = cache->objs;
> +	for (i = 0; i < cache_len; i++)
> +		obj_table[i] = (void *) RTE_PTR_ADD(base_value, cache_objs[i]);
> +	rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
>  	cache->len = 0;
>  }
> 
> @@ -1289,7 +1300,9 @@ static __rte_always_inline void
>  __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
>  		      unsigned int n, struct rte_mempool_cache *cache)
>  {
> -	void **cache_objs;
> +	uint32_t *cache_objs;
> +	void *base_value;
> +	uint32_t i;
> 
>  	/* increment stat now, adding in mempool always success */
>  	__MEMPOOL_STAT_ADD(mp, put_bulk, 1);
> @@ -1301,6 +1314,12 @@ __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
> 
>  	cache_objs = &cache->objs[cache->len];
> 
> +	base_value = cache->local_cache_base_value;
> +
> +	uint64x2_t v_obj_table;
> +	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> +	uint32x2_t v_cache_objs;
> +
>  	/*
>  	 * The cache follows the following algorithm
>  	 *   1. Add the objects to the cache
> @@ -1309,12 +1328,26 @@ __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
>  	 */
> 
>  	/* Add elements back into the cache */
> -	rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
> +
> +#if defined __ARM_NEON
> +	for (i = 0; i < (n & ~0x1); i+=2) {
> +		v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
> +		v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table, v_base_value));
> +		vst1_u32(cache_objs + i, v_cache_objs);
> +	}
> +	if (n & 0x1) {
> +		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
> +	}
> +#else
> +	for (i = 0; i < n; i++) {
> +		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
> +	}
> +#endif
> 
>  	cache->len += n;
> 
>  	if (cache->len >= cache->flushthresh) {
> -		rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size],
> +		rte_mempool_ops_enqueue_bulk(mp, obj_table + cache->len - cache->size,
>  				cache->len - cache->size);
>  		cache->len = cache->size;
>  	}
> @@ -1415,23 +1448,26 @@ __mempool_generic_get(struct rte_mempool *mp, void **obj_table,
>  		      unsigned int n, struct rte_mempool_cache *cache)
>  {
>  	int ret;
> +	uint32_t i;
>  	uint32_t index, len;
> -	void **cache_objs;
> +	uint32_t *cache_objs;
> 
>  	/* No cache provided or cannot be satisfied from cache */
>  	if (unlikely(cache == NULL || n >= cache->size))
>  		goto ring_dequeue;
> 
> +	void *base_value = cache->local_cache_base_value;
>  	cache_objs = cache->objs;
> 
>  	/* Can this be satisfied from the cache? */
>  	if (cache->len < n) {
>  		/* No. Backfill the cache first, and then fill from it */
>  		uint32_t req = n + (cache->size - cache->len);
> +		void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
> 
>  		/* How many do we require i.e. number to fill the cache + the request */
>  		ret = rte_mempool_ops_dequeue_bulk(mp,
> -			&cache->objs[cache->len], req);
> +			temp_objs, req);
>  		if (unlikely(ret < 0)) {
>  			/*
>  			 * In the off chance that we are buffer constrained,
> @@ -1442,12 +1478,32 @@ __mempool_generic_get(struct rte_mempool *mp, void **obj_table,
>  			goto ring_dequeue;
>  		}
> 
> +		len = cache->len;
> +		for (i = 0; i < req; ++i, ++len) {
> +			cache_objs[len] = (uint32_t) RTE_PTR_DIFF(temp_objs[i], base_value);
> +		}
> +
>  		cache->len += req;
>  	}
> 
> +	uint64x2_t v_obj_table;
> +	uint64x2_t v_cache_objs;
> +	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> +
>  	/* Now fill in the response ... */
> +#if defined __ARM_NEON
> +	for (index = 0, len = cache->len - 1; index < (n & ~0x1); index+=2,
> +						len-=2, obj_table+=2) {
> +		v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
> +		v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
> +		vst1q_u64((uint64_t *)obj_table, v_obj_table);
> +	}
> +	if (n & 0x1)
> +		*obj_table = (void *) RTE_PTR_ADD(base_value, cache_objs[len]);
> +#else
>  	for (index = 0, len = cache->len - 1; index < n; ++index, len--, obj_table++)
> -		*obj_table = cache_objs[len];
> +		*obj_table = (void *) RTE_PTR_ADD(base_value, cache_objs[len]);
> +#endif
> 
>  	cache->len -= n;
> 
> --
> 2.17.1
  
Honnappa Nagarahalli Oct. 2, 2021, 12:07 a.m. UTC | #7
<snip>
> 
> > Current mempool per core cache implementation is based on pointer For
> > most architectures, each pointer consumes 64b Replace it with
> > index-based implementation, where in each buffer is addressed by (pool
> > address + index)
> 
> I don't think it is going to work:
> On 64-bit systems difference between pool address and it's elem address
> could be bigger than 4GB.
Are you talking about a case where the memory pool size is more than 4GB?

> 
> > It will reduce memory requirements
> >
> > L3Fwd performance testing reveals minor improvements in the cache
> > performance and no change in throughput
> >
> > Micro-benchmarking the patch using mempool_perf_test shows significant
> > improvement with majority of the test cases
> >
> > Future plan involves replacing global pool's pointer-based
> > implementation with index-based implementation
> >
> > Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
> > ---
> >  drivers/mempool/ring/rte_mempool_ring.c |  2 +-
> >  lib/mempool/rte_mempool.c               |  8 +++
> >  lib/mempool/rte_mempool.h               | 74 ++++++++++++++++++++++---
> >  3 files changed, 74 insertions(+), 10 deletions(-)
> >
> > diff --git a/drivers/mempool/ring/rte_mempool_ring.c
> > b/drivers/mempool/ring/rte_mempool_ring.c
> > index b1f09ff28f4d..e55913e47f21 100644
> > --- a/drivers/mempool/ring/rte_mempool_ring.c
> > +++ b/drivers/mempool/ring/rte_mempool_ring.c
> > @@ -101,7 +101,7 @@ ring_alloc(struct rte_mempool *mp, uint32_t
> rg_flags)
> >  		return -rte_errno;
> >
> >  	mp->pool_data = r;
> > -
> > +	mp->local_cache_base_addr = &r[1];
> >  	return 0;
> >  }
> >
> > diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> > index 59a588425bd6..424bdb19c323 100644
> > --- a/lib/mempool/rte_mempool.c
> > +++ b/lib/mempool/rte_mempool.c
> > @@ -480,6 +480,7 @@ rte_mempool_populate_default(struct
> rte_mempool *mp)
> >  	int ret;
> >  	bool need_iova_contig_obj;
> >  	size_t max_alloc_size = SIZE_MAX;
> > +	unsigned lcore_id;
> >
> >  	ret = mempool_ops_alloc_once(mp);
> >  	if (ret != 0)
> > @@ -600,6 +601,13 @@ rte_mempool_populate_default(struct
> rte_mempool *mp)
> >  		}
> >  	}
> >
> > +	/* Init all default caches. */
> > +	if (mp->cache_size != 0) {
> > +		for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
> > +			mp->local_cache[lcore_id].local_cache_base_value =
> > +				*(void **)mp->local_cache_base_addr;
> > +	}
> > +
> >  	rte_mempool_trace_populate_default(mp);
> >  	return mp->size;
> >
> > diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> > index 4235d6f0bf2b..545405c0d3ce 100644
> > --- a/lib/mempool/rte_mempool.h
> > +++ b/lib/mempool/rte_mempool.h
> > @@ -51,6 +51,8 @@
> >  #include <rte_memcpy.h>
> >  #include <rte_common.h>
> >
> > +#include <arm_neon.h>
> > +
> >  #include "rte_mempool_trace_fp.h"
> >
> >  #ifdef __cplusplus
> > @@ -91,11 +93,12 @@ struct rte_mempool_cache {
> >  	uint32_t size;	      /**< Size of the cache */
> >  	uint32_t flushthresh; /**< Threshold before we flush excess elements
> */
> >  	uint32_t len;	      /**< Current cache count */
> > +	void *local_cache_base_value; /**< Base value to calculate indices
> > +*/
> >  	/*
> >  	 * Cache is allocated to this size to allow it to overflow in certain
> >  	 * cases to avoid needless emptying of cache.
> >  	 */
> > -	void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache
> objects */
> > +	uint32_t objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache
> objects */
> >  } __rte_cache_aligned;
> >
> >  /**
> > @@ -172,7 +175,6 @@ struct rte_mempool_objtlr {
> >   * A list of memory where objects are stored
> >   */
> >  STAILQ_HEAD(rte_mempool_memhdr_list, rte_mempool_memhdr);
> > -
> >  /**
> >   * Callback used to free a memory chunk
> >   */
> > @@ -244,6 +246,7 @@ struct rte_mempool {
> >  	int32_t ops_index;
> >
> >  	struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
> > +	void *local_cache_base_addr; /**< Reference to the base value */
> >
> >  	uint32_t populated_size;         /**< Number of populated objects. */
> >  	struct rte_mempool_objhdr_list elt_list; /**< List of objects in
> > pool */ @@ -1269,7 +1272,15 @@ rte_mempool_cache_flush(struct
> rte_mempool_cache *cache,
> >  	if (cache == NULL || cache->len == 0)
> >  		return;
> >  	rte_mempool_trace_cache_flush(cache, mp);
> > -	rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
> > +
> > +	unsigned int i;
> > +	unsigned int cache_len = cache->len;
> > +	void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> > +	void *base_value = cache->local_cache_base_value;
> > +	uint32_t *cache_objs = cache->objs;
> > +	for (i = 0; i < cache_len; i++)
> > +		obj_table[i] = (void *) RTE_PTR_ADD(base_value,
> cache_objs[i]);
> > +	rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
> >  	cache->len = 0;
> >  }
> >
> > @@ -1289,7 +1300,9 @@ static __rte_always_inline void
> > __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
> >  		      unsigned int n, struct rte_mempool_cache *cache)  {
> > -	void **cache_objs;
> > +	uint32_t *cache_objs;
> > +	void *base_value;
> > +	uint32_t i;
> >
> >  	/* increment stat now, adding in mempool always success */
> >  	__MEMPOOL_STAT_ADD(mp, put_bulk, 1); @@ -1301,6 +1314,12
> @@
> > __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
> >
> >  	cache_objs = &cache->objs[cache->len];
> >
> > +	base_value = cache->local_cache_base_value;
> > +
> > +	uint64x2_t v_obj_table;
> > +	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> > +	uint32x2_t v_cache_objs;
> > +
> >  	/*
> >  	 * The cache follows the following algorithm
> >  	 *   1. Add the objects to the cache
> > @@ -1309,12 +1328,26 @@ __mempool_generic_put(struct rte_mempool
> *mp, void * const *obj_table,
> >  	 */
> >
> >  	/* Add elements back into the cache */
> > -	rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
> > +
> > +#if defined __ARM_NEON
> > +	for (i = 0; i < (n & ~0x1); i+=2) {
> > +		v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
> > +		v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table,
> v_base_value));
> > +		vst1_u32(cache_objs + i, v_cache_objs);
> > +	}
> > +	if (n & 0x1) {
> > +		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i],
> base_value);
> > +	}
> > +#else
> > +	for (i = 0; i < n; i++) {
> > +		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i],
> base_value);
> > +	}
> > +#endif
> >
> >  	cache->len += n;
> >
> >  	if (cache->len >= cache->flushthresh) {
> > -		rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache-
> >size],
> > +		rte_mempool_ops_enqueue_bulk(mp, obj_table + cache->len
> -
> > +cache->size,
> >  				cache->len - cache->size);
> >  		cache->len = cache->size;
> >  	}
> > @@ -1415,23 +1448,26 @@ __mempool_generic_get(struct rte_mempool
> *mp, void **obj_table,
> >  		      unsigned int n, struct rte_mempool_cache *cache)  {
> >  	int ret;
> > +	uint32_t i;
> >  	uint32_t index, len;
> > -	void **cache_objs;
> > +	uint32_t *cache_objs;
> >
> >  	/* No cache provided or cannot be satisfied from cache */
> >  	if (unlikely(cache == NULL || n >= cache->size))
> >  		goto ring_dequeue;
> >
> > +	void *base_value = cache->local_cache_base_value;
> >  	cache_objs = cache->objs;
> >
> >  	/* Can this be satisfied from the cache? */
> >  	if (cache->len < n) {
> >  		/* No. Backfill the cache first, and then fill from it */
> >  		uint32_t req = n + (cache->size - cache->len);
> > +		void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> /**< Cache objects
> > +*/
> >
> >  		/* How many do we require i.e. number to fill the cache + the
> request */
> >  		ret = rte_mempool_ops_dequeue_bulk(mp,
> > -			&cache->objs[cache->len], req);
> > +			temp_objs, req);
> >  		if (unlikely(ret < 0)) {
> >  			/*
> >  			 * In the off chance that we are buffer constrained,
> @@ -1442,12
> > +1478,32 @@ __mempool_generic_get(struct rte_mempool *mp, void
> **obj_table,
> >  			goto ring_dequeue;
> >  		}
> >
> > +		len = cache->len;
> > +		for (i = 0; i < req; ++i, ++len) {
> > +			cache_objs[len] = (uint32_t)
> RTE_PTR_DIFF(temp_objs[i], base_value);
> > +		}
> > +
> >  		cache->len += req;
> >  	}
> >
> > +	uint64x2_t v_obj_table;
> > +	uint64x2_t v_cache_objs;
> > +	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> > +
> >  	/* Now fill in the response ... */
> > +#if defined __ARM_NEON
> > +	for (index = 0, len = cache->len - 1; index < (n & ~0x1); index+=2,
> > +						len-=2, obj_table+=2) {
> > +		v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
> > +		v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
> > +		vst1q_u64((uint64_t *)obj_table, v_obj_table);
> > +	}
> > +	if (n & 0x1)
> > +		*obj_table = (void *) RTE_PTR_ADD(base_value,
> cache_objs[len]);
> > +#else
> >  	for (index = 0, len = cache->len - 1; index < n; ++index, len--,
> obj_table++)
> > -		*obj_table = cache_objs[len];
> > +		*obj_table = (void *) RTE_PTR_ADD(base_value,
> cache_objs[len]);
> > +#endif
> >
> >  	cache->len -= n;
> >
> > --
> > 2.17.1
  
Ananyev, Konstantin Oct. 2, 2021, 6:51 p.m. UTC | #8
> > > Current mempool per core cache implementation is based on pointer For
> > > most architectures, each pointer consumes 64b Replace it with
> > > index-based implementation, where in each buffer is addressed by (pool
> > > address + index)
> >
> > I don't think it is going to work:
> > On 64-bit systems difference between pool address and it's elem address
> > could be bigger than 4GB.
> Are you talking about a case where the memory pool size is more than 4GB?

That is one possible scenario.
Another possibility - user populates mempool himself with some external
memory by calling rte_mempool_populate_iova() directly.
I suppose such situation can even occur even with normal rte_mempool_create(),
though it should be a really rare one.  

> 
> >
> > > It will reduce memory requirements
> > >
> > > L3Fwd performance testing reveals minor improvements in the cache
> > > performance and no change in throughput
> > >
> > > Micro-benchmarking the patch using mempool_perf_test shows significant
> > > improvement with majority of the test cases
> > >
> > > Future plan involves replacing global pool's pointer-based
> > > implementation with index-based implementation
> > >
> > > Signed-off-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
> > > ---
> > >  drivers/mempool/ring/rte_mempool_ring.c |  2 +-
> > >  lib/mempool/rte_mempool.c               |  8 +++
> > >  lib/mempool/rte_mempool.h               | 74 ++++++++++++++++++++++---
> > >  3 files changed, 74 insertions(+), 10 deletions(-)
> > >
> > > diff --git a/drivers/mempool/ring/rte_mempool_ring.c
> > > b/drivers/mempool/ring/rte_mempool_ring.c
> > > index b1f09ff28f4d..e55913e47f21 100644
> > > --- a/drivers/mempool/ring/rte_mempool_ring.c
> > > +++ b/drivers/mempool/ring/rte_mempool_ring.c
> > > @@ -101,7 +101,7 @@ ring_alloc(struct rte_mempool *mp, uint32_t
> > rg_flags)
> > >  		return -rte_errno;
> > >
> > >  	mp->pool_data = r;
> > > -
> > > +	mp->local_cache_base_addr = &r[1];
> > >  	return 0;
> > >  }
> > >
> > > diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> > > index 59a588425bd6..424bdb19c323 100644
> > > --- a/lib/mempool/rte_mempool.c
> > > +++ b/lib/mempool/rte_mempool.c
> > > @@ -480,6 +480,7 @@ rte_mempool_populate_default(struct
> > rte_mempool *mp)
> > >  	int ret;
> > >  	bool need_iova_contig_obj;
> > >  	size_t max_alloc_size = SIZE_MAX;
> > > +	unsigned lcore_id;
> > >
> > >  	ret = mempool_ops_alloc_once(mp);
> > >  	if (ret != 0)
> > > @@ -600,6 +601,13 @@ rte_mempool_populate_default(struct
> > rte_mempool *mp)
> > >  		}
> > >  	}
> > >
> > > +	/* Init all default caches. */
> > > +	if (mp->cache_size != 0) {
> > > +		for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
> > > +			mp->local_cache[lcore_id].local_cache_base_value =
> > > +				*(void **)mp->local_cache_base_addr;
> > > +	}
> > > +
> > >  	rte_mempool_trace_populate_default(mp);
> > >  	return mp->size;
> > >
> > > diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> > > index 4235d6f0bf2b..545405c0d3ce 100644
> > > --- a/lib/mempool/rte_mempool.h
> > > +++ b/lib/mempool/rte_mempool.h
> > > @@ -51,6 +51,8 @@
> > >  #include <rte_memcpy.h>
> > >  #include <rte_common.h>
> > >
> > > +#include <arm_neon.h>
> > > +
> > >  #include "rte_mempool_trace_fp.h"
> > >
> > >  #ifdef __cplusplus
> > > @@ -91,11 +93,12 @@ struct rte_mempool_cache {
> > >  	uint32_t size;	      /**< Size of the cache */
> > >  	uint32_t flushthresh; /**< Threshold before we flush excess elements
> > */
> > >  	uint32_t len;	      /**< Current cache count */
> > > +	void *local_cache_base_value; /**< Base value to calculate indices
> > > +*/
> > >  	/*
> > >  	 * Cache is allocated to this size to allow it to overflow in certain
> > >  	 * cases to avoid needless emptying of cache.
> > >  	 */
> > > -	void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache
> > objects */
> > > +	uint32_t objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache
> > objects */
> > >  } __rte_cache_aligned;
> > >
> > >  /**
> > > @@ -172,7 +175,6 @@ struct rte_mempool_objtlr {
> > >   * A list of memory where objects are stored
> > >   */
> > >  STAILQ_HEAD(rte_mempool_memhdr_list, rte_mempool_memhdr);
> > > -
> > >  /**
> > >   * Callback used to free a memory chunk
> > >   */
> > > @@ -244,6 +246,7 @@ struct rte_mempool {
> > >  	int32_t ops_index;
> > >
> > >  	struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
> > > +	void *local_cache_base_addr; /**< Reference to the base value */
> > >
> > >  	uint32_t populated_size;         /**< Number of populated objects. */
> > >  	struct rte_mempool_objhdr_list elt_list; /**< List of objects in
> > > pool */ @@ -1269,7 +1272,15 @@ rte_mempool_cache_flush(struct
> > rte_mempool_cache *cache,
> > >  	if (cache == NULL || cache->len == 0)
> > >  		return;
> > >  	rte_mempool_trace_cache_flush(cache, mp);
> > > -	rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
> > > +
> > > +	unsigned int i;
> > > +	unsigned int cache_len = cache->len;
> > > +	void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> > > +	void *base_value = cache->local_cache_base_value;
> > > +	uint32_t *cache_objs = cache->objs;
> > > +	for (i = 0; i < cache_len; i++)
> > > +		obj_table[i] = (void *) RTE_PTR_ADD(base_value,
> > cache_objs[i]);
> > > +	rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
> > >  	cache->len = 0;
> > >  }
> > >
> > > @@ -1289,7 +1300,9 @@ static __rte_always_inline void
> > > __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
> > >  		      unsigned int n, struct rte_mempool_cache *cache)  {
> > > -	void **cache_objs;
> > > +	uint32_t *cache_objs;
> > > +	void *base_value;
> > > +	uint32_t i;
> > >
> > >  	/* increment stat now, adding in mempool always success */
> > >  	__MEMPOOL_STAT_ADD(mp, put_bulk, 1); @@ -1301,6 +1314,12
> > @@
> > > __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
> > >
> > >  	cache_objs = &cache->objs[cache->len];
> > >
> > > +	base_value = cache->local_cache_base_value;
> > > +
> > > +	uint64x2_t v_obj_table;
> > > +	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> > > +	uint32x2_t v_cache_objs;
> > > +
> > >  	/*
> > >  	 * The cache follows the following algorithm
> > >  	 *   1. Add the objects to the cache
> > > @@ -1309,12 +1328,26 @@ __mempool_generic_put(struct rte_mempool
> > *mp, void * const *obj_table,
> > >  	 */
> > >
> > >  	/* Add elements back into the cache */
> > > -	rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
> > > +
> > > +#if defined __ARM_NEON
> > > +	for (i = 0; i < (n & ~0x1); i+=2) {
> > > +		v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
> > > +		v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table,
> > v_base_value));
> > > +		vst1_u32(cache_objs + i, v_cache_objs);
> > > +	}
> > > +	if (n & 0x1) {
> > > +		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i],
> > base_value);
> > > +	}
> > > +#else
> > > +	for (i = 0; i < n; i++) {
> > > +		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i],
> > base_value);
> > > +	}
> > > +#endif
> > >
> > >  	cache->len += n;
> > >
> > >  	if (cache->len >= cache->flushthresh) {
> > > -		rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache-
> > >size],
> > > +		rte_mempool_ops_enqueue_bulk(mp, obj_table + cache->len
> > -
> > > +cache->size,
> > >  				cache->len - cache->size);
> > >  		cache->len = cache->size;
> > >  	}
> > > @@ -1415,23 +1448,26 @@ __mempool_generic_get(struct rte_mempool
> > *mp, void **obj_table,
> > >  		      unsigned int n, struct rte_mempool_cache *cache)  {
> > >  	int ret;
> > > +	uint32_t i;
> > >  	uint32_t index, len;
> > > -	void **cache_objs;
> > > +	uint32_t *cache_objs;
> > >
> > >  	/* No cache provided or cannot be satisfied from cache */
> > >  	if (unlikely(cache == NULL || n >= cache->size))
> > >  		goto ring_dequeue;
> > >
> > > +	void *base_value = cache->local_cache_base_value;
> > >  	cache_objs = cache->objs;
> > >
> > >  	/* Can this be satisfied from the cache? */
> > >  	if (cache->len < n) {
> > >  		/* No. Backfill the cache first, and then fill from it */
> > >  		uint32_t req = n + (cache->size - cache->len);
> > > +		void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
> > /**< Cache objects
> > > +*/
> > >
> > >  		/* How many do we require i.e. number to fill the cache + the
> > request */
> > >  		ret = rte_mempool_ops_dequeue_bulk(mp,
> > > -			&cache->objs[cache->len], req);
> > > +			temp_objs, req);
> > >  		if (unlikely(ret < 0)) {
> > >  			/*
> > >  			 * In the off chance that we are buffer constrained,
> > @@ -1442,12
> > > +1478,32 @@ __mempool_generic_get(struct rte_mempool *mp, void
> > **obj_table,
> > >  			goto ring_dequeue;
> > >  		}
> > >
> > > +		len = cache->len;
> > > +		for (i = 0; i < req; ++i, ++len) {
> > > +			cache_objs[len] = (uint32_t)
> > RTE_PTR_DIFF(temp_objs[i], base_value);
> > > +		}
> > > +
> > >  		cache->len += req;
> > >  	}
> > >
> > > +	uint64x2_t v_obj_table;
> > > +	uint64x2_t v_cache_objs;
> > > +	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
> > > +
> > >  	/* Now fill in the response ... */
> > > +#if defined __ARM_NEON
> > > +	for (index = 0, len = cache->len - 1; index < (n & ~0x1); index+=2,
> > > +						len-=2, obj_table+=2) {
> > > +		v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
> > > +		v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
> > > +		vst1q_u64((uint64_t *)obj_table, v_obj_table);
> > > +	}
> > > +	if (n & 0x1)
> > > +		*obj_table = (void *) RTE_PTR_ADD(base_value,
> > cache_objs[len]);
> > > +#else
> > >  	for (index = 0, len = cache->len - 1; index < n; ++index, len--,
> > obj_table++)
> > > -		*obj_table = cache_objs[len];
> > > +		*obj_table = (void *) RTE_PTR_ADD(base_value,
> > cache_objs[len]);
> > > +#endif
> > >
> > >  	cache->len -= n;
> > >
> > > --
> > > 2.17.1
  
Honnappa Nagarahalli Oct. 4, 2021, 4:36 p.m. UTC | #9
<snip>
> 
> 
> > > > Current mempool per core cache implementation is based on pointer
> > > > For most architectures, each pointer consumes 64b Replace it with
> > > > index-based implementation, where in each buffer is addressed by
> > > > (pool address + index)
> > >
> > > I don't think it is going to work:
> > > On 64-bit systems difference between pool address and it's elem
> > > address could be bigger than 4GB.
> > Are you talking about a case where the memory pool size is more than 4GB?
> 
> That is one possible scenario.
> Another possibility - user populates mempool himself with some external
> memory by calling rte_mempool_populate_iova() directly.
Is the concern that IOVA might not be contiguous for all the memory used by the mempool?

> I suppose such situation can even occur even with normal
> rte_mempool_create(), though it should be a really rare one.
All in all, this feature needs to be configurable during compile time.

> 
> >
> > >
<snip>
  
Morten Brørup Oct. 30, 2021, 10:23 a.m. UTC | #10
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Honnappa
> Nagarahalli
> Sent: Monday, 4 October 2021 18.36
> 
> <snip>
> >
> >
> > > > > Current mempool per core cache implementation is based on
> pointer
> > > > > For most architectures, each pointer consumes 64b Replace it
> with
> > > > > index-based implementation, where in each buffer is addressed
> by
> > > > > (pool address + index)

I like Dharmik's suggestion very much. CPU cache is a critical and limited resource.

DPDK has a tendency of using pointers where indexes could be used instead. I suppose pointers provide the additional flexibility of mixing entries from different memory pools, e.g. multiple mbuf pools.

> > > >
> > > > I don't think it is going to work:
> > > > On 64-bit systems difference between pool address and it's elem
> > > > address could be bigger than 4GB.
> > > Are you talking about a case where the memory pool size is more
> than 4GB?
> >
> > That is one possible scenario.

That could be solved by making the index an element index instead of a pointer offset: address = (pool address + index * element size).

> > Another possibility - user populates mempool himself with some
> external
> > memory by calling rte_mempool_populate_iova() directly.
> Is the concern that IOVA might not be contiguous for all the memory
> used by the mempool?
> 
> > I suppose such situation can even occur even with normal
> > rte_mempool_create(), though it should be a really rare one.
> All in all, this feature needs to be configurable during compile time.
  
Morten Brørup Oct. 31, 2021, 8:14 a.m. UTC | #11
> From: Morten Brørup
> Sent: Saturday, 30 October 2021 12.24
> 
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Honnappa
> > Nagarahalli
> > Sent: Monday, 4 October 2021 18.36
> >
> > <snip>
> > >
> > >
> > > > > > Current mempool per core cache implementation is based on
> > pointer
> > > > > > For most architectures, each pointer consumes 64b Replace it
> > with
> > > > > > index-based implementation, where in each buffer is addressed
> > by
> > > > > > (pool address + index)
> 
> I like Dharmik's suggestion very much. CPU cache is a critical and
> limited resource.
> 
> DPDK has a tendency of using pointers where indexes could be used
> instead. I suppose pointers provide the additional flexibility of
> mixing entries from different memory pools, e.g. multiple mbuf pools.
> 
> > > > >
> > > > > I don't think it is going to work:
> > > > > On 64-bit systems difference between pool address and it's elem
> > > > > address could be bigger than 4GB.
> > > > Are you talking about a case where the memory pool size is more
> > than 4GB?
> > >
> > > That is one possible scenario.
> 
> That could be solved by making the index an element index instead of a
> pointer offset: address = (pool address + index * element size).

Or instead of scaling the index with the element size, which is only known at runtime, the index could be more efficiently scaled by a compile time constant such as RTE_MEMPOOL_ALIGN (= RTE_CACHE_LINE_SIZE). With a cache line size of 64 byte, that would allow indexing into mempools up to 256 GB in size.

> 
> > > Another possibility - user populates mempool himself with some
> > external
> > > memory by calling rte_mempool_populate_iova() directly.
> > Is the concern that IOVA might not be contiguous for all the memory
> > used by the mempool?
> >
> > > I suppose such situation can even occur even with normal
> > > rte_mempool_create(), though it should be a really rare one.
> > All in all, this feature needs to be configurable during compile
> time.
  
Dharmik Thakkar Nov. 3, 2021, 3:12 p.m. UTC | #12
Hi,

Thank you everyone for the comments! I am currently working on making the global pool ring’s implementation as index based.
Once done, I will send a patch for community review. I will also make it as a compile time option.

> On Oct 31, 2021, at 3:14 AM, Morten Brørup <mb@smartsharesystems.com> wrote:
> 
>> From: Morten Brørup
>> Sent: Saturday, 30 October 2021 12.24
>> 
>>> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Honnappa
>>> Nagarahalli
>>> Sent: Monday, 4 October 2021 18.36
>>> 
>>> <snip>
>>>> 
>>>> 
>>>>>>> Current mempool per core cache implementation is based on
>>> pointer
>>>>>>> For most architectures, each pointer consumes 64b Replace it
>>> with
>>>>>>> index-based implementation, where in each buffer is addressed
>>> by
>>>>>>> (pool address + index)
>> 
>> I like Dharmik's suggestion very much. CPU cache is a critical and
>> limited resource.
>> 
>> DPDK has a tendency of using pointers where indexes could be used
>> instead. I suppose pointers provide the additional flexibility of
>> mixing entries from different memory pools, e.g. multiple mbuf pools.
>> 

Agreed, thank you!

>>>>>> 
>>>>>> I don't think it is going to work:
>>>>>> On 64-bit systems difference between pool address and it's elem
>>>>>> address could be bigger than 4GB.
>>>>> Are you talking about a case where the memory pool size is more
>>> than 4GB?
>>>> 
>>>> That is one possible scenario.
>> 
>> That could be solved by making the index an element index instead of a
>> pointer offset: address = (pool address + index * element size).
> 
> Or instead of scaling the index with the element size, which is only known at runtime, the index could be more efficiently scaled by a compile time constant such as RTE_MEMPOOL_ALIGN (= RTE_CACHE_LINE_SIZE). With a cache line size of 64 byte, that would allow indexing into mempools up to 256 GB in size.
> 

Looking at this snippet [1] from rte_mempool_op_populate_helper(), there is an ‘offset’ added to avoid objects to cross page boundaries. If my understanding is correct, using the index of element instead of a pointer offset will pose a challenge for some of the corner cases.

[1]
        for (i = 0; i < max_objs; i++) {                                           
                /* avoid objects to cross page boundaries */
                if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0) {
                        off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va + off);
                        if (flags & RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
                                off += total_elt_sz -
                                        (((uintptr_t)(va + off - 1) %
                                                total_elt_sz) + 1);
                }

>> 
>>>> Another possibility - user populates mempool himself with some
>>> external
>>>> memory by calling rte_mempool_populate_iova() directly.
>>> Is the concern that IOVA might not be contiguous for all the memory
>>> used by the mempool?
>>> 
>>>> I suppose such situation can even occur even with normal
>>>> rte_mempool_create(), though it should be a really rare one.
>>> All in all, this feature needs to be configurable during compile
>> time.
>
  
Morten Brørup Nov. 3, 2021, 3:52 p.m. UTC | #13
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Dharmik Thakkar
> Sent: Wednesday, 3 November 2021 16.13
> 
> Hi,
> 
> Thank you everyone for the comments! I am currently working on making
> the global pool ring’s implementation as index based.
> Once done, I will send a patch for community review. I will also make
> it as a compile time option.

Sounds good to me.

This could probably be abstracted to other libraries too. E.g. the ring library holds pointers to objects (void *); an alternative ring library could hold indexes to objects (uint32_t). A ring often holds objects from the same mempool, and the application knows which mempool, so indexing would be useful here too.

> 
> > On Oct 31, 2021, at 3:14 AM, Morten Brørup <mb@smartsharesystems.com>
> wrote:
> >
> >> From: Morten Brørup
> >> Sent: Saturday, 30 October 2021 12.24
> >>
> >>> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Honnappa
> >>> Nagarahalli
> >>> Sent: Monday, 4 October 2021 18.36
> >>>
> >>> <snip>
> >>>>
> >>>>
> >>>>>>> Current mempool per core cache implementation is based on
> >>> pointer
> >>>>>>> For most architectures, each pointer consumes 64b Replace it
> >>> with
> >>>>>>> index-based implementation, where in each buffer is addressed
> >>> by
> >>>>>>> (pool address + index)
> >>
> >> I like Dharmik's suggestion very much. CPU cache is a critical and
> >> limited resource.
> >>
> >> DPDK has a tendency of using pointers where indexes could be used
> >> instead. I suppose pointers provide the additional flexibility of
> >> mixing entries from different memory pools, e.g. multiple mbuf
> pools.
> >>
> 
> Agreed, thank you!
> 
> >>>>>>
> >>>>>> I don't think it is going to work:
> >>>>>> On 64-bit systems difference between pool address and it's elem
> >>>>>> address could be bigger than 4GB.
> >>>>> Are you talking about a case where the memory pool size is more
> >>> than 4GB?
> >>>>
> >>>> That is one possible scenario.
> >>
> >> That could be solved by making the index an element index instead of
> a
> >> pointer offset: address = (pool address + index * element size).
> >
> > Or instead of scaling the index with the element size, which is only
> known at runtime, the index could be more efficiently scaled by a
> compile time constant such as RTE_MEMPOOL_ALIGN (=
> RTE_CACHE_LINE_SIZE). With a cache line size of 64 byte, that would
> allow indexing into mempools up to 256 GB in size.
> >
> 
> Looking at this snippet [1] from rte_mempool_op_populate_helper(),
> there is an ‘offset’ added to avoid objects to cross page boundaries.
> If my understanding is correct, using the index of element instead of a
> pointer offset will pose a challenge for some of the corner cases.
> 
> [1]
>         for (i = 0; i < max_objs; i++) {
>                 /* avoid objects to cross page boundaries */
>                 if (check_obj_bounds(va + off, pg_sz, total_elt_sz) <
> 0) {
>                         off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) -
> (va + off);
>                         if (flags & RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
>                                 off += total_elt_sz -
>                                         (((uintptr_t)(va + off - 1) %
>                                                 total_elt_sz) + 1);
>                 }
> 

OK. Alternatively to scaling the index with a cache line size, you can scale it with sizeof(uintptr_t) to be able to address 32 or 16 GB mempools on respectively 64 bit and 32 bit architectures. Both x86 and ARM CPUs have instructions to access memory with an added offset multiplied by 4 or 8. So that should be high performance.

> >>
> >>>> Another possibility - user populates mempool himself with some
> >>> external
> >>>> memory by calling rte_mempool_populate_iova() directly.
> >>> Is the concern that IOVA might not be contiguous for all the memory
> >>> used by the mempool?
> >>>
> >>>> I suppose such situation can even occur even with normal
> >>>> rte_mempool_create(), though it should be a really rare one.
> >>> All in all, this feature needs to be configurable during compile
> >> time.
> >
  
Dharmik Thakkar Nov. 4, 2021, 4:42 a.m. UTC | #14
> On Nov 3, 2021, at 10:52 AM, Morten Brørup <mb@smartsharesystems.com> wrote:
> 
>> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Dharmik Thakkar
>> Sent: Wednesday, 3 November 2021 16.13
>> 
>> Hi,
>> 
>> Thank you everyone for the comments! I am currently working on making
>> the global pool ring’s implementation as index based.
>> Once done, I will send a patch for community review. I will also make
>> it as a compile time option.
> 
> Sounds good to me.
> 
> This could probably be abstracted to other libraries too. E.g. the ring library holds pointers to objects (void *); an alternative ring library could hold indexes to objects (uint32_t). A ring often holds objects from the same mempool, and the application knows which mempool, so indexing would be useful here too.
> 

Yes, ring library within DPDK has the APIs to support configurable element size

>> 
>>> On Oct 31, 2021, at 3:14 AM, Morten Brørup <mb@smartsharesystems.com>
>> wrote:
>>> 
>>>> From: Morten Brørup
>>>> Sent: Saturday, 30 October 2021 12.24
>>>> 
>>>>> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Honnappa
>>>>> Nagarahalli
>>>>> Sent: Monday, 4 October 2021 18.36
>>>>> 
>>>>> <snip>
>>>>>> 
>>>>>> 
>>>>>>>>> Current mempool per core cache implementation is based on
>>>>> pointer
>>>>>>>>> For most architectures, each pointer consumes 64b Replace it
>>>>> with
>>>>>>>>> index-based implementation, where in each buffer is addressed
>>>>> by
>>>>>>>>> (pool address + index)
>>>> 
>>>> I like Dharmik's suggestion very much. CPU cache is a critical and
>>>> limited resource.
>>>> 
>>>> DPDK has a tendency of using pointers where indexes could be used
>>>> instead. I suppose pointers provide the additional flexibility of
>>>> mixing entries from different memory pools, e.g. multiple mbuf
>> pools.
>>>> 
>> 
>> Agreed, thank you!
>> 
>>>>>>>> 
>>>>>>>> I don't think it is going to work:
>>>>>>>> On 64-bit systems difference between pool address and it's elem
>>>>>>>> address could be bigger than 4GB.
>>>>>>> Are you talking about a case where the memory pool size is more
>>>>> than 4GB?
>>>>>> 
>>>>>> That is one possible scenario.
>>>> 
>>>> That could be solved by making the index an element index instead of
>> a
>>>> pointer offset: address = (pool address + index * element size).
>>> 
>>> Or instead of scaling the index with the element size, which is only
>> known at runtime, the index could be more efficiently scaled by a
>> compile time constant such as RTE_MEMPOOL_ALIGN (=
>> RTE_CACHE_LINE_SIZE). With a cache line size of 64 byte, that would
>> allow indexing into mempools up to 256 GB in size.
>>> 
>> 
>> Looking at this snippet [1] from rte_mempool_op_populate_helper(),
>> there is an ‘offset’ added to avoid objects to cross page boundaries.
>> If my understanding is correct, using the index of element instead of a
>> pointer offset will pose a challenge for some of the corner cases.
>> 
>> [1]
>>        for (i = 0; i < max_objs; i++) {
>>                /* avoid objects to cross page boundaries */
>>                if (check_obj_bounds(va + off, pg_sz, total_elt_sz) <
>> 0) {
>>                        off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) -
>> (va + off);
>>                        if (flags & RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
>>                                off += total_elt_sz -
>>                                        (((uintptr_t)(va + off - 1) %
>>                                                total_elt_sz) + 1);
>>                }
>> 
> 
> OK. Alternatively to scaling the index with a cache line size, you can scale it with sizeof(uintptr_t) to be able to address 32 or 16 GB mempools on respectively 64 bit and 32 bit architectures. Both x86 and ARM CPUs have instructions to access memory with an added offset multiplied by 4 or 8. So that should be high performance.

Yes, agreed this can be done.
Cache line size can also be used when ‘MEMPOOL_F_NO_CACHE_ALIGN’ is not enabled.
On a side note, I wanted to better understand the need for having the 'MEMPOOL_F_NO_CACHE_ALIGN' option.

> 
>>>> 
>>>>>> Another possibility - user populates mempool himself with some
>>>>> external
>>>>>> memory by calling rte_mempool_populate_iova() directly.
>>>>> Is the concern that IOVA might not be contiguous for all the memory
>>>>> used by the mempool?
>>>>> 
>>>>>> I suppose such situation can even occur even with normal
>>>>>> rte_mempool_create(), though it should be a really rare one.
>>>>> All in all, this feature needs to be configurable during compile
>>>> time.
>>> 
>
  
Morten Brørup Nov. 4, 2021, 8:04 a.m. UTC | #15
+ Ring library maintainers (@Honnappa and @Konstantin) for my rants about its documentation.

> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Dharmik Thakkar
> Sent: Thursday, 4 November 2021 05.42
> 
> > On Nov 3, 2021, at 10:52 AM, Morten Brørup <mb@smartsharesystems.com>
> wrote:
> >
> >> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Dharmik Thakkar
> >> Sent: Wednesday, 3 November 2021 16.13
> >>
> >> Hi,
> >>
> >> Thank you everyone for the comments! I am currently working on
> making
> >> the global pool ring’s implementation as index based.
> >> Once done, I will send a patch for community review. I will also
> make
> >> it as a compile time option.
> >
> > Sounds good to me.
> >
> > This could probably be abstracted to other libraries too. E.g. the
> ring library holds pointers to objects (void *); an alternative ring
> library could hold indexes to objects (uint32_t). A ring often holds
> objects from the same mempool, and the application knows which mempool,
> so indexing would be useful here too.
> >
> 
> Yes, ring library within DPDK has the APIs to support configurable
> element size

I remember seeing that feature proposed on the mailing list too, but I couldn't find it in the API documentation, so I was not sure it was ever accepted.

The containers section of the API documentation (/doc/api/doxy-api-index.md) doesn't contain any references to it. And the description of the RTE Ring library, which the "ring" link in the API documentation refers to, clearly says: The Ring Manager is a fixed-size queue, implemented as a table of *pointers*. (My emphasis.) So I thought it wasn't accepted.

However, searching for it in the source code reveals that it is indeed there! And the Ring Library chapter in the Programmer's Guide does mention that the objects can be something else than pointers.

So the documentation is not all screwed up, just a little sparse. :-)

> 
> >>
> >>> On Oct 31, 2021, at 3:14 AM, Morten Brørup
> <mb@smartsharesystems.com>
> >> wrote:
> >>>
> >>>> From: Morten Brørup
> >>>> Sent: Saturday, 30 October 2021 12.24
> >>>>
> >>>>> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Honnappa
> >>>>> Nagarahalli
> >>>>> Sent: Monday, 4 October 2021 18.36
> >>>>>
> >>>>> <snip>
> >>>>>>
> >>>>>>
> >>>>>>>>> Current mempool per core cache implementation is based on
> >>>>> pointer
> >>>>>>>>> For most architectures, each pointer consumes 64b Replace it
> >>>>> with
> >>>>>>>>> index-based implementation, where in each buffer is addressed
> >>>>> by
> >>>>>>>>> (pool address + index)
> >>>>
> >>>> I like Dharmik's suggestion very much. CPU cache is a critical and
> >>>> limited resource.
> >>>>
> >>>> DPDK has a tendency of using pointers where indexes could be used
> >>>> instead. I suppose pointers provide the additional flexibility of
> >>>> mixing entries from different memory pools, e.g. multiple mbuf
> >> pools.
> >>>>
> >>
> >> Agreed, thank you!
> >>
> >>>>>>>>
> >>>>>>>> I don't think it is going to work:
> >>>>>>>> On 64-bit systems difference between pool address and it's
> elem
> >>>>>>>> address could be bigger than 4GB.
> >>>>>>> Are you talking about a case where the memory pool size is more
> >>>>> than 4GB?
> >>>>>>
> >>>>>> That is one possible scenario.
> >>>>
> >>>> That could be solved by making the index an element index instead
> of
> >> a
> >>>> pointer offset: address = (pool address + index * element size).
> >>>
> >>> Or instead of scaling the index with the element size, which is
> only
> >> known at runtime, the index could be more efficiently scaled by a
> >> compile time constant such as RTE_MEMPOOL_ALIGN (=
> >> RTE_CACHE_LINE_SIZE). With a cache line size of 64 byte, that would
> >> allow indexing into mempools up to 256 GB in size.
> >>>
> >>
> >> Looking at this snippet [1] from rte_mempool_op_populate_helper(),
> >> there is an ‘offset’ added to avoid objects to cross page
> boundaries.
> >> If my understanding is correct, using the index of element instead
> of a
> >> pointer offset will pose a challenge for some of the corner cases.
> >>
> >> [1]
> >>        for (i = 0; i < max_objs; i++) {
> >>                /* avoid objects to cross page boundaries */
> >>                if (check_obj_bounds(va + off, pg_sz, total_elt_sz) <
> >> 0) {
> >>                        off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) -
> >> (va + off);
> >>                        if (flags & RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
> >>                                off += total_elt_sz -
> >>                                        (((uintptr_t)(va + off - 1) %
> >>                                                total_elt_sz) + 1);
> >>                }
> >>
> >
> > OK. Alternatively to scaling the index with a cache line size, you
> can scale it with sizeof(uintptr_t) to be able to address 32 or 16 GB
> mempools on respectively 64 bit and 32 bit architectures. Both x86 and
> ARM CPUs have instructions to access memory with an added offset
> multiplied by 4 or 8. So that should be high performance.
> 
> Yes, agreed this can be done.
> Cache line size can also be used when ‘MEMPOOL_F_NO_CACHE_ALIGN’ is not
> enabled.
> On a side note, I wanted to better understand the need for having the
> 'MEMPOOL_F_NO_CACHE_ALIGN' option.

The description of this field is misleading, and should be corrected.
The correct description would be: Don't need to align objs on cache lines.

It is useful for mempools containing very small objects, to conserve memory.

> 
> >
> >>>>
> >>>>>> Another possibility - user populates mempool himself with some
> >>>>> external
> >>>>>> memory by calling rte_mempool_populate_iova() directly.
> >>>>> Is the concern that IOVA might not be contiguous for all the
> memory
> >>>>> used by the mempool?
> >>>>>
> >>>>>> I suppose such situation can even occur even with normal
> >>>>>> rte_mempool_create(), though it should be a really rare one.
> >>>>> All in all, this feature needs to be configurable during compile
> >>>> time.
> >>>
> >
  
Honnappa Nagarahalli Nov. 8, 2021, 4:32 a.m. UTC | #16
<snip>

> > >>>>>>>>> Current mempool per core cache implementation is based on
> > >>>>> pointer
> > >>>>>>>>> For most architectures, each pointer consumes 64b Replace it
> > >>>>> with
> > >>>>>>>>> index-based implementation, where in each buffer is
> > >>>>>>>>> addressed
> > >>>>> by
> > >>>>>>>>> (pool address + index)
> > >>>>
> > >>>> I like Dharmik's suggestion very much. CPU cache is a critical
> > >>>> and limited resource.
> > >>>>
> > >>>> DPDK has a tendency of using pointers where indexes could be used
> > >>>> instead. I suppose pointers provide the additional flexibility of
> > >>>> mixing entries from different memory pools, e.g. multiple mbuf
> > >> pools.
> > >>>>
> > >>
> > >> Agreed, thank you!
> > >>
> > >>>>>>>>
> > >>>>>>>> I don't think it is going to work:
> > >>>>>>>> On 64-bit systems difference between pool address and it's
> > elem
> > >>>>>>>> address could be bigger than 4GB.
> > >>>>>>> Are you talking about a case where the memory pool size is
> > >>>>>>> more
> > >>>>> than 4GB?
> > >>>>>>
> > >>>>>> That is one possible scenario.
> > >>>>
> > >>>> That could be solved by making the index an element index instead
> > of
> > >> a
> > >>>> pointer offset: address = (pool address + index * element size).
> > >>>
> > >>> Or instead of scaling the index with the element size, which is
> > only
> > >> known at runtime, the index could be more efficiently scaled by a
> > >> compile time constant such as RTE_MEMPOOL_ALIGN (=
> > >> RTE_CACHE_LINE_SIZE). With a cache line size of 64 byte, that would
> > >> allow indexing into mempools up to 256 GB in size.
> > >>>
> > >>
> > >> Looking at this snippet [1] from rte_mempool_op_populate_helper(),
> > >> there is an ‘offset’ added to avoid objects to cross page
> > boundaries.
> > >> If my understanding is correct, using the index of element instead
> > of a
> > >> pointer offset will pose a challenge for some of the corner cases.
> > >>
> > >> [1]
> > >>        for (i = 0; i < max_objs; i++) {
> > >>                /* avoid objects to cross page boundaries */
> > >>                if (check_obj_bounds(va + off, pg_sz, total_elt_sz)
> > >> <
> > >> 0) {
> > >>                        off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) -
> > >> (va + off);
> > >>                        if (flags & RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
> > >>                                off += total_elt_sz -
> > >>                                        (((uintptr_t)(va + off - 1) %
> > >>                                                total_elt_sz) + 1);
> > >>                }
> > >>
> > >
> > > OK. Alternatively to scaling the index with a cache line size, you
> > can scale it with sizeof(uintptr_t) to be able to address 32 or 16 GB
> > mempools on respectively 64 bit and 32 bit architectures. Both x86 and
> > ARM CPUs have instructions to access memory with an added offset
> > multiplied by 4 or 8. So that should be high performance.
> >
> > Yes, agreed this can be done.
> > Cache line size can also be used when ‘MEMPOOL_F_NO_CACHE_ALIGN’ is
> > not enabled.
> > On a side note, I wanted to better understand the need for having the
> > 'MEMPOOL_F_NO_CACHE_ALIGN' option.
> 
> The description of this field is misleading, and should be corrected.
> The correct description would be: Don't need to align objs on cache lines.
> 
> It is useful for mempools containing very small objects, to conserve memory.
I think we can assume that mbuf pools are created with the 'MEMPOOL_F_NO_CACHE_ALIGN' flag set. With this we can use offset calculated with cache line size as the unit.

> 
> >
> > >
> > >>>>
> > >>>>>> Another possibility - user populates mempool himself with some
> > >>>>> external
> > >>>>>> memory by calling rte_mempool_populate_iova() directly.
> > >>>>> Is the concern that IOVA might not be contiguous for all the
> > memory
> > >>>>> used by the mempool?
> > >>>>>
> > >>>>>> I suppose such situation can even occur even with normal
> > >>>>>> rte_mempool_create(), though it should be a really rare one.
> > >>>>> All in all, this feature needs to be configurable during compile
> > >>>> time.
> > >>>
> > >
  
Morten Brørup Nov. 8, 2021, 7:22 a.m. UTC | #17
> From: Honnappa Nagarahalli [mailto:Honnappa.Nagarahalli@arm.com]
> Sent: Monday, 8 November 2021 05.33
> 
> <snip>
> 
> > > >>>>>>>>> Current mempool per core cache implementation is based on
> > > >>>>> pointer
> > > >>>>>>>>> For most architectures, each pointer consumes 64b Replace
> it
> > > >>>>> with
> > > >>>>>>>>> index-based implementation, where in each buffer is
> > > >>>>>>>>> addressed
> > > >>>>> by
> > > >>>>>>>>> (pool address + index)
> > > >>>>
> > > >>>> I like Dharmik's suggestion very much. CPU cache is a critical
> > > >>>> and limited resource.
> > > >>>>
> > > >>>> DPDK has a tendency of using pointers where indexes could be
> used
> > > >>>> instead. I suppose pointers provide the additional flexibility
> of
> > > >>>> mixing entries from different memory pools, e.g. multiple mbuf
> > > >> pools.
> > > >>>>
> > > >>
> > > >> Agreed, thank you!
> > > >>
> > > >>>>>>>>
> > > >>>>>>>> I don't think it is going to work:
> > > >>>>>>>> On 64-bit systems difference between pool address and it's
> > > elem
> > > >>>>>>>> address could be bigger than 4GB.
> > > >>>>>>> Are you talking about a case where the memory pool size is
> > > >>>>>>> more
> > > >>>>> than 4GB?
> > > >>>>>>
> > > >>>>>> That is one possible scenario.
> > > >>>>
> > > >>>> That could be solved by making the index an element index
> instead
> > > of
> > > >> a
> > > >>>> pointer offset: address = (pool address + index * element
> size).
> > > >>>
> > > >>> Or instead of scaling the index with the element size, which is
> > > only
> > > >> known at runtime, the index could be more efficiently scaled by
> a
> > > >> compile time constant such as RTE_MEMPOOL_ALIGN (=
> > > >> RTE_CACHE_LINE_SIZE). With a cache line size of 64 byte, that
> would
> > > >> allow indexing into mempools up to 256 GB in size.
> > > >>>
> > > >>
> > > >> Looking at this snippet [1] from
> rte_mempool_op_populate_helper(),
> > > >> there is an ‘offset’ added to avoid objects to cross page
> > > boundaries.
> > > >> If my understanding is correct, using the index of element
> instead
> > > of a
> > > >> pointer offset will pose a challenge for some of the corner
> cases.
> > > >>
> > > >> [1]
> > > >>        for (i = 0; i < max_objs; i++) {
> > > >>                /* avoid objects to cross page boundaries */
> > > >>                if (check_obj_bounds(va + off, pg_sz,
> total_elt_sz)
> > > >> <
> > > >> 0) {
> > > >>                        off += RTE_PTR_ALIGN_CEIL(va + off,
> pg_sz) -
> > > >> (va + off);
> > > >>                        if (flags &
> RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
> > > >>                                off += total_elt_sz -
> > > >>                                        (((uintptr_t)(va + off -
> 1) %
> > > >>                                                total_elt_sz) +
> 1);
> > > >>                }
> > > >>
> > > >
> > > > OK. Alternatively to scaling the index with a cache line size,
> you
> > > can scale it with sizeof(uintptr_t) to be able to address 32 or 16
> GB
> > > mempools on respectively 64 bit and 32 bit architectures. Both x86
> and
> > > ARM CPUs have instructions to access memory with an added offset
> > > multiplied by 4 or 8. So that should be high performance.
> > >
> > > Yes, agreed this can be done.
> > > Cache line size can also be used when ‘MEMPOOL_F_NO_CACHE_ALIGN’ is
> > > not enabled.
> > > On a side note, I wanted to better understand the need for having
> the
> > > 'MEMPOOL_F_NO_CACHE_ALIGN' option.
> >
> > The description of this field is misleading, and should be corrected.
> > The correct description would be: Don't need to align objs on cache
> lines.
> >
> > It is useful for mempools containing very small objects, to conserve
> memory.
> I think we can assume that mbuf pools are created with the
> 'MEMPOOL_F_NO_CACHE_ALIGN' flag set. With this we can use offset
> calculated with cache line size as the unit.

You mean MEMPOOL_F_NO_CACHE_ALIGN flag not set. ;-)

I agree. And since the flag is a hint only, it can be ignored if the mempool library is scaling the index with the cache line size.

However, a mempool may contain other objects than mbufs, and those objects may be small, so ignoring the MEMPOOL_F_NO_CACHE_ALIGN flag may cost a lot of memory for such mempools.

> 
> >
> > >
> > > >
> > > >>>>
> > > >>>>>> Another possibility - user populates mempool himself with
> some
> > > >>>>> external
> > > >>>>>> memory by calling rte_mempool_populate_iova() directly.
> > > >>>>> Is the concern that IOVA might not be contiguous for all the
> > > memory
> > > >>>>> used by the mempool?
> > > >>>>>
> > > >>>>>> I suppose such situation can even occur even with normal
> > > >>>>>> rte_mempool_create(), though it should be a really rare one.
> > > >>>>> All in all, this feature needs to be configurable during
> compile
> > > >>>> time.
> > > >>>
> > > >
  
Honnappa Nagarahalli Nov. 8, 2021, 3:29 p.m. UTC | #18
<snip>

> > > > >>>>>>>>> Current mempool per core cache implementation is based
> > > > >>>>>>>>> on
> > > > >>>>> pointer
> > > > >>>>>>>>> For most architectures, each pointer consumes 64b
> > > > >>>>>>>>> Replace
> > it
> > > > >>>>> with
> > > > >>>>>>>>> index-based implementation, where in each buffer is
> > > > >>>>>>>>> addressed
> > > > >>>>> by
> > > > >>>>>>>>> (pool address + index)
> > > > >>>>
> > > > >>>> I like Dharmik's suggestion very much. CPU cache is a
> > > > >>>> critical and limited resource.
> > > > >>>>
> > > > >>>> DPDK has a tendency of using pointers where indexes could be
> > used
> > > > >>>> instead. I suppose pointers provide the additional
> > > > >>>> flexibility
> > of
> > > > >>>> mixing entries from different memory pools, e.g. multiple
> > > > >>>> mbuf
> > > > >> pools.
> > > > >>>>
> > > > >>
> > > > >> Agreed, thank you!
> > > > >>
> > > > >>>>>>>>
> > > > >>>>>>>> I don't think it is going to work:
> > > > >>>>>>>> On 64-bit systems difference between pool address and
> > > > >>>>>>>> it's
> > > > elem
> > > > >>>>>>>> address could be bigger than 4GB.
> > > > >>>>>>> Are you talking about a case where the memory pool size is
> > > > >>>>>>> more
> > > > >>>>> than 4GB?
> > > > >>>>>>
> > > > >>>>>> That is one possible scenario.
> > > > >>>>
> > > > >>>> That could be solved by making the index an element index
> > instead
> > > > of
> > > > >> a
> > > > >>>> pointer offset: address = (pool address + index * element
> > size).
> > > > >>>
> > > > >>> Or instead of scaling the index with the element size, which
> > > > >>> is
> > > > only
> > > > >> known at runtime, the index could be more efficiently scaled by
> > a
> > > > >> compile time constant such as RTE_MEMPOOL_ALIGN (=
> > > > >> RTE_CACHE_LINE_SIZE). With a cache line size of 64 byte, that
> > would
> > > > >> allow indexing into mempools up to 256 GB in size.
> > > > >>>
> > > > >>
> > > > >> Looking at this snippet [1] from
> > rte_mempool_op_populate_helper(),
> > > > >> there is an ‘offset’ added to avoid objects to cross page
> > > > boundaries.
> > > > >> If my understanding is correct, using the index of element
> > instead
> > > > of a
> > > > >> pointer offset will pose a challenge for some of the corner
> > cases.
> > > > >>
> > > > >> [1]
> > > > >>        for (i = 0; i < max_objs; i++) {
> > > > >>                /* avoid objects to cross page boundaries */
> > > > >>                if (check_obj_bounds(va + off, pg_sz,
> > total_elt_sz)
> > > > >> <
> > > > >> 0) {
> > > > >>                        off += RTE_PTR_ALIGN_CEIL(va + off,
> > pg_sz) -
> > > > >> (va + off);
> > > > >>                        if (flags &
> > RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
> > > > >>                                off += total_elt_sz -
> > > > >>                                        (((uintptr_t)(va + off -
> > 1) %
> > > > >>                                                total_elt_sz) +
> > 1);
> > > > >>                }
> > > > >>
> > > > >
> > > > > OK. Alternatively to scaling the index with a cache line size,
> > you
> > > > can scale it with sizeof(uintptr_t) to be able to address 32 or 16
> > GB
> > > > mempools on respectively 64 bit and 32 bit architectures. Both x86
> > and
> > > > ARM CPUs have instructions to access memory with an added offset
> > > > multiplied by 4 or 8. So that should be high performance.
> > > >
> > > > Yes, agreed this can be done.
> > > > Cache line size can also be used when ‘MEMPOOL_F_NO_CACHE_ALIGN’
> > > > is not enabled.
> > > > On a side note, I wanted to better understand the need for having
> > the
> > > > 'MEMPOOL_F_NO_CACHE_ALIGN' option.
> > >
> > > The description of this field is misleading, and should be corrected.
> > > The correct description would be: Don't need to align objs on cache
> > lines.
> > >
> > > It is useful for mempools containing very small objects, to conserve
> > memory.
> > I think we can assume that mbuf pools are created with the
> > 'MEMPOOL_F_NO_CACHE_ALIGN' flag set. With this we can use offset
> > calculated with cache line size as the unit.
> 
> You mean MEMPOOL_F_NO_CACHE_ALIGN flag not set. ;-)
Yes 😊

> 
> I agree. And since the flag is a hint only, it can be ignored if the mempool
> library is scaling the index with the cache line size.
I do not think we should ignore the flag for reason you mention below.

> 
> However, a mempool may contain other objects than mbufs, and those objects
> may be small, so ignoring the MEMPOOL_F_NO_CACHE_ALIGN flag may cost a
> lot of memory for such mempools.
We could use different methods. If MEMPOOL_F_NO_CACHE_ALIGN is set, use the unit as 'sizeof(uintptr_t)', if not set use cache line size as the unit.

> 
> >
> > >
<snip>
  
Morten Brørup Nov. 8, 2021, 3:39 p.m. UTC | #19
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Honnappa
> Nagarahalli
> Sent: Monday, 8 November 2021 16.29
> 
> <snip>
> 
> > > > > >>>>>>>>> Current mempool per core cache implementation is
> based
> > > > > >>>>>>>>> on
> > > > > >>>>> pointer
> > > > > >>>>>>>>> For most architectures, each pointer consumes 64b
> > > > > >>>>>>>>> Replace
> > > it
> > > > > >>>>> with
> > > > > >>>>>>>>> index-based implementation, where in each buffer is
> > > > > >>>>>>>>> addressed
> > > > > >>>>> by
> > > > > >>>>>>>>> (pool address + index)
> > > > > >>>>
> > > > > >>>> I like Dharmik's suggestion very much. CPU cache is a
> > > > > >>>> critical and limited resource.
> > > > > >>>>
> > > > > >>>> DPDK has a tendency of using pointers where indexes could
> be
> > > used
> > > > > >>>> instead. I suppose pointers provide the additional
> > > > > >>>> flexibility
> > > of
> > > > > >>>> mixing entries from different memory pools, e.g. multiple
> > > > > >>>> mbuf
> > > > > >> pools.
> > > > > >>>>
> > > > > >>
> > > > > >> Agreed, thank you!
> > > > > >>
> > > > > >>>>>>>>
> > > > > >>>>>>>> I don't think it is going to work:
> > > > > >>>>>>>> On 64-bit systems difference between pool address and
> > > > > >>>>>>>> it's
> > > > > elem
> > > > > >>>>>>>> address could be bigger than 4GB.
> > > > > >>>>>>> Are you talking about a case where the memory pool size
> is
> > > > > >>>>>>> more
> > > > > >>>>> than 4GB?
> > > > > >>>>>>
> > > > > >>>>>> That is one possible scenario.
> > > > > >>>>
> > > > > >>>> That could be solved by making the index an element index
> > > instead
> > > > > of
> > > > > >> a
> > > > > >>>> pointer offset: address = (pool address + index * element
> > > size).
> > > > > >>>
> > > > > >>> Or instead of scaling the index with the element size,
> which
> > > > > >>> is
> > > > > only
> > > > > >> known at runtime, the index could be more efficiently scaled
> by
> > > a
> > > > > >> compile time constant such as RTE_MEMPOOL_ALIGN (=
> > > > > >> RTE_CACHE_LINE_SIZE). With a cache line size of 64 byte,
> that
> > > would
> > > > > >> allow indexing into mempools up to 256 GB in size.
> > > > > >>>
> > > > > >>
> > > > > >> Looking at this snippet [1] from
> > > rte_mempool_op_populate_helper(),
> > > > > >> there is an ‘offset’ added to avoid objects to cross page
> > > > > boundaries.
> > > > > >> If my understanding is correct, using the index of element
> > > instead
> > > > > of a
> > > > > >> pointer offset will pose a challenge for some of the corner
> > > cases.
> > > > > >>
> > > > > >> [1]
> > > > > >>        for (i = 0; i < max_objs; i++) {
> > > > > >>                /* avoid objects to cross page boundaries */
> > > > > >>                if (check_obj_bounds(va + off, pg_sz,
> > > total_elt_sz)
> > > > > >> <
> > > > > >> 0) {
> > > > > >>                        off += RTE_PTR_ALIGN_CEIL(va + off,
> > > pg_sz) -
> > > > > >> (va + off);
> > > > > >>                        if (flags &
> > > RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
> > > > > >>                                off += total_elt_sz -
> > > > > >>                                        (((uintptr_t)(va +
> off -
> > > 1) %
> > > > > >>                                                total_elt_sz)
> +
> > > 1);
> > > > > >>                }
> > > > > >>
> > > > > >
> > > > > > OK. Alternatively to scaling the index with a cache line
> size,
> > > you
> > > > > can scale it with sizeof(uintptr_t) to be able to address 32 or
> 16
> > > GB
> > > > > mempools on respectively 64 bit and 32 bit architectures. Both
> x86
> > > and
> > > > > ARM CPUs have instructions to access memory with an added
> offset
> > > > > multiplied by 4 or 8. So that should be high performance.
> > > > >
> > > > > Yes, agreed this can be done.
> > > > > Cache line size can also be used when
> ‘MEMPOOL_F_NO_CACHE_ALIGN’
> > > > > is not enabled.
> > > > > On a side note, I wanted to better understand the need for
> having
> > > the
> > > > > 'MEMPOOL_F_NO_CACHE_ALIGN' option.
> > > >
> > > > The description of this field is misleading, and should be
> corrected.
> > > > The correct description would be: Don't need to align objs on
> cache
> > > lines.
> > > >
> > > > It is useful for mempools containing very small objects, to
> conserve
> > > memory.
> > > I think we can assume that mbuf pools are created with the
> > > 'MEMPOOL_F_NO_CACHE_ALIGN' flag set. With this we can use offset
> > > calculated with cache line size as the unit.
> >
> > You mean MEMPOOL_F_NO_CACHE_ALIGN flag not set. ;-)
> Yes 😊
> 
> >
> > I agree. And since the flag is a hint only, it can be ignored if the
> mempool
> > library is scaling the index with the cache line size.
> I do not think we should ignore the flag for reason you mention below.
> 
> >
> > However, a mempool may contain other objects than mbufs, and those
> objects
> > may be small, so ignoring the MEMPOOL_F_NO_CACHE_ALIGN flag may cost
> a
> > lot of memory for such mempools.
> We could use different methods. If MEMPOOL_F_NO_CACHE_ALIGN is set, use
> the unit as 'sizeof(uintptr_t)', if not set use cache line size as the
> unit.
> 

That would require that the indexing multiplier is a runtime parameter instead of a compile time parameter. So it would have a performance penalty.

The indexing multiplier could be compile time configurable, so it is a tradeoff between granularity and maximum mempool size.
  
Honnappa Nagarahalli Nov. 8, 2021, 3:46 p.m. UTC | #20
<snip>
> >
> > > > > > >>>>>>>>> Current mempool per core cache implementation is
> > based
> > > > > > >>>>>>>>> on
> > > > > > >>>>> pointer
> > > > > > >>>>>>>>> For most architectures, each pointer consumes 64b
> > > > > > >>>>>>>>> Replace
> > > > it
> > > > > > >>>>> with
> > > > > > >>>>>>>>> index-based implementation, where in each buffer is
> > > > > > >>>>>>>>> addressed
> > > > > > >>>>> by
> > > > > > >>>>>>>>> (pool address + index)
> > > > > > >>>>
> > > > > > >>>> I like Dharmik's suggestion very much. CPU cache is a
> > > > > > >>>> critical and limited resource.
> > > > > > >>>>
> > > > > > >>>> DPDK has a tendency of using pointers where indexes could
> > be
> > > > used
> > > > > > >>>> instead. I suppose pointers provide the additional
> > > > > > >>>> flexibility
> > > > of
> > > > > > >>>> mixing entries from different memory pools, e.g. multiple
> > > > > > >>>> mbuf
> > > > > > >> pools.
> > > > > > >>>>
> > > > > > >>
> > > > > > >> Agreed, thank you!
> > > > > > >>
> > > > > > >>>>>>>>
> > > > > > >>>>>>>> I don't think it is going to work:
> > > > > > >>>>>>>> On 64-bit systems difference between pool address and
> > > > > > >>>>>>>> it's
> > > > > > elem
> > > > > > >>>>>>>> address could be bigger than 4GB.
> > > > > > >>>>>>> Are you talking about a case where the memory pool
> > > > > > >>>>>>> size
> > is
> > > > > > >>>>>>> more
> > > > > > >>>>> than 4GB?
> > > > > > >>>>>>
> > > > > > >>>>>> That is one possible scenario.
> > > > > > >>>>
> > > > > > >>>> That could be solved by making the index an element index
> > > > instead
> > > > > > of
> > > > > > >> a
> > > > > > >>>> pointer offset: address = (pool address + index * element
> > > > size).
> > > > > > >>>
> > > > > > >>> Or instead of scaling the index with the element size,
> > which
> > > > > > >>> is
> > > > > > only
> > > > > > >> known at runtime, the index could be more efficiently
> > > > > > >> scaled
> > by
> > > > a
> > > > > > >> compile time constant such as RTE_MEMPOOL_ALIGN (=
> > > > > > >> RTE_CACHE_LINE_SIZE). With a cache line size of 64 byte,
> > that
> > > > would
> > > > > > >> allow indexing into mempools up to 256 GB in size.
> > > > > > >>>
> > > > > > >>
> > > > > > >> Looking at this snippet [1] from
> > > > rte_mempool_op_populate_helper(),
> > > > > > >> there is an ‘offset’ added to avoid objects to cross page
> > > > > > boundaries.
> > > > > > >> If my understanding is correct, using the index of element
> > > > instead
> > > > > > of a
> > > > > > >> pointer offset will pose a challenge for some of the corner
> > > > cases.
> > > > > > >>
> > > > > > >> [1]
> > > > > > >>        for (i = 0; i < max_objs; i++) {
> > > > > > >>                /* avoid objects to cross page boundaries */
> > > > > > >>                if (check_obj_bounds(va + off, pg_sz,
> > > > total_elt_sz)
> > > > > > >> <
> > > > > > >> 0) {
> > > > > > >>                        off += RTE_PTR_ALIGN_CEIL(va + off,
> > > > pg_sz) -
> > > > > > >> (va + off);
> > > > > > >>                        if (flags &
> > > > RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
> > > > > > >>                                off += total_elt_sz -
> > > > > > >>                                        (((uintptr_t)(va +
> > off -
> > > > 1) %
> > > > > > >>
> > > > > > >> total_elt_sz)
> > +
> > > > 1);
> > > > > > >>                }
> > > > > > >>
> > > > > > >
> > > > > > > OK. Alternatively to scaling the index with a cache line
> > size,
> > > > you
> > > > > > can scale it with sizeof(uintptr_t) to be able to address 32
> > > > > > or
> > 16
> > > > GB
> > > > > > mempools on respectively 64 bit and 32 bit architectures. Both
> > x86
> > > > and
> > > > > > ARM CPUs have instructions to access memory with an added
> > offset
> > > > > > multiplied by 4 or 8. So that should be high performance.
> > > > > >
> > > > > > Yes, agreed this can be done.
> > > > > > Cache line size can also be used when
> > ‘MEMPOOL_F_NO_CACHE_ALIGN’
> > > > > > is not enabled.
> > > > > > On a side note, I wanted to better understand the need for
> > having
> > > > the
> > > > > > 'MEMPOOL_F_NO_CACHE_ALIGN' option.
> > > > >
> > > > > The description of this field is misleading, and should be
> > corrected.
> > > > > The correct description would be: Don't need to align objs on
> > cache
> > > > lines.
> > > > >
> > > > > It is useful for mempools containing very small objects, to
> > conserve
> > > > memory.
> > > > I think we can assume that mbuf pools are created with the
> > > > 'MEMPOOL_F_NO_CACHE_ALIGN' flag set. With this we can use offset
> > > > calculated with cache line size as the unit.
> > >
> > > You mean MEMPOOL_F_NO_CACHE_ALIGN flag not set. ;-)
> > Yes 😊
> >
> > >
> > > I agree. And since the flag is a hint only, it can be ignored if the
> > mempool
> > > library is scaling the index with the cache line size.
> > I do not think we should ignore the flag for reason you mention below.
> >
> > >
> > > However, a mempool may contain other objects than mbufs, and those
> > objects
> > > may be small, so ignoring the MEMPOOL_F_NO_CACHE_ALIGN flag may
> cost
> > a
> > > lot of memory for such mempools.
> > We could use different methods. If MEMPOOL_F_NO_CACHE_ALIGN is set,
> > use the unit as 'sizeof(uintptr_t)', if not set use cache line size as
> > the unit.
> >
> 
> That would require that the indexing multiplier is a runtime parameter instead
> of a compile time parameter. So it would have a performance penalty.
> 
> The indexing multiplier could be compile time configurable, so it is a tradeoff
> between granularity and maximum mempool size.
I meant compile time configurable. i.e.

#ifdef MEMPOOL_F_NO_CACHE_ALIGN
<use sizeof(uintptr_t) as the multiplier>
#else
<use cache line size as the multiplier> /* This should provide enough memory for packet buffers */
#endif
  
Morten Brørup Nov. 8, 2021, 4:03 p.m. UTC | #21
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Honnappa
> Nagarahalli
> Sent: Monday, 8 November 2021 16.46
> 
> <snip>
> > >
> > > > > > > >>>>>>>>> Current mempool per core cache implementation is
> > > based
> > > > > > > >>>>>>>>> on
> > > > > > > >>>>> pointer
> > > > > > > >>>>>>>>> For most architectures, each pointer consumes 64b
> > > > > > > >>>>>>>>> Replace
> > > > > it
> > > > > > > >>>>> with
> > > > > > > >>>>>>>>> index-based implementation, where in each buffer
> is
> > > > > > > >>>>>>>>> addressed
> > > > > > > >>>>> by
> > > > > > > >>>>>>>>> (pool address + index)
> > > > > > > >>>>
> > > > > > > >>>> I like Dharmik's suggestion very much. CPU cache is a
> > > > > > > >>>> critical and limited resource.
> > > > > > > >>>>
> > > > > > > >>>> DPDK has a tendency of using pointers where indexes
> could
> > > be
> > > > > used
> > > > > > > >>>> instead. I suppose pointers provide the additional
> > > > > > > >>>> flexibility
> > > > > of
> > > > > > > >>>> mixing entries from different memory pools, e.g.
> multiple
> > > > > > > >>>> mbuf
> > > > > > > >> pools.
> > > > > > > >>>>
> > > > > > > >>
> > > > > > > >> Agreed, thank you!
> > > > > > > >>
> > > > > > > >>>>>>>>
> > > > > > > >>>>>>>> I don't think it is going to work:
> > > > > > > >>>>>>>> On 64-bit systems difference between pool address
> and
> > > > > > > >>>>>>>> it's
> > > > > > > elem
> > > > > > > >>>>>>>> address could be bigger than 4GB.
> > > > > > > >>>>>>> Are you talking about a case where the memory pool
> > > > > > > >>>>>>> size
> > > is
> > > > > > > >>>>>>> more
> > > > > > > >>>>> than 4GB?
> > > > > > > >>>>>>
> > > > > > > >>>>>> That is one possible scenario.
> > > > > > > >>>>
> > > > > > > >>>> That could be solved by making the index an element
> index
> > > > > instead
> > > > > > > of
> > > > > > > >> a
> > > > > > > >>>> pointer offset: address = (pool address + index *
> element
> > > > > size).
> > > > > > > >>>
> > > > > > > >>> Or instead of scaling the index with the element size,
> > > which
> > > > > > > >>> is
> > > > > > > only
> > > > > > > >> known at runtime, the index could be more efficiently
> > > > > > > >> scaled
> > > by
> > > > > a
> > > > > > > >> compile time constant such as RTE_MEMPOOL_ALIGN (=
> > > > > > > >> RTE_CACHE_LINE_SIZE). With a cache line size of 64 byte,
> > > that
> > > > > would
> > > > > > > >> allow indexing into mempools up to 256 GB in size.
> > > > > > > >>>
> > > > > > > >>
> > > > > > > >> Looking at this snippet [1] from
> > > > > rte_mempool_op_populate_helper(),
> > > > > > > >> there is an ‘offset’ added to avoid objects to cross
> page
> > > > > > > boundaries.
> > > > > > > >> If my understanding is correct, using the index of
> element
> > > > > instead
> > > > > > > of a
> > > > > > > >> pointer offset will pose a challenge for some of the
> corner
> > > > > cases.
> > > > > > > >>
> > > > > > > >> [1]
> > > > > > > >>        for (i = 0; i < max_objs; i++) {
> > > > > > > >>                /* avoid objects to cross page boundaries
> */
> > > > > > > >>                if (check_obj_bounds(va + off, pg_sz,
> > > > > total_elt_sz)
> > > > > > > >> <
> > > > > > > >> 0) {
> > > > > > > >>                        off += RTE_PTR_ALIGN_CEIL(va +
> off,
> > > > > pg_sz) -
> > > > > > > >> (va + off);
> > > > > > > >>                        if (flags &
> > > > > RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
> > > > > > > >>                                off += total_elt_sz -
> > > > > > > >>                                        (((uintptr_t)(va
> +
> > > off -
> > > > > 1) %
> > > > > > > >>
> > > > > > > >> total_elt_sz)
> > > +
> > > > > 1);
> > > > > > > >>                }
> > > > > > > >>
> > > > > > > >
> > > > > > > > OK. Alternatively to scaling the index with a cache line
> > > size,
> > > > > you
> > > > > > > can scale it with sizeof(uintptr_t) to be able to address
> 32
> > > > > > > or
> > > 16
> > > > > GB
> > > > > > > mempools on respectively 64 bit and 32 bit architectures.
> Both
> > > x86
> > > > > and
> > > > > > > ARM CPUs have instructions to access memory with an added
> > > offset
> > > > > > > multiplied by 4 or 8. So that should be high performance.
> > > > > > >
> > > > > > > Yes, agreed this can be done.
> > > > > > > Cache line size can also be used when
> > > ‘MEMPOOL_F_NO_CACHE_ALIGN’
> > > > > > > is not enabled.
> > > > > > > On a side note, I wanted to better understand the need for
> > > having
> > > > > the
> > > > > > > 'MEMPOOL_F_NO_CACHE_ALIGN' option.
> > > > > >
> > > > > > The description of this field is misleading, and should be
> > > corrected.
> > > > > > The correct description would be: Don't need to align objs on
> > > cache
> > > > > lines.
> > > > > >
> > > > > > It is useful for mempools containing very small objects, to
> > > conserve
> > > > > memory.
> > > > > I think we can assume that mbuf pools are created with the
> > > > > 'MEMPOOL_F_NO_CACHE_ALIGN' flag set. With this we can use
> offset
> > > > > calculated with cache line size as the unit.
> > > >
> > > > You mean MEMPOOL_F_NO_CACHE_ALIGN flag not set. ;-)
> > > Yes 😊
> > >
> > > >
> > > > I agree. And since the flag is a hint only, it can be ignored if
> the
> > > mempool
> > > > library is scaling the index with the cache line size.
> > > I do not think we should ignore the flag for reason you mention
> below.
> > >
> > > >
> > > > However, a mempool may contain other objects than mbufs, and
> those
> > > objects
> > > > may be small, so ignoring the MEMPOOL_F_NO_CACHE_ALIGN flag may
> > cost
> > > a
> > > > lot of memory for such mempools.
> > > We could use different methods. If MEMPOOL_F_NO_CACHE_ALIGN is set,
> > > use the unit as 'sizeof(uintptr_t)', if not set use cache line size
> as
> > > the unit.
> > >
> >
> > That would require that the indexing multiplier is a runtime
> parameter instead
> > of a compile time parameter. So it would have a performance penalty.
> >
> > The indexing multiplier could be compile time configurable, so it is
> a tradeoff
> > between granularity and maximum mempool size.
> I meant compile time configurable. i.e.
> 
> #ifdef MEMPOOL_F_NO_CACHE_ALIGN
> <use sizeof(uintptr_t) as the multiplier>
> #else
> <use cache line size as the multiplier> /* This should provide enough
> memory for packet buffers */
> #endif

Please note that MEMPOOL_F_NO_CACHE_ALIGN is a runtime flag passed when creating a mempool, not a compile time option.
  
Jerin Jacob Nov. 8, 2021, 4:47 p.m. UTC | #22
On Mon, Nov 8, 2021 at 9:34 PM Morten Brørup <mb@smartsharesystems.com> wrote:
>
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Honnappa
> > Nagarahalli
> > Sent: Monday, 8 November 2021 16.46
> >
> > <snip>
> > > >
> > > > > > > > >>>>>>>>> Current mempool per core cache implementation is
> > > > based
> > > > > > > > >>>>>>>>> on
> > > > > > > > >>>>> pointer
> > > > > > > > >>>>>>>>> For most architectures, each pointer consumes 64b
> > > > > > > > >>>>>>>>> Replace
> > > > > > it
> > > > > > > > >>>>> with
> > > > > > > > >>>>>>>>> index-based implementation, where in each buffer
> > is
> > > > > > > > >>>>>>>>> addressed
> > > > > > > > >>>>> by
> > > > > > > > >>>>>>>>> (pool address + index)
> > > > > > > > >>>>
> > > > > > > > >>>> I like Dharmik's suggestion very much. CPU cache is a
> > > > > > > > >>>> critical and limited resource.
> > > > > > > > >>>>
> > > > > > > > >>>> DPDK has a tendency of using pointers where indexes
> > could
> > > > be
> > > > > > used
> > > > > > > > >>>> instead. I suppose pointers provide the additional
> > > > > > > > >>>> flexibility
> > > > > > of
> > > > > > > > >>>> mixing entries from different memory pools, e.g.
> > multiple
> > > > > > > > >>>> mbuf
> > > > > > > > >> pools.
> > > > > > > > >>>>
> > > > > > > > >>
> > > > > > > > >> Agreed, thank you!
> > > > > > > > >>
> > > > > > > > >>>>>>>>
> > > > > > > > >>>>>>>> I don't think it is going to work:
> > > > > > > > >>>>>>>> On 64-bit systems difference between pool address
> > and
> > > > > > > > >>>>>>>> it's
> > > > > > > > elem
> > > > > > > > >>>>>>>> address could be bigger than 4GB.
> > > > > > > > >>>>>>> Are you talking about a case where the memory pool
> > > > > > > > >>>>>>> size
> > > > is
> > > > > > > > >>>>>>> more
> > > > > > > > >>>>> than 4GB?
> > > > > > > > >>>>>>
> > > > > > > > >>>>>> That is one possible scenario.
> > > > > > > > >>>>
> > > > > > > > >>>> That could be solved by making the index an element
> > index
> > > > > > instead
> > > > > > > > of
> > > > > > > > >> a
> > > > > > > > >>>> pointer offset: address = (pool address + index *
> > element
> > > > > > size).
> > > > > > > > >>>
> > > > > > > > >>> Or instead of scaling the index with the element size,
> > > > which
> > > > > > > > >>> is
> > > > > > > > only
> > > > > > > > >> known at runtime, the index could be more efficiently
> > > > > > > > >> scaled
> > > > by
> > > > > > a
> > > > > > > > >> compile time constant such as RTE_MEMPOOL_ALIGN (=
> > > > > > > > >> RTE_CACHE_LINE_SIZE). With a cache line size of 64 byte,
> > > > that
> > > > > > would
> > > > > > > > >> allow indexing into mempools up to 256 GB in size.
> > > > > > > > >>>
> > > > > > > > >>
> > > > > > > > >> Looking at this snippet [1] from
> > > > > > rte_mempool_op_populate_helper(),
> > > > > > > > >> there is an ‘offset’ added to avoid objects to cross
> > page
> > > > > > > > boundaries.
> > > > > > > > >> If my understanding is correct, using the index of
> > element
> > > > > > instead
> > > > > > > > of a
> > > > > > > > >> pointer offset will pose a challenge for some of the
> > corner
> > > > > > cases.
> > > > > > > > >>
> > > > > > > > >> [1]
> > > > > > > > >>        for (i = 0; i < max_objs; i++) {
> > > > > > > > >>                /* avoid objects to cross page boundaries
> > */
> > > > > > > > >>                if (check_obj_bounds(va + off, pg_sz,
> > > > > > total_elt_sz)
> > > > > > > > >> <
> > > > > > > > >> 0) {
> > > > > > > > >>                        off += RTE_PTR_ALIGN_CEIL(va +
> > off,
> > > > > > pg_sz) -
> > > > > > > > >> (va + off);
> > > > > > > > >>                        if (flags &
> > > > > > RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
> > > > > > > > >>                                off += total_elt_sz -
> > > > > > > > >>                                        (((uintptr_t)(va
> > +
> > > > off -
> > > > > > 1) %
> > > > > > > > >>
> > > > > > > > >> total_elt_sz)
> > > > +
> > > > > > 1);
> > > > > > > > >>                }
> > > > > > > > >>
> > > > > > > > >
> > > > > > > > > OK. Alternatively to scaling the index with a cache line
> > > > size,
> > > > > > you
> > > > > > > > can scale it with sizeof(uintptr_t) to be able to address
> > 32
> > > > > > > > or
> > > > 16
> > > > > > GB
> > > > > > > > mempools on respectively 64 bit and 32 bit architectures.
> > Both
> > > > x86
> > > > > > and
> > > > > > > > ARM CPUs have instructions to access memory with an added
> > > > offset
> > > > > > > > multiplied by 4 or 8. So that should be high performance.
> > > > > > > >
> > > > > > > > Yes, agreed this can be done.
> > > > > > > > Cache line size can also be used when
> > > > ‘MEMPOOL_F_NO_CACHE_ALIGN’
> > > > > > > > is not enabled.
> > > > > > > > On a side note, I wanted to better understand the need for
> > > > having
> > > > > > the
> > > > > > > > 'MEMPOOL_F_NO_CACHE_ALIGN' option.
> > > > > > >
> > > > > > > The description of this field is misleading, and should be
> > > > corrected.
> > > > > > > The correct description would be: Don't need to align objs on
> > > > cache
> > > > > > lines.
> > > > > > >
> > > > > > > It is useful for mempools containing very small objects, to
> > > > conserve
> > > > > > memory.
> > > > > > I think we can assume that mbuf pools are created with the
> > > > > > 'MEMPOOL_F_NO_CACHE_ALIGN' flag set. With this we can use
> > offset
> > > > > > calculated with cache line size as the unit.
> > > > >
> > > > > You mean MEMPOOL_F_NO_CACHE_ALIGN flag not set. ;-)
> > > > Yes 😊
> > > >
> > > > >
> > > > > I agree. And since the flag is a hint only, it can be ignored if
> > the
> > > > mempool
> > > > > library is scaling the index with the cache line size.
> > > > I do not think we should ignore the flag for reason you mention
> > below.
> > > >
> > > > >
> > > > > However, a mempool may contain other objects than mbufs, and
> > those
> > > > objects
> > > > > may be small, so ignoring the MEMPOOL_F_NO_CACHE_ALIGN flag may
> > > cost
> > > > a
> > > > > lot of memory for such mempools.
> > > > We could use different methods. If MEMPOOL_F_NO_CACHE_ALIGN is set,
> > > > use the unit as 'sizeof(uintptr_t)', if not set use cache line size
> > as
> > > > the unit.
> > > >
> > >
> > > That would require that the indexing multiplier is a runtime
> > parameter instead
> > > of a compile time parameter. So it would have a performance penalty.
> > >
> > > The indexing multiplier could be compile time configurable, so it is
> > a tradeoff
> > > between granularity and maximum mempool size.
> > I meant compile time configurable. i.e.
> >
> > #ifdef MEMPOOL_F_NO_CACHE_ALIGN
> > <use sizeof(uintptr_t) as the multiplier>
> > #else
> > <use cache line size as the multiplier> /* This should provide enough
> > memory for packet buffers */
> > #endif
>
> Please note that MEMPOOL_F_NO_CACHE_ALIGN is a runtime flag passed when creating a mempool, not a compile time option.

Also, Please share  PMU counters stats on L1 and L2 miss with or
without this scheme after the rework. IMO, we should not have any
regression on
1) Per core mpps
OR
2) L1 and L2 misses.
with l3fwd/testpmd/l2fwd etc,


>
>
  
Thomas Monjalon July 31, 2023, 12:23 p.m. UTC | #23
The v2 was not sent, and Stephen dropped the patch from patchwork.

Do we abandon this feature?
Should I remove it from the roadmap?


06/07/2023 19:43, Stephen Hemminger:
> On Thu, 13 Jan 2022 05:31:18 +0000
> Dharmik Thakkar <Dharmik.Thakkar@arm.com> wrote:
> 
> > Hi,
> > 
> > Thank you for your valuable review comments and suggestions!
> > 
> > I will be sending out a v2 in which I have increased the size of the mempool to 32GB by using division by sizeof(uintptr_t).
> > However, I am seeing ~5% performance degradation with mempool_perf_autotest (for bulk size of 32) with this change
> > when compared to the base performance.
> > Earlier, without this change, I was seeing an improvement of ~13% compared to base performance. So, this is a significant degradation.
> > I would appreciate your review comments on v2.
> > 
> > Thank you!
> > 
> > > On Jan 10, 2022, at 12:38 AM, Jerin Jacob <jerinjacobk@gmail.com> wrote:
> > > 
> > > On Sat, Jan 8, 2022 at 3:07 PM Morten Brørup <mb@smartsharesystems.com> wrote:  
> > >>   
> > >>> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > >>> Sent: Friday, 7 January 2022 14.51
> > >>> 
> > >>> On Fri, Jan 07, 2022 at 12:29:23PM +0100, Morten Brørup wrote:  
> > >>>>> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > >>>>> Sent: Friday, 7 January 2022 12.16
> > >>>>> 
> > >>>>> On Sat, Dec 25, 2021 at 01:16:03AM +0100, Morten Brørup wrote:  
> > >>>>>>> From: Dharmik Thakkar [mailto:dharmik.thakkar@arm.com] Sent:  
> > >>>>> Friday, 24  
> > >>>>>>> December 2021 23.59
> > >>>>>>> 
> > >>>>>>> Current mempool per core cache implementation stores pointers  
> > >>> to  
> > >>>>> mbufs  
> > >>>>>>> On 64b architectures, each pointer consumes 8B This patch  
> > >>> replaces  
> > >>>>> it  
> > >>>>>>> with index-based implementation, where in each buffer is  
> > >>> addressed  
> > >>>>> by  
> > >>>>>>> (pool base address + index) It reduces the amount of  
> > >>> memory/cache  
> > >>>>>>> required for per core cache
> > >>>>>>> 
> > >>>>>>> L3Fwd performance testing reveals minor improvements in the  
> > >>> cache  
> > >>>>>>> performance (L1 and L2 misses reduced by 0.60%) with no change  
> > >>> in  
> > >>>>>>> throughput
> > >>>>>>> 
> > >>>>>>> Micro-benchmarking the patch using mempool_perf_test shows  
> > >>>>> significant  
> > >>>>>>> improvement with majority of the test cases
> > >>>>>>>   
> > >>>>>> 
> > >>>>>> I still think this is very interesting. And your performance  
> > >>> numbers  
> > >>>>> are  
> > >>>>>> looking good.
> > >>>>>> 
> > >>>>>> However, it limits the size of a mempool to 4 GB. As previously
> > >>>>>> discussed, the max mempool size can be increased by multiplying  
> > >>> the  
> > >>>>> index  
> > >>>>>> with a constant.
> > >>>>>> 
> > >>>>>> I would suggest using sizeof(uintptr_t) as the constant  
> > >>> multiplier,  
> > >>>>> so  
> > >>>>>> the mempool can hold objects of any size divisible by  
> > >>>>> sizeof(uintptr_t).  
> > >>>>>> And it would be silly to use a mempool to hold objects smaller  
> > >>> than  
> > >>>>>> sizeof(uintptr_t).
> > >>>>>> 
> > >>>>>> How does the performance look if you multiply the index by
> > >>>>>> sizeof(uintptr_t)?
> > >>>>>>   
> > >>>>> 
> > >>>>> Each mempool entry is cache aligned, so we can use that if we want  
> > >>> a  
> > >>>>> bigger
> > >>>>> multiplier.  
> > >>>> 
> > >>>> Thanks for chiming in, Bruce.
> > >>>> 
> > >>>> Please also read this discussion about the multiplier:
> > >>>> http://inbox.dpdk.org/dev/CALBAE1PrQYyOG96f6ECeW1vPF3TOh1h7MZZULiY95z9xjbRuyA@mail.gmail.com/
> > >>>>   
> > >>> 
> > >>> I actually wondered after I had sent the email whether we had indeed an
> > >>> option to disable the cache alignment or not! Thanks for pointing out
> > >>> that
> > >>> we do. This brings a couple additional thoughts:
> > >>> 
> > >>> * Using indexes for the cache should probably be a runtime flag rather
> > >>> than
> > >>>  a build-time one.
> > >>> * It would seem reasonable to me to disallow use of the indexed-cache
> > >>> flag
> > >>>  and the non-cache aligned flag simultaneously.
> > >>> * On the offchance that that restriction is unacceptable, then we can
> > >>>  make things a little more complicated by doing a runtime computation
> > >>> of
> > >>>  the "index-shiftwidth" to use.
> > >>> 
> > >>> Overall, I think defaulting to cacheline shiftwidth and disallowing
> > >>> index-based addressing when using unaligned buffers is simplest and
> > >>> easiest
> > >>> unless we can come up with a valid usecase for needing more than that.
> > >>> 
> > >>> /Bruce  
> > >> 
> > >> This feature is a performance optimization.
> > >> 
> > >> With that in mind, it should not introduce function pointers or similar run-time checks or in the fast path, to determine what kind of cache to use per mempool. And if an index multiplier is implemented, it should be a compile time constant, probably something between sizeof(uintptr_t) or RTE_MEMPOOL_ALIGN (=RTE_CACHE_LINE_SIZE).
> > >> 
> > >> The patch comes with a tradeoff between better performance and limited mempool size, and possibly some limitations regarding very small objects that are not cache line aligned to avoid wasting memory (RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ).
> > >> 
> > >> With no multiplier, the only tradeoff is that the mempool size is limited to 4 GB.
> > >> 
> > >> If the multiplier is small (i.e. 8 bytes) the only tradeoff is that the mempool size is limited to 32 GB. (And a waste of memory for objects smaller than 8 byte; but I don't think anyone would use a mempool to hold objects smaller than 8 byte.)
> > >> 
> > >> If the multiplier is larger (i.e. 64 bytes cache line size), the mempool size is instead limited to 256 GB, but RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ has no effect.
> > >> 
> > >> Note: 32 bit platforms have no benefit from this patch: The pointer already only uses 4 bytes, so replacing the pointer with a 4 byte index makes no difference.
> > >> 
> > >> 
> > >> Since this feature is a performance optimization only, and doesn't provide any new features, I don't mind it being a compile time option.
> > >> 
> > >> If this feature is a compile time option, and the mempool library is compiled with the large multiplier, then RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ could be made undefined in the public header file, so compilation of applications using the flag will fail. And rte_mempool_create() could RTE_ASSERT() that RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ is not set in its flags parameter, or emit a warning about the flag being ignored. Obviously, rte_mempool_create() should also RTE_ASSERT() that the mempool is not larger than the library supports, possibly emitting a message that the mempool library should be built without this feature to support the larger mempool.
> > >> 
> > >> Here is another thought: If only exotic applications use mempools larger than 32 GB, this would be a generally acceptable limit, and DPDK should use index-based cache as default, making the opposite (i.e. pointer-based cache) a compile time option instead. A similar decision was recently made for limiting the RTE_MAX_LCORE default.
> > >> 
> > >> 
> > >> Although DPDK is moving away from compile time options in order to better support Linux distros, there should be a general exception for performance and memory optimizations. Otherwise, network appliance vendors will inherit the increasing amount of DPDK bloat, and we (network appliance vendors) will eventually be forced to fork DPDK to get rid of the bloat and achieve the goals originally intended by DPDK.  
> > > 
> > > Agree with Morten's view on this.
> > >   
> > >> If anyone disagrees with the principle about a general exception for performance and memory optimizations, I would like to pass on the decision to the Techboard!
> > >>   
> 
> NAK
> Having compile time stuff like this means one side or the other is not tested
> by CI infrastructure.  There never was sufficient justification, and lots of objections.
> Dropping the patch.
> 
>
  
Morten Brørup July 31, 2023, 12:33 p.m. UTC | #24
> From: Thomas Monjalon [mailto:thomas@monjalon.net]
> Sent: Monday, 31 July 2023 14.24
> 
> The v2 was not sent, and Stephen dropped the patch from patchwork.
> 
> Do we abandon this feature?

+1, because I think that the zero-copy mempool cache access functions make this patch irrelevant.

> Should I remove it from the roadmap?

+1

> 
> 
> 06/07/2023 19:43, Stephen Hemminger:
> > On Thu, 13 Jan 2022 05:31:18 +0000
> > Dharmik Thakkar <Dharmik.Thakkar@arm.com> wrote:
> >
> > > Hi,
> > >
> > > Thank you for your valuable review comments and suggestions!
> > >
> > > I will be sending out a v2 in which I have increased the size of the
> mempool to 32GB by using division by sizeof(uintptr_t).
> > > However, I am seeing ~5% performance degradation with
> mempool_perf_autotest (for bulk size of 32) with this change
> > > when compared to the base performance.
> > > Earlier, without this change, I was seeing an improvement of ~13% compared
> to base performance. So, this is a significant degradation.
> > > I would appreciate your review comments on v2.
> > >
> > > Thank you!
> > >
> > > > On Jan 10, 2022, at 12:38 AM, Jerin Jacob <jerinjacobk@gmail.com> wrote:
> > > >
> > > > On Sat, Jan 8, 2022 at 3:07 PM Morten Brørup <mb@smartsharesystems.com>
> wrote:
> > > >>
> > > >>> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > > >>> Sent: Friday, 7 January 2022 14.51
> > > >>>
> > > >>> On Fri, Jan 07, 2022 at 12:29:23PM +0100, Morten Brørup wrote:
> > > >>>>> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > > >>>>> Sent: Friday, 7 January 2022 12.16
> > > >>>>>
> > > >>>>> On Sat, Dec 25, 2021 at 01:16:03AM +0100, Morten Brørup wrote:
> > > >>>>>>> From: Dharmik Thakkar [mailto:dharmik.thakkar@arm.com] Sent:
> > > >>>>> Friday, 24
> > > >>>>>>> December 2021 23.59
> > > >>>>>>>
> > > >>>>>>> Current mempool per core cache implementation stores pointers
> > > >>> to
> > > >>>>> mbufs
> > > >>>>>>> On 64b architectures, each pointer consumes 8B This patch
> > > >>> replaces
> > > >>>>> it
> > > >>>>>>> with index-based implementation, where in each buffer is
> > > >>> addressed
> > > >>>>> by
> > > >>>>>>> (pool base address + index) It reduces the amount of
> > > >>> memory/cache
> > > >>>>>>> required for per core cache
> > > >>>>>>>
> > > >>>>>>> L3Fwd performance testing reveals minor improvements in the
> > > >>> cache
> > > >>>>>>> performance (L1 and L2 misses reduced by 0.60%) with no change
> > > >>> in
> > > >>>>>>> throughput
> > > >>>>>>>
> > > >>>>>>> Micro-benchmarking the patch using mempool_perf_test shows
> > > >>>>> significant
> > > >>>>>>> improvement with majority of the test cases
> > > >>>>>>>
> > > >>>>>>
> > > >>>>>> I still think this is very interesting. And your performance
> > > >>> numbers
> > > >>>>> are
> > > >>>>>> looking good.
> > > >>>>>>
> > > >>>>>> However, it limits the size of a mempool to 4 GB. As previously
> > > >>>>>> discussed, the max mempool size can be increased by multiplying
> > > >>> the
> > > >>>>> index
> > > >>>>>> with a constant.
> > > >>>>>>
> > > >>>>>> I would suggest using sizeof(uintptr_t) as the constant
> > > >>> multiplier,
> > > >>>>> so
> > > >>>>>> the mempool can hold objects of any size divisible by
> > > >>>>> sizeof(uintptr_t).
> > > >>>>>> And it would be silly to use a mempool to hold objects smaller
> > > >>> than
> > > >>>>>> sizeof(uintptr_t).
> > > >>>>>>
> > > >>>>>> How does the performance look if you multiply the index by
> > > >>>>>> sizeof(uintptr_t)?
> > > >>>>>>
> > > >>>>>
> > > >>>>> Each mempool entry is cache aligned, so we can use that if we want
> > > >>> a
> > > >>>>> bigger
> > > >>>>> multiplier.
> > > >>>>
> > > >>>> Thanks for chiming in, Bruce.
> > > >>>>
> > > >>>> Please also read this discussion about the multiplier:
> > > >>>>
> http://inbox.dpdk.org/dev/CALBAE1PrQYyOG96f6ECeW1vPF3TOh1h7MZZULiY95z9xjbRuyA@
> mail.gmail.com/
> > > >>>>
> > > >>>
> > > >>> I actually wondered after I had sent the email whether we had indeed
> an
> > > >>> option to disable the cache alignment or not! Thanks for pointing out
> > > >>> that
> > > >>> we do. This brings a couple additional thoughts:
> > > >>>
> > > >>> * Using indexes for the cache should probably be a runtime flag rather
> > > >>> than
> > > >>>  a build-time one.
> > > >>> * It would seem reasonable to me to disallow use of the indexed-cache
> > > >>> flag
> > > >>>  and the non-cache aligned flag simultaneously.
> > > >>> * On the offchance that that restriction is unacceptable, then we can
> > > >>>  make things a little more complicated by doing a runtime computation
> > > >>> of
> > > >>>  the "index-shiftwidth" to use.
> > > >>>
> > > >>> Overall, I think defaulting to cacheline shiftwidth and disallowing
> > > >>> index-based addressing when using unaligned buffers is simplest and
> > > >>> easiest
> > > >>> unless we can come up with a valid usecase for needing more than that.
> > > >>>
> > > >>> /Bruce
> > > >>
> > > >> This feature is a performance optimization.
> > > >>
> > > >> With that in mind, it should not introduce function pointers or similar
> run-time checks or in the fast path, to determine what kind of cache to use
> per mempool. And if an index multiplier is implemented, it should be a compile
> time constant, probably something between sizeof(uintptr_t) or
> RTE_MEMPOOL_ALIGN (=RTE_CACHE_LINE_SIZE).
> > > >>
> > > >> The patch comes with a tradeoff between better performance and limited
> mempool size, and possibly some limitations regarding very small objects that
> are not cache line aligned to avoid wasting memory
> (RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ).
> > > >>
> > > >> With no multiplier, the only tradeoff is that the mempool size is
> limited to 4 GB.
> > > >>
> > > >> If the multiplier is small (i.e. 8 bytes) the only tradeoff is that the
> mempool size is limited to 32 GB. (And a waste of memory for objects smaller
> than 8 byte; but I don't think anyone would use a mempool to hold objects
> smaller than 8 byte.)
> > > >>
> > > >> If the multiplier is larger (i.e. 64 bytes cache line size), the
> mempool size is instead limited to 256 GB, but
> RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ has no effect.
> > > >>
> > > >> Note: 32 bit platforms have no benefit from this patch: The pointer
> already only uses 4 bytes, so replacing the pointer with a 4 byte index makes
> no difference.
> > > >>
> > > >>
> > > >> Since this feature is a performance optimization only, and doesn't
> provide any new features, I don't mind it being a compile time option.
> > > >>
> > > >> If this feature is a compile time option, and the mempool library is
> compiled with the large multiplier, then RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ
> could be made undefined in the public header file, so compilation of
> applications using the flag will fail. And rte_mempool_create() could
> RTE_ASSERT() that RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ is not set in its flags
> parameter, or emit a warning about the flag being ignored. Obviously,
> rte_mempool_create() should also RTE_ASSERT() that the mempool is not larger
> than the library supports, possibly emitting a message that the mempool
> library should be built without this feature to support the larger mempool.
> > > >>
> > > >> Here is another thought: If only exotic applications use mempools
> larger than 32 GB, this would be a generally acceptable limit, and DPDK should
> use index-based cache as default, making the opposite (i.e. pointer-based
> cache) a compile time option instead. A similar decision was recently made for
> limiting the RTE_MAX_LCORE default.
> > > >>
> > > >>
> > > >> Although DPDK is moving away from compile time options in order to
> better support Linux distros, there should be a general exception for
> performance and memory optimizations. Otherwise, network appliance vendors
> will inherit the increasing amount of DPDK bloat, and we (network appliance
> vendors) will eventually be forced to fork DPDK to get rid of the bloat and
> achieve the goals originally intended by DPDK.
> > > >
> > > > Agree with Morten's view on this.
> > > >
> > > >> If anyone disagrees with the principle about a general exception for
> performance and memory optimizations, I would like to pass on the decision to
> the Techboard!
> > > >>
> >
> > NAK
> > Having compile time stuff like this means one side or the other is not
> tested
> > by CI infrastructure.  There never was sufficient justification, and lots of
> objections.
> > Dropping the patch.
> >
> >
> 
> 
> 
>
  
Dharmik Thakkar July 31, 2023, 2:57 p.m. UTC | #25
> -----Original Message-----
> From: Morten Brørup <mb@smartsharesystems.com>
> Sent: Monday, July 31, 2023 7:33 AM
> To: thomas@monjalon.net; Dharmik Jayesh Thakkar
> <DharmikJayesh.Thakkar@arm.com>
> Cc: dev@dpdk.org; Jerin Jacob <jerinjacobk@gmail.com>; Bruce Richardson
> <bruce.richardson@intel.com>; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; nd <nd@arm.com>; Ruifeng Wang
> <Ruifeng.Wang@arm.com>; Stephen Hemminger
> <stephen@networkplumber.org>; olivier.matz@6wind.com;
> andrew.rybchenko@oktetlabs.ru
> Subject: RE: [PATCH 0/1] mempool: implement index-based per core cache
>
> > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > Sent: Monday, 31 July 2023 14.24
> >
> > The v2 was not sent, and Stephen dropped the patch from patchwork.
> >
> > Do we abandon this feature?
>
> +1, because I think that the zero-copy mempool cache access functions make
> this patch irrelevant.
>
> > Should I remove it from the roadmap?
>
> +1

V2 was sent (https://patches.dpdk.org/project/dpdk/patch/20220113053630.886638-1-dharmik.thakkar@arm.com/)
However, it is not relevant anymore and can be dropped. Thank you!

>
> >
> >
> > 06/07/2023 19:43, Stephen Hemminger:
> > > On Thu, 13 Jan 2022 05:31:18 +0000
> > > Dharmik Thakkar <Dharmik.Thakkar@arm.com> wrote:
> > >
> > > > Hi,
> > > >
> > > > Thank you for your valuable review comments and suggestions!
> > > >
> > > > I will be sending out a v2 in which I have increased the size of
> > > > the
> > mempool to 32GB by using division by sizeof(uintptr_t).
> > > > However, I am seeing ~5% performance degradation with
> > mempool_perf_autotest (for bulk size of 32) with this change
> > > > when compared to the base performance.
> > > > Earlier, without this change, I was seeing an improvement of ~13%
> > > > compared
> > to base performance. So, this is a significant degradation.
> > > > I would appreciate your review comments on v2.
> > > >
> > > > Thank you!
> > > >
> > > > > On Jan 10, 2022, at 12:38 AM, Jerin Jacob <jerinjacobk@gmail.com>
> wrote:
> > > > >
> > > > > On Sat, Jan 8, 2022 at 3:07 PM Morten Brørup
> > > > > <mb@smartsharesystems.com>
> > wrote:
> > > > >>
> > > > >>> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > > > >>> Sent: Friday, 7 January 2022 14.51
> > > > >>>
> > > > >>> On Fri, Jan 07, 2022 at 12:29:23PM +0100, Morten Brørup wrote:
> > > > >>>>> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> > > > >>>>> Sent: Friday, 7 January 2022 12.16
> > > > >>>>>
> > > > >>>>> On Sat, Dec 25, 2021 at 01:16:03AM +0100, Morten Brørup wrote:
> > > > >>>>>>> From: Dharmik Thakkar [mailto:dharmik.thakkar@arm.com]
> Sent:
> > > > >>>>> Friday, 24
> > > > >>>>>>> December 2021 23.59
> > > > >>>>>>>
> > > > >>>>>>> Current mempool per core cache implementation stores
> > > > >>>>>>> pointers
> > > > >>> to
> > > > >>>>> mbufs
> > > > >>>>>>> On 64b architectures, each pointer consumes 8B This patch
> > > > >>> replaces
> > > > >>>>> it
> > > > >>>>>>> with index-based implementation, where in each buffer is
> > > > >>> addressed
> > > > >>>>> by
> > > > >>>>>>> (pool base address + index) It reduces the amount of
> > > > >>> memory/cache
> > > > >>>>>>> required for per core cache
> > > > >>>>>>>
> > > > >>>>>>> L3Fwd performance testing reveals minor improvements in
> > > > >>>>>>> the
> > > > >>> cache
> > > > >>>>>>> performance (L1 and L2 misses reduced by 0.60%) with no
> > > > >>>>>>> change
> > > > >>> in
> > > > >>>>>>> throughput
> > > > >>>>>>>
> > > > >>>>>>> Micro-benchmarking the patch using mempool_perf_test shows
> > > > >>>>> significant
> > > > >>>>>>> improvement with majority of the test cases
> > > > >>>>>>>
> > > > >>>>>>
> > > > >>>>>> I still think this is very interesting. And your
> > > > >>>>>> performance
> > > > >>> numbers
> > > > >>>>> are
> > > > >>>>>> looking good.
> > > > >>>>>>
> > > > >>>>>> However, it limits the size of a mempool to 4 GB. As
> > > > >>>>>> previously discussed, the max mempool size can be increased
> > > > >>>>>> by multiplying
> > > > >>> the
> > > > >>>>> index
> > > > >>>>>> with a constant.
> > > > >>>>>>
> > > > >>>>>> I would suggest using sizeof(uintptr_t) as the constant
> > > > >>> multiplier,
> > > > >>>>> so
> > > > >>>>>> the mempool can hold objects of any size divisible by
> > > > >>>>> sizeof(uintptr_t).
> > > > >>>>>> And it would be silly to use a mempool to hold objects
> > > > >>>>>> smaller
> > > > >>> than
> > > > >>>>>> sizeof(uintptr_t).
> > > > >>>>>>
> > > > >>>>>> How does the performance look if you multiply the index by
> > > > >>>>>> sizeof(uintptr_t)?
> > > > >>>>>>
> > > > >>>>>
> > > > >>>>> Each mempool entry is cache aligned, so we can use that if
> > > > >>>>> we want
> > > > >>> a
> > > > >>>>> bigger
> > > > >>>>> multiplier.
> > > > >>>>
> > > > >>>> Thanks for chiming in, Bruce.
> > > > >>>>
> > > > >>>> Please also read this discussion about the multiplier:
> > > > >>>>
> >
> http://inbox.dpdk.org/dev/CALBAE1PrQYyOG96f6ECeW1vPF3TOh1h7MZZULiY
> 95z9
> > xjbRuyA@
> > mail.gmail.com/
> > > > >>>>
> > > > >>>
> > > > >>> I actually wondered after I had sent the email whether we had
> > > > >>> indeed
> > an
> > > > >>> option to disable the cache alignment or not! Thanks for
> > > > >>> pointing out that we do. This brings a couple additional
> > > > >>> thoughts:
> > > > >>>
> > > > >>> * Using indexes for the cache should probably be a runtime
> > > > >>> flag rather than  a build-time one.
> > > > >>> * It would seem reasonable to me to disallow use of the
> > > > >>> indexed-cache flag  and the non-cache aligned flag
> > > > >>> simultaneously.
> > > > >>> * On the offchance that that restriction is unacceptable, then
> > > > >>> we can  make things a little more complicated by doing a
> > > > >>> runtime computation of  the "index-shiftwidth" to use.
> > > > >>>
> > > > >>> Overall, I think defaulting to cacheline shiftwidth and
> > > > >>> disallowing index-based addressing when using unaligned
> > > > >>> buffers is simplest and easiest unless we can come up with a
> > > > >>> valid usecase for needing more than that.
> > > > >>>
> > > > >>> /Bruce
> > > > >>
> > > > >> This feature is a performance optimization.
> > > > >>
> > > > >> With that in mind, it should not introduce function pointers or
> > > > >> similar
> > run-time checks or in the fast path, to determine what kind of cache
> > to use per mempool. And if an index multiplier is implemented, it
> > should be a compile time constant, probably something between
> > sizeof(uintptr_t) or RTE_MEMPOOL_ALIGN (=RTE_CACHE_LINE_SIZE).
> > > > >>
> > > > >> The patch comes with a tradeoff between better performance and
> > > > >> limited
> > mempool size, and possibly some limitations regarding very small
> > objects that are not cache line aligned to avoid wasting memory
> > (RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ).
> > > > >>
> > > > >> With no multiplier, the only tradeoff is that the mempool size
> > > > >> is
> > limited to 4 GB.
> > > > >>
> > > > >> If the multiplier is small (i.e. 8 bytes) the only tradeoff is
> > > > >> that the
> > mempool size is limited to 32 GB. (And a waste of memory for objects
> > smaller than 8 byte; but I don't think anyone would use a mempool to
> > hold objects smaller than 8 byte.)
> > > > >>
> > > > >> If the multiplier is larger (i.e. 64 bytes cache line size),
> > > > >> the
> > mempool size is instead limited to 256 GB, but
> > RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ has no effect.
> > > > >>
> > > > >> Note: 32 bit platforms have no benefit from this patch: The
> > > > >> pointer
> > already only uses 4 bytes, so replacing the pointer with a 4 byte
> > index makes no difference.
> > > > >>
> > > > >>
> > > > >> Since this feature is a performance optimization only, and
> > > > >> doesn't
> > provide any new features, I don't mind it being a compile time option.
> > > > >>
> > > > >> If this feature is a compile time option, and the mempool
> > > > >> library is
> > compiled with the large multiplier, then
> > RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ could be made undefined in the
> public
> > header file, so compilation of applications using the flag will fail.
> > And rte_mempool_create() could
> > RTE_ASSERT() that RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ is not set in its
> > flags parameter, or emit a warning about the flag being ignored.
> > Obviously,
> > rte_mempool_create() should also RTE_ASSERT() that the mempool is not
> > larger than the library supports, possibly emitting a message that the
> > mempool library should be built without this feature to support the larger
> mempool.
> > > > >>
> > > > >> Here is another thought: If only exotic applications use
> > > > >> mempools
> > larger than 32 GB, this would be a generally acceptable limit, and
> > DPDK should use index-based cache as default, making the opposite
> > (i.e. pointer-based
> > cache) a compile time option instead. A similar decision was recently
> > made for limiting the RTE_MAX_LCORE default.
> > > > >>
> > > > >>
> > > > >> Although DPDK is moving away from compile time options in order
> > > > >> to
> > better support Linux distros, there should be a general exception for
> > performance and memory optimizations. Otherwise, network appliance
> > vendors will inherit the increasing amount of DPDK bloat, and we
> > (network appliance
> > vendors) will eventually be forced to fork DPDK to get rid of the
> > bloat and achieve the goals originally intended by DPDK.
> > > > >
> > > > > Agree with Morten's view on this.
> > > > >
> > > > >> If anyone disagrees with the principle about a general
> > > > >> exception for
> > performance and memory optimizations, I would like to pass on the
> > decision to the Techboard!
> > > > >>
> > >
> > > NAK
> > > Having compile time stuff like this means one side or the other is
> > > not
> > tested
> > > by CI infrastructure.  There never was sufficient justification, and
> > > lots of
> > objections.
> > > Dropping the patch.
> > >
> > >
> >
> >
> >
> >

IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
  

Patch

diff --git a/drivers/mempool/ring/rte_mempool_ring.c b/drivers/mempool/ring/rte_mempool_ring.c
index b1f09ff28f4d..e55913e47f21 100644
--- a/drivers/mempool/ring/rte_mempool_ring.c
+++ b/drivers/mempool/ring/rte_mempool_ring.c
@@ -101,7 +101,7 @@  ring_alloc(struct rte_mempool *mp, uint32_t rg_flags)
 		return -rte_errno;
 
 	mp->pool_data = r;
-
+	mp->local_cache_base_addr = &r[1];
 	return 0;
 }
 
diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index 59a588425bd6..424bdb19c323 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -480,6 +480,7 @@  rte_mempool_populate_default(struct rte_mempool *mp)
 	int ret;
 	bool need_iova_contig_obj;
 	size_t max_alloc_size = SIZE_MAX;
+	unsigned lcore_id;
 
 	ret = mempool_ops_alloc_once(mp);
 	if (ret != 0)
@@ -600,6 +601,13 @@  rte_mempool_populate_default(struct rte_mempool *mp)
 		}
 	}
 
+	/* Init all default caches. */
+	if (mp->cache_size != 0) {
+		for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
+			mp->local_cache[lcore_id].local_cache_base_value =
+				*(void **)mp->local_cache_base_addr;
+	}
+
 	rte_mempool_trace_populate_default(mp);
 	return mp->size;
 
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 4235d6f0bf2b..545405c0d3ce 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -51,6 +51,8 @@ 
 #include <rte_memcpy.h>
 #include <rte_common.h>
 
+#include <arm_neon.h>
+
 #include "rte_mempool_trace_fp.h"
 
 #ifdef __cplusplus
@@ -91,11 +93,12 @@  struct rte_mempool_cache {
 	uint32_t size;	      /**< Size of the cache */
 	uint32_t flushthresh; /**< Threshold before we flush excess elements */
 	uint32_t len;	      /**< Current cache count */
+	void *local_cache_base_value; /**< Base value to calculate indices */
 	/*
 	 * Cache is allocated to this size to allow it to overflow in certain
 	 * cases to avoid needless emptying of cache.
 	 */
-	void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
+	uint32_t objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
 } __rte_cache_aligned;
 
 /**
@@ -172,7 +175,6 @@  struct rte_mempool_objtlr {
  * A list of memory where objects are stored
  */
 STAILQ_HEAD(rte_mempool_memhdr_list, rte_mempool_memhdr);
-
 /**
  * Callback used to free a memory chunk
  */
@@ -244,6 +246,7 @@  struct rte_mempool {
 	int32_t ops_index;
 
 	struct rte_mempool_cache *local_cache; /**< Per-lcore local cache */
+	void *local_cache_base_addr; /**< Reference to the base value */
 
 	uint32_t populated_size;         /**< Number of populated objects. */
 	struct rte_mempool_objhdr_list elt_list; /**< List of objects in pool */
@@ -1269,7 +1272,15 @@  rte_mempool_cache_flush(struct rte_mempool_cache *cache,
 	if (cache == NULL || cache->len == 0)
 		return;
 	rte_mempool_trace_cache_flush(cache, mp);
-	rte_mempool_ops_enqueue_bulk(mp, cache->objs, cache->len);
+
+	unsigned int i;
+	unsigned int cache_len = cache->len;
+	void *obj_table[RTE_MEMPOOL_CACHE_MAX_SIZE * 3];
+	void *base_value = cache->local_cache_base_value;
+	uint32_t *cache_objs = cache->objs;
+	for (i = 0; i < cache_len; i++)
+		obj_table[i] = (void *) RTE_PTR_ADD(base_value, cache_objs[i]);
+	rte_mempool_ops_enqueue_bulk(mp, obj_table, cache->len);
 	cache->len = 0;
 }
 
@@ -1289,7 +1300,9 @@  static __rte_always_inline void
 __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
 		      unsigned int n, struct rte_mempool_cache *cache)
 {
-	void **cache_objs;
+	uint32_t *cache_objs;
+	void *base_value;
+	uint32_t i;
 
 	/* increment stat now, adding in mempool always success */
 	__MEMPOOL_STAT_ADD(mp, put_bulk, 1);
@@ -1301,6 +1314,12 @@  __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
 
 	cache_objs = &cache->objs[cache->len];
 
+	base_value = cache->local_cache_base_value;
+
+	uint64x2_t v_obj_table;
+	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
+	uint32x2_t v_cache_objs;
+
 	/*
 	 * The cache follows the following algorithm
 	 *   1. Add the objects to the cache
@@ -1309,12 +1328,26 @@  __mempool_generic_put(struct rte_mempool *mp, void * const *obj_table,
 	 */
 
 	/* Add elements back into the cache */
-	rte_memcpy(&cache_objs[0], obj_table, sizeof(void *) * n);
+
+#if defined __ARM_NEON
+	for (i = 0; i < (n & ~0x1); i+=2) {
+		v_obj_table = vld1q_u64((const uint64_t *)&obj_table[i]);
+		v_cache_objs = vqmovn_u64(vsubq_u64(v_obj_table, v_base_value));
+		vst1_u32(cache_objs + i, v_cache_objs);
+	}
+	if (n & 0x1) {
+		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
+	}
+#else
+	for (i = 0; i < n; i++) {
+		cache_objs[i] = (uint32_t) RTE_PTR_DIFF(obj_table[i], base_value);
+	}
+#endif
 
 	cache->len += n;
 
 	if (cache->len >= cache->flushthresh) {
-		rte_mempool_ops_enqueue_bulk(mp, &cache->objs[cache->size],
+		rte_mempool_ops_enqueue_bulk(mp, obj_table + cache->len - cache->size,
 				cache->len - cache->size);
 		cache->len = cache->size;
 	}
@@ -1415,23 +1448,26 @@  __mempool_generic_get(struct rte_mempool *mp, void **obj_table,
 		      unsigned int n, struct rte_mempool_cache *cache)
 {
 	int ret;
+	uint32_t i;
 	uint32_t index, len;
-	void **cache_objs;
+	uint32_t *cache_objs;
 
 	/* No cache provided or cannot be satisfied from cache */
 	if (unlikely(cache == NULL || n >= cache->size))
 		goto ring_dequeue;
 
+	void *base_value = cache->local_cache_base_value;
 	cache_objs = cache->objs;
 
 	/* Can this be satisfied from the cache? */
 	if (cache->len < n) {
 		/* No. Backfill the cache first, and then fill from it */
 		uint32_t req = n + (cache->size - cache->len);
+		void *temp_objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 3]; /**< Cache objects */
 
 		/* How many do we require i.e. number to fill the cache + the request */
 		ret = rte_mempool_ops_dequeue_bulk(mp,
-			&cache->objs[cache->len], req);
+			temp_objs, req);
 		if (unlikely(ret < 0)) {
 			/*
 			 * In the off chance that we are buffer constrained,
@@ -1442,12 +1478,32 @@  __mempool_generic_get(struct rte_mempool *mp, void **obj_table,
 			goto ring_dequeue;
 		}
 
+		len = cache->len;
+		for (i = 0; i < req; ++i, ++len) {
+			cache_objs[len] = (uint32_t) RTE_PTR_DIFF(temp_objs[i], base_value);
+		}
+
 		cache->len += req;
 	}
 
+	uint64x2_t v_obj_table;
+	uint64x2_t v_cache_objs;
+	uint64x2_t v_base_value = vdupq_n_u64((uint64_t)base_value);
+
 	/* Now fill in the response ... */
+#if defined __ARM_NEON
+	for (index = 0, len = cache->len - 1; index < (n & ~0x1); index+=2,
+						len-=2, obj_table+=2) {
+		v_cache_objs = vmovl_u32(vld1_u32(cache_objs + len - 1));
+		v_obj_table = vaddq_u64(v_cache_objs, v_base_value);
+		vst1q_u64((uint64_t *)obj_table, v_obj_table);
+	}
+	if (n & 0x1)
+		*obj_table = (void *) RTE_PTR_ADD(base_value, cache_objs[len]);
+#else
 	for (index = 0, len = cache->len - 1; index < n; ++index, len--, obj_table++)
-		*obj_table = cache_objs[len];
+		*obj_table = (void *) RTE_PTR_ADD(base_value, cache_objs[len]);
+#endif
 
 	cache->len -= n;