[v2,2/3] mempool: include non-DPDK threads in statistics

Message ID 20221031112634.18329-2-mb@smartsharesystems.com (mailing list archive)
State Superseded, archived
Headers
Series mempool: split statistics from debug |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation warning apply issues

Commit Message

Morten Brørup Oct. 31, 2022, 11:26 a.m. UTC
  Offset the stats array index by one, and count non-DPDK threads at index
zero.

This patch provides two benefits:
* Non-DPDK threads are also included in the statistics.
* A conditional in the fast path is removed. Static branch prediction was
  correct, so the performance improvement is negligible.

v2:
* New. No v1 of this patch in the series.

Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
---
 lib/mempool/rte_mempool.c |  2 +-
 lib/mempool/rte_mempool.h | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)
  

Comments

Mattias Rönnblom Nov. 2, 2022, 7:52 a.m. UTC | #1
On 2022-10-31 12:26, Morten Brørup wrote:
> Offset the stats array index by one, and count non-DPDK threads at index
> zero.
> 
> This patch provides two benefits:
> * Non-DPDK threads are also included in the statistics.
> * A conditional in the fast path is removed. Static branch prediction was
>    correct, so the performance improvement is negligible.
> 
> v2:
> * New. No v1 of this patch in the series.
> 
> Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> ---
>   lib/mempool/rte_mempool.c |  2 +-
>   lib/mempool/rte_mempool.h | 12 ++++++------
>   2 files changed, 7 insertions(+), 7 deletions(-)
> 
> diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> index 62d1ce764e..e6208125e0 100644
> --- a/lib/mempool/rte_mempool.c
> +++ b/lib/mempool/rte_mempool.c
> @@ -1272,7 +1272,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool *mp)
>   #ifdef RTE_LIBRTE_MEMPOOL_STATS
>   	rte_mempool_ops_get_info(mp, &info);
>   	memset(&sum, 0, sizeof(sum));
> -	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
> +	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE + 1; lcore_id++) {
>   		sum.put_bulk += mp->stats[lcore_id].put_bulk;
>   		sum.put_objs += mp->stats[lcore_id].put_objs;
>   		sum.put_common_pool_bulk += mp->stats[lcore_id].put_common_pool_bulk;
> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> index 9c4bf5549f..16e7e62e3c 100644
> --- a/lib/mempool/rte_mempool.h
> +++ b/lib/mempool/rte_mempool.h
> @@ -238,8 +238,11 @@ struct rte_mempool {
>   	struct rte_mempool_memhdr_list mem_list; /**< List of memory chunks */
>   
>   #ifdef RTE_LIBRTE_MEMPOOL_STATS
> -	/** Per-lcore statistics. */
> -	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
> +	/** Per-lcore statistics.
> +	 *
> +	 * Offset by one, to include non-DPDK threads.
> +	 */
> +	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE + 1];
>   #endif
>   }  __rte_cache_aligned;
>   
> @@ -304,10 +307,7 @@ struct rte_mempool {
>    */
>   #ifdef RTE_LIBRTE_MEMPOOL_STATS
>   #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {                  \
> -		unsigned __lcore_id = rte_lcore_id();           \
> -		if (__lcore_id < RTE_MAX_LCORE) {               \
> -			mp->stats[__lcore_id].name += n;        \
> -		}                                               \
> +		(mp)->stats[rte_lcore_id() + 1].name += n;      \

This relies on LCORE_ID_ANY being UINT32_MAX, and a wrap to 0, for an 
unregistered non-EAL thread? Might be worth a comment, or better a 
rewrite with an explicit LCORE_ID_ANY comparison.

You anyways need a conditional. An atomic add must be used in the 
unregistered EAL thread case.

>   	} while (0)
>   #else
>   #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {} while (0)
  
Morten Brørup Nov. 2, 2022, 9:09 a.m. UTC | #2
> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
> Sent: Wednesday, 2 November 2022 08.53
> 
> On 2022-10-31 12:26, Morten Brørup wrote:
> > Offset the stats array index by one, and count non-DPDK threads at
> index
> > zero.
> >
> > This patch provides two benefits:
> > * Non-DPDK threads are also included in the statistics.
> > * A conditional in the fast path is removed. Static branch prediction
> was
> >    correct, so the performance improvement is negligible.
> >
> > v2:
> > * New. No v1 of this patch in the series.
> >
> > Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> > ---
> >   lib/mempool/rte_mempool.c |  2 +-
> >   lib/mempool/rte_mempool.h | 12 ++++++------
> >   2 files changed, 7 insertions(+), 7 deletions(-)
> >
> > diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> > index 62d1ce764e..e6208125e0 100644
> > --- a/lib/mempool/rte_mempool.c
> > +++ b/lib/mempool/rte_mempool.c
> > @@ -1272,7 +1272,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool
> *mp)
> >   #ifdef RTE_LIBRTE_MEMPOOL_STATS
> >   	rte_mempool_ops_get_info(mp, &info);
> >   	memset(&sum, 0, sizeof(sum));
> > -	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
> > +	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE + 1; lcore_id++) {
> >   		sum.put_bulk += mp->stats[lcore_id].put_bulk;
> >   		sum.put_objs += mp->stats[lcore_id].put_objs;
> >   		sum.put_common_pool_bulk += mp-
> >stats[lcore_id].put_common_pool_bulk;
> > diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> > index 9c4bf5549f..16e7e62e3c 100644
> > --- a/lib/mempool/rte_mempool.h
> > +++ b/lib/mempool/rte_mempool.h
> > @@ -238,8 +238,11 @@ struct rte_mempool {
> >   	struct rte_mempool_memhdr_list mem_list; /**< List of memory
> chunks */
> >
> >   #ifdef RTE_LIBRTE_MEMPOOL_STATS
> > -	/** Per-lcore statistics. */
> > -	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
> > +	/** Per-lcore statistics.
> > +	 *
> > +	 * Offset by one, to include non-DPDK threads.
> > +	 */
> > +	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE + 1];
> >   #endif
> >   }  __rte_cache_aligned;
> >
> > @@ -304,10 +307,7 @@ struct rte_mempool {
> >    */
> >   #ifdef RTE_LIBRTE_MEMPOOL_STATS
> >   #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {                  \
> > -		unsigned __lcore_id = rte_lcore_id();           \
> > -		if (__lcore_id < RTE_MAX_LCORE) {               \
> > -			mp->stats[__lcore_id].name += n;        \
> > -		}                                               \
> > +		(mp)->stats[rte_lcore_id() + 1].name += n;      \
> 
> This relies on LCORE_ID_ANY being UINT32_MAX, and a wrap to 0, for an
> unregistered non-EAL thread? Might be worth a comment, or better a
> rewrite with an explicit LCORE_ID_ANY comparison.

The purpose of this patch is to avoid the comparison here.

Yes, it relies on the wrap to zero, and these conditions:
1. LCORE_ID_ANY being UINT32_MAX, and
2. the return type of rte_lcore_id() being unsigned int, and
3. unsigned int being uint32_t.

When I wrote this, I considered it safe to assume that LCORE_ID_ANY will remain the unsigned equivalent of -1 using the return type of rte_lcore_id(). In other words: If the return type of rte_lcore_id() should change from 32 to 64 bit, LCORE_ID_ANY would be updated accordingly to UINT64_MAX.

Because of this assumption, I didn't use [(rte_lcore_id() + 1) & UINT32_MAX], but just [rte_lcore_id() + 1].

I struggled writing an appropriate comment without making it unacceptably long, but eventually gave up, and settled for the one-line comment in the structure only.

> 
> You anyways need a conditional. An atomic add must be used in the
> unregistered EAL thread case.

Good point: The "+= n" must be atomic for non-isolated threads.

I just took a look at how software maintained stats are handled elsewhere, and the first thing I found, is the IOAT DMA driver, which also seems to be using non-atomic increment [1] regardless if used by a DPDK thread or not.

[1]: https://elixir.bootlin.com/dpdk/v22.11-rc2/source/drivers/dma/ioat/ioat_dmadev.c#L228

However, doing it wrong elsewhere doesn't make it correct doing it wrong here too. :-)

Atomic increments are costly, so I would rather follow your suggestion and reintroduce the comparison. How about:

#define RTE_MEMPOOL_STAT_ADD(mp, name, n) do { \
    unsigned int __lcore_id = rte_lcore_id(); \
    if (likely(__lcore_id < RTE_MAX_LCORE)) { \
        (mp)->stats[__lcore_id].name += n; \
    } else {
        rte_atomic64_add( \
                (rte_atomic64_t*)&((mp)->stats[RTE_MAX_LCORE].name), n);\
    } \
}

And the structure comment could be:
 * Plus one, for non-DPDK threads.
  
Stephen Hemminger Nov. 2, 2022, 3:19 p.m. UTC | #3
On Wed, 2 Nov 2022 10:09:26 +0100
Morten Brørup <mb@smartsharesystems.com> wrote:

> > From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
> > Sent: Wednesday, 2 November 2022 08.53
> > 
> > On 2022-10-31 12:26, Morten Brørup wrote:  
> > > Offset the stats array index by one, and count non-DPDK threads at  
> > index  
> > > zero.
> > >
> > > This patch provides two benefits:
> > > * Non-DPDK threads are also included in the statistics.
> > > * A conditional in the fast path is removed. Static branch prediction  
> > was  
> > >    correct, so the performance improvement is negligible.
> > >
> > > v2:
> > > * New. No v1 of this patch in the series.
> > >
> > > Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> > > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> > > ---
> > >   lib/mempool/rte_mempool.c |  2 +-
> > >   lib/mempool/rte_mempool.h | 12 ++++++------
> > >   2 files changed, 7 insertions(+), 7 deletions(-)
> > >
> > > diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> > > index 62d1ce764e..e6208125e0 100644
> > > --- a/lib/mempool/rte_mempool.c
> > > +++ b/lib/mempool/rte_mempool.c
> > > @@ -1272,7 +1272,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool  
> > *mp)  
> > >   #ifdef RTE_LIBRTE_MEMPOOL_STATS
> > >   	rte_mempool_ops_get_info(mp, &info);
> > >   	memset(&sum, 0, sizeof(sum));
> > > -	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
> > > +	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE + 1; lcore_id++) {
> > >   		sum.put_bulk += mp->stats[lcore_id].put_bulk;
> > >   		sum.put_objs += mp->stats[lcore_id].put_objs;
> > >   		sum.put_common_pool_bulk += mp-
> > >stats[lcore_id].put_common_pool_bulk;
> > > diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> > > index 9c4bf5549f..16e7e62e3c 100644
> > > --- a/lib/mempool/rte_mempool.h
> > > +++ b/lib/mempool/rte_mempool.h
> > > @@ -238,8 +238,11 @@ struct rte_mempool {
> > >   	struct rte_mempool_memhdr_list mem_list; /**< List of memory  
> > chunks */  
> > >
> > >   #ifdef RTE_LIBRTE_MEMPOOL_STATS
> > > -	/** Per-lcore statistics. */
> > > -	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
> > > +	/** Per-lcore statistics.
> > > +	 *
> > > +	 * Offset by one, to include non-DPDK threads.
> > > +	 */
> > > +	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE + 1];
> > >   #endif
> > >   }  __rte_cache_aligned;
> > >
> > > @@ -304,10 +307,7 @@ struct rte_mempool {
> > >    */
> > >   #ifdef RTE_LIBRTE_MEMPOOL_STATS
> > >   #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {                  \
> > > -		unsigned __lcore_id = rte_lcore_id();           \
> > > -		if (__lcore_id < RTE_MAX_LCORE) {               \
> > > -			mp->stats[__lcore_id].name += n;        \
> > > -		}                                               \
> > > +		(mp)->stats[rte_lcore_id() + 1].name += n;      \  
> > 
> > This relies on LCORE_ID_ANY being UINT32_MAX, and a wrap to 0, for an
> > unregistered non-EAL thread? Might be worth a comment, or better a
> > rewrite with an explicit LCORE_ID_ANY comparison.  
> 
> The purpose of this patch is to avoid the comparison here.
> 
> Yes, it relies on the wrap to zero, and these conditions:
> 1. LCORE_ID_ANY being UINT32_MAX, and
> 2. the return type of rte_lcore_id() being unsigned int, and
> 3. unsigned int being uint32_t.
> 
> When I wrote this, I considered it safe to assume that LCORE_ID_ANY will remain the unsigned equivalent of -1 using the return type of rte_lcore_id(). In other words: If the return type of rte_lcore_id() should change from 32 to 64 bit, LCORE_ID_ANY would be updated accordingly to UINT64_MAX.
> 
> Because of this assumption, I didn't use [(rte_lcore_id() + 1) & UINT32_MAX], but just [rte_lcore_id() + 1].
> 
> I struggled writing an appropriate comment without making it unacceptably long, but eventually gave up, and settled for the one-line comment in the structure only.
> 
> > 
> > You anyways need a conditional. An atomic add must be used in the
> > unregistered EAL thread case.  
> 
> Good point: The "+= n" must be atomic for non-isolated threads.
> 
> I just took a look at how software maintained stats are handled elsewhere, and the first thing I found, is the IOAT DMA driver, which also seems to be using non-atomic increment [1] regardless if used by a DPDK thread or not.
> 
> [1]: https://elixir.bootlin.com/dpdk/v22.11-rc2/source/drivers/dma/ioat/ioat_dmadev.c#L228
> 
> However, doing it wrong elsewhere doesn't make it correct doing it wrong here too. :-)
> 
> Atomic increments are costly, so I would rather follow your suggestion and reintroduce the comparison. How about:
> 
> #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do { \
>     unsigned int __lcore_id = rte_lcore_id(); \
>     if (likely(__lcore_id < RTE_MAX_LCORE)) { \
>         (mp)->stats[__lcore_id].name += n; \
>     } else {
>         rte_atomic64_add( \
>                 (rte_atomic64_t*)&((mp)->stats[RTE_MAX_LCORE].name), n);\
>     } \
> }
> 
> And the structure comment could be:
>  * Plus one, for non-DPDK threads.
> 

Or use rte_lcore_index() instead of rte_lcore_id()
  
Morten Brørup Nov. 2, 2022, 3:37 p.m. UTC | #4
Med venlig hilsen / Kind regards,
-Morten Brørup


> -----Original Message-----
> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> Sent: Wednesday, 2 November 2022 16.19
> To: Morten Brørup
> Cc: Mattias Rönnblom; olivier.matz@6wind.com;
> andrew.rybchenko@oktetlabs.ru; jerinj@marvell.com;
> bruce.richardson@intel.com; thomas@monjalon.net; dev@dpdk.org
> Subject: Re: [PATCH v2 2/3] mempool: include non-DPDK threads in
> statistics
> 
> On Wed, 2 Nov 2022 10:09:26 +0100
> Morten Brørup <mb@smartsharesystems.com> wrote:
> 
> > > From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
> > > Sent: Wednesday, 2 November 2022 08.53
> > >
> > > On 2022-10-31 12:26, Morten Brørup wrote:
> > > > Offset the stats array index by one, and count non-DPDK threads
> at
> > > index
> > > > zero.
> > > >
> > > > This patch provides two benefits:
> > > > * Non-DPDK threads are also included in the statistics.
> > > > * A conditional in the fast path is removed. Static branch
> prediction
> > > was
> > > >    correct, so the performance improvement is negligible.
> > > >
> > > > v2:
> > > > * New. No v1 of this patch in the series.
> > > >
> > > > Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> > > > Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> > > > ---
> > > >   lib/mempool/rte_mempool.c |  2 +-
> > > >   lib/mempool/rte_mempool.h | 12 ++++++------
> > > >   2 files changed, 7 insertions(+), 7 deletions(-)
> > > >
> > > > diff --git a/lib/mempool/rte_mempool.c
> b/lib/mempool/rte_mempool.c
> > > > index 62d1ce764e..e6208125e0 100644
> > > > --- a/lib/mempool/rte_mempool.c
> > > > +++ b/lib/mempool/rte_mempool.c
> > > > @@ -1272,7 +1272,7 @@ rte_mempool_dump(FILE *f, struct
> rte_mempool
> > > *mp)
> > > >   #ifdef RTE_LIBRTE_MEMPOOL_STATS
> > > >   	rte_mempool_ops_get_info(mp, &info);
> > > >   	memset(&sum, 0, sizeof(sum));
> > > > -	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
> > > > +	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE + 1;
> lcore_id++) {
> > > >   		sum.put_bulk += mp->stats[lcore_id].put_bulk;
> > > >   		sum.put_objs += mp->stats[lcore_id].put_objs;
> > > >   		sum.put_common_pool_bulk += mp-
> > > >stats[lcore_id].put_common_pool_bulk;
> > > > diff --git a/lib/mempool/rte_mempool.h
> b/lib/mempool/rte_mempool.h
> > > > index 9c4bf5549f..16e7e62e3c 100644
> > > > --- a/lib/mempool/rte_mempool.h
> > > > +++ b/lib/mempool/rte_mempool.h
> > > > @@ -238,8 +238,11 @@ struct rte_mempool {
> > > >   	struct rte_mempool_memhdr_list mem_list; /**< List of
> memory
> > > chunks */
> > > >
> > > >   #ifdef RTE_LIBRTE_MEMPOOL_STATS
> > > > -	/** Per-lcore statistics. */
> > > > -	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
> > > > +	/** Per-lcore statistics.
> > > > +	 *
> > > > +	 * Offset by one, to include non-DPDK threads.
> > > > +	 */
> > > > +	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE + 1];
> > > >   #endif
> > > >   }  __rte_cache_aligned;
> > > >
> > > > @@ -304,10 +307,7 @@ struct rte_mempool {
> > > >    */
> > > >   #ifdef RTE_LIBRTE_MEMPOOL_STATS
> > > >   #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {
> \
> > > > -		unsigned __lcore_id = rte_lcore_id();           \
> > > > -		if (__lcore_id < RTE_MAX_LCORE) {               \
> > > > -			mp->stats[__lcore_id].name += n;        \
> > > > -		}                                               \
> > > > +		(mp)->stats[rte_lcore_id() + 1].name += n;      \
> > >
> > > This relies on LCORE_ID_ANY being UINT32_MAX, and a wrap to 0, for
> an
> > > unregistered non-EAL thread? Might be worth a comment, or better a
> > > rewrite with an explicit LCORE_ID_ANY comparison.
> >
> > The purpose of this patch is to avoid the comparison here.
> >
> > Yes, it relies on the wrap to zero, and these conditions:
> > 1. LCORE_ID_ANY being UINT32_MAX, and
> > 2. the return type of rte_lcore_id() being unsigned int, and
> > 3. unsigned int being uint32_t.
> >
> > When I wrote this, I considered it safe to assume that LCORE_ID_ANY
> will remain the unsigned equivalent of -1 using the return type of
> rte_lcore_id(). In other words: If the return type of rte_lcore_id()
> should change from 32 to 64 bit, LCORE_ID_ANY would be updated
> accordingly to UINT64_MAX.
> >
> > Because of this assumption, I didn't use [(rte_lcore_id() + 1) &
> UINT32_MAX], but just [rte_lcore_id() + 1].
> >
> > I struggled writing an appropriate comment without making it
> unacceptably long, but eventually gave up, and settled for the one-line
> comment in the structure only.
> >
> > >
> > > You anyways need a conditional. An atomic add must be used in the
> > > unregistered EAL thread case.
> >
> > Good point: The "+= n" must be atomic for non-isolated threads.
> >
> > I just took a look at how software maintained stats are handled
> elsewhere, and the first thing I found, is the IOAT DMA driver, which
> also seems to be using non-atomic increment [1] regardless if used by a
> DPDK thread or not.
> >
> > [1]: https://elixir.bootlin.com/dpdk/v22.11-
> rc2/source/drivers/dma/ioat/ioat_dmadev.c#L228
> >
> > However, doing it wrong elsewhere doesn't make it correct doing it
> wrong here too. :-)
> >
> > Atomic increments are costly, so I would rather follow your
> suggestion and reintroduce the comparison. How about:
> >
> > #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do { \
> >     unsigned int __lcore_id = rte_lcore_id(); \
> >     if (likely(__lcore_id < RTE_MAX_LCORE)) { \
> >         (mp)->stats[__lcore_id].name += n; \
> >     } else {
> >         rte_atomic64_add( \
> >                 (rte_atomic64_t*)&((mp)->stats[RTE_MAX_LCORE].name),
> n);\
> >     } \
> > }
> >
> > And the structure comment could be:
> >  * Plus one, for non-DPDK threads.
> >
> 
> Or use rte_lcore_index() instead of rte_lcore_id()
> 

rte_lcore_index() is not inline, so it would impact performance. And looking at its source code [2], it compares and branches anyway. It would effectively just be a longwinded way of converting the rte_lcore_id() return type from unsigned to signed int.

[2]: https://elixir.bootlin.com/dpdk/v22.11-rc2/source/lib/eal/common/eal_common_lcore.c#L27
  
Mattias Rönnblom Nov. 2, 2022, 5:53 p.m. UTC | #5
On 2022-11-02 10:09, Morten Brørup wrote:
>> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
>> Sent: Wednesday, 2 November 2022 08.53
>>
>> On 2022-10-31 12:26, Morten Brørup wrote:
>>> Offset the stats array index by one, and count non-DPDK threads at
>> index
>>> zero.
>>>
>>> This patch provides two benefits:
>>> * Non-DPDK threads are also included in the statistics.
>>> * A conditional in the fast path is removed. Static branch prediction
>> was
>>>     correct, so the performance improvement is negligible.
>>>
>>> v2:
>>> * New. No v1 of this patch in the series.
>>>
>>> Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
>>> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
>>> ---
>>>    lib/mempool/rte_mempool.c |  2 +-
>>>    lib/mempool/rte_mempool.h | 12 ++++++------
>>>    2 files changed, 7 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
>>> index 62d1ce764e..e6208125e0 100644
>>> --- a/lib/mempool/rte_mempool.c
>>> +++ b/lib/mempool/rte_mempool.c
>>> @@ -1272,7 +1272,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool
>> *mp)
>>>    #ifdef RTE_LIBRTE_MEMPOOL_STATS
>>>    	rte_mempool_ops_get_info(mp, &info);
>>>    	memset(&sum, 0, sizeof(sum));
>>> -	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
>>> +	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE + 1; lcore_id++) {
>>>    		sum.put_bulk += mp->stats[lcore_id].put_bulk;
>>>    		sum.put_objs += mp->stats[lcore_id].put_objs;
>>>    		sum.put_common_pool_bulk += mp-
>>> stats[lcore_id].put_common_pool_bulk;
>>> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
>>> index 9c4bf5549f..16e7e62e3c 100644
>>> --- a/lib/mempool/rte_mempool.h
>>> +++ b/lib/mempool/rte_mempool.h
>>> @@ -238,8 +238,11 @@ struct rte_mempool {
>>>    	struct rte_mempool_memhdr_list mem_list; /**< List of memory
>> chunks */
>>>
>>>    #ifdef RTE_LIBRTE_MEMPOOL_STATS
>>> -	/** Per-lcore statistics. */
>>> -	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
>>> +	/** Per-lcore statistics.
>>> +	 *
>>> +	 * Offset by one, to include non-DPDK threads.
>>> +	 */
>>> +	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE + 1];
>>>    #endif
>>>    }  __rte_cache_aligned;
>>>
>>> @@ -304,10 +307,7 @@ struct rte_mempool {
>>>     */
>>>    #ifdef RTE_LIBRTE_MEMPOOL_STATS
>>>    #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {                  \
>>> -		unsigned __lcore_id = rte_lcore_id();           \
>>> -		if (__lcore_id < RTE_MAX_LCORE) {               \
>>> -			mp->stats[__lcore_id].name += n;        \
>>> -		}                                               \
>>> +		(mp)->stats[rte_lcore_id() + 1].name += n;      \
>>
>> This relies on LCORE_ID_ANY being UINT32_MAX, and a wrap to 0, for an
>> unregistered non-EAL thread? Might be worth a comment, or better a
>> rewrite with an explicit LCORE_ID_ANY comparison.
> 
> The purpose of this patch is to avoid the comparison here.
> 
> Yes, it relies on the wrap to zero, and these conditions:
> 1. LCORE_ID_ANY being UINT32_MAX, and
> 2. the return type of rte_lcore_id() being unsigned int, and
> 3. unsigned int being uint32_t.
> 
> When I wrote this, I considered it safe to assume that LCORE_ID_ANY will remain the unsigned equivalent of -1 using the return type of rte_lcore_id(). In other words: If the return type of rte_lcore_id() should change from 32 to 64 bit, LCORE_ID_ANY would be updated accordingly to UINT64_MAX.
> 
> Because of this assumption, I didn't use [(rte_lcore_id() + 1) & UINT32_MAX], but just [rte_lcore_id() + 1].
> 
> I struggled writing an appropriate comment without making it unacceptably long, but eventually gave up, and settled for the one-line comment in the structure only.
> 
>>
>> You anyways need a conditional. An atomic add must be used in the
>> unregistered EAL thread case.
> 
> Good point: The "+= n" must be atomic for non-isolated threads.
> 

If the various unregistered non-EAL threads are run on isolated cores or 
not does not matter.

> I just took a look at how software maintained stats are handled elsewhere, and the first thing I found, is the IOAT DMA driver, which also seems to be using non-atomic increment [1] regardless if used by a DPDK thread or not.
> 
> [1]: https://elixir.bootlin.com/dpdk/v22.11-rc2/source/drivers/dma/ioat/ioat_dmadev.c#L228
> 
> However, doing it wrong elsewhere doesn't make it correct doing it wrong here too. :-)
> 
> Atomic increments are costly, so I would rather follow your suggestion and reintroduce the comparison. How about:
> 
> #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do { \
>      unsigned int __lcore_id = rte_lcore_id(); \
>      if (likely(__lcore_id < RTE_MAX_LCORE)) { \
>          (mp)->stats[__lcore_id].name += n; \
>      } else {
>          rte_atomic64_add( \
>                  (rte_atomic64_t*)&((mp)->stats[RTE_MAX_LCORE].name), n);\
>      } \
> }
You are supposed to use GCC C11 intrinsics (e.g., __atomic_fetch_add()).

In addition: technically, you must use an atomic store for the EAL 
thread case (and an atomic load on the reader side), although there are 
tons of examples in DPDK where tearing is ignored. (The generated code 
will likely look the same.)

DPDK coding conventions require there be no braces for a single statement.

Other than that, it looks good.

> 
> And the structure comment could be:
>   * Plus one, for non-DPDK threads.
> 

"Unregistered non-EAL threads". This is the term the EAL documentation uses.
  
Morten Brørup Nov. 3, 2022, 8:59 a.m. UTC | #6
> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
> Sent: Wednesday, 2 November 2022 18.53
> 
> On 2022-11-02 10:09, Morten Brørup wrote:
> >> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
> >> Sent: Wednesday, 2 November 2022 08.53
> >>
> >> On 2022-10-31 12:26, Morten Brørup wrote:
> >>> Offset the stats array index by one, and count non-DPDK threads at
> >> index
> >>> zero.
> >>>
> >>> This patch provides two benefits:
> >>> * Non-DPDK threads are also included in the statistics.
> >>> * A conditional in the fast path is removed. Static branch
> prediction
> >> was
> >>>     correct, so the performance improvement is negligible.
> >>>
> >>> v2:
> >>> * New. No v1 of this patch in the series.
> >>>
> >>> Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> >>> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> >>> ---
> >>>    lib/mempool/rte_mempool.c |  2 +-
> >>>    lib/mempool/rte_mempool.h | 12 ++++++------
> >>>    2 files changed, 7 insertions(+), 7 deletions(-)
> >>>
> >>> diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
> >>> index 62d1ce764e..e6208125e0 100644
> >>> --- a/lib/mempool/rte_mempool.c
> >>> +++ b/lib/mempool/rte_mempool.c
> >>> @@ -1272,7 +1272,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool
> >> *mp)
> >>>    #ifdef RTE_LIBRTE_MEMPOOL_STATS
> >>>    	rte_mempool_ops_get_info(mp, &info);
> >>>    	memset(&sum, 0, sizeof(sum));
> >>> -	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
> >>> +	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE + 1; lcore_id++) {
> >>>    		sum.put_bulk += mp->stats[lcore_id].put_bulk;
> >>>    		sum.put_objs += mp->stats[lcore_id].put_objs;
> >>>    		sum.put_common_pool_bulk += mp-
> >>> stats[lcore_id].put_common_pool_bulk;
> >>> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
> >>> index 9c4bf5549f..16e7e62e3c 100644
> >>> --- a/lib/mempool/rte_mempool.h
> >>> +++ b/lib/mempool/rte_mempool.h
> >>> @@ -238,8 +238,11 @@ struct rte_mempool {
> >>>    	struct rte_mempool_memhdr_list mem_list; /**< List of
> memory
> >> chunks */
> >>>
> >>>    #ifdef RTE_LIBRTE_MEMPOOL_STATS
> >>> -	/** Per-lcore statistics. */
> >>> -	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
> >>> +	/** Per-lcore statistics.
> >>> +	 *
> >>> +	 * Offset by one, to include non-DPDK threads.
> >>> +	 */
> >>> +	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE + 1];
> >>>    #endif
> >>>    }  __rte_cache_aligned;
> >>>
> >>> @@ -304,10 +307,7 @@ struct rte_mempool {
> >>>     */
> >>>    #ifdef RTE_LIBRTE_MEMPOOL_STATS
> >>>    #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {
> \
> >>> -		unsigned __lcore_id = rte_lcore_id();           \
> >>> -		if (__lcore_id < RTE_MAX_LCORE) {               \
> >>> -			mp->stats[__lcore_id].name += n;        \
> >>> -		}                                               \
> >>> +		(mp)->stats[rte_lcore_id() + 1].name += n;      \
> >>
> >> This relies on LCORE_ID_ANY being UINT32_MAX, and a wrap to 0, for
> an
> >> unregistered non-EAL thread? Might be worth a comment, or better a
> >> rewrite with an explicit LCORE_ID_ANY comparison.
> >
> > The purpose of this patch is to avoid the comparison here.
> >
> > Yes, it relies on the wrap to zero, and these conditions:
> > 1. LCORE_ID_ANY being UINT32_MAX, and
> > 2. the return type of rte_lcore_id() being unsigned int, and
> > 3. unsigned int being uint32_t.
> >
> > When I wrote this, I considered it safe to assume that LCORE_ID_ANY
> will remain the unsigned equivalent of -1 using the return type of
> rte_lcore_id(). In other words: If the return type of rte_lcore_id()
> should change from 32 to 64 bit, LCORE_ID_ANY would be updated
> accordingly to UINT64_MAX.
> >
> > Because of this assumption, I didn't use [(rte_lcore_id() + 1) &
> UINT32_MAX], but just [rte_lcore_id() + 1].
> >
> > I struggled writing an appropriate comment without making it
> unacceptably long, but eventually gave up, and settled for the one-line
> comment in the structure only.
> >
> >>
> >> You anyways need a conditional. An atomic add must be used in the
> >> unregistered EAL thread case.
> >
> > Good point: The "+= n" must be atomic for non-isolated threads.
> >
> 
> If the various unregistered non-EAL threads are run on isolated cores
> or not does not matter.

Agree: They all share the same index, and thus may race, regardless which cores they are using.

Rephrasing: The "+= n" must be atomic for the unregistered non-EAL threads.

> 
> > I just took a look at how software maintained stats are handled
> elsewhere, and the first thing I found, is the IOAT DMA driver, which
> also seems to be using non-atomic increment [1] regardless if used by a
> DPDK thread or not.
> >
> > [1]: https://elixir.bootlin.com/dpdk/v22.11-
> rc2/source/drivers/dma/ioat/ioat_dmadev.c#L228
> >
> > However, doing it wrong elsewhere doesn't make it correct doing it
> wrong here too. :-)
> >
> > Atomic increments are costly, so I would rather follow your
> suggestion and reintroduce the comparison. How about:
> >
> > #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do { \
> >      unsigned int __lcore_id = rte_lcore_id(); \
> >      if (likely(__lcore_id < RTE_MAX_LCORE)) { \
> >          (mp)->stats[__lcore_id].name += n; \
> >      } else {
> >          rte_atomic64_add( \
> >                  (rte_atomic64_t*)&((mp)->stats[RTE_MAX_LCORE].name),
> n);\
> >      } \
> > }
> You are supposed to use GCC C11 intrinsics (e.g.,
> __atomic_fetch_add()).

Ups. I forgot!

This should be emphasized everywhere in the rte_atomic library, to prevent such mistakes.

> 
> In addition: technically, you must use an atomic store for the EAL
> thread case (and an atomic load on the reader side), although there are
> tons of examples in DPDK where tearing is ignored. (The generated code
> will likely look the same.)

The counters are 64 bit aligned, but tearing could happen on 32 bit architectures.

My initial reaction to your comment was to do it correctly on the EAL threads too, to avoid tearing there too. However, there might be a performance cost for 32 bit architectures, so I will consider that these are only debug counters, and accept the risk of tearing.

> 
> DPDK coding conventions require there be no braces for a single
> statement.

Will fix.

> 
> Other than that, it looks good.
> 
> >
> > And the structure comment could be:
> >   * Plus one, for non-DPDK threads.
> >
> 
> "Unregistered non-EAL threads". This is the term the EAL documentation
> uses.

Thank you for clarifying. I didn't follow that discussion, so the new terminology for threads hasn't stuck with me yet. :-)
  
Mattias Rönnblom Nov. 4, 2022, 8:58 a.m. UTC | #7
On 2022-11-03 09:59, Morten Brørup wrote:
>> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
>> Sent: Wednesday, 2 November 2022 18.53
>>
>> On 2022-11-02 10:09, Morten Brørup wrote:
>>>> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
>>>> Sent: Wednesday, 2 November 2022 08.53
>>>>
>>>> On 2022-10-31 12:26, Morten Brørup wrote:
>>>>> Offset the stats array index by one, and count non-DPDK threads at
>>>> index
>>>>> zero.
>>>>>
>>>>> This patch provides two benefits:
>>>>> * Non-DPDK threads are also included in the statistics.
>>>>> * A conditional in the fast path is removed. Static branch
>> prediction
>>>> was
>>>>>      correct, so the performance improvement is negligible.
>>>>>
>>>>> v2:
>>>>> * New. No v1 of this patch in the series.
>>>>>
>>>>> Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
>>>>> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
>>>>> ---
>>>>>     lib/mempool/rte_mempool.c |  2 +-
>>>>>     lib/mempool/rte_mempool.h | 12 ++++++------
>>>>>     2 files changed, 7 insertions(+), 7 deletions(-)
>>>>>
>>>>> diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
>>>>> index 62d1ce764e..e6208125e0 100644
>>>>> --- a/lib/mempool/rte_mempool.c
>>>>> +++ b/lib/mempool/rte_mempool.c
>>>>> @@ -1272,7 +1272,7 @@ rte_mempool_dump(FILE *f, struct rte_mempool
>>>> *mp)
>>>>>     #ifdef RTE_LIBRTE_MEMPOOL_STATS
>>>>>     	rte_mempool_ops_get_info(mp, &info);
>>>>>     	memset(&sum, 0, sizeof(sum));
>>>>> -	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
>>>>> +	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE + 1; lcore_id++) {
>>>>>     		sum.put_bulk += mp->stats[lcore_id].put_bulk;
>>>>>     		sum.put_objs += mp->stats[lcore_id].put_objs;
>>>>>     		sum.put_common_pool_bulk += mp-
>>>>> stats[lcore_id].put_common_pool_bulk;
>>>>> diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
>>>>> index 9c4bf5549f..16e7e62e3c 100644
>>>>> --- a/lib/mempool/rte_mempool.h
>>>>> +++ b/lib/mempool/rte_mempool.h
>>>>> @@ -238,8 +238,11 @@ struct rte_mempool {
>>>>>     	struct rte_mempool_memhdr_list mem_list; /**< List of
>> memory
>>>> chunks */
>>>>>
>>>>>     #ifdef RTE_LIBRTE_MEMPOOL_STATS
>>>>> -	/** Per-lcore statistics. */
>>>>> -	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
>>>>> +	/** Per-lcore statistics.
>>>>> +	 *
>>>>> +	 * Offset by one, to include non-DPDK threads.
>>>>> +	 */
>>>>> +	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE + 1];
>>>>>     #endif
>>>>>     }  __rte_cache_aligned;
>>>>>
>>>>> @@ -304,10 +307,7 @@ struct rte_mempool {
>>>>>      */
>>>>>     #ifdef RTE_LIBRTE_MEMPOOL_STATS
>>>>>     #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {
>> \
>>>>> -		unsigned __lcore_id = rte_lcore_id();           \
>>>>> -		if (__lcore_id < RTE_MAX_LCORE) {               \
>>>>> -			mp->stats[__lcore_id].name += n;        \
>>>>> -		}                                               \
>>>>> +		(mp)->stats[rte_lcore_id() + 1].name += n;      \
>>>>
>>>> This relies on LCORE_ID_ANY being UINT32_MAX, and a wrap to 0, for
>> an
>>>> unregistered non-EAL thread? Might be worth a comment, or better a
>>>> rewrite with an explicit LCORE_ID_ANY comparison.
>>>
>>> The purpose of this patch is to avoid the comparison here.
>>>
>>> Yes, it relies on the wrap to zero, and these conditions:
>>> 1. LCORE_ID_ANY being UINT32_MAX, and
>>> 2. the return type of rte_lcore_id() being unsigned int, and
>>> 3. unsigned int being uint32_t.
>>>
>>> When I wrote this, I considered it safe to assume that LCORE_ID_ANY
>> will remain the unsigned equivalent of -1 using the return type of
>> rte_lcore_id(). In other words: If the return type of rte_lcore_id()
>> should change from 32 to 64 bit, LCORE_ID_ANY would be updated
>> accordingly to UINT64_MAX.
>>>
>>> Because of this assumption, I didn't use [(rte_lcore_id() + 1) &
>> UINT32_MAX], but just [rte_lcore_id() + 1].
>>>
>>> I struggled writing an appropriate comment without making it
>> unacceptably long, but eventually gave up, and settled for the one-line
>> comment in the structure only.
>>>
>>>>
>>>> You anyways need a conditional. An atomic add must be used in the
>>>> unregistered EAL thread case.
>>>
>>> Good point: The "+= n" must be atomic for non-isolated threads.
>>>
>>
>> If the various unregistered non-EAL threads are run on isolated cores
>> or not does not matter.
> 
> Agree: They all share the same index, and thus may race, regardless which cores they are using.
> 
> Rephrasing: The "+= n" must be atomic for the unregistered non-EAL threads.
> 
>>
>>> I just took a look at how software maintained stats are handled
>> elsewhere, and the first thing I found, is the IOAT DMA driver, which
>> also seems to be using non-atomic increment [1] regardless if used by a
>> DPDK thread or not.
>>>
>>> [1]: https://elixir.bootlin.com/dpdk/v22.11-
>> rc2/source/drivers/dma/ioat/ioat_dmadev.c#L228
>>>
>>> However, doing it wrong elsewhere doesn't make it correct doing it
>> wrong here too. :-)
>>>
>>> Atomic increments are costly, so I would rather follow your
>> suggestion and reintroduce the comparison. How about:
>>>
>>> #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do { \
>>>       unsigned int __lcore_id = rte_lcore_id(); \
>>>       if (likely(__lcore_id < RTE_MAX_LCORE)) { \
>>>           (mp)->stats[__lcore_id].name += n; \
>>>       } else {
>>>           rte_atomic64_add( \
>>>                   (rte_atomic64_t*)&((mp)->stats[RTE_MAX_LCORE].name),
>> n);\
>>>       } \
>>> }
>> You are supposed to use GCC C11 intrinsics (e.g.,
>> __atomic_fetch_add()).
> 
> Ups. I forgot!
> 
> This should be emphasized everywhere in the rte_atomic library, to prevent such mistakes.
> 
>>
>> In addition: technically, you must use an atomic store for the EAL
>> thread case (and an atomic load on the reader side), although there are
>> tons of examples in DPDK where tearing is ignored. (The generated code
>> will likely look the same.)
> 
> The counters are 64 bit aligned, but tearing could happen on 32 bit architectures.
> 

The compiler is free to do it on any architecture, but I'm not sure if 
it happens much in practice.

> My initial reaction to your comment was to do it correctly on the EAL threads too, to avoid tearing there too. However, there might be a performance cost for 32 bit architectures, so I will consider that these are only debug counters, and accept the risk of tearing.
> 

What would that cost consist of?

In theory C11 atomics could be implemented "in software" (i.e., without 
the proper ISA-level guarantees, with locks), but does any of the 
DPDK-supported compiler/32-bit ISAs actually do so?

It's also not obvious that if there, for a certain 
compiler/ISA-combination, is a performance impact to pay for 
correctness, correctness would have to give way.

>>
>> DPDK coding conventions require there be no braces for a single
>> statement.
> 
> Will fix.
> 
>>
>> Other than that, it looks good.
>>
>>>
>>> And the structure comment could be:
>>>    * Plus one, for non-DPDK threads.
>>>
>>
>> "Unregistered non-EAL threads". This is the term the EAL documentation
>> uses.
> 
> Thank you for clarifying. I didn't follow that discussion, so the new terminology for threads hasn't stuck with me yet. :-)
>
  
Morten Brørup Nov. 4, 2022, 10:01 a.m. UTC | #8
> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
> Sent: Friday, 4 November 2022 09.59
> 
> On 2022-11-03 09:59, Morten Brørup wrote:
> >> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
> >> Sent: Wednesday, 2 November 2022 18.53
> >>
> >> On 2022-11-02 10:09, Morten Brørup wrote:
> >>>> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
> >>>> Sent: Wednesday, 2 November 2022 08.53
> >>>>
> >>>> On 2022-10-31 12:26, Morten Brørup wrote:
> >>>>> Offset the stats array index by one, and count non-DPDK threads
> at
> >>>> index
> >>>>> zero.
> >>>>>
> >>>>> This patch provides two benefits:
> >>>>> * Non-DPDK threads are also included in the statistics.
> >>>>> * A conditional in the fast path is removed. Static branch
> >> prediction
> >>>> was
> >>>>>      correct, so the performance improvement is negligible.
> >>>>>
> >>>>> v2:
> >>>>> * New. No v1 of this patch in the series.
> >>>>>
> >>>>> Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> >>>>> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> >>>>> ---
> >>>>>     lib/mempool/rte_mempool.c |  2 +-
> >>>>>     lib/mempool/rte_mempool.h | 12 ++++++------
> >>>>>     2 files changed, 7 insertions(+), 7 deletions(-)
> >>>>>
> >>>>> diff --git a/lib/mempool/rte_mempool.c
> b/lib/mempool/rte_mempool.c
> >>>>> index 62d1ce764e..e6208125e0 100644
> >>>>> --- a/lib/mempool/rte_mempool.c
> >>>>> +++ b/lib/mempool/rte_mempool.c
> >>>>> @@ -1272,7 +1272,7 @@ rte_mempool_dump(FILE *f, struct
> rte_mempool
> >>>> *mp)
> >>>>>     #ifdef RTE_LIBRTE_MEMPOOL_STATS
> >>>>>     	rte_mempool_ops_get_info(mp, &info);
> >>>>>     	memset(&sum, 0, sizeof(sum));
> >>>>> -	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
> >>>>> +	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE + 1;
> lcore_id++) {
> >>>>>     		sum.put_bulk += mp->stats[lcore_id].put_bulk;
> >>>>>     		sum.put_objs += mp->stats[lcore_id].put_objs;
> >>>>>     		sum.put_common_pool_bulk += mp-
> >>>>> stats[lcore_id].put_common_pool_bulk;
> >>>>> diff --git a/lib/mempool/rte_mempool.h
> b/lib/mempool/rte_mempool.h
> >>>>> index 9c4bf5549f..16e7e62e3c 100644
> >>>>> --- a/lib/mempool/rte_mempool.h
> >>>>> +++ b/lib/mempool/rte_mempool.h
> >>>>> @@ -238,8 +238,11 @@ struct rte_mempool {
> >>>>>     	struct rte_mempool_memhdr_list mem_list; /**< List of
> >> memory
> >>>> chunks */
> >>>>>
> >>>>>     #ifdef RTE_LIBRTE_MEMPOOL_STATS
> >>>>> -	/** Per-lcore statistics. */
> >>>>> -	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
> >>>>> +	/** Per-lcore statistics.
> >>>>> +	 *
> >>>>> +	 * Offset by one, to include non-DPDK threads.
> >>>>> +	 */
> >>>>> +	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE + 1];
> >>>>>     #endif
> >>>>>     }  __rte_cache_aligned;
> >>>>>
> >>>>> @@ -304,10 +307,7 @@ struct rte_mempool {
> >>>>>      */
> >>>>>     #ifdef RTE_LIBRTE_MEMPOOL_STATS
> >>>>>     #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {
> >> \
> >>>>> -		unsigned __lcore_id = rte_lcore_id();           \
> >>>>> -		if (__lcore_id < RTE_MAX_LCORE) {               \
> >>>>> -			mp->stats[__lcore_id].name += n;        \
> >>>>> -		}                                               \
> >>>>> +		(mp)->stats[rte_lcore_id() + 1].name += n;      \
> >>>>
> >>>> This relies on LCORE_ID_ANY being UINT32_MAX, and a wrap to 0, for
> >> an
> >>>> unregistered non-EAL thread? Might be worth a comment, or better a
> >>>> rewrite with an explicit LCORE_ID_ANY comparison.
> >>>
> >>> The purpose of this patch is to avoid the comparison here.
> >>>
> >>> Yes, it relies on the wrap to zero, and these conditions:
> >>> 1. LCORE_ID_ANY being UINT32_MAX, and
> >>> 2. the return type of rte_lcore_id() being unsigned int, and
> >>> 3. unsigned int being uint32_t.
> >>>
> >>> When I wrote this, I considered it safe to assume that LCORE_ID_ANY
> >> will remain the unsigned equivalent of -1 using the return type of
> >> rte_lcore_id(). In other words: If the return type of rte_lcore_id()
> >> should change from 32 to 64 bit, LCORE_ID_ANY would be updated
> >> accordingly to UINT64_MAX.
> >>>
> >>> Because of this assumption, I didn't use [(rte_lcore_id() + 1) &
> >> UINT32_MAX], but just [rte_lcore_id() + 1].
> >>>
> >>> I struggled writing an appropriate comment without making it
> >> unacceptably long, but eventually gave up, and settled for the one-
> line
> >> comment in the structure only.
> >>>
> >>>>
> >>>> You anyways need a conditional. An atomic add must be used in the
> >>>> unregistered EAL thread case.
> >>>
> >>> Good point: The "+= n" must be atomic for non-isolated threads.
> >>>
> >>
> >> If the various unregistered non-EAL threads are run on isolated
> cores
> >> or not does not matter.
> >
> > Agree: They all share the same index, and thus may race, regardless
> which cores they are using.
> >
> > Rephrasing: The "+= n" must be atomic for the unregistered non-EAL
> threads.
> >
> >>
> >>> I just took a look at how software maintained stats are handled
> >> elsewhere, and the first thing I found, is the IOAT DMA driver,
> which
> >> also seems to be using non-atomic increment [1] regardless if used
> by a
> >> DPDK thread or not.
> >>>
> >>> [1]: https://elixir.bootlin.com/dpdk/v22.11-
> >> rc2/source/drivers/dma/ioat/ioat_dmadev.c#L228
> >>>
> >>> However, doing it wrong elsewhere doesn't make it correct doing it
> >> wrong here too. :-)
> >>>
> >>> Atomic increments are costly, so I would rather follow your
> >> suggestion and reintroduce the comparison. How about:
> >>>
> >>> #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do { \
> >>>       unsigned int __lcore_id = rte_lcore_id(); \
> >>>       if (likely(__lcore_id < RTE_MAX_LCORE)) { \
> >>>           (mp)->stats[__lcore_id].name += n; \
> >>>       } else {
> >>>           rte_atomic64_add( \
> >>>                   (rte_atomic64_t*)&((mp)-
> >stats[RTE_MAX_LCORE].name),
> >> n);\
> >>>       } \
> >>> }
> >> You are supposed to use GCC C11 intrinsics (e.g.,
> >> __atomic_fetch_add()).
> >
> > Ups. I forgot!
> >
> > This should be emphasized everywhere in the rte_atomic library, to
> prevent such mistakes.
> >
> >>
> >> In addition: technically, you must use an atomic store for the EAL
> >> thread case (and an atomic load on the reader side), although there
> are
> >> tons of examples in DPDK where tearing is ignored. (The generated
> code
> >> will likely look the same.)
> >
> > The counters are 64 bit aligned, but tearing could happen on 32 bit
> architectures.
> >
> 
> The compiler is free to do it on any architecture, but I'm not sure if
> it happens much in practice.
> 
> > My initial reaction to your comment was to do it correctly on the EAL
> threads too, to avoid tearing there too. However, there might be a
> performance cost for 32 bit architectures, so I will consider that
> these are only debug counters, and accept the risk of tearing.
> >
> 
> What would that cost consist of?

I have tested it this morning on Godbolt: https://godbolt.org/z/fz7f3cv8b

ctr += n becomes:

        add     QWORD PTR ctr[rip], rdi

Whereas __atomic_fetch_add(&ctr, n, __ATOMIC_RELAXED) becomes:
 
        lock add        QWORD PTR ctr[rip], rdi

> 
> In theory C11 atomics could be implemented "in software" (i.e., without
> the proper ISA-level guarantees, with locks), but does any of the
> DPDK-supported compiler/32-bit ISAs actually do so?

Adding -m32 to the compiler options, ctr += n becomes:

        xor     edx, edx
        mov     eax, DWORD PTR [esp+4]
        add     DWORD PTR ctr, eax
        adc     DWORD PTR ctr+4, edx

And __atomic_fetch_add(&ctr, n, __ATOMIC_RELAXED) becomes:

        push    edi
        xor     edi, edi
        push    esi
        push    ebx
        sub     esp, 8
        mov     esi, DWORD PTR [esp+24]
        mov     eax, DWORD PTR ctr
        mov     edx, DWORD PTR ctr+4
.L4:
        mov     ecx, eax
        mov     ebx, edx
        add     ecx, esi
        adc     ebx, edi
        mov     DWORD PTR [esp], ecx
        mov     DWORD PTR [esp+4], ebx
        mov     ebx, DWORD PTR [esp]
        mov     ecx, DWORD PTR [esp+4]
        lock cmpxchg8b  QWORD PTR ctr
        jne     .L4
        add     esp, 8
        pop     ebx
        pop     esi
        pop     edi

> 
> It's also not obvious that if there, for a certain
> compiler/ISA-combination, is a performance impact to pay for
> correctness, correctness would have to give way.

In this particular case, and because they are only debug counters, I will argue that DPDK generally uses non-atomic access to 64 bit statistics counters. This was also the case for these counters before this patch.

I am planning to finish v3 of this patch series today, so I hope you can agree to close the atomicity discussion with another argument: Making statistics counters atomic should be elevated to a general patch across DPDK, and not part of this mempool patch series.

The v3 patch (which I am working on right now) counts atomically for the unregistered non-EAL threads:

#define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {                                  \
		unsigned int __lcore_id = rte_lcore_id();                       \
		if (likely(__lcore_id < RTE_MAX_LCORE))                         \
			(mp)->stats[__lcore_id].name += n;                      \
		else                                                            \
			__atomic_fetch_add(&((mp)->stats[RTE_MAX_LCORE].name),  \
					   n, __ATOMIC_RELAXED);                \
	} while (0)

PS: Excellent discussion - thank you for getting involved, Mattias.
  
Mattias Rönnblom Nov. 7, 2022, 7:26 a.m. UTC | #9
On 2022-11-04 11:01, Morten Brørup wrote:
>> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
>> Sent: Friday, 4 November 2022 09.59
>>
>> On 2022-11-03 09:59, Morten Brørup wrote:
>>>> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
>>>> Sent: Wednesday, 2 November 2022 18.53
>>>>
>>>> On 2022-11-02 10:09, Morten Brørup wrote:
>>>>>> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
>>>>>> Sent: Wednesday, 2 November 2022 08.53
>>>>>>
>>>>>> On 2022-10-31 12:26, Morten Brørup wrote:
>>>>>>> Offset the stats array index by one, and count non-DPDK threads
>> at
>>>>>> index
>>>>>>> zero.
>>>>>>>
>>>>>>> This patch provides two benefits:
>>>>>>> * Non-DPDK threads are also included in the statistics.
>>>>>>> * A conditional in the fast path is removed. Static branch
>>>> prediction
>>>>>> was
>>>>>>>       correct, so the performance improvement is negligible.
>>>>>>>
>>>>>>> v2:
>>>>>>> * New. No v1 of this patch in the series.
>>>>>>>
>>>>>>> Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
>>>>>>> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
>>>>>>> ---
>>>>>>>      lib/mempool/rte_mempool.c |  2 +-
>>>>>>>      lib/mempool/rte_mempool.h | 12 ++++++------
>>>>>>>      2 files changed, 7 insertions(+), 7 deletions(-)
>>>>>>>
>>>>>>> diff --git a/lib/mempool/rte_mempool.c
>> b/lib/mempool/rte_mempool.c
>>>>>>> index 62d1ce764e..e6208125e0 100644
>>>>>>> --- a/lib/mempool/rte_mempool.c
>>>>>>> +++ b/lib/mempool/rte_mempool.c
>>>>>>> @@ -1272,7 +1272,7 @@ rte_mempool_dump(FILE *f, struct
>> rte_mempool
>>>>>> *mp)
>>>>>>>      #ifdef RTE_LIBRTE_MEMPOOL_STATS
>>>>>>>      	rte_mempool_ops_get_info(mp, &info);
>>>>>>>      	memset(&sum, 0, sizeof(sum));
>>>>>>> -	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
>>>>>>> +	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE + 1;
>> lcore_id++) {
>>>>>>>      		sum.put_bulk += mp->stats[lcore_id].put_bulk;
>>>>>>>      		sum.put_objs += mp->stats[lcore_id].put_objs;
>>>>>>>      		sum.put_common_pool_bulk += mp-
>>>>>>> stats[lcore_id].put_common_pool_bulk;
>>>>>>> diff --git a/lib/mempool/rte_mempool.h
>> b/lib/mempool/rte_mempool.h
>>>>>>> index 9c4bf5549f..16e7e62e3c 100644
>>>>>>> --- a/lib/mempool/rte_mempool.h
>>>>>>> +++ b/lib/mempool/rte_mempool.h
>>>>>>> @@ -238,8 +238,11 @@ struct rte_mempool {
>>>>>>>      	struct rte_mempool_memhdr_list mem_list; /**< List of
>>>> memory
>>>>>> chunks */
>>>>>>>
>>>>>>>      #ifdef RTE_LIBRTE_MEMPOOL_STATS
>>>>>>> -	/** Per-lcore statistics. */
>>>>>>> -	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
>>>>>>> +	/** Per-lcore statistics.
>>>>>>> +	 *
>>>>>>> +	 * Offset by one, to include non-DPDK threads.
>>>>>>> +	 */
>>>>>>> +	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE + 1];
>>>>>>>      #endif
>>>>>>>      }  __rte_cache_aligned;
>>>>>>>
>>>>>>> @@ -304,10 +307,7 @@ struct rte_mempool {
>>>>>>>       */
>>>>>>>      #ifdef RTE_LIBRTE_MEMPOOL_STATS
>>>>>>>      #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {
>>>> \
>>>>>>> -		unsigned __lcore_id = rte_lcore_id();           \
>>>>>>> -		if (__lcore_id < RTE_MAX_LCORE) {               \
>>>>>>> -			mp->stats[__lcore_id].name += n;        \
>>>>>>> -		}                                               \
>>>>>>> +		(mp)->stats[rte_lcore_id() + 1].name += n;      \
>>>>>>
>>>>>> This relies on LCORE_ID_ANY being UINT32_MAX, and a wrap to 0, for
>>>> an
>>>>>> unregistered non-EAL thread? Might be worth a comment, or better a
>>>>>> rewrite with an explicit LCORE_ID_ANY comparison.
>>>>>
>>>>> The purpose of this patch is to avoid the comparison here.
>>>>>
>>>>> Yes, it relies on the wrap to zero, and these conditions:
>>>>> 1. LCORE_ID_ANY being UINT32_MAX, and
>>>>> 2. the return type of rte_lcore_id() being unsigned int, and
>>>>> 3. unsigned int being uint32_t.
>>>>>
>>>>> When I wrote this, I considered it safe to assume that LCORE_ID_ANY
>>>> will remain the unsigned equivalent of -1 using the return type of
>>>> rte_lcore_id(). In other words: If the return type of rte_lcore_id()
>>>> should change from 32 to 64 bit, LCORE_ID_ANY would be updated
>>>> accordingly to UINT64_MAX.
>>>>>
>>>>> Because of this assumption, I didn't use [(rte_lcore_id() + 1) &
>>>> UINT32_MAX], but just [rte_lcore_id() + 1].
>>>>>
>>>>> I struggled writing an appropriate comment without making it
>>>> unacceptably long, but eventually gave up, and settled for the one-
>> line
>>>> comment in the structure only.
>>>>>
>>>>>>
>>>>>> You anyways need a conditional. An atomic add must be used in the
>>>>>> unregistered EAL thread case.
>>>>>
>>>>> Good point: The "+= n" must be atomic for non-isolated threads.
>>>>>
>>>>
>>>> If the various unregistered non-EAL threads are run on isolated
>> cores
>>>> or not does not matter.
>>>
>>> Agree: They all share the same index, and thus may race, regardless
>> which cores they are using.
>>>
>>> Rephrasing: The "+= n" must be atomic for the unregistered non-EAL
>> threads.
>>>
>>>>
>>>>> I just took a look at how software maintained stats are handled
>>>> elsewhere, and the first thing I found, is the IOAT DMA driver,
>> which
>>>> also seems to be using non-atomic increment [1] regardless if used
>> by a
>>>> DPDK thread or not.
>>>>>
>>>>> [1]: https://elixir.bootlin.com/dpdk/v22.11-
>>>> rc2/source/drivers/dma/ioat/ioat_dmadev.c#L228
>>>>>
>>>>> However, doing it wrong elsewhere doesn't make it correct doing it
>>>> wrong here too. :-)
>>>>>
>>>>> Atomic increments are costly, so I would rather follow your
>>>> suggestion and reintroduce the comparison. How about:
>>>>>
>>>>> #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do { \
>>>>>        unsigned int __lcore_id = rte_lcore_id(); \
>>>>>        if (likely(__lcore_id < RTE_MAX_LCORE)) { \
>>>>>            (mp)->stats[__lcore_id].name += n; \
>>>>>        } else {
>>>>>            rte_atomic64_add( \
>>>>>                    (rte_atomic64_t*)&((mp)-
>>> stats[RTE_MAX_LCORE].name),
>>>> n);\
>>>>>        } \
>>>>> }
>>>> You are supposed to use GCC C11 intrinsics (e.g.,
>>>> __atomic_fetch_add()).
>>>
>>> Ups. I forgot!
>>>
>>> This should be emphasized everywhere in the rte_atomic library, to
>> prevent such mistakes.
>>>
>>>>
>>>> In addition: technically, you must use an atomic store for the EAL
>>>> thread case (and an atomic load on the reader side), although there
>> are
>>>> tons of examples in DPDK where tearing is ignored. (The generated
>> code
>>>> will likely look the same.)
>>>
>>> The counters are 64 bit aligned, but tearing could happen on 32 bit
>> architectures.
>>>
>>
>> The compiler is free to do it on any architecture, but I'm not sure if
>> it happens much in practice.
>>
>>> My initial reaction to your comment was to do it correctly on the EAL
>> threads too, to avoid tearing there too. However, there might be a
>> performance cost for 32 bit architectures, so I will consider that
>> these are only debug counters, and accept the risk of tearing.
>>>
>>
>> What would that cost consist of?
> 
> I have tested it this morning on Godbolt: https://godbolt.org/z/fz7f3cv8b
> 
> ctr += n becomes:
> 
>          add     QWORD PTR ctr[rip], rdi
> 
> Whereas __atomic_fetch_add(&ctr, n, __ATOMIC_RELAXED) becomes:
>   
>          lock add        QWORD PTR ctr[rip], rdi
> 

Since there is a single writer/producer, only the store need be atomic, 
not the complete load+add+store sequence.

__atomic_store_n(&ctr, ctr + 1, __ATOMIC_RELAXED);

Such a construct will not result in a lock add instruction.

I've happened to finished a draft version of a chapter on this topic, in 
the data plane book I'm writing:
https://ericsson.github.io/dataplanebook/stats/stats.html#per-core-counters

I will add a section with this "add an extra instance to deal with 
unregistered non-EAL threads" approach taken in your patch.

>>
>> In theory C11 atomics could be implemented "in software" (i.e., without
>> the proper ISA-level guarantees, with locks), but does any of the
>> DPDK-supported compiler/32-bit ISAs actually do so?
> 
> Adding -m32 to the compiler options, ctr += n becomes:
> 
>          xor     edx, edx
>          mov     eax, DWORD PTR [esp+4]
>          add     DWORD PTR ctr, eax
>          adc     DWORD PTR ctr+4, edx
> 
> And __atomic_fetch_add(&ctr, n, __ATOMIC_RELAXED) becomes:
> 
>          push    edi
>          xor     edi, edi
>          push    esi
>          push    ebx
>          sub     esp, 8
>          mov     esi, DWORD PTR [esp+24]
>          mov     eax, DWORD PTR ctr
>          mov     edx, DWORD PTR ctr+4
> .L4:
>          mov     ecx, eax
>          mov     ebx, edx
>          add     ecx, esi
>          adc     ebx, edi
>          mov     DWORD PTR [esp], ecx
>          mov     DWORD PTR [esp+4], ebx
>          mov     ebx, DWORD PTR [esp]
>          mov     ecx, DWORD PTR [esp+4]
>          lock cmpxchg8b  QWORD PTR ctr
>          jne     .L4
>          add     esp, 8
>          pop     ebx
>          pop     esi
>          pop     edi
> 
>>
>> It's also not obvious that if there, for a certain
>> compiler/ISA-combination, is a performance impact to pay for
>> correctness, correctness would have to give way.
> 
> In this particular case, and because they are only debug counters, I will argue that DPDK generally uses non-atomic access to 64 bit statistics counters. This was also the case for these counters before this patch.
> 
> I am planning to finish v3 of this patch series today, so I hope you can agree to close the atomicity discussion with another argument: Making statistics counters atomic should be elevated to a general patch across DPDK, and not part of this mempool patch series.
> 

As I think I indicated, I think this is a minor issue. I'm OK with the 
code as-written.

> The v3 patch (which I am working on right now) counts atomically for the unregistered non-EAL threads:
> 
> #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {                                  \
> 		unsigned int __lcore_id = rte_lcore_id();                       \
> 		if (likely(__lcore_id < RTE_MAX_LCORE))                         \
> 			(mp)->stats[__lcore_id].name += n;                      \
> 		else                                                            \
> 			__atomic_fetch_add(&((mp)->stats[RTE_MAX_LCORE].name),  \
> 					   n, __ATOMIC_RELAXED);                \
> 	} while (0)
> 
> PS: Excellent discussion - thank you for getting involved, Mattias.
>
  
Morten Brørup Nov. 7, 2022, 8:56 a.m. UTC | #10
> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
> Sent: Monday, 7 November 2022 08.27
> 
> On 2022-11-04 11:01, Morten Brørup wrote:
> >> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
> >> Sent: Friday, 4 November 2022 09.59
> >>
> >> On 2022-11-03 09:59, Morten Brørup wrote:
> >>>> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
> >>>> Sent: Wednesday, 2 November 2022 18.53
> >>>>
> >>>> On 2022-11-02 10:09, Morten Brørup wrote:
> >>>>>> From: Mattias Rönnblom [mailto:hofors@lysator.liu.se]
> >>>>>> Sent: Wednesday, 2 November 2022 08.53
> >>>>>>
> >>>>>> On 2022-10-31 12:26, Morten Brørup wrote:
> >>>>>>> Offset the stats array index by one, and count non-DPDK threads
> >> at
> >>>>>> index
> >>>>>>> zero.
> >>>>>>>
> >>>>>>> This patch provides two benefits:
> >>>>>>> * Non-DPDK threads are also included in the statistics.
> >>>>>>> * A conditional in the fast path is removed. Static branch
> >>>> prediction
> >>>>>> was
> >>>>>>>       correct, so the performance improvement is negligible.
> >>>>>>>
> >>>>>>> v2:
> >>>>>>> * New. No v1 of this patch in the series.
> >>>>>>>
> >>>>>>> Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> >>>>>>> Signed-off-by: Morten Brørup <mb@smartsharesystems.com>
> >>>>>>> ---
> >>>>>>>      lib/mempool/rte_mempool.c |  2 +-
> >>>>>>>      lib/mempool/rte_mempool.h | 12 ++++++------
> >>>>>>>      2 files changed, 7 insertions(+), 7 deletions(-)
> >>>>>>>
> >>>>>>> diff --git a/lib/mempool/rte_mempool.c
> >> b/lib/mempool/rte_mempool.c
> >>>>>>> index 62d1ce764e..e6208125e0 100644
> >>>>>>> --- a/lib/mempool/rte_mempool.c
> >>>>>>> +++ b/lib/mempool/rte_mempool.c
> >>>>>>> @@ -1272,7 +1272,7 @@ rte_mempool_dump(FILE *f, struct
> >> rte_mempool
> >>>>>> *mp)
> >>>>>>>      #ifdef RTE_LIBRTE_MEMPOOL_STATS
> >>>>>>>      	rte_mempool_ops_get_info(mp, &info);
> >>>>>>>      	memset(&sum, 0, sizeof(sum));
> >>>>>>> -	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
> >>>>>>> +	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE + 1;
> >> lcore_id++) {
> >>>>>>>      		sum.put_bulk += mp->stats[lcore_id].put_bulk;
> >>>>>>>      		sum.put_objs += mp->stats[lcore_id].put_objs;
> >>>>>>>      		sum.put_common_pool_bulk += mp-
> >>>>>>> stats[lcore_id].put_common_pool_bulk;
> >>>>>>> diff --git a/lib/mempool/rte_mempool.h
> >> b/lib/mempool/rte_mempool.h
> >>>>>>> index 9c4bf5549f..16e7e62e3c 100644
> >>>>>>> --- a/lib/mempool/rte_mempool.h
> >>>>>>> +++ b/lib/mempool/rte_mempool.h
> >>>>>>> @@ -238,8 +238,11 @@ struct rte_mempool {
> >>>>>>>      	struct rte_mempool_memhdr_list mem_list; /**< List of
> >>>> memory
> >>>>>> chunks */
> >>>>>>>
> >>>>>>>      #ifdef RTE_LIBRTE_MEMPOOL_STATS
> >>>>>>> -	/** Per-lcore statistics. */
> >>>>>>> -	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
> >>>>>>> +	/** Per-lcore statistics.
> >>>>>>> +	 *
> >>>>>>> +	 * Offset by one, to include non-DPDK threads.
> >>>>>>> +	 */
> >>>>>>> +	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE + 1];
> >>>>>>>      #endif
> >>>>>>>      }  __rte_cache_aligned;
> >>>>>>>
> >>>>>>> @@ -304,10 +307,7 @@ struct rte_mempool {
> >>>>>>>       */
> >>>>>>>      #ifdef RTE_LIBRTE_MEMPOOL_STATS
> >>>>>>>      #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {
> >>>> \
> >>>>>>> -		unsigned __lcore_id = rte_lcore_id();           \
> >>>>>>> -		if (__lcore_id < RTE_MAX_LCORE) {               \
> >>>>>>> -			mp->stats[__lcore_id].name += n;        \
> >>>>>>> -		}                                               \
> >>>>>>> +		(mp)->stats[rte_lcore_id() + 1].name += n;      \
> >>>>>>
> >>>>>> This relies on LCORE_ID_ANY being UINT32_MAX, and a wrap to 0,
> for
> >>>> an
> >>>>>> unregistered non-EAL thread? Might be worth a comment, or better
> a
> >>>>>> rewrite with an explicit LCORE_ID_ANY comparison.
> >>>>>
> >>>>> The purpose of this patch is to avoid the comparison here.
> >>>>>
> >>>>> Yes, it relies on the wrap to zero, and these conditions:
> >>>>> 1. LCORE_ID_ANY being UINT32_MAX, and
> >>>>> 2. the return type of rte_lcore_id() being unsigned int, and
> >>>>> 3. unsigned int being uint32_t.
> >>>>>
> >>>>> When I wrote this, I considered it safe to assume that
> LCORE_ID_ANY
> >>>> will remain the unsigned equivalent of -1 using the return type of
> >>>> rte_lcore_id(). In other words: If the return type of
> rte_lcore_id()
> >>>> should change from 32 to 64 bit, LCORE_ID_ANY would be updated
> >>>> accordingly to UINT64_MAX.
> >>>>>
> >>>>> Because of this assumption, I didn't use [(rte_lcore_id() + 1) &
> >>>> UINT32_MAX], but just [rte_lcore_id() + 1].
> >>>>>
> >>>>> I struggled writing an appropriate comment without making it
> >>>> unacceptably long, but eventually gave up, and settled for the
> one-
> >> line
> >>>> comment in the structure only.
> >>>>>
> >>>>>>
> >>>>>> You anyways need a conditional. An atomic add must be used in
> the
> >>>>>> unregistered EAL thread case.
> >>>>>
> >>>>> Good point: The "+= n" must be atomic for non-isolated threads.
> >>>>>
> >>>>
> >>>> If the various unregistered non-EAL threads are run on isolated
> >> cores
> >>>> or not does not matter.
> >>>
> >>> Agree: They all share the same index, and thus may race, regardless
> >> which cores they are using.
> >>>
> >>> Rephrasing: The "+= n" must be atomic for the unregistered non-EAL
> >> threads.
> >>>
> >>>>
> >>>>> I just took a look at how software maintained stats are handled
> >>>> elsewhere, and the first thing I found, is the IOAT DMA driver,
> >> which
> >>>> also seems to be using non-atomic increment [1] regardless if used
> >> by a
> >>>> DPDK thread or not.
> >>>>>
> >>>>> [1]: https://elixir.bootlin.com/dpdk/v22.11-
> >>>> rc2/source/drivers/dma/ioat/ioat_dmadev.c#L228
> >>>>>
> >>>>> However, doing it wrong elsewhere doesn't make it correct doing
> it
> >>>> wrong here too. :-)
> >>>>>
> >>>>> Atomic increments are costly, so I would rather follow your
> >>>> suggestion and reintroduce the comparison. How about:
> >>>>>
> >>>>> #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do { \
> >>>>>        unsigned int __lcore_id = rte_lcore_id(); \
> >>>>>        if (likely(__lcore_id < RTE_MAX_LCORE)) { \
> >>>>>            (mp)->stats[__lcore_id].name += n; \
> >>>>>        } else {
> >>>>>            rte_atomic64_add( \
> >>>>>                    (rte_atomic64_t*)&((mp)-
> >>> stats[RTE_MAX_LCORE].name),
> >>>> n);\
> >>>>>        } \
> >>>>> }
> >>>> You are supposed to use GCC C11 intrinsics (e.g.,
> >>>> __atomic_fetch_add()).
> >>>
> >>> Ups. I forgot!
> >>>
> >>> This should be emphasized everywhere in the rte_atomic library, to
> >> prevent such mistakes.
> >>>
> >>>>
> >>>> In addition: technically, you must use an atomic store for the EAL
> >>>> thread case (and an atomic load on the reader side), although
> there
> >> are
> >>>> tons of examples in DPDK where tearing is ignored. (The generated
> >> code
> >>>> will likely look the same.)
> >>>
> >>> The counters are 64 bit aligned, but tearing could happen on 32 bit
> >> architectures.
> >>>
> >>
> >> The compiler is free to do it on any architecture, but I'm not sure
> if
> >> it happens much in practice.
> >>
> >>> My initial reaction to your comment was to do it correctly on the
> EAL
> >> threads too, to avoid tearing there too. However, there might be a
> >> performance cost for 32 bit architectures, so I will consider that
> >> these are only debug counters, and accept the risk of tearing.
> >>>
> >>
> >> What would that cost consist of?
> >
> > I have tested it this morning on Godbolt:
> https://godbolt.org/z/fz7f3cv8b
> >
> > ctr += n becomes:
> >
> >          add     QWORD PTR ctr[rip], rdi
> >
> > Whereas __atomic_fetch_add(&ctr, n, __ATOMIC_RELAXED) becomes:
> >
> >          lock add        QWORD PTR ctr[rip], rdi
> >
> 
> Since there is a single writer/producer, only the store need be atomic,
> not the complete load+add+store sequence.
> 
> __atomic_store_n(&ctr, ctr + 1, __ATOMIC_RELAXED);
> 
> Such a construct will not result in a lock add instruction.

I was trying to get rid of the conditional by handling both kinds of threads the same way. However, without the conditional, the unregistered non-EAL threads need to use __atomic_fetch_add would spill over to the EAL threads. So I concluded that the conditional should remain.

> 
> I've happened to finished a draft version of a chapter on this topic,
> in
> the data plane book I'm writing:
> https://ericsson.github.io/dataplanebook/stats/stats.html#per-core-
> counters

Thanks for the link. I just took a look at it, and it looks great!

> 
> I will add a section with this "add an extra instance to deal with
> unregistered non-EAL threads" approach taken in your patch.
> 
> >>
> >> In theory C11 atomics could be implemented "in software" (i.e.,
> without
> >> the proper ISA-level guarantees, with locks), but does any of the
> >> DPDK-supported compiler/32-bit ISAs actually do so?
> >
> > Adding -m32 to the compiler options, ctr += n becomes:
> >
> >          xor     edx, edx
> >          mov     eax, DWORD PTR [esp+4]
> >          add     DWORD PTR ctr, eax
> >          adc     DWORD PTR ctr+4, edx
> >
> > And __atomic_fetch_add(&ctr, n, __ATOMIC_RELAXED) becomes:
> >
> >          push    edi
> >          xor     edi, edi
> >          push    esi
> >          push    ebx
> >          sub     esp, 8
> >          mov     esi, DWORD PTR [esp+24]
> >          mov     eax, DWORD PTR ctr
> >          mov     edx, DWORD PTR ctr+4
> > .L4:
> >          mov     ecx, eax
> >          mov     ebx, edx
> >          add     ecx, esi
> >          adc     ebx, edi
> >          mov     DWORD PTR [esp], ecx
> >          mov     DWORD PTR [esp+4], ebx
> >          mov     ebx, DWORD PTR [esp]
> >          mov     ecx, DWORD PTR [esp+4]
> >          lock cmpxchg8b  QWORD PTR ctr
> >          jne     .L4
> >          add     esp, 8
> >          pop     ebx
> >          pop     esi
> >          pop     edi
> >
> >>
> >> It's also not obvious that if there, for a certain
> >> compiler/ISA-combination, is a performance impact to pay for
> >> correctness, correctness would have to give way.
> >
> > In this particular case, and because they are only debug counters, I
> will argue that DPDK generally uses non-atomic access to 64 bit
> statistics counters. This was also the case for these counters before
> this patch.
> >
> > I am planning to finish v3 of this patch series today, so I hope you
> can agree to close the atomicity discussion with another argument:
> Making statistics counters atomic should be elevated to a general patch
> across DPDK, and not part of this mempool patch series.
> >
> 
> As I think I indicated, I think this is a minor issue. I'm OK with the
> code as-written.

Thanks. We can always micro-optimize the unregistered non-EAL thread counters later. :-)

> 
> > The v3 patch (which I am working on right now) counts atomically for
> the unregistered non-EAL threads:
> >
> > #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {
> \
> > 		unsigned int __lcore_id = rte_lcore_id();
> \
> > 		if (likely(__lcore_id < RTE_MAX_LCORE))
> \
> > 			(mp)->stats[__lcore_id].name += n;
> \
> > 		else
> \
> > 			__atomic_fetch_add(&((mp)-
> >stats[RTE_MAX_LCORE].name),  \
> > 					   n, __ATOMIC_RELAXED);                \
> > 	} while (0)
> >
> > PS: Excellent discussion - thank you for getting involved, Mattias.
> >
  

Patch

diff --git a/lib/mempool/rte_mempool.c b/lib/mempool/rte_mempool.c
index 62d1ce764e..e6208125e0 100644
--- a/lib/mempool/rte_mempool.c
+++ b/lib/mempool/rte_mempool.c
@@ -1272,7 +1272,7 @@  rte_mempool_dump(FILE *f, struct rte_mempool *mp)
 #ifdef RTE_LIBRTE_MEMPOOL_STATS
 	rte_mempool_ops_get_info(mp, &info);
 	memset(&sum, 0, sizeof(sum));
-	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
+	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE + 1; lcore_id++) {
 		sum.put_bulk += mp->stats[lcore_id].put_bulk;
 		sum.put_objs += mp->stats[lcore_id].put_objs;
 		sum.put_common_pool_bulk += mp->stats[lcore_id].put_common_pool_bulk;
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 9c4bf5549f..16e7e62e3c 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -238,8 +238,11 @@  struct rte_mempool {
 	struct rte_mempool_memhdr_list mem_list; /**< List of memory chunks */
 
 #ifdef RTE_LIBRTE_MEMPOOL_STATS
-	/** Per-lcore statistics. */
-	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
+	/** Per-lcore statistics.
+	 *
+	 * Offset by one, to include non-DPDK threads.
+	 */
+	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE + 1];
 #endif
 }  __rte_cache_aligned;
 
@@ -304,10 +307,7 @@  struct rte_mempool {
  */
 #ifdef RTE_LIBRTE_MEMPOOL_STATS
 #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {                  \
-		unsigned __lcore_id = rte_lcore_id();           \
-		if (__lcore_id < RTE_MAX_LCORE) {               \
-			mp->stats[__lcore_id].name += n;        \
-		}                                               \
+		(mp)->stats[rte_lcore_id() + 1].name += n;      \
 	} while (0)
 #else
 #define RTE_MEMPOOL_STAT_ADD(mp, name, n) do {} while (0)