[RFC] rte_ring: don't use always inline

Message ID 20220505224547.394253-1-stephen@networkplumber.org (mailing list archive)
State Rejected, archived
Delegated to: Thomas Monjalon
Headers
Series [RFC] rte_ring: don't use always inline |

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/Intel-compilation success Compilation OK
ci/intel-Testing success Testing PASS

Commit Message

Stephen Hemminger May 5, 2022, 10:45 p.m. UTC
  Forcing compiler to inline with always inline can lead to worse
and sometimes broken code. Better to use the standard inline
keyword and let compiler have some flexibilty.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---

This is RFC because the use of large scale inlining is debatable.
This change may slow things down on some versions of Gcc and
architectures.

If you follow Linux kernel list, this has been a debated topic
over the years, with opinions for and against inlining.
Combined with bad inlining in various Gcc versions.

 lib/ring/rte_ring.h         | 36 ++++++++++++++++++------------------
 lib/ring/rte_ring_c11_pvt.h |  6 +++---
 lib/ring/rte_ring_elem.h    | 36 ++++++++++++++++++------------------
 3 files changed, 39 insertions(+), 39 deletions(-)
  

Comments

Honnappa Nagarahalli May 5, 2022, 10:59 p.m. UTC | #1
Thanks Stephen. Do you see any performance difference with this change?

> -----Original Message-----
> From: Stephen Hemminger <stephen@networkplumber.org>
> Sent: Thursday, May 5, 2022 5:46 PM
> To: dev@dpdk.org
> Cc: Stephen Hemminger <stephen@networkplumber.org>
> Subject: [RFC] rte_ring: don't use always inline
> 
> Forcing compiler to inline with always inline can lead to worse and sometimes
> broken code. Better to use the standard inline keyword and let compiler have
> some flexibilty.
> 
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> ---
> 
> This is RFC because the use of large scale inlining is debatable.
> This change may slow things down on some versions of Gcc and architectures.
> 
> If you follow Linux kernel list, this has been a debated topic over the years, with
> opinions for and against inlining.
> Combined with bad inlining in various Gcc versions.
> 
>  lib/ring/rte_ring.h         | 36 ++++++++++++++++++------------------
>  lib/ring/rte_ring_c11_pvt.h |  6 +++---
>  lib/ring/rte_ring_elem.h    | 36 ++++++++++++++++++------------------
>  3 files changed, 39 insertions(+), 39 deletions(-)
> 
> diff --git a/lib/ring/rte_ring.h b/lib/ring/rte_ring.h index
> 980e92e59493..4a2588beed9e 100644
> --- a/lib/ring/rte_ring.h
> +++ b/lib/ring/rte_ring.h
> @@ -226,7 +226,7 @@ void rte_ring_dump(FILE *f, const struct rte_ring *r);
>   * @return
>   *   The number of objects enqueued, either 0 or n
>   */
> -static __rte_always_inline unsigned int
> +static inline unsigned int
>  rte_ring_mp_enqueue_bulk(struct rte_ring *r, void * const *obj_table,
>  			 unsigned int n, unsigned int *free_space)  { @@ -
> 249,7 +249,7 @@ rte_ring_mp_enqueue_bulk(struct rte_ring *r, void * const
> *obj_table,
>   * @return
>   *   The number of objects enqueued, either 0 or n
>   */
> -static __rte_always_inline unsigned int
> +static inline unsigned int
>  rte_ring_sp_enqueue_bulk(struct rte_ring *r, void * const *obj_table,
>  			 unsigned int n, unsigned int *free_space)  { @@ -
> 276,7 +276,7 @@ rte_ring_sp_enqueue_bulk(struct rte_ring *r, void * const
> *obj_table,
>   * @return
>   *   The number of objects enqueued, either 0 or n
>   */
> -static __rte_always_inline unsigned int
> +static inline unsigned int
>  rte_ring_enqueue_bulk(struct rte_ring *r, void * const *obj_table,
>  		      unsigned int n, unsigned int *free_space)  { @@ -298,7
> +298,7 @@ rte_ring_enqueue_bulk(struct rte_ring *r, void * const *obj_table,
>   *   - 0: Success; objects enqueued.
>   *   - -ENOBUFS: Not enough room in the ring to enqueue; no object is
> enqueued.
>   */
> -static __rte_always_inline int
> +static inline int
>  rte_ring_mp_enqueue(struct rte_ring *r, void *obj)  {
>  	return rte_ring_mp_enqueue_elem(r, &obj, sizeof(void *)); @@ -315,7
> +315,7 @@ rte_ring_mp_enqueue(struct rte_ring *r, void *obj)
>   *   - 0: Success; objects enqueued.
>   *   - -ENOBUFS: Not enough room in the ring to enqueue; no object is
> enqueued.
>   */
> -static __rte_always_inline int
> +static inline int
>  rte_ring_sp_enqueue(struct rte_ring *r, void *obj)  {
>  	return rte_ring_sp_enqueue_elem(r, &obj, sizeof(void *)); @@ -336,7
> +336,7 @@ rte_ring_sp_enqueue(struct rte_ring *r, void *obj)
>   *   - 0: Success; objects enqueued.
>   *   - -ENOBUFS: Not enough room in the ring to enqueue; no object is
> enqueued.
>   */
> -static __rte_always_inline int
> +static inline int
>  rte_ring_enqueue(struct rte_ring *r, void *obj)  {
>  	return rte_ring_enqueue_elem(r, &obj, sizeof(void *)); @@ -360,7
> +360,7 @@ rte_ring_enqueue(struct rte_ring *r, void *obj)
>   * @return
>   *   The number of objects dequeued, either 0 or n
>   */
> -static __rte_always_inline unsigned int
> +static inline unsigned int
>  rte_ring_mc_dequeue_bulk(struct rte_ring *r, void **obj_table,
>  		unsigned int n, unsigned int *available)  { @@ -384,7 +384,7
> @@ rte_ring_mc_dequeue_bulk(struct rte_ring *r, void **obj_table,
>   * @return
>   *   The number of objects dequeued, either 0 or n
>   */
> -static __rte_always_inline unsigned int
> +static inline unsigned int
>  rte_ring_sc_dequeue_bulk(struct rte_ring *r, void **obj_table,
>  		unsigned int n, unsigned int *available)  { @@ -411,7 +411,7
> @@ rte_ring_sc_dequeue_bulk(struct rte_ring *r, void **obj_table,
>   * @return
>   *   The number of objects dequeued, either 0 or n
>   */
> -static __rte_always_inline unsigned int
> +static inline unsigned int
>  rte_ring_dequeue_bulk(struct rte_ring *r, void **obj_table, unsigned int n,
>  		unsigned int *available)
>  {
> @@ -434,7 +434,7 @@ rte_ring_dequeue_bulk(struct rte_ring *r, void
> **obj_table, unsigned int n,
>   *   - -ENOENT: Not enough entries in the ring to dequeue; no object is
>   *     dequeued.
>   */
> -static __rte_always_inline int
> +static inline int
>  rte_ring_mc_dequeue(struct rte_ring *r, void **obj_p)  {
>  	return rte_ring_mc_dequeue_elem(r, obj_p, sizeof(void *)); @@ -452,7
> +452,7 @@ rte_ring_mc_dequeue(struct rte_ring *r, void **obj_p)
>   *   - -ENOENT: Not enough entries in the ring to dequeue, no object is
>   *     dequeued.
>   */
> -static __rte_always_inline int
> +static inline int
>  rte_ring_sc_dequeue(struct rte_ring *r, void **obj_p)  {
>  	return rte_ring_sc_dequeue_elem(r, obj_p, sizeof(void *)); @@ -474,7
> +474,7 @@ rte_ring_sc_dequeue(struct rte_ring *r, void **obj_p)
>   *   - -ENOENT: Not enough entries in the ring to dequeue, no object is
>   *     dequeued.
>   */
> -static __rte_always_inline int
> +static inline int
>  rte_ring_dequeue(struct rte_ring *r, void **obj_p)  {
>  	return rte_ring_dequeue_elem(r, obj_p, sizeof(void *)); @@ -681,7
> +681,7 @@ struct rte_ring *rte_ring_lookup(const char *name);
>   * @return
>   *   - n: Actual number of objects enqueued.
>   */
> -static __rte_always_inline unsigned int
> +static inline unsigned int
>  rte_ring_mp_enqueue_burst(struct rte_ring *r, void * const *obj_table,
>  			 unsigned int n, unsigned int *free_space)  { @@ -
> 704,7 +704,7 @@ rte_ring_mp_enqueue_burst(struct rte_ring *r, void * const
> *obj_table,
>   * @return
>   *   - n: Actual number of objects enqueued.
>   */
> -static __rte_always_inline unsigned int
> +static inline unsigned int
>  rte_ring_sp_enqueue_burst(struct rte_ring *r, void * const *obj_table,
>  			 unsigned int n, unsigned int *free_space)  { @@ -
> 731,7 +731,7 @@ rte_ring_sp_enqueue_burst(struct rte_ring *r, void * const
> *obj_table,
>   * @return
>   *   - n: Actual number of objects enqueued.
>   */
> -static __rte_always_inline unsigned int
> +static inline unsigned int
>  rte_ring_enqueue_burst(struct rte_ring *r, void * const *obj_table,
>  		      unsigned int n, unsigned int *free_space)  { @@ -759,7
> +759,7 @@ rte_ring_enqueue_burst(struct rte_ring *r, void * const *obj_table,
>   * @return
>   *   - n: Actual number of objects dequeued, 0 if ring is empty
>   */
> -static __rte_always_inline unsigned int
> +static inline unsigned int
>  rte_ring_mc_dequeue_burst(struct rte_ring *r, void **obj_table,
>  		unsigned int n, unsigned int *available)  { @@ -784,7 +784,7
> @@ rte_ring_mc_dequeue_burst(struct rte_ring *r, void **obj_table,
>   * @return
>   *   - n: Actual number of objects dequeued, 0 if ring is empty
>   */
> -static __rte_always_inline unsigned int
> +static inline unsigned int
>  rte_ring_sc_dequeue_burst(struct rte_ring *r, void **obj_table,
>  		unsigned int n, unsigned int *available)  { @@ -811,7 +811,7
> @@ rte_ring_sc_dequeue_burst(struct rte_ring *r, void **obj_table,
>   * @return
>   *   - Number of objects dequeued
>   */
> -static __rte_always_inline unsigned int
> +static inline unsigned int
>  rte_ring_dequeue_burst(struct rte_ring *r, void **obj_table,
>  		unsigned int n, unsigned int *available)  { diff --git
> a/lib/ring/rte_ring_c11_pvt.h b/lib/ring/rte_ring_c11_pvt.h index
> f895950df487..6972a9825cb7 100644
> --- a/lib/ring/rte_ring_c11_pvt.h
> +++ b/lib/ring/rte_ring_c11_pvt.h
> @@ -11,7 +11,7 @@
>  #ifndef _RTE_RING_C11_PVT_H_
>  #define _RTE_RING_C11_PVT_H_
> 
> -static __rte_always_inline void
> +static inline void
>  __rte_ring_update_tail(struct rte_ring_headtail *ht, uint32_t old_val,
>  		uint32_t new_val, uint32_t single, uint32_t enqueue)  { @@ -
> 50,7 +50,7 @@ __rte_ring_update_tail(struct rte_ring_headtail *ht, uint32_t
> old_val,
>   *   Actual number of objects enqueued.
>   *   If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only.
>   */
> -static __rte_always_inline unsigned int
> +static inline unsigned int
>  __rte_ring_move_prod_head(struct rte_ring *r, unsigned int is_sp,
>  		unsigned int n, enum rte_ring_queue_behavior behavior,
>  		uint32_t *old_head, uint32_t *new_head, @@ -126,7 +126,7
> @@ __rte_ring_move_prod_head(struct rte_ring *r, unsigned int is_sp,
>   *   - Actual number of objects dequeued.
>   *     If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only.
>   */
> -static __rte_always_inline unsigned int
> +static inline unsigned int
>  __rte_ring_move_cons_head(struct rte_ring *r, int is_sc,
>  		unsigned int n, enum rte_ring_queue_behavior behavior,
>  		uint32_t *old_head, uint32_t *new_head, diff --git
> a/lib/ring/rte_ring_elem.h b/lib/ring/rte_ring_elem.h index
> fb1edc9aad1f..35e110fc5b4b 100644
> --- a/lib/ring/rte_ring_elem.h
> +++ b/lib/ring/rte_ring_elem.h
> @@ -128,7 +128,7 @@ struct rte_ring *rte_ring_create_elem(const char
> *name, unsigned int esize,
>   * @return
>   *   The number of objects enqueued, either 0 or n
>   */
> -static __rte_always_inline unsigned int
> +static inline unsigned int
>  rte_ring_mp_enqueue_bulk_elem(struct rte_ring *r, const void *obj_table,
>  		unsigned int esize, unsigned int n, unsigned int *free_space)  {
> @@ -157,7 +157,7 @@ rte_ring_mp_enqueue_bulk_elem(struct rte_ring *r,
> const void *obj_table,
>   * @return
>   *   The number of objects enqueued, either 0 or n
>   */
> -static __rte_always_inline unsigned int
> +static inline unsigned int
>  rte_ring_sp_enqueue_bulk_elem(struct rte_ring *r, const void *obj_table,
>  		unsigned int esize, unsigned int n, unsigned int *free_space)  {
> @@ -191,7 +191,7 @@ rte_ring_sp_enqueue_bulk_elem(struct rte_ring *r,
> const void *obj_table,
>   * @return
>   *   The number of objects enqueued, either 0 or n
>   */
> -static __rte_always_inline unsigned int
> +static inline unsigned int
>  rte_ring_enqueue_bulk_elem(struct rte_ring *r, const void *obj_table,
>  		unsigned int esize, unsigned int n, unsigned int *free_space)  {
> @@ -235,7 +235,7 @@ rte_ring_enqueue_bulk_elem(struct rte_ring *r, const
> void *obj_table,
>   *   - 0: Success; objects enqueued.
>   *   - -ENOBUFS: Not enough room in the ring to enqueue; no object is
> enqueued.
>   */
> -static __rte_always_inline int
> +static inline int
>  rte_ring_mp_enqueue_elem(struct rte_ring *r, void *obj, unsigned int esize)  {
>  	return rte_ring_mp_enqueue_bulk_elem(r, obj, esize, 1, NULL) ? 0 :
> @@ -259,7 +259,7 @@ rte_ring_mp_enqueue_elem(struct rte_ring *r, void
> *obj, unsigned int esize)
>   *   - 0: Success; objects enqueued.
>   *   - -ENOBUFS: Not enough room in the ring to enqueue; no object is
> enqueued.
>   */
> -static __rte_always_inline int
> +static inline int
>  rte_ring_sp_enqueue_elem(struct rte_ring *r, void *obj, unsigned int esize)  {
>  	return rte_ring_sp_enqueue_bulk_elem(r, obj, esize, 1, NULL) ? 0 :
> @@ -285,7 +285,7 @@ rte_ring_sp_enqueue_elem(struct rte_ring *r, void
> *obj, unsigned int esize)
>   *   - 0: Success; objects enqueued.
>   *   - -ENOBUFS: Not enough room in the ring to enqueue; no object is
> enqueued.
>   */
> -static __rte_always_inline int
> +static inline int
>  rte_ring_enqueue_elem(struct rte_ring *r, void *obj, unsigned int esize)  {
>  	return rte_ring_enqueue_bulk_elem(r, obj, esize, 1, NULL) ? 0 :
> @@ -314,7 +314,7 @@ rte_ring_enqueue_elem(struct rte_ring *r, void *obj,
> unsigned int esize)
>   * @return
>   *   The number of objects dequeued, either 0 or n
>   */
> -static __rte_always_inline unsigned int
> +static inline unsigned int
>  rte_ring_mc_dequeue_bulk_elem(struct rte_ring *r, void *obj_table,
>  		unsigned int esize, unsigned int n, unsigned int *available)  {
> @@ -342,7 +342,7 @@ rte_ring_mc_dequeue_bulk_elem(struct rte_ring *r,
> void *obj_table,
>   * @return
>   *   The number of objects dequeued, either 0 or n
>   */
> -static __rte_always_inline unsigned int
> +static inline unsigned int
>  rte_ring_sc_dequeue_bulk_elem(struct rte_ring *r, void *obj_table,
>  		unsigned int esize, unsigned int n, unsigned int *available)  {
> @@ -373,7 +373,7 @@ rte_ring_sc_dequeue_bulk_elem(struct rte_ring *r,
> void *obj_table,
>   * @return
>   *   The number of objects dequeued, either 0 or n
>   */
> -static __rte_always_inline unsigned int
> +static inline unsigned int
>  rte_ring_dequeue_bulk_elem(struct rte_ring *r, void *obj_table,
>  		unsigned int esize, unsigned int n, unsigned int *available)  {
> @@ -418,7 +418,7 @@ rte_ring_dequeue_bulk_elem(struct rte_ring *r, void
> *obj_table,
>   *   - -ENOENT: Not enough entries in the ring to dequeue; no object is
>   *     dequeued.
>   */
> -static __rte_always_inline int
> +static inline int
>  rte_ring_mc_dequeue_elem(struct rte_ring *r, void *obj_p,
>  				unsigned int esize)
>  {
> @@ -442,7 +442,7 @@ rte_ring_mc_dequeue_elem(struct rte_ring *r, void
> *obj_p,
>   *   - -ENOENT: Not enough entries in the ring to dequeue, no object is
>   *     dequeued.
>   */
> -static __rte_always_inline int
> +static inline int
>  rte_ring_sc_dequeue_elem(struct rte_ring *r, void *obj_p,
>  				unsigned int esize)
>  {
> @@ -470,7 +470,7 @@ rte_ring_sc_dequeue_elem(struct rte_ring *r, void
> *obj_p,
>   *   - -ENOENT: Not enough entries in the ring to dequeue, no object is
>   *     dequeued.
>   */
> -static __rte_always_inline int
> +static inline int
>  rte_ring_dequeue_elem(struct rte_ring *r, void *obj_p, unsigned int esize)  {
>  	return rte_ring_dequeue_bulk_elem(r, obj_p, esize, 1, NULL) ? 0 :
> @@ -499,7 +499,7 @@ rte_ring_dequeue_elem(struct rte_ring *r, void *obj_p,
> unsigned int esize)
>   * @return
>   *   - n: Actual number of objects enqueued.
>   */
> -static __rte_always_inline unsigned int
> +static inline unsigned int
>  rte_ring_mp_enqueue_burst_elem(struct rte_ring *r, const void *obj_table,
>  		unsigned int esize, unsigned int n, unsigned int *free_space)  {
> @@ -528,7 +528,7 @@ rte_ring_mp_enqueue_burst_elem(struct rte_ring *r,
> const void *obj_table,
>   * @return
>   *   - n: Actual number of objects enqueued.
>   */
> -static __rte_always_inline unsigned int
> +static inline unsigned int
>  rte_ring_sp_enqueue_burst_elem(struct rte_ring *r, const void *obj_table,
>  		unsigned int esize, unsigned int n, unsigned int *free_space)  {
> @@ -559,7 +559,7 @@ rte_ring_sp_enqueue_burst_elem(struct rte_ring *r,
> const void *obj_table,
>   * @return
>   *   - n: Actual number of objects enqueued.
>   */
> -static __rte_always_inline unsigned int
> +static inline unsigned int
>  rte_ring_enqueue_burst_elem(struct rte_ring *r, const void *obj_table,
>  		unsigned int esize, unsigned int n, unsigned int *free_space)  {
> @@ -609,7 +609,7 @@ rte_ring_enqueue_burst_elem(struct rte_ring *r, const
> void *obj_table,
>   * @return
>   *   - n: Actual number of objects dequeued, 0 if ring is empty
>   */
> -static __rte_always_inline unsigned int
> +static inline unsigned int
>  rte_ring_mc_dequeue_burst_elem(struct rte_ring *r, void *obj_table,
>  		unsigned int esize, unsigned int n, unsigned int *available)  {
> @@ -638,7 +638,7 @@ rte_ring_mc_dequeue_burst_elem(struct rte_ring *r,
> void *obj_table,
>   * @return
>   *   - n: Actual number of objects dequeued, 0 if ring is empty
>   */
> -static __rte_always_inline unsigned int
> +static inline unsigned int
>  rte_ring_sc_dequeue_burst_elem(struct rte_ring *r, void *obj_table,
>  		unsigned int esize, unsigned int n, unsigned int *available)  {
> @@ -669,7 +669,7 @@ rte_ring_sc_dequeue_burst_elem(struct rte_ring *r,
> void *obj_table,
>   * @return
>   *   - Number of objects dequeued
>   */
> -static __rte_always_inline unsigned int
> +static inline unsigned int
>  rte_ring_dequeue_burst_elem(struct rte_ring *r, void *obj_table,
>  		unsigned int esize, unsigned int n, unsigned int *available)  {
> --
> 2.35.1
  
Stephen Hemminger May 5, 2022, 11:10 p.m. UTC | #2
On Thu, 5 May 2022 22:59:32 +0000
Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com> wrote:

> Thanks Stephen. Do you see any performance difference with this change?
> 
> > -----Original Message-----
> > From: Stephen Hemminger <stephen@networkplumber.org>
> > Sent: Thursday, May 5, 2022 5:46 PM
> > To: dev@dpdk.org
> > Cc: Stephen Hemminger <stephen@networkplumber.org>
> > Subject: [RFC] rte_ring: don't use always inline
> > 
> > Forcing compiler to inline with always inline can lead to worse and sometimes
> > broken code. Better to use the standard inline keyword and let compiler have
> > some flexibilty.
> > 
> > Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> > ---
> > 
> > This is RFC because the use of large scale inlining is debatable.
> > This change may slow things down on some versions of Gcc and architectures.
> > 
> > If you follow Linux kernel list, this has been a debated topic over the years, with
> > opinions for and against inlining.
> > Combined with bad inlining in various Gcc versions.
> > 
> >  lib/ring/rte_ring.h         | 36 ++++++++++++++++++------------------
> >  lib/ring/rte_ring_c11_pvt.h |  6 +++---
> >  lib/ring/rte_ring_elem.h    | 36 ++++++++++++++++++------------------
> >  3 files changed, 39 insertions(+), 39 deletions(-)
> > 
> > diff --git a/lib/ring/rte_ring.h b/lib/ring/rte_ring.h index
> > 980e92e59493..4a2588beed9e 100644
> > --- a/lib/ring/rte_ring.h
> > +++ b/lib/ring/rte_ring.h
> > @@ -226,7 +226,7 @@ void rte_ring_dump(FILE *f, const struct rte_ring *r);
> >   * @return
> >   *   The number of objects enqueued, either 0 or n
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_mp_enqueue_bulk(struct rte_ring *r, void * const *obj_table,
> >  			 unsigned int n, unsigned int *free_space)  { @@ -
> > 249,7 +249,7 @@ rte_ring_mp_enqueue_bulk(struct rte_ring *r, void * const
> > *obj_table,
> >   * @return
> >   *   The number of objects enqueued, either 0 or n
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_sp_enqueue_bulk(struct rte_ring *r, void * const *obj_table,
> >  			 unsigned int n, unsigned int *free_space)  { @@ -
> > 276,7 +276,7 @@ rte_ring_sp_enqueue_bulk(struct rte_ring *r, void * const
> > *obj_table,
> >   * @return
> >   *   The number of objects enqueued, either 0 or n
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_enqueue_bulk(struct rte_ring *r, void * const *obj_table,
> >  		      unsigned int n, unsigned int *free_space)  { @@ -298,7
> > +298,7 @@ rte_ring_enqueue_bulk(struct rte_ring *r, void * const *obj_table,
> >   *   - 0: Success; objects enqueued.
> >   *   - -ENOBUFS: Not enough room in the ring to enqueue; no object is
> > enqueued.
> >   */
> > -static __rte_always_inline int
> > +static inline int
> >  rte_ring_mp_enqueue(struct rte_ring *r, void *obj)  {
> >  	return rte_ring_mp_enqueue_elem(r, &obj, sizeof(void *)); @@ -315,7
> > +315,7 @@ rte_ring_mp_enqueue(struct rte_ring *r, void *obj)
> >   *   - 0: Success; objects enqueued.
> >   *   - -ENOBUFS: Not enough room in the ring to enqueue; no object is
> > enqueued.
> >   */
> > -static __rte_always_inline int
> > +static inline int
> >  rte_ring_sp_enqueue(struct rte_ring *r, void *obj)  {
> >  	return rte_ring_sp_enqueue_elem(r, &obj, sizeof(void *)); @@ -336,7
> > +336,7 @@ rte_ring_sp_enqueue(struct rte_ring *r, void *obj)
> >   *   - 0: Success; objects enqueued.
> >   *   - -ENOBUFS: Not enough room in the ring to enqueue; no object is
> > enqueued.
> >   */
> > -static __rte_always_inline int
> > +static inline int
> >  rte_ring_enqueue(struct rte_ring *r, void *obj)  {
> >  	return rte_ring_enqueue_elem(r, &obj, sizeof(void *)); @@ -360,7
> > +360,7 @@ rte_ring_enqueue(struct rte_ring *r, void *obj)
> >   * @return
> >   *   The number of objects dequeued, either 0 or n
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_mc_dequeue_bulk(struct rte_ring *r, void **obj_table,
> >  		unsigned int n, unsigned int *available)  { @@ -384,7 +384,7
> > @@ rte_ring_mc_dequeue_bulk(struct rte_ring *r, void **obj_table,
> >   * @return
> >   *   The number of objects dequeued, either 0 or n
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_sc_dequeue_bulk(struct rte_ring *r, void **obj_table,
> >  		unsigned int n, unsigned int *available)  { @@ -411,7 +411,7
> > @@ rte_ring_sc_dequeue_bulk(struct rte_ring *r, void **obj_table,
> >   * @return
> >   *   The number of objects dequeued, either 0 or n
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_dequeue_bulk(struct rte_ring *r, void **obj_table, unsigned int n,
> >  		unsigned int *available)
> >  {
> > @@ -434,7 +434,7 @@ rte_ring_dequeue_bulk(struct rte_ring *r, void
> > **obj_table, unsigned int n,
> >   *   - -ENOENT: Not enough entries in the ring to dequeue; no object is
> >   *     dequeued.
> >   */
> > -static __rte_always_inline int
> > +static inline int
> >  rte_ring_mc_dequeue(struct rte_ring *r, void **obj_p)  {
> >  	return rte_ring_mc_dequeue_elem(r, obj_p, sizeof(void *)); @@ -452,7
> > +452,7 @@ rte_ring_mc_dequeue(struct rte_ring *r, void **obj_p)
> >   *   - -ENOENT: Not enough entries in the ring to dequeue, no object is
> >   *     dequeued.
> >   */
> > -static __rte_always_inline int
> > +static inline int
> >  rte_ring_sc_dequeue(struct rte_ring *r, void **obj_p)  {
> >  	return rte_ring_sc_dequeue_elem(r, obj_p, sizeof(void *)); @@ -474,7
> > +474,7 @@ rte_ring_sc_dequeue(struct rte_ring *r, void **obj_p)
> >   *   - -ENOENT: Not enough entries in the ring to dequeue, no object is
> >   *     dequeued.
> >   */
> > -static __rte_always_inline int
> > +static inline int
> >  rte_ring_dequeue(struct rte_ring *r, void **obj_p)  {
> >  	return rte_ring_dequeue_elem(r, obj_p, sizeof(void *)); @@ -681,7
> > +681,7 @@ struct rte_ring *rte_ring_lookup(const char *name);
> >   * @return
> >   *   - n: Actual number of objects enqueued.
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_mp_enqueue_burst(struct rte_ring *r, void * const *obj_table,
> >  			 unsigned int n, unsigned int *free_space)  { @@ -
> > 704,7 +704,7 @@ rte_ring_mp_enqueue_burst(struct rte_ring *r, void * const
> > *obj_table,
> >   * @return
> >   *   - n: Actual number of objects enqueued.
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_sp_enqueue_burst(struct rte_ring *r, void * const *obj_table,
> >  			 unsigned int n, unsigned int *free_space)  { @@ -
> > 731,7 +731,7 @@ rte_ring_sp_enqueue_burst(struct rte_ring *r, void * const
> > *obj_table,
> >   * @return
> >   *   - n: Actual number of objects enqueued.
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_enqueue_burst(struct rte_ring *r, void * const *obj_table,
> >  		      unsigned int n, unsigned int *free_space)  { @@ -759,7
> > +759,7 @@ rte_ring_enqueue_burst(struct rte_ring *r, void * const *obj_table,
> >   * @return
> >   *   - n: Actual number of objects dequeued, 0 if ring is empty
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_mc_dequeue_burst(struct rte_ring *r, void **obj_table,
> >  		unsigned int n, unsigned int *available)  { @@ -784,7 +784,7
> > @@ rte_ring_mc_dequeue_burst(struct rte_ring *r, void **obj_table,
> >   * @return
> >   *   - n: Actual number of objects dequeued, 0 if ring is empty
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_sc_dequeue_burst(struct rte_ring *r, void **obj_table,
> >  		unsigned int n, unsigned int *available)  { @@ -811,7 +811,7
> > @@ rte_ring_sc_dequeue_burst(struct rte_ring *r, void **obj_table,
> >   * @return
> >   *   - Number of objects dequeued
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_dequeue_burst(struct rte_ring *r, void **obj_table,
> >  		unsigned int n, unsigned int *available)  { diff --git
> > a/lib/ring/rte_ring_c11_pvt.h b/lib/ring/rte_ring_c11_pvt.h index
> > f895950df487..6972a9825cb7 100644
> > --- a/lib/ring/rte_ring_c11_pvt.h
> > +++ b/lib/ring/rte_ring_c11_pvt.h
> > @@ -11,7 +11,7 @@
> >  #ifndef _RTE_RING_C11_PVT_H_
> >  #define _RTE_RING_C11_PVT_H_
> > 
> > -static __rte_always_inline void
> > +static inline void
> >  __rte_ring_update_tail(struct rte_ring_headtail *ht, uint32_t old_val,
> >  		uint32_t new_val, uint32_t single, uint32_t enqueue)  { @@ -
> > 50,7 +50,7 @@ __rte_ring_update_tail(struct rte_ring_headtail *ht, uint32_t
> > old_val,
> >   *   Actual number of objects enqueued.
> >   *   If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only.
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  __rte_ring_move_prod_head(struct rte_ring *r, unsigned int is_sp,
> >  		unsigned int n, enum rte_ring_queue_behavior behavior,
> >  		uint32_t *old_head, uint32_t *new_head, @@ -126,7 +126,7
> > @@ __rte_ring_move_prod_head(struct rte_ring *r, unsigned int is_sp,
> >   *   - Actual number of objects dequeued.
> >   *     If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only.
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  __rte_ring_move_cons_head(struct rte_ring *r, int is_sc,
> >  		unsigned int n, enum rte_ring_queue_behavior behavior,
> >  		uint32_t *old_head, uint32_t *new_head, diff --git
> > a/lib/ring/rte_ring_elem.h b/lib/ring/rte_ring_elem.h index
> > fb1edc9aad1f..35e110fc5b4b 100644
> > --- a/lib/ring/rte_ring_elem.h
> > +++ b/lib/ring/rte_ring_elem.h
> > @@ -128,7 +128,7 @@ struct rte_ring *rte_ring_create_elem(const char
> > *name, unsigned int esize,
> >   * @return
> >   *   The number of objects enqueued, either 0 or n
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_mp_enqueue_bulk_elem(struct rte_ring *r, const void *obj_table,
> >  		unsigned int esize, unsigned int n, unsigned int *free_space)  {
> > @@ -157,7 +157,7 @@ rte_ring_mp_enqueue_bulk_elem(struct rte_ring *r,
> > const void *obj_table,
> >   * @return
> >   *   The number of objects enqueued, either 0 or n
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_sp_enqueue_bulk_elem(struct rte_ring *r, const void *obj_table,
> >  		unsigned int esize, unsigned int n, unsigned int *free_space)  {
> > @@ -191,7 +191,7 @@ rte_ring_sp_enqueue_bulk_elem(struct rte_ring *r,
> > const void *obj_table,
> >   * @return
> >   *   The number of objects enqueued, either 0 or n
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_enqueue_bulk_elem(struct rte_ring *r, const void *obj_table,
> >  		unsigned int esize, unsigned int n, unsigned int *free_space)  {
> > @@ -235,7 +235,7 @@ rte_ring_enqueue_bulk_elem(struct rte_ring *r, const
> > void *obj_table,
> >   *   - 0: Success; objects enqueued.
> >   *   - -ENOBUFS: Not enough room in the ring to enqueue; no object is
> > enqueued.
> >   */
> > -static __rte_always_inline int
> > +static inline int
> >  rte_ring_mp_enqueue_elem(struct rte_ring *r, void *obj, unsigned int esize)  {
> >  	return rte_ring_mp_enqueue_bulk_elem(r, obj, esize, 1, NULL) ? 0 :
> > @@ -259,7 +259,7 @@ rte_ring_mp_enqueue_elem(struct rte_ring *r, void
> > *obj, unsigned int esize)
> >   *   - 0: Success; objects enqueued.
> >   *   - -ENOBUFS: Not enough room in the ring to enqueue; no object is
> > enqueued.
> >   */
> > -static __rte_always_inline int
> > +static inline int
> >  rte_ring_sp_enqueue_elem(struct rte_ring *r, void *obj, unsigned int esize)  {
> >  	return rte_ring_sp_enqueue_bulk_elem(r, obj, esize, 1, NULL) ? 0 :
> > @@ -285,7 +285,7 @@ rte_ring_sp_enqueue_elem(struct rte_ring *r, void
> > *obj, unsigned int esize)
> >   *   - 0: Success; objects enqueued.
> >   *   - -ENOBUFS: Not enough room in the ring to enqueue; no object is
> > enqueued.
> >   */
> > -static __rte_always_inline int
> > +static inline int
> >  rte_ring_enqueue_elem(struct rte_ring *r, void *obj, unsigned int esize)  {
> >  	return rte_ring_enqueue_bulk_elem(r, obj, esize, 1, NULL) ? 0 :
> > @@ -314,7 +314,7 @@ rte_ring_enqueue_elem(struct rte_ring *r, void *obj,
> > unsigned int esize)
> >   * @return
> >   *   The number of objects dequeued, either 0 or n
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_mc_dequeue_bulk_elem(struct rte_ring *r, void *obj_table,
> >  		unsigned int esize, unsigned int n, unsigned int *available)  {
> > @@ -342,7 +342,7 @@ rte_ring_mc_dequeue_bulk_elem(struct rte_ring *r,
> > void *obj_table,
> >   * @return
> >   *   The number of objects dequeued, either 0 or n
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_sc_dequeue_bulk_elem(struct rte_ring *r, void *obj_table,
> >  		unsigned int esize, unsigned int n, unsigned int *available)  {
> > @@ -373,7 +373,7 @@ rte_ring_sc_dequeue_bulk_elem(struct rte_ring *r,
> > void *obj_table,
> >   * @return
> >   *   The number of objects dequeued, either 0 or n
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_dequeue_bulk_elem(struct rte_ring *r, void *obj_table,
> >  		unsigned int esize, unsigned int n, unsigned int *available)  {
> > @@ -418,7 +418,7 @@ rte_ring_dequeue_bulk_elem(struct rte_ring *r, void
> > *obj_table,
> >   *   - -ENOENT: Not enough entries in the ring to dequeue; no object is
> >   *     dequeued.
> >   */
> > -static __rte_always_inline int
> > +static inline int
> >  rte_ring_mc_dequeue_elem(struct rte_ring *r, void *obj_p,
> >  				unsigned int esize)
> >  {
> > @@ -442,7 +442,7 @@ rte_ring_mc_dequeue_elem(struct rte_ring *r, void
> > *obj_p,
> >   *   - -ENOENT: Not enough entries in the ring to dequeue, no object is
> >   *     dequeued.
> >   */
> > -static __rte_always_inline int
> > +static inline int
> >  rte_ring_sc_dequeue_elem(struct rte_ring *r, void *obj_p,
> >  				unsigned int esize)
> >  {
> > @@ -470,7 +470,7 @@ rte_ring_sc_dequeue_elem(struct rte_ring *r, void
> > *obj_p,
> >   *   - -ENOENT: Not enough entries in the ring to dequeue, no object is
> >   *     dequeued.
> >   */
> > -static __rte_always_inline int
> > +static inline int
> >  rte_ring_dequeue_elem(struct rte_ring *r, void *obj_p, unsigned int esize)  {
> >  	return rte_ring_dequeue_bulk_elem(r, obj_p, esize, 1, NULL) ? 0 :
> > @@ -499,7 +499,7 @@ rte_ring_dequeue_elem(struct rte_ring *r, void *obj_p,
> > unsigned int esize)
> >   * @return
> >   *   - n: Actual number of objects enqueued.
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_mp_enqueue_burst_elem(struct rte_ring *r, const void *obj_table,
> >  		unsigned int esize, unsigned int n, unsigned int *free_space)  {
> > @@ -528,7 +528,7 @@ rte_ring_mp_enqueue_burst_elem(struct rte_ring *r,
> > const void *obj_table,
> >   * @return
> >   *   - n: Actual number of objects enqueued.
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_sp_enqueue_burst_elem(struct rte_ring *r, const void *obj_table,
> >  		unsigned int esize, unsigned int n, unsigned int *free_space)  {
> > @@ -559,7 +559,7 @@ rte_ring_sp_enqueue_burst_elem(struct rte_ring *r,
> > const void *obj_table,
> >   * @return
> >   *   - n: Actual number of objects enqueued.
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_enqueue_burst_elem(struct rte_ring *r, const void *obj_table,
> >  		unsigned int esize, unsigned int n, unsigned int *free_space)  {
> > @@ -609,7 +609,7 @@ rte_ring_enqueue_burst_elem(struct rte_ring *r, const
> > void *obj_table,
> >   * @return
> >   *   - n: Actual number of objects dequeued, 0 if ring is empty
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_mc_dequeue_burst_elem(struct rte_ring *r, void *obj_table,
> >  		unsigned int esize, unsigned int n, unsigned int *available)  {
> > @@ -638,7 +638,7 @@ rte_ring_mc_dequeue_burst_elem(struct rte_ring *r,
> > void *obj_table,
> >   * @return
> >   *   - n: Actual number of objects dequeued, 0 if ring is empty
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_sc_dequeue_burst_elem(struct rte_ring *r, void *obj_table,
> >  		unsigned int esize, unsigned int n, unsigned int *available)  {
> > @@ -669,7 +669,7 @@ rte_ring_sc_dequeue_burst_elem(struct rte_ring *r,
> > void *obj_table,
> >   * @return
> >   *   - Number of objects dequeued
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_dequeue_burst_elem(struct rte_ring *r, void *obj_table,
> >  		unsigned int esize, unsigned int n, unsigned int *available)  {
> > --
> > 2.35.1  
> 

I have not run performance tests and don't have the infrastructure available
to give a meaningful answer.  The application we use doesn't make much use
of rings.

This was more in response to the RISCV issues.
  
Stephen Hemminger May 5, 2022, 11:16 p.m. UTC | #3
On Thu, 5 May 2022 16:10:27 -0700
Stephen Hemminger <stephen@networkplumber.org> wrote:

> On Thu, 5 May 2022 22:59:32 +0000
> Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com> wrote:
> 
> > Thanks Stephen. Do you see any performance difference with this change?
> >   
> > > -----Original Message-----
> > > From: Stephen Hemminger <stephen@networkplumber.org>
> > > Sent: Thursday, May 5, 2022 5:46 PM
> > > To: dev@dpdk.org
> > > Cc: Stephen Hemminger <stephen@networkplumber.org>
> > > Subject: [RFC] rte_ring: don't use always inline
> > > 
> > > Forcing compiler to inline with always inline can lead to worse and sometimes
> > > broken code. Better to use the standard inline keyword and let compiler have
> > > some flexibilty.
> > > 
> > > Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> > > ---
> > > 
> > > This is RFC because the use of large scale inlining is debatable.
> > > This change may slow things down on some versions of Gcc and architectures.
> > > 
> > > If you follow Linux kernel list, this has been a debated topic over the years, with
> > > opinions for and against inlining.
> > > Combined with bad inlining in various Gcc versions.
> > > 
> > >  lib/ring/rte_ring.h         | 36 ++++++++++++++++++------------------
> > >  lib/ring/rte_ring_c11_pvt.h |  6 +++---
> > >  lib/ring/rte_ring_elem.h    | 36 ++++++++++++++++++------------------
> > >  3 files changed, 39 insertions(+), 39 deletions(-)
> > > 
> > > diff --git a/lib/ring/rte_ring.h b/lib/ring/rte_ring.h index
> > > 980e92e59493..4a2588beed9e 100644
> > > --- a/lib/ring/rte_ring.h
> > > +++ b/lib/ring/rte_ring.h
> > > @@ -226,7 +226,7 @@ void rte_ring_dump(FILE *f, const struct rte_ring *r);
> > >   * @return
> > >   *   The number of objects enqueued, either 0 or n
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_mp_enqueue_bulk(struct rte_ring *r, void * const *obj_table,
> > >  			 unsigned int n, unsigned int *free_space)  { @@ -
> > > 249,7 +249,7 @@ rte_ring_mp_enqueue_bulk(struct rte_ring *r, void * const
> > > *obj_table,
> > >   * @return
> > >   *   The number of objects enqueued, either 0 or n
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_sp_enqueue_bulk(struct rte_ring *r, void * const *obj_table,
> > >  			 unsigned int n, unsigned int *free_space)  { @@ -
> > > 276,7 +276,7 @@ rte_ring_sp_enqueue_bulk(struct rte_ring *r, void * const
> > > *obj_table,
> > >   * @return
> > >   *   The number of objects enqueued, either 0 or n
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_enqueue_bulk(struct rte_ring *r, void * const *obj_table,
> > >  		      unsigned int n, unsigned int *free_space)  { @@ -298,7
> > > +298,7 @@ rte_ring_enqueue_bulk(struct rte_ring *r, void * const *obj_table,
> > >   *   - 0: Success; objects enqueued.
> > >   *   - -ENOBUFS: Not enough room in the ring to enqueue; no object is
> > > enqueued.
> > >   */
> > > -static __rte_always_inline int
> > > +static inline int
> > >  rte_ring_mp_enqueue(struct rte_ring *r, void *obj)  {
> > >  	return rte_ring_mp_enqueue_elem(r, &obj, sizeof(void *)); @@ -315,7
> > > +315,7 @@ rte_ring_mp_enqueue(struct rte_ring *r, void *obj)
> > >   *   - 0: Success; objects enqueued.
> > >   *   - -ENOBUFS: Not enough room in the ring to enqueue; no object is
> > > enqueued.
> > >   */
> > > -static __rte_always_inline int
> > > +static inline int
> > >  rte_ring_sp_enqueue(struct rte_ring *r, void *obj)  {
> > >  	return rte_ring_sp_enqueue_elem(r, &obj, sizeof(void *)); @@ -336,7
> > > +336,7 @@ rte_ring_sp_enqueue(struct rte_ring *r, void *obj)
> > >   *   - 0: Success; objects enqueued.
> > >   *   - -ENOBUFS: Not enough room in the ring to enqueue; no object is
> > > enqueued.
> > >   */
> > > -static __rte_always_inline int
> > > +static inline int
> > >  rte_ring_enqueue(struct rte_ring *r, void *obj)  {
> > >  	return rte_ring_enqueue_elem(r, &obj, sizeof(void *)); @@ -360,7
> > > +360,7 @@ rte_ring_enqueue(struct rte_ring *r, void *obj)
> > >   * @return
> > >   *   The number of objects dequeued, either 0 or n
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_mc_dequeue_bulk(struct rte_ring *r, void **obj_table,
> > >  		unsigned int n, unsigned int *available)  { @@ -384,7 +384,7
> > > @@ rte_ring_mc_dequeue_bulk(struct rte_ring *r, void **obj_table,
> > >   * @return
> > >   *   The number of objects dequeued, either 0 or n
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_sc_dequeue_bulk(struct rte_ring *r, void **obj_table,
> > >  		unsigned int n, unsigned int *available)  { @@ -411,7 +411,7
> > > @@ rte_ring_sc_dequeue_bulk(struct rte_ring *r, void **obj_table,
> > >   * @return
> > >   *   The number of objects dequeued, either 0 or n
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_dequeue_bulk(struct rte_ring *r, void **obj_table, unsigned int n,
> > >  		unsigned int *available)
> > >  {
> > > @@ -434,7 +434,7 @@ rte_ring_dequeue_bulk(struct rte_ring *r, void
> > > **obj_table, unsigned int n,
> > >   *   - -ENOENT: Not enough entries in the ring to dequeue; no object is
> > >   *     dequeued.
> > >   */
> > > -static __rte_always_inline int
> > > +static inline int
> > >  rte_ring_mc_dequeue(struct rte_ring *r, void **obj_p)  {
> > >  	return rte_ring_mc_dequeue_elem(r, obj_p, sizeof(void *)); @@ -452,7
> > > +452,7 @@ rte_ring_mc_dequeue(struct rte_ring *r, void **obj_p)
> > >   *   - -ENOENT: Not enough entries in the ring to dequeue, no object is
> > >   *     dequeued.
> > >   */
> > > -static __rte_always_inline int
> > > +static inline int
> > >  rte_ring_sc_dequeue(struct rte_ring *r, void **obj_p)  {
> > >  	return rte_ring_sc_dequeue_elem(r, obj_p, sizeof(void *)); @@ -474,7
> > > +474,7 @@ rte_ring_sc_dequeue(struct rte_ring *r, void **obj_p)
> > >   *   - -ENOENT: Not enough entries in the ring to dequeue, no object is
> > >   *     dequeued.
> > >   */
> > > -static __rte_always_inline int
> > > +static inline int
> > >  rte_ring_dequeue(struct rte_ring *r, void **obj_p)  {
> > >  	return rte_ring_dequeue_elem(r, obj_p, sizeof(void *)); @@ -681,7
> > > +681,7 @@ struct rte_ring *rte_ring_lookup(const char *name);
> > >   * @return
> > >   *   - n: Actual number of objects enqueued.
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_mp_enqueue_burst(struct rte_ring *r, void * const *obj_table,
> > >  			 unsigned int n, unsigned int *free_space)  { @@ -
> > > 704,7 +704,7 @@ rte_ring_mp_enqueue_burst(struct rte_ring *r, void * const
> > > *obj_table,
> > >   * @return
> > >   *   - n: Actual number of objects enqueued.
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_sp_enqueue_burst(struct rte_ring *r, void * const *obj_table,
> > >  			 unsigned int n, unsigned int *free_space)  { @@ -
> > > 731,7 +731,7 @@ rte_ring_sp_enqueue_burst(struct rte_ring *r, void * const
> > > *obj_table,
> > >   * @return
> > >   *   - n: Actual number of objects enqueued.
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_enqueue_burst(struct rte_ring *r, void * const *obj_table,
> > >  		      unsigned int n, unsigned int *free_space)  { @@ -759,7
> > > +759,7 @@ rte_ring_enqueue_burst(struct rte_ring *r, void * const *obj_table,
> > >   * @return
> > >   *   - n: Actual number of objects dequeued, 0 if ring is empty
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_mc_dequeue_burst(struct rte_ring *r, void **obj_table,
> > >  		unsigned int n, unsigned int *available)  { @@ -784,7 +784,7
> > > @@ rte_ring_mc_dequeue_burst(struct rte_ring *r, void **obj_table,
> > >   * @return
> > >   *   - n: Actual number of objects dequeued, 0 if ring is empty
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_sc_dequeue_burst(struct rte_ring *r, void **obj_table,
> > >  		unsigned int n, unsigned int *available)  { @@ -811,7 +811,7
> > > @@ rte_ring_sc_dequeue_burst(struct rte_ring *r, void **obj_table,
> > >   * @return
> > >   *   - Number of objects dequeued
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_dequeue_burst(struct rte_ring *r, void **obj_table,
> > >  		unsigned int n, unsigned int *available)  { diff --git
> > > a/lib/ring/rte_ring_c11_pvt.h b/lib/ring/rte_ring_c11_pvt.h index
> > > f895950df487..6972a9825cb7 100644
> > > --- a/lib/ring/rte_ring_c11_pvt.h
> > > +++ b/lib/ring/rte_ring_c11_pvt.h
> > > @@ -11,7 +11,7 @@
> > >  #ifndef _RTE_RING_C11_PVT_H_
> > >  #define _RTE_RING_C11_PVT_H_
> > > 
> > > -static __rte_always_inline void
> > > +static inline void
> > >  __rte_ring_update_tail(struct rte_ring_headtail *ht, uint32_t old_val,
> > >  		uint32_t new_val, uint32_t single, uint32_t enqueue)  { @@ -
> > > 50,7 +50,7 @@ __rte_ring_update_tail(struct rte_ring_headtail *ht, uint32_t
> > > old_val,
> > >   *   Actual number of objects enqueued.
> > >   *   If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only.
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  __rte_ring_move_prod_head(struct rte_ring *r, unsigned int is_sp,
> > >  		unsigned int n, enum rte_ring_queue_behavior behavior,
> > >  		uint32_t *old_head, uint32_t *new_head, @@ -126,7 +126,7
> > > @@ __rte_ring_move_prod_head(struct rte_ring *r, unsigned int is_sp,
> > >   *   - Actual number of objects dequeued.
> > >   *     If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only.
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  __rte_ring_move_cons_head(struct rte_ring *r, int is_sc,
> > >  		unsigned int n, enum rte_ring_queue_behavior behavior,
> > >  		uint32_t *old_head, uint32_t *new_head, diff --git
> > > a/lib/ring/rte_ring_elem.h b/lib/ring/rte_ring_elem.h index
> > > fb1edc9aad1f..35e110fc5b4b 100644
> > > --- a/lib/ring/rte_ring_elem.h
> > > +++ b/lib/ring/rte_ring_elem.h
> > > @@ -128,7 +128,7 @@ struct rte_ring *rte_ring_create_elem(const char
> > > *name, unsigned int esize,
> > >   * @return
> > >   *   The number of objects enqueued, either 0 or n
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_mp_enqueue_bulk_elem(struct rte_ring *r, const void *obj_table,
> > >  		unsigned int esize, unsigned int n, unsigned int *free_space)  {
> > > @@ -157,7 +157,7 @@ rte_ring_mp_enqueue_bulk_elem(struct rte_ring *r,
> > > const void *obj_table,
> > >   * @return
> > >   *   The number of objects enqueued, either 0 or n
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_sp_enqueue_bulk_elem(struct rte_ring *r, const void *obj_table,
> > >  		unsigned int esize, unsigned int n, unsigned int *free_space)  {
> > > @@ -191,7 +191,7 @@ rte_ring_sp_enqueue_bulk_elem(struct rte_ring *r,
> > > const void *obj_table,
> > >   * @return
> > >   *   The number of objects enqueued, either 0 or n
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_enqueue_bulk_elem(struct rte_ring *r, const void *obj_table,
> > >  		unsigned int esize, unsigned int n, unsigned int *free_space)  {
> > > @@ -235,7 +235,7 @@ rte_ring_enqueue_bulk_elem(struct rte_ring *r, const
> > > void *obj_table,
> > >   *   - 0: Success; objects enqueued.
> > >   *   - -ENOBUFS: Not enough room in the ring to enqueue; no object is
> > > enqueued.
> > >   */
> > > -static __rte_always_inline int
> > > +static inline int
> > >  rte_ring_mp_enqueue_elem(struct rte_ring *r, void *obj, unsigned int esize)  {
> > >  	return rte_ring_mp_enqueue_bulk_elem(r, obj, esize, 1, NULL) ? 0 :
> > > @@ -259,7 +259,7 @@ rte_ring_mp_enqueue_elem(struct rte_ring *r, void
> > > *obj, unsigned int esize)
> > >   *   - 0: Success; objects enqueued.
> > >   *   - -ENOBUFS: Not enough room in the ring to enqueue; no object is
> > > enqueued.
> > >   */
> > > -static __rte_always_inline int
> > > +static inline int
> > >  rte_ring_sp_enqueue_elem(struct rte_ring *r, void *obj, unsigned int esize)  {
> > >  	return rte_ring_sp_enqueue_bulk_elem(r, obj, esize, 1, NULL) ? 0 :
> > > @@ -285,7 +285,7 @@ rte_ring_sp_enqueue_elem(struct rte_ring *r, void
> > > *obj, unsigned int esize)
> > >   *   - 0: Success; objects enqueued.
> > >   *   - -ENOBUFS: Not enough room in the ring to enqueue; no object is
> > > enqueued.
> > >   */
> > > -static __rte_always_inline int
> > > +static inline int
> > >  rte_ring_enqueue_elem(struct rte_ring *r, void *obj, unsigned int esize)  {
> > >  	return rte_ring_enqueue_bulk_elem(r, obj, esize, 1, NULL) ? 0 :
> > > @@ -314,7 +314,7 @@ rte_ring_enqueue_elem(struct rte_ring *r, void *obj,
> > > unsigned int esize)
> > >   * @return
> > >   *   The number of objects dequeued, either 0 or n
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_mc_dequeue_bulk_elem(struct rte_ring *r, void *obj_table,
> > >  		unsigned int esize, unsigned int n, unsigned int *available)  {
> > > @@ -342,7 +342,7 @@ rte_ring_mc_dequeue_bulk_elem(struct rte_ring *r,
> > > void *obj_table,
> > >   * @return
> > >   *   The number of objects dequeued, either 0 or n
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_sc_dequeue_bulk_elem(struct rte_ring *r, void *obj_table,
> > >  		unsigned int esize, unsigned int n, unsigned int *available)  {
> > > @@ -373,7 +373,7 @@ rte_ring_sc_dequeue_bulk_elem(struct rte_ring *r,
> > > void *obj_table,
> > >   * @return
> > >   *   The number of objects dequeued, either 0 or n
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_dequeue_bulk_elem(struct rte_ring *r, void *obj_table,
> > >  		unsigned int esize, unsigned int n, unsigned int *available)  {
> > > @@ -418,7 +418,7 @@ rte_ring_dequeue_bulk_elem(struct rte_ring *r, void
> > > *obj_table,
> > >   *   - -ENOENT: Not enough entries in the ring to dequeue; no object is
> > >   *     dequeued.
> > >   */
> > > -static __rte_always_inline int
> > > +static inline int
> > >  rte_ring_mc_dequeue_elem(struct rte_ring *r, void *obj_p,
> > >  				unsigned int esize)
> > >  {
> > > @@ -442,7 +442,7 @@ rte_ring_mc_dequeue_elem(struct rte_ring *r, void
> > > *obj_p,
> > >   *   - -ENOENT: Not enough entries in the ring to dequeue, no object is
> > >   *     dequeued.
> > >   */
> > > -static __rte_always_inline int
> > > +static inline int
> > >  rte_ring_sc_dequeue_elem(struct rte_ring *r, void *obj_p,
> > >  				unsigned int esize)
> > >  {
> > > @@ -470,7 +470,7 @@ rte_ring_sc_dequeue_elem(struct rte_ring *r, void
> > > *obj_p,
> > >   *   - -ENOENT: Not enough entries in the ring to dequeue, no object is
> > >   *     dequeued.
> > >   */
> > > -static __rte_always_inline int
> > > +static inline int
> > >  rte_ring_dequeue_elem(struct rte_ring *r, void *obj_p, unsigned int esize)  {
> > >  	return rte_ring_dequeue_bulk_elem(r, obj_p, esize, 1, NULL) ? 0 :
> > > @@ -499,7 +499,7 @@ rte_ring_dequeue_elem(struct rte_ring *r, void *obj_p,
> > > unsigned int esize)
> > >   * @return
> > >   *   - n: Actual number of objects enqueued.
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_mp_enqueue_burst_elem(struct rte_ring *r, const void *obj_table,
> > >  		unsigned int esize, unsigned int n, unsigned int *free_space)  {
> > > @@ -528,7 +528,7 @@ rte_ring_mp_enqueue_burst_elem(struct rte_ring *r,
> > > const void *obj_table,
> > >   * @return
> > >   *   - n: Actual number of objects enqueued.
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_sp_enqueue_burst_elem(struct rte_ring *r, const void *obj_table,
> > >  		unsigned int esize, unsigned int n, unsigned int *free_space)  {
> > > @@ -559,7 +559,7 @@ rte_ring_sp_enqueue_burst_elem(struct rte_ring *r,
> > > const void *obj_table,
> > >   * @return
> > >   *   - n: Actual number of objects enqueued.
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_enqueue_burst_elem(struct rte_ring *r, const void *obj_table,
> > >  		unsigned int esize, unsigned int n, unsigned int *free_space)  {
> > > @@ -609,7 +609,7 @@ rte_ring_enqueue_burst_elem(struct rte_ring *r, const
> > > void *obj_table,
> > >   * @return
> > >   *   - n: Actual number of objects dequeued, 0 if ring is empty
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_mc_dequeue_burst_elem(struct rte_ring *r, void *obj_table,
> > >  		unsigned int esize, unsigned int n, unsigned int *available)  {
> > > @@ -638,7 +638,7 @@ rte_ring_mc_dequeue_burst_elem(struct rte_ring *r,
> > > void *obj_table,
> > >   * @return
> > >   *   - n: Actual number of objects dequeued, 0 if ring is empty
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_sc_dequeue_burst_elem(struct rte_ring *r, void *obj_table,
> > >  		unsigned int esize, unsigned int n, unsigned int *available)  {
> > > @@ -669,7 +669,7 @@ rte_ring_sc_dequeue_burst_elem(struct rte_ring *r,
> > > void *obj_table,
> > >   * @return
> > >   *   - Number of objects dequeued
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_dequeue_burst_elem(struct rte_ring *r, void *obj_table,
> > >  		unsigned int esize, unsigned int n, unsigned int *available)  {
> > > --
> > > 2.35.1    
> >   
> 
> I have not run performance tests and don't have the infrastructure available
> to give a meaningful answer.  The application we use doesn't make much use
> of rings.
> 
> This was more in response to the RISCV issues.

With GCC there is control over of level of inlining with compiler flags.
But definitely in the nerd-knob zone.

       -finline-limit=n
           By default, GCC limits the size of functions that can be inlined.  This flag
           allows coarse control of this limit.  n is the size of functions that can be
           inlined in number of pseudo instructions.

           Inlining is actually controlled by a number of parameters, which may be specified
           individually by using --param name=value.  The -finline-limit=n option sets some
           of these parameters as follows:

           max-inline-insns-single
               is set to n/2.

           max-inline-insns-auto
               is set to n/2.

           See below for a documentation of the individual parameters controlling inlining
           and for the defaults of these parameters.

           Note: there may be no value to -finline-limit that results in default behavior.

           Note: pseudo instruction represents, in this particular context, an abstract
           measurement of function's size.  In no way does it represent a count of assembly
           instructions and as such its exact meaning might change from one release to an
           another.
  
Honnappa Nagarahalli May 6, 2022, 1:37 a.m. UTC | #4
<snip>

> 
> > Thanks Stephen. Do you see any performance difference with this change?
> >
> > > -----Original Message-----
> > > From: Stephen Hemminger <stephen@networkplumber.org>
> > > Sent: Thursday, May 5, 2022 5:46 PM
> > > To: dev@dpdk.org
> > > Cc: Stephen Hemminger <stephen@networkplumber.org>
> > > Subject: [RFC] rte_ring: don't use always inline
> > >
> > > Forcing compiler to inline with always inline can lead to worse and
> > > sometimes broken code. Better to use the standard inline keyword and
> > > let compiler have some flexibilty.
> > >
> > > Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> > > ---
> > >
> > > This is RFC because the use of large scale inlining is debatable.
> > > This change may slow things down on some versions of Gcc and
> architectures.
> > >
> > > If you follow Linux kernel list, this has been a debated topic over
> > > the years, with opinions for and against inlining.
> > > Combined with bad inlining in various Gcc versions.
> > >
> > >  lib/ring/rte_ring.h         | 36 ++++++++++++++++++------------------
> > >  lib/ring/rte_ring_c11_pvt.h |  6 +++---
> > >  lib/ring/rte_ring_elem.h    | 36 ++++++++++++++++++------------------
> > >  3 files changed, 39 insertions(+), 39 deletions(-)
> > >
<snip>

> >
> 
> I have not run performance tests and don't have the infrastructure available to
> give a meaningful answer.  The application we use doesn't make much use of
> rings.
> 
> This was more in response to the RISCV issues.
The RISC-V issue is due to a compiler bug. Should we be making these changes due to a compiler bug in one architecture?
  
Tyler Retzlaff May 6, 2022, 7:24 a.m. UTC | #5
On Thu, May 05, 2022 at 10:59:32PM +0000, Honnappa Nagarahalli wrote:
> Thanks Stephen. Do you see any performance difference with this change?

as a matter of due diligence i think a comparison should be made just
to be confident nothing is regressing.

i support this change in principal since it is generally accepted best
practice to not force inlining since it can remove more valuable
optimizations that the compiler may make that the human can't see.
the optimizations may vary depending on compiler implementation.

force inlining should be used as a targeted measure rather than blanket
on every function and when in use probably needs to be periodically
reviewed and potentially removed as the code / compiler evolves.

also one other consideration is the impact of a particular compiler's
force inlining intrinsic/builtin is that it may permit inlining of
functions when not declared in a header. i.e. a function from one
library may be able to be inlined to another binary as a link time
optimization. although everything here is in a header so it's a bit
moot.

i'd like to see this change go in if possible.

thanks

> 
> > -----Original Message-----
> > From: Stephen Hemminger <stephen@networkplumber.org>
> > Sent: Thursday, May 5, 2022 5:46 PM
> > To: dev@dpdk.org
> > Cc: Stephen Hemminger <stephen@networkplumber.org>
> > Subject: [RFC] rte_ring: don't use always inline
> > 
> > Forcing compiler to inline with always inline can lead to worse and sometimes
> > broken code. Better to use the standard inline keyword and let compiler have
> > some flexibilty.
> > 
> > Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> > ---
> > 
> > This is RFC because the use of large scale inlining is debatable.
> > This change may slow things down on some versions of Gcc and architectures.
> > 
> > If you follow Linux kernel list, this has been a debated topic over the years, with
> > opinions for and against inlining.
> > Combined with bad inlining in various Gcc versions.
> > 
> >  lib/ring/rte_ring.h         | 36 ++++++++++++++++++------------------
> >  lib/ring/rte_ring_c11_pvt.h |  6 +++---
> >  lib/ring/rte_ring_elem.h    | 36 ++++++++++++++++++------------------
> >  3 files changed, 39 insertions(+), 39 deletions(-)
> > 
> > diff --git a/lib/ring/rte_ring.h b/lib/ring/rte_ring.h index
> > 980e92e59493..4a2588beed9e 100644
> > --- a/lib/ring/rte_ring.h
> > +++ b/lib/ring/rte_ring.h
> > @@ -226,7 +226,7 @@ void rte_ring_dump(FILE *f, const struct rte_ring *r);
> >   * @return
> >   *   The number of objects enqueued, either 0 or n
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_mp_enqueue_bulk(struct rte_ring *r, void * const *obj_table,
> >  			 unsigned int n, unsigned int *free_space)  { @@ -
> > 249,7 +249,7 @@ rte_ring_mp_enqueue_bulk(struct rte_ring *r, void * const
> > *obj_table,
> >   * @return
> >   *   The number of objects enqueued, either 0 or n
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_sp_enqueue_bulk(struct rte_ring *r, void * const *obj_table,
> >  			 unsigned int n, unsigned int *free_space)  { @@ -
> > 276,7 +276,7 @@ rte_ring_sp_enqueue_bulk(struct rte_ring *r, void * const
> > *obj_table,
> >   * @return
> >   *   The number of objects enqueued, either 0 or n
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_enqueue_bulk(struct rte_ring *r, void * const *obj_table,
> >  		      unsigned int n, unsigned int *free_space)  { @@ -298,7
> > +298,7 @@ rte_ring_enqueue_bulk(struct rte_ring *r, void * const *obj_table,
> >   *   - 0: Success; objects enqueued.
> >   *   - -ENOBUFS: Not enough room in the ring to enqueue; no object is
> > enqueued.
> >   */
> > -static __rte_always_inline int
> > +static inline int
> >  rte_ring_mp_enqueue(struct rte_ring *r, void *obj)  {
> >  	return rte_ring_mp_enqueue_elem(r, &obj, sizeof(void *)); @@ -315,7
> > +315,7 @@ rte_ring_mp_enqueue(struct rte_ring *r, void *obj)
> >   *   - 0: Success; objects enqueued.
> >   *   - -ENOBUFS: Not enough room in the ring to enqueue; no object is
> > enqueued.
> >   */
> > -static __rte_always_inline int
> > +static inline int
> >  rte_ring_sp_enqueue(struct rte_ring *r, void *obj)  {
> >  	return rte_ring_sp_enqueue_elem(r, &obj, sizeof(void *)); @@ -336,7
> > +336,7 @@ rte_ring_sp_enqueue(struct rte_ring *r, void *obj)
> >   *   - 0: Success; objects enqueued.
> >   *   - -ENOBUFS: Not enough room in the ring to enqueue; no object is
> > enqueued.
> >   */
> > -static __rte_always_inline int
> > +static inline int
> >  rte_ring_enqueue(struct rte_ring *r, void *obj)  {
> >  	return rte_ring_enqueue_elem(r, &obj, sizeof(void *)); @@ -360,7
> > +360,7 @@ rte_ring_enqueue(struct rte_ring *r, void *obj)
> >   * @return
> >   *   The number of objects dequeued, either 0 or n
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_mc_dequeue_bulk(struct rte_ring *r, void **obj_table,
> >  		unsigned int n, unsigned int *available)  { @@ -384,7 +384,7
> > @@ rte_ring_mc_dequeue_bulk(struct rte_ring *r, void **obj_table,
> >   * @return
> >   *   The number of objects dequeued, either 0 or n
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_sc_dequeue_bulk(struct rte_ring *r, void **obj_table,
> >  		unsigned int n, unsigned int *available)  { @@ -411,7 +411,7
> > @@ rte_ring_sc_dequeue_bulk(struct rte_ring *r, void **obj_table,
> >   * @return
> >   *   The number of objects dequeued, either 0 or n
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_dequeue_bulk(struct rte_ring *r, void **obj_table, unsigned int n,
> >  		unsigned int *available)
> >  {
> > @@ -434,7 +434,7 @@ rte_ring_dequeue_bulk(struct rte_ring *r, void
> > **obj_table, unsigned int n,
> >   *   - -ENOENT: Not enough entries in the ring to dequeue; no object is
> >   *     dequeued.
> >   */
> > -static __rte_always_inline int
> > +static inline int
> >  rte_ring_mc_dequeue(struct rte_ring *r, void **obj_p)  {
> >  	return rte_ring_mc_dequeue_elem(r, obj_p, sizeof(void *)); @@ -452,7
> > +452,7 @@ rte_ring_mc_dequeue(struct rte_ring *r, void **obj_p)
> >   *   - -ENOENT: Not enough entries in the ring to dequeue, no object is
> >   *     dequeued.
> >   */
> > -static __rte_always_inline int
> > +static inline int
> >  rte_ring_sc_dequeue(struct rte_ring *r, void **obj_p)  {
> >  	return rte_ring_sc_dequeue_elem(r, obj_p, sizeof(void *)); @@ -474,7
> > +474,7 @@ rte_ring_sc_dequeue(struct rte_ring *r, void **obj_p)
> >   *   - -ENOENT: Not enough entries in the ring to dequeue, no object is
> >   *     dequeued.
> >   */
> > -static __rte_always_inline int
> > +static inline int
> >  rte_ring_dequeue(struct rte_ring *r, void **obj_p)  {
> >  	return rte_ring_dequeue_elem(r, obj_p, sizeof(void *)); @@ -681,7
> > +681,7 @@ struct rte_ring *rte_ring_lookup(const char *name);
> >   * @return
> >   *   - n: Actual number of objects enqueued.
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_mp_enqueue_burst(struct rte_ring *r, void * const *obj_table,
> >  			 unsigned int n, unsigned int *free_space)  { @@ -
> > 704,7 +704,7 @@ rte_ring_mp_enqueue_burst(struct rte_ring *r, void * const
> > *obj_table,
> >   * @return
> >   *   - n: Actual number of objects enqueued.
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_sp_enqueue_burst(struct rte_ring *r, void * const *obj_table,
> >  			 unsigned int n, unsigned int *free_space)  { @@ -
> > 731,7 +731,7 @@ rte_ring_sp_enqueue_burst(struct rte_ring *r, void * const
> > *obj_table,
> >   * @return
> >   *   - n: Actual number of objects enqueued.
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_enqueue_burst(struct rte_ring *r, void * const *obj_table,
> >  		      unsigned int n, unsigned int *free_space)  { @@ -759,7
> > +759,7 @@ rte_ring_enqueue_burst(struct rte_ring *r, void * const *obj_table,
> >   * @return
> >   *   - n: Actual number of objects dequeued, 0 if ring is empty
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_mc_dequeue_burst(struct rte_ring *r, void **obj_table,
> >  		unsigned int n, unsigned int *available)  { @@ -784,7 +784,7
> > @@ rte_ring_mc_dequeue_burst(struct rte_ring *r, void **obj_table,
> >   * @return
> >   *   - n: Actual number of objects dequeued, 0 if ring is empty
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_sc_dequeue_burst(struct rte_ring *r, void **obj_table,
> >  		unsigned int n, unsigned int *available)  { @@ -811,7 +811,7
> > @@ rte_ring_sc_dequeue_burst(struct rte_ring *r, void **obj_table,
> >   * @return
> >   *   - Number of objects dequeued
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_dequeue_burst(struct rte_ring *r, void **obj_table,
> >  		unsigned int n, unsigned int *available)  { diff --git
> > a/lib/ring/rte_ring_c11_pvt.h b/lib/ring/rte_ring_c11_pvt.h index
> > f895950df487..6972a9825cb7 100644
> > --- a/lib/ring/rte_ring_c11_pvt.h
> > +++ b/lib/ring/rte_ring_c11_pvt.h
> > @@ -11,7 +11,7 @@
> >  #ifndef _RTE_RING_C11_PVT_H_
> >  #define _RTE_RING_C11_PVT_H_
> > 
> > -static __rte_always_inline void
> > +static inline void
> >  __rte_ring_update_tail(struct rte_ring_headtail *ht, uint32_t old_val,
> >  		uint32_t new_val, uint32_t single, uint32_t enqueue)  { @@ -
> > 50,7 +50,7 @@ __rte_ring_update_tail(struct rte_ring_headtail *ht, uint32_t
> > old_val,
> >   *   Actual number of objects enqueued.
> >   *   If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only.
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  __rte_ring_move_prod_head(struct rte_ring *r, unsigned int is_sp,
> >  		unsigned int n, enum rte_ring_queue_behavior behavior,
> >  		uint32_t *old_head, uint32_t *new_head, @@ -126,7 +126,7
> > @@ __rte_ring_move_prod_head(struct rte_ring *r, unsigned int is_sp,
> >   *   - Actual number of objects dequeued.
> >   *     If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only.
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  __rte_ring_move_cons_head(struct rte_ring *r, int is_sc,
> >  		unsigned int n, enum rte_ring_queue_behavior behavior,
> >  		uint32_t *old_head, uint32_t *new_head, diff --git
> > a/lib/ring/rte_ring_elem.h b/lib/ring/rte_ring_elem.h index
> > fb1edc9aad1f..35e110fc5b4b 100644
> > --- a/lib/ring/rte_ring_elem.h
> > +++ b/lib/ring/rte_ring_elem.h
> > @@ -128,7 +128,7 @@ struct rte_ring *rte_ring_create_elem(const char
> > *name, unsigned int esize,
> >   * @return
> >   *   The number of objects enqueued, either 0 or n
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_mp_enqueue_bulk_elem(struct rte_ring *r, const void *obj_table,
> >  		unsigned int esize, unsigned int n, unsigned int *free_space)  {
> > @@ -157,7 +157,7 @@ rte_ring_mp_enqueue_bulk_elem(struct rte_ring *r,
> > const void *obj_table,
> >   * @return
> >   *   The number of objects enqueued, either 0 or n
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_sp_enqueue_bulk_elem(struct rte_ring *r, const void *obj_table,
> >  		unsigned int esize, unsigned int n, unsigned int *free_space)  {
> > @@ -191,7 +191,7 @@ rte_ring_sp_enqueue_bulk_elem(struct rte_ring *r,
> > const void *obj_table,
> >   * @return
> >   *   The number of objects enqueued, either 0 or n
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_enqueue_bulk_elem(struct rte_ring *r, const void *obj_table,
> >  		unsigned int esize, unsigned int n, unsigned int *free_space)  {
> > @@ -235,7 +235,7 @@ rte_ring_enqueue_bulk_elem(struct rte_ring *r, const
> > void *obj_table,
> >   *   - 0: Success; objects enqueued.
> >   *   - -ENOBUFS: Not enough room in the ring to enqueue; no object is
> > enqueued.
> >   */
> > -static __rte_always_inline int
> > +static inline int
> >  rte_ring_mp_enqueue_elem(struct rte_ring *r, void *obj, unsigned int esize)  {
> >  	return rte_ring_mp_enqueue_bulk_elem(r, obj, esize, 1, NULL) ? 0 :
> > @@ -259,7 +259,7 @@ rte_ring_mp_enqueue_elem(struct rte_ring *r, void
> > *obj, unsigned int esize)
> >   *   - 0: Success; objects enqueued.
> >   *   - -ENOBUFS: Not enough room in the ring to enqueue; no object is
> > enqueued.
> >   */
> > -static __rte_always_inline int
> > +static inline int
> >  rte_ring_sp_enqueue_elem(struct rte_ring *r, void *obj, unsigned int esize)  {
> >  	return rte_ring_sp_enqueue_bulk_elem(r, obj, esize, 1, NULL) ? 0 :
> > @@ -285,7 +285,7 @@ rte_ring_sp_enqueue_elem(struct rte_ring *r, void
> > *obj, unsigned int esize)
> >   *   - 0: Success; objects enqueued.
> >   *   - -ENOBUFS: Not enough room in the ring to enqueue; no object is
> > enqueued.
> >   */
> > -static __rte_always_inline int
> > +static inline int
> >  rte_ring_enqueue_elem(struct rte_ring *r, void *obj, unsigned int esize)  {
> >  	return rte_ring_enqueue_bulk_elem(r, obj, esize, 1, NULL) ? 0 :
> > @@ -314,7 +314,7 @@ rte_ring_enqueue_elem(struct rte_ring *r, void *obj,
> > unsigned int esize)
> >   * @return
> >   *   The number of objects dequeued, either 0 or n
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_mc_dequeue_bulk_elem(struct rte_ring *r, void *obj_table,
> >  		unsigned int esize, unsigned int n, unsigned int *available)  {
> > @@ -342,7 +342,7 @@ rte_ring_mc_dequeue_bulk_elem(struct rte_ring *r,
> > void *obj_table,
> >   * @return
> >   *   The number of objects dequeued, either 0 or n
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_sc_dequeue_bulk_elem(struct rte_ring *r, void *obj_table,
> >  		unsigned int esize, unsigned int n, unsigned int *available)  {
> > @@ -373,7 +373,7 @@ rte_ring_sc_dequeue_bulk_elem(struct rte_ring *r,
> > void *obj_table,
> >   * @return
> >   *   The number of objects dequeued, either 0 or n
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_dequeue_bulk_elem(struct rte_ring *r, void *obj_table,
> >  		unsigned int esize, unsigned int n, unsigned int *available)  {
> > @@ -418,7 +418,7 @@ rte_ring_dequeue_bulk_elem(struct rte_ring *r, void
> > *obj_table,
> >   *   - -ENOENT: Not enough entries in the ring to dequeue; no object is
> >   *     dequeued.
> >   */
> > -static __rte_always_inline int
> > +static inline int
> >  rte_ring_mc_dequeue_elem(struct rte_ring *r, void *obj_p,
> >  				unsigned int esize)
> >  {
> > @@ -442,7 +442,7 @@ rte_ring_mc_dequeue_elem(struct rte_ring *r, void
> > *obj_p,
> >   *   - -ENOENT: Not enough entries in the ring to dequeue, no object is
> >   *     dequeued.
> >   */
> > -static __rte_always_inline int
> > +static inline int
> >  rte_ring_sc_dequeue_elem(struct rte_ring *r, void *obj_p,
> >  				unsigned int esize)
> >  {
> > @@ -470,7 +470,7 @@ rte_ring_sc_dequeue_elem(struct rte_ring *r, void
> > *obj_p,
> >   *   - -ENOENT: Not enough entries in the ring to dequeue, no object is
> >   *     dequeued.
> >   */
> > -static __rte_always_inline int
> > +static inline int
> >  rte_ring_dequeue_elem(struct rte_ring *r, void *obj_p, unsigned int esize)  {
> >  	return rte_ring_dequeue_bulk_elem(r, obj_p, esize, 1, NULL) ? 0 :
> > @@ -499,7 +499,7 @@ rte_ring_dequeue_elem(struct rte_ring *r, void *obj_p,
> > unsigned int esize)
> >   * @return
> >   *   - n: Actual number of objects enqueued.
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_mp_enqueue_burst_elem(struct rte_ring *r, const void *obj_table,
> >  		unsigned int esize, unsigned int n, unsigned int *free_space)  {
> > @@ -528,7 +528,7 @@ rte_ring_mp_enqueue_burst_elem(struct rte_ring *r,
> > const void *obj_table,
> >   * @return
> >   *   - n: Actual number of objects enqueued.
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_sp_enqueue_burst_elem(struct rte_ring *r, const void *obj_table,
> >  		unsigned int esize, unsigned int n, unsigned int *free_space)  {
> > @@ -559,7 +559,7 @@ rte_ring_sp_enqueue_burst_elem(struct rte_ring *r,
> > const void *obj_table,
> >   * @return
> >   *   - n: Actual number of objects enqueued.
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_enqueue_burst_elem(struct rte_ring *r, const void *obj_table,
> >  		unsigned int esize, unsigned int n, unsigned int *free_space)  {
> > @@ -609,7 +609,7 @@ rte_ring_enqueue_burst_elem(struct rte_ring *r, const
> > void *obj_table,
> >   * @return
> >   *   - n: Actual number of objects dequeued, 0 if ring is empty
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_mc_dequeue_burst_elem(struct rte_ring *r, void *obj_table,
> >  		unsigned int esize, unsigned int n, unsigned int *available)  {
> > @@ -638,7 +638,7 @@ rte_ring_mc_dequeue_burst_elem(struct rte_ring *r,
> > void *obj_table,
> >   * @return
> >   *   - n: Actual number of objects dequeued, 0 if ring is empty
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_sc_dequeue_burst_elem(struct rte_ring *r, void *obj_table,
> >  		unsigned int esize, unsigned int n, unsigned int *available)  {
> > @@ -669,7 +669,7 @@ rte_ring_sc_dequeue_burst_elem(struct rte_ring *r,
> > void *obj_table,
> >   * @return
> >   *   - Number of objects dequeued
> >   */
> > -static __rte_always_inline unsigned int
> > +static inline unsigned int
> >  rte_ring_dequeue_burst_elem(struct rte_ring *r, void *obj_table,
> >  		unsigned int esize, unsigned int n, unsigned int *available)  {
> > --
> > 2.35.1
  
Honnappa Nagarahalli May 6, 2022, 3:12 p.m. UTC | #6
<snip>
> 
> On Thu, May 05, 2022 at 10:59:32PM +0000, Honnappa Nagarahalli wrote:
> > Thanks Stephen. Do you see any performance difference with this change?
> 
> as a matter of due diligence i think a comparison should be made just to be
> confident nothing is regressing.
> 
> i support this change in principal since it is generally accepted best practice to
> not force inlining since it can remove more valuable optimizations that the
> compiler may make that the human can't see.
> the optimizations may vary depending on compiler implementation.
> 
> force inlining should be used as a targeted measure rather than blanket on
> every function and when in use probably needs to be periodically reviewed and
> potentially removed as the code / compiler evolves.
> 
> also one other consideration is the impact of a particular compiler's force
> inlining intrinsic/builtin is that it may permit inlining of functions when not
> declared in a header. i.e. a function from one library may be able to be inlined
> to another binary as a link time optimization. although everything here is in a
> header so it's a bit moot.
> 
> i'd like to see this change go in if possible.
Like Stephen mentions below, I am sure we will have a for and against discussion here.
As a DPDK community we have put performance front and center, I would prefer to go down that route first.

> 
> thanks
> 
> >
> > > -----Original Message-----
> > > From: Stephen Hemminger <stephen@networkplumber.org>
> > > Sent: Thursday, May 5, 2022 5:46 PM
> > > To: dev@dpdk.org
> > > Cc: Stephen Hemminger <stephen@networkplumber.org>
> > > Subject: [RFC] rte_ring: don't use always inline
> > >
> > > Forcing compiler to inline with always inline can lead to worse and
> > > sometimes broken code. Better to use the standard inline keyword and
> > > let compiler have some flexibilty.
> > >
> > > Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> > > ---
> > >
> > > This is RFC because the use of large scale inlining is debatable.
> > > This change may slow things down on some versions of Gcc and
> architectures.
> > >
> > > If you follow Linux kernel list, this has been a debated topic over
> > > the years, with opinions for and against inlining.
> > > Combined with bad inlining in various Gcc versions.
> > >
> > >  lib/ring/rte_ring.h         | 36 ++++++++++++++++++------------------
> > >  lib/ring/rte_ring_c11_pvt.h |  6 +++---
> > >  lib/ring/rte_ring_elem.h    | 36 ++++++++++++++++++------------------
> > >  3 files changed, 39 insertions(+), 39 deletions(-)
> > >
> > > diff --git a/lib/ring/rte_ring.h b/lib/ring/rte_ring.h index
> > > 980e92e59493..4a2588beed9e 100644
> > > --- a/lib/ring/rte_ring.h
> > > +++ b/lib/ring/rte_ring.h
> > > @@ -226,7 +226,7 @@ void rte_ring_dump(FILE *f, const struct rte_ring
> *r);
> > >   * @return
> > >   *   The number of objects enqueued, either 0 or n
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_mp_enqueue_bulk(struct rte_ring *r, void * const *obj_table,
> > >  			 unsigned int n, unsigned int *free_space)  { @@ -
> > > 249,7 +249,7 @@ rte_ring_mp_enqueue_bulk(struct rte_ring *r, void *
> > > const *obj_table,
> > >   * @return
> > >   *   The number of objects enqueued, either 0 or n
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_sp_enqueue_bulk(struct rte_ring *r, void * const *obj_table,
> > >  			 unsigned int n, unsigned int *free_space)  { @@ -
> > > 276,7 +276,7 @@ rte_ring_sp_enqueue_bulk(struct rte_ring *r, void *
> > > const *obj_table,
> > >   * @return
> > >   *   The number of objects enqueued, either 0 or n
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_enqueue_bulk(struct rte_ring *r, void * const *obj_table,
> > >  		      unsigned int n, unsigned int *free_space)  { @@ -298,7
> > > +298,7 @@ rte_ring_enqueue_bulk(struct rte_ring *r, void * const
> > > +*obj_table,
> > >   *   - 0: Success; objects enqueued.
> > >   *   - -ENOBUFS: Not enough room in the ring to enqueue; no object is
> > > enqueued.
> > >   */
> > > -static __rte_always_inline int
> > > +static inline int
> > >  rte_ring_mp_enqueue(struct rte_ring *r, void *obj)  {
> > >  	return rte_ring_mp_enqueue_elem(r, &obj, sizeof(void *)); @@
> > > -315,7
> > > +315,7 @@ rte_ring_mp_enqueue(struct rte_ring *r, void *obj)
> > >   *   - 0: Success; objects enqueued.
> > >   *   - -ENOBUFS: Not enough room in the ring to enqueue; no object is
> > > enqueued.
> > >   */
> > > -static __rte_always_inline int
> > > +static inline int
> > >  rte_ring_sp_enqueue(struct rte_ring *r, void *obj)  {
> > >  	return rte_ring_sp_enqueue_elem(r, &obj, sizeof(void *)); @@
> > > -336,7
> > > +336,7 @@ rte_ring_sp_enqueue(struct rte_ring *r, void *obj)
> > >   *   - 0: Success; objects enqueued.
> > >   *   - -ENOBUFS: Not enough room in the ring to enqueue; no object is
> > > enqueued.
> > >   */
> > > -static __rte_always_inline int
> > > +static inline int
> > >  rte_ring_enqueue(struct rte_ring *r, void *obj)  {
> > >  	return rte_ring_enqueue_elem(r, &obj, sizeof(void *)); @@ -360,7
> > > +360,7 @@ rte_ring_enqueue(struct rte_ring *r, void *obj)
> > >   * @return
> > >   *   The number of objects dequeued, either 0 or n
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_mc_dequeue_bulk(struct rte_ring *r, void **obj_table,
> > >  		unsigned int n, unsigned int *available)  { @@ -384,7 +384,7
> @@
> > > rte_ring_mc_dequeue_bulk(struct rte_ring *r, void **obj_table,
> > >   * @return
> > >   *   The number of objects dequeued, either 0 or n
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_sc_dequeue_bulk(struct rte_ring *r, void **obj_table,
> > >  		unsigned int n, unsigned int *available)  { @@ -411,7 +411,7
> @@
> > > rte_ring_sc_dequeue_bulk(struct rte_ring *r, void **obj_table,
> > >   * @return
> > >   *   The number of objects dequeued, either 0 or n
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_dequeue_bulk(struct rte_ring *r, void **obj_table, unsigned int n,
> > >  		unsigned int *available)
> > >  {
> > > @@ -434,7 +434,7 @@ rte_ring_dequeue_bulk(struct rte_ring *r, void
> > > **obj_table, unsigned int n,
> > >   *   - -ENOENT: Not enough entries in the ring to dequeue; no object is
> > >   *     dequeued.
> > >   */
> > > -static __rte_always_inline int
> > > +static inline int
> > >  rte_ring_mc_dequeue(struct rte_ring *r, void **obj_p)  {
> > >  	return rte_ring_mc_dequeue_elem(r, obj_p, sizeof(void *)); @@
> > > -452,7
> > > +452,7 @@ rte_ring_mc_dequeue(struct rte_ring *r, void **obj_p)
> > >   *   - -ENOENT: Not enough entries in the ring to dequeue, no object is
> > >   *     dequeued.
> > >   */
> > > -static __rte_always_inline int
> > > +static inline int
> > >  rte_ring_sc_dequeue(struct rte_ring *r, void **obj_p)  {
> > >  	return rte_ring_sc_dequeue_elem(r, obj_p, sizeof(void *)); @@
> > > -474,7
> > > +474,7 @@ rte_ring_sc_dequeue(struct rte_ring *r, void **obj_p)
> > >   *   - -ENOENT: Not enough entries in the ring to dequeue, no object is
> > >   *     dequeued.
> > >   */
> > > -static __rte_always_inline int
> > > +static inline int
> > >  rte_ring_dequeue(struct rte_ring *r, void **obj_p)  {
> > >  	return rte_ring_dequeue_elem(r, obj_p, sizeof(void *)); @@ -681,7
> > > +681,7 @@ struct rte_ring *rte_ring_lookup(const char *name);
> > >   * @return
> > >   *   - n: Actual number of objects enqueued.
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_mp_enqueue_burst(struct rte_ring *r, void * const *obj_table,
> > >  			 unsigned int n, unsigned int *free_space)  { @@ -
> > > 704,7 +704,7 @@ rte_ring_mp_enqueue_burst(struct rte_ring *r, void *
> > > const *obj_table,
> > >   * @return
> > >   *   - n: Actual number of objects enqueued.
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_sp_enqueue_burst(struct rte_ring *r, void * const *obj_table,
> > >  			 unsigned int n, unsigned int *free_space)  { @@ -
> > > 731,7 +731,7 @@ rte_ring_sp_enqueue_burst(struct rte_ring *r, void *
> > > const *obj_table,
> > >   * @return
> > >   *   - n: Actual number of objects enqueued.
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_enqueue_burst(struct rte_ring *r, void * const *obj_table,
> > >  		      unsigned int n, unsigned int *free_space)  { @@ -759,7
> > > +759,7 @@ rte_ring_enqueue_burst(struct rte_ring *r, void * const
> > > +*obj_table,
> > >   * @return
> > >   *   - n: Actual number of objects dequeued, 0 if ring is empty
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_mc_dequeue_burst(struct rte_ring *r, void **obj_table,
> > >  		unsigned int n, unsigned int *available)  { @@ -784,7 +784,7
> @@
> > > rte_ring_mc_dequeue_burst(struct rte_ring *r, void **obj_table,
> > >   * @return
> > >   *   - n: Actual number of objects dequeued, 0 if ring is empty
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_sc_dequeue_burst(struct rte_ring *r, void **obj_table,
> > >  		unsigned int n, unsigned int *available)  { @@ -811,7 +811,7
> @@
> > > rte_ring_sc_dequeue_burst(struct rte_ring *r, void **obj_table,
> > >   * @return
> > >   *   - Number of objects dequeued
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_dequeue_burst(struct rte_ring *r, void **obj_table,
> > >  		unsigned int n, unsigned int *available)  { diff --git
> > > a/lib/ring/rte_ring_c11_pvt.h b/lib/ring/rte_ring_c11_pvt.h index
> > > f895950df487..6972a9825cb7 100644
> > > --- a/lib/ring/rte_ring_c11_pvt.h
> > > +++ b/lib/ring/rte_ring_c11_pvt.h
> > > @@ -11,7 +11,7 @@
> > >  #ifndef _RTE_RING_C11_PVT_H_
> > >  #define _RTE_RING_C11_PVT_H_
> > >
> > > -static __rte_always_inline void
> > > +static inline void
> > >  __rte_ring_update_tail(struct rte_ring_headtail *ht, uint32_t old_val,
> > >  		uint32_t new_val, uint32_t single, uint32_t enqueue)  { @@ -
> > > 50,7 +50,7 @@ __rte_ring_update_tail(struct rte_ring_headtail *ht,
> > > uint32_t old_val,
> > >   *   Actual number of objects enqueued.
> > >   *   If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only.
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  __rte_ring_move_prod_head(struct rte_ring *r, unsigned int is_sp,
> > >  		unsigned int n, enum rte_ring_queue_behavior behavior,
> > >  		uint32_t *old_head, uint32_t *new_head, @@ -126,7 +126,7
> @@
> > > __rte_ring_move_prod_head(struct rte_ring *r, unsigned int is_sp,
> > >   *   - Actual number of objects dequeued.
> > >   *     If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only.
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  __rte_ring_move_cons_head(struct rte_ring *r, int is_sc,
> > >  		unsigned int n, enum rte_ring_queue_behavior behavior,
> > >  		uint32_t *old_head, uint32_t *new_head, diff --git
> > > a/lib/ring/rte_ring_elem.h b/lib/ring/rte_ring_elem.h index
> > > fb1edc9aad1f..35e110fc5b4b 100644
> > > --- a/lib/ring/rte_ring_elem.h
> > > +++ b/lib/ring/rte_ring_elem.h
> > > @@ -128,7 +128,7 @@ struct rte_ring *rte_ring_create_elem(const char
> > > *name, unsigned int esize,
> > >   * @return
> > >   *   The number of objects enqueued, either 0 or n
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_mp_enqueue_bulk_elem(struct rte_ring *r, const void *obj_table,
> > >  		unsigned int esize, unsigned int n, unsigned int *free_space)  {
> > > @@ -157,7 +157,7 @@ rte_ring_mp_enqueue_bulk_elem(struct rte_ring
> > > *r, const void *obj_table,
> > >   * @return
> > >   *   The number of objects enqueued, either 0 or n
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_sp_enqueue_bulk_elem(struct rte_ring *r, const void *obj_table,
> > >  		unsigned int esize, unsigned int n, unsigned int *free_space)  {
> > > @@ -191,7 +191,7 @@ rte_ring_sp_enqueue_bulk_elem(struct rte_ring
> > > *r, const void *obj_table,
> > >   * @return
> > >   *   The number of objects enqueued, either 0 or n
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_enqueue_bulk_elem(struct rte_ring *r, const void *obj_table,
> > >  		unsigned int esize, unsigned int n, unsigned int *free_space)  {
> > > @@ -235,7 +235,7 @@ rte_ring_enqueue_bulk_elem(struct rte_ring *r,
> > > const void *obj_table,
> > >   *   - 0: Success; objects enqueued.
> > >   *   - -ENOBUFS: Not enough room in the ring to enqueue; no object is
> > > enqueued.
> > >   */
> > > -static __rte_always_inline int
> > > +static inline int
> > >  rte_ring_mp_enqueue_elem(struct rte_ring *r, void *obj, unsigned int
> esize)  {
> > >  	return rte_ring_mp_enqueue_bulk_elem(r, obj, esize, 1, NULL) ? 0 :
> > > @@ -259,7 +259,7 @@ rte_ring_mp_enqueue_elem(struct rte_ring *r,
> > > void *obj, unsigned int esize)
> > >   *   - 0: Success; objects enqueued.
> > >   *   - -ENOBUFS: Not enough room in the ring to enqueue; no object is
> > > enqueued.
> > >   */
> > > -static __rte_always_inline int
> > > +static inline int
> > >  rte_ring_sp_enqueue_elem(struct rte_ring *r, void *obj, unsigned int esize)
> {
> > >  	return rte_ring_sp_enqueue_bulk_elem(r, obj, esize, 1, NULL) ? 0 :
> > > @@ -285,7 +285,7 @@ rte_ring_sp_enqueue_elem(struct rte_ring *r,
> > > void *obj, unsigned int esize)
> > >   *   - 0: Success; objects enqueued.
> > >   *   - -ENOBUFS: Not enough room in the ring to enqueue; no object is
> > > enqueued.
> > >   */
> > > -static __rte_always_inline int
> > > +static inline int
> > >  rte_ring_enqueue_elem(struct rte_ring *r, void *obj, unsigned int esize)  {
> > >  	return rte_ring_enqueue_bulk_elem(r, obj, esize, 1, NULL) ? 0 :
> > > @@ -314,7 +314,7 @@ rte_ring_enqueue_elem(struct rte_ring *r, void
> > > *obj, unsigned int esize)
> > >   * @return
> > >   *   The number of objects dequeued, either 0 or n
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_mc_dequeue_bulk_elem(struct rte_ring *r, void *obj_table,
> > >  		unsigned int esize, unsigned int n, unsigned int *available)  {
> > > @@ -342,7 +342,7 @@ rte_ring_mc_dequeue_bulk_elem(struct rte_ring
> > > *r, void *obj_table,
> > >   * @return
> > >   *   The number of objects dequeued, either 0 or n
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_sc_dequeue_bulk_elem(struct rte_ring *r, void *obj_table,
> > >  		unsigned int esize, unsigned int n, unsigned int *available)  {
> > > @@ -373,7 +373,7 @@ rte_ring_sc_dequeue_bulk_elem(struct rte_ring
> > > *r, void *obj_table,
> > >   * @return
> > >   *   The number of objects dequeued, either 0 or n
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_dequeue_bulk_elem(struct rte_ring *r, void *obj_table,
> > >  		unsigned int esize, unsigned int n, unsigned int *available)  {
> > > @@ -418,7 +418,7 @@ rte_ring_dequeue_bulk_elem(struct rte_ring *r,
> > > void *obj_table,
> > >   *   - -ENOENT: Not enough entries in the ring to dequeue; no object is
> > >   *     dequeued.
> > >   */
> > > -static __rte_always_inline int
> > > +static inline int
> > >  rte_ring_mc_dequeue_elem(struct rte_ring *r, void *obj_p,
> > >  				unsigned int esize)
> > >  {
> > > @@ -442,7 +442,7 @@ rte_ring_mc_dequeue_elem(struct rte_ring *r,
> > > void *obj_p,
> > >   *   - -ENOENT: Not enough entries in the ring to dequeue, no object is
> > >   *     dequeued.
> > >   */
> > > -static __rte_always_inline int
> > > +static inline int
> > >  rte_ring_sc_dequeue_elem(struct rte_ring *r, void *obj_p,
> > >  				unsigned int esize)
> > >  {
> > > @@ -470,7 +470,7 @@ rte_ring_sc_dequeue_elem(struct rte_ring *r,
> > > void *obj_p,
> > >   *   - -ENOENT: Not enough entries in the ring to dequeue, no object is
> > >   *     dequeued.
> > >   */
> > > -static __rte_always_inline int
> > > +static inline int
> > >  rte_ring_dequeue_elem(struct rte_ring *r, void *obj_p, unsigned int esize)
> {
> > >  	return rte_ring_dequeue_bulk_elem(r, obj_p, esize, 1, NULL) ? 0 :
> > > @@ -499,7 +499,7 @@ rte_ring_dequeue_elem(struct rte_ring *r, void
> > > *obj_p, unsigned int esize)
> > >   * @return
> > >   *   - n: Actual number of objects enqueued.
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_mp_enqueue_burst_elem(struct rte_ring *r, const void *obj_table,
> > >  		unsigned int esize, unsigned int n, unsigned int *free_space)  {
> > > @@ -528,7 +528,7 @@ rte_ring_mp_enqueue_burst_elem(struct rte_ring
> > > *r, const void *obj_table,
> > >   * @return
> > >   *   - n: Actual number of objects enqueued.
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_sp_enqueue_burst_elem(struct rte_ring *r, const void *obj_table,
> > >  		unsigned int esize, unsigned int n, unsigned int *free_space)  {
> > > @@ -559,7 +559,7 @@ rte_ring_sp_enqueue_burst_elem(struct rte_ring
> > > *r, const void *obj_table,
> > >   * @return
> > >   *   - n: Actual number of objects enqueued.
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_enqueue_burst_elem(struct rte_ring *r, const void *obj_table,
> > >  		unsigned int esize, unsigned int n, unsigned int *free_space)  {
> > > @@ -609,7 +609,7 @@ rte_ring_enqueue_burst_elem(struct rte_ring *r,
> > > const void *obj_table,
> > >   * @return
> > >   *   - n: Actual number of objects dequeued, 0 if ring is empty
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_mc_dequeue_burst_elem(struct rte_ring *r, void *obj_table,
> > >  		unsigned int esize, unsigned int n, unsigned int *available)  {
> > > @@ -638,7 +638,7 @@ rte_ring_mc_dequeue_burst_elem(struct rte_ring
> > > *r, void *obj_table,
> > >   * @return
> > >   *   - n: Actual number of objects dequeued, 0 if ring is empty
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_sc_dequeue_burst_elem(struct rte_ring *r, void *obj_table,
> > >  		unsigned int esize, unsigned int n, unsigned int *available)  {
> > > @@ -669,7 +669,7 @@ rte_ring_sc_dequeue_burst_elem(struct rte_ring
> > > *r, void *obj_table,
> > >   * @return
> > >   *   - Number of objects dequeued
> > >   */
> > > -static __rte_always_inline unsigned int
> > > +static inline unsigned int
> > >  rte_ring_dequeue_burst_elem(struct rte_ring *r, void *obj_table,
> > >  		unsigned int esize, unsigned int n, unsigned int *available)  {
> > > --
> > > 2.35.1
  
Bruce Richardson May 6, 2022, 3:28 p.m. UTC | #7
On Fri, May 06, 2022 at 03:12:32PM +0000, Honnappa Nagarahalli wrote:
> <snip>
> > 
> > On Thu, May 05, 2022 at 10:59:32PM +0000, Honnappa Nagarahalli wrote:
> > > Thanks Stephen. Do you see any performance difference with this change?
> > 
> > as a matter of due diligence i think a comparison should be made just to be
> > confident nothing is regressing.
> > 
> > i support this change in principal since it is generally accepted best practice to
> > not force inlining since it can remove more valuable optimizations that the
> > compiler may make that the human can't see.
> > the optimizations may vary depending on compiler implementation.
> > 
> > force inlining should be used as a targeted measure rather than blanket on
> > every function and when in use probably needs to be periodically reviewed and
> > potentially removed as the code / compiler evolves.
> > 
> > also one other consideration is the impact of a particular compiler's force
> > inlining intrinsic/builtin is that it may permit inlining of functions when not
> > declared in a header. i.e. a function from one library may be able to be inlined
> > to another binary as a link time optimization. although everything here is in a
> > header so it's a bit moot.
> > 
> > i'd like to see this change go in if possible.
> Like Stephen mentions below, I am sure we will have a for and against discussion here.
> As a DPDK community we have put performance front and center, I would prefer to go down that route first.
>

I ran some initial numbers with this patch, and the very quick summary of
what I've seen so far:

* Unit tests show no major differences, and while it depends on what
  specific number you are interested in, most seem within margin of error.
* Within unit tests, the one number I mostly look at when considering
  inlining is the "empty poll" cost, since I believe we should look to keep
  that as close to zero as possible. In the past I've seen that number jump
  from 3 cycles to 12 cycles due to missed inlining. In this case, it seem
  fine.
* Ran a quick test with the eventdev_pipeline example app using SW eventdev,
  as a test of an actual app which is fairly ring-heavy [used 8 workers
  with 1000 cycles per packet hop]. (Thanks to Harry vH for this suggestion
  of a workload)
  * GCC 8 build - no difference observed
  * GCC 11 build - approx 2% perf reduction observed

As I said, these are just some quick rough numbers, and I'll try and get
some more numbers on a couple of different platforms, see if the small
reduction seen is consistent or not. I may also test a few differnet
combinations/options in the eventdev test.  It would be good if others also
tested on a few platforms available to them.

/Bruce
  
Stephen Hemminger May 6, 2022, 3:41 p.m. UTC | #8
On Fri, 6 May 2022 00:24:34 -0700
Tyler Retzlaff <roretzla@linux.microsoft.com> wrote:

> On Thu, May 05, 2022 at 10:59:32PM +0000, Honnappa Nagarahalli wrote:
> > Thanks Stephen. Do you see any performance difference with this change?  
> 
> as a matter of due diligence i think a comparison should be made just
> to be confident nothing is regressing.
> 
> i support this change in principal since it is generally accepted best
> practice to not force inlining since it can remove more valuable
> optimizations that the compiler may make that the human can't see.
> the optimizations may vary depending on compiler implementation.
> 
> force inlining should be used as a targeted measure rather than blanket
> on every function and when in use probably needs to be periodically
> reviewed and potentially removed as the code / compiler evolves.
> 
> also one other consideration is the impact of a particular compiler's
> force inlining intrinsic/builtin is that it may permit inlining of
> functions when not declared in a header. i.e. a function from one
> library may be able to be inlined to another binary as a link time
> optimization. although everything here is in a header so it's a bit
> moot.
> 
> i'd like to see this change go in if possible.
> 
> thanks
> 


Some quick numbers from Gcc 10.3 and 2.7G AMD and ring_perf_autotest

Looks like always inline is faster on second run but just inline is
slightly faster on first run. Maybe the icache gets loaded for second run,
but on first pass the smaller code size helps.

With always_inline:
### Testing single element enq/deq ###
legacy APIs: SP/SC: single: 6.36
legacy APIs: MP/MC: single: 15.38

### Testing burst enq/deq ###
legacy APIs: SP/SC: burst (size: 8): 14.21
legacy APIs: SP/SC: burst (size: 32): 34.97
legacy APIs: MP/MC: burst (size: 8): 19.33
legacy APIs: MP/MC: burst (size: 32): 40.01

### Testing bulk enq/deq ###
legacy APIs: SP/SC: bulk (size: 8): 13.48
legacy APIs: SP/SC: bulk (size: 32): 34.07
legacy APIs: MP/MC: bulk (size: 8): 18.18
legacy APIs: MP/MC: bulk (size: 32): 37.32

### Testing empty bulk deq ###
legacy APIs: SP/SC: bulk (size: 8): 1.73
legacy APIs: MP/MC: bulk (size: 8): 1.74

### Testing using two hyperthreads ###
legacy APIs: SP/SC: bulk (size: 8): 4.57
legacy APIs: MP/MC: bulk (size: 8): 7.14
legacy APIs: SP/SC: bulk (size: 32): 2.14
legacy APIs: MP/MC: bulk (size: 32): 2.14

### Testing using two physical cores ###
legacy APIs: SP/SC: bulk (size: 8): 13.50
legacy APIs: MP/MC: bulk (size: 8): 41.68
legacy APIs: SP/SC: bulk (size: 32): 6.63
legacy APIs: MP/MC: bulk (size: 32): 8.75

### Testing using all worker nodes ###

Bulk enq/dequeue count on size 8
Core [0] count = 22792
Core [1] count = 22830
Core [2] count = 22896
Core [3] count = 22850
Core [4] count = 22688
Core [5] count = 22457
Core [6] count = 22815
Core [7] count = 22837
Core [8] count = 23045
Core [9] count = 23087
Core [10] count = 23066
Core [11] count = 23018
Core [12] count = 23132
Core [13] count = 23084
Core [14] count = 23216
Core [15] count = 23183
Total count (size: 8): 366996

Bulk enq/dequeue count on size 32
Core [0] count = 24069
Core [1] count = 24171
Core [2] count = 24101
Core [3] count = 24062
Core [4] count = 24078
Core [5] count = 24092
Core [6] count = 23837
Core [7] count = 24114
Core [8] count = 24189
Core [9] count = 24182
Core [10] count = 24118
Core [11] count = 24086
Core [12] count = 24182
Core [13] count = 24177
Core [14] count = 24224
Core [15] count = 24205
Total count (size: 32): 385887

### Testing single element enq/deq ###
elem APIs: element size 16B: SP/SC: single: 7.78
elem APIs: element size 16B: MP/MC: single: 18.00

### Testing burst enq/deq ###
elem APIs: element size 16B: SP/SC: burst (size: 8): 15.16
elem APIs: element size 16B: SP/SC: burst (size: 32): 46.38
elem APIs: element size 16B: MP/MC: burst (size: 8): 23.59
elem APIs: element size 16B: MP/MC: burst (size: 32): 41.65

### Testing bulk enq/deq ###
elem APIs: element size 16B: SP/SC: bulk (size: 8): 13.48
elem APIs: element size 16B: SP/SC: bulk (size: 32): 35.57
elem APIs: element size 16B: MP/MC: bulk (size: 8): 23.61
elem APIs: element size 16B: MP/MC: bulk (size: 32): 41.10

### Testing empty bulk deq ###
elem APIs: element size 16B: SP/SC: bulk (size: 8): 1.72
elem APIs: element size 16B: MP/MC: bulk (size: 8): 1.72

### Testing using two hyperthreads ###
elem APIs: element size 16B: SP/SC: bulk (size: 8): 4.51
elem APIs: element size 16B: MP/MC: bulk (size: 8): 7.16
elem APIs: element size 16B: SP/SC: bulk (size: 32): 2.91
elem APIs: element size 16B: MP/MC: bulk (size: 32): 2.98

### Testing using two physical cores ###
elem APIs: element size 16B: SP/SC: bulk (size: 8): 26.27
elem APIs: element size 16B: MP/MC: bulk (size: 8): 43.94
elem APIs: element size 16B: SP/SC: bulk (size: 32): 7.09
elem APIs: element size 16B: MP/MC: bulk (size: 32): 8.31

### Testing using all worker nodes ###

Bulk enq/dequeue count on size 8
Core [0] count = 22970
Core [1] count = 23068
Core [2] count = 22807
Core [3] count = 22823
Core [4] count = 22361
Core [5] count = 22732
Core [6] count = 22788
Core [7] count = 23005
Core [8] count = 22826
Core [9] count = 22882
Core [10] count = 22936
Core [11] count = 22971
Core [12] count = 23095
Core [13] count = 23087
Core [14] count = 23160
Core [15] count = 23155
Total count (size: 8): 366666

Bulk enq/dequeue count on size 32
Core [0] count = 22940
Core [1] count = 22964
Core [2] count = 22957
Core [3] count = 22934
Core [4] count = 22938
Core [5] count = 22826
Core [6] count = 22922
Core [7] count = 22927
Core [8] count = 23090
Core [9] count = 23042
Core [10] count = 23093
Core [11] count = 23004
Core [12] count = 22973
Core [13] count = 22947
Core [14] count = 23075
Core [15] count = 23021
Total count (size: 32): 367653
Test OK

With just inline:
### Testing single element enq/deq ###
legacy APIs: SP/SC: single: 4.61
legacy APIs: MP/MC: single: 15.15

### Testing burst enq/deq ###
legacy APIs: SP/SC: burst (size: 8): 13.20
legacy APIs: SP/SC: burst (size: 32): 33.10
legacy APIs: MP/MC: burst (size: 8): 18.06
legacy APIs: MP/MC: burst (size: 32): 37.53

### Testing bulk enq/deq ###
legacy APIs: SP/SC: bulk (size: 8): 11.62
legacy APIs: SP/SC: bulk (size: 32): 32.36
legacy APIs: MP/MC: bulk (size: 8): 18.07
legacy APIs: MP/MC: bulk (size: 32): 37.10

### Testing empty bulk deq ###
legacy APIs: SP/SC: bulk (size: 8): 1.69
legacy APIs: MP/MC: bulk (size: 8): 1.28

### Testing using two hyperthreads ###
legacy APIs: SP/SC: bulk (size: 8): 4.42
legacy APIs: MP/MC: bulk (size: 8): 7.15
legacy APIs: SP/SC: bulk (size: 32): 2.13
legacy APIs: MP/MC: bulk (size: 32): 2.12

### Testing using two physical cores ###
legacy APIs: SP/SC: bulk (size: 8): 13.59
legacy APIs: MP/MC: bulk (size: 8): 40.95
legacy APIs: SP/SC: bulk (size: 32): 6.53
legacy APIs: MP/MC: bulk (size: 32): 8.67

### Testing using all worker nodes ###

Bulk enq/dequeue count on size 8
Core [0] count = 21666
Core [1] count = 21693
Core [2] count = 21790
Core [3] count = 21706
Core [4] count = 21600
Core [5] count = 21575
Core [6] count = 21583
Core [7] count = 21600
Core [8] count = 21862
Core [9] count = 21872
Core [10] count = 21906
Core [11] count = 21938
Core [12] count = 22036
Core [13] count = 21965
Core [14] count = 21992
Core [15] count = 22052
Total count (size: 8): 348836

Bulk enq/dequeue count on size 32
Core [0] count = 23307
Core [1] count = 23352
Core [2] count = 23314
Core [3] count = 23304
Core [4] count = 23232
Core [5] count = 23244
Core [6] count = 23398
Core [7] count = 23308
Core [8] count = 23245
Core [9] count = 23278
Core [10] count = 22568
Core [11] count = 23308
Core [12] count = 23288
Core [13] count = 23262
Core [14] count = 23357
Core [15] count = 23366
Total count (size: 32): 372131

### Testing single element enq/deq ###
elem APIs: element size 16B: SP/SC: single: 9.93
elem APIs: element size 16B: MP/MC: single: 20.81

### Testing burst enq/deq ###
elem APIs: element size 16B: SP/SC: burst (size: 8): 15.50
elem APIs: element size 16B: SP/SC: burst (size: 32): 37.11
elem APIs: element size 16B: MP/MC: burst (size: 8): 25.13
elem APIs: element size 16B: MP/MC: burst (size: 32): 43.84

### Testing bulk enq/deq ###
elem APIs: element size 16B: SP/SC: bulk (size: 8): 17.50
elem APIs: element size 16B: SP/SC: bulk (size: 32): 38.51
elem APIs: element size 16B: MP/MC: bulk (size: 8): 24.98
elem APIs: element size 16B: MP/MC: bulk (size: 32): 45.97

### Testing empty bulk deq ###
elem APIs: element size 16B: SP/SC: bulk (size: 8): 2.55
elem APIs: element size 16B: MP/MC: bulk (size: 8): 2.55

### Testing using two hyperthreads ###
elem APIs: element size 16B: SP/SC: bulk (size: 8): 4.43
elem APIs: element size 16B: MP/MC: bulk (size: 8): 6.92
elem APIs: element size 16B: SP/SC: bulk (size: 32): 2.82
elem APIs: element size 16B: MP/MC: bulk (size: 32): 2.93

### Testing using two physical cores ###
elem APIs: element size 16B: SP/SC: bulk (size: 8): 26.51
elem APIs: element size 16B: MP/MC: bulk (size: 8): 42.32
elem APIs: element size 16B: SP/SC: bulk (size: 32): 6.94
elem APIs: element size 16B: MP/MC: bulk (size: 32): 8.15

### Testing using all worker nodes ###

Bulk enq/dequeue count on size 8
Core [0] count = 22850
Core [1] count = 22907
Core [2] count = 22799
Core [3] count = 22843
Core [4] count = 22293
Core [5] count = 22671
Core [6] count = 22294
Core [7] count = 22753
Core [8] count = 22878
Core [9] count = 22894
Core [10] count = 22886
Core [11] count = 22939
Core [12] count = 23076
Core [13] count = 22999
Core [14] count = 22910
Core [15] count = 22904
Total count (size: 8): 364896

Bulk enq/dequeue count on size 32
Core [0] count = 22279
Core [1] count = 22564
Core [2] count = 22659
Core [3] count = 22645
Core [4] count = 22629
Core [5] count = 22671
Core [6] count = 22721
Core [7] count = 22732
Core [8] count = 22668
Core [9] count = 22710
Core [10] count = 22691
Core [11] count = 22606
Core [12] count = 22699
Core [13] count = 22776
Core [14] count = 22792
Core [15] count = 22756
Total count (size: 32): 362598
Test OK
  
Stephen Hemminger May 6, 2022, 4:33 p.m. UTC | #9
On Fri, 6 May 2022 16:28:41 +0100
Bruce Richardson <bruce.richardson@intel.com> wrote:

> On Fri, May 06, 2022 at 03:12:32PM +0000, Honnappa Nagarahalli wrote:
> > <snip>  
> > > 
> > > On Thu, May 05, 2022 at 10:59:32PM +0000, Honnappa Nagarahalli wrote:  
> > > > Thanks Stephen. Do you see any performance difference with this change?  
> > > 
> > > as a matter of due diligence i think a comparison should be made just to be
> > > confident nothing is regressing.
> > > 
> > > i support this change in principal since it is generally accepted best practice to
> > > not force inlining since it can remove more valuable optimizations that the
> > > compiler may make that the human can't see.
> > > the optimizations may vary depending on compiler implementation.
> > > 
> > > force inlining should be used as a targeted measure rather than blanket on
> > > every function and when in use probably needs to be periodically reviewed and
> > > potentially removed as the code / compiler evolves.
> > > 
> > > also one other consideration is the impact of a particular compiler's force
> > > inlining intrinsic/builtin is that it may permit inlining of functions when not
> > > declared in a header. i.e. a function from one library may be able to be inlined
> > > to another binary as a link time optimization. although everything here is in a
> > > header so it's a bit moot.
> > > 
> > > i'd like to see this change go in if possible.  
> > Like Stephen mentions below, I am sure we will have a for and against discussion here.
> > As a DPDK community we have put performance front and center, I would prefer to go down that route first.
> >  
> 
> I ran some initial numbers with this patch, and the very quick summary of
> what I've seen so far:
> 
> * Unit tests show no major differences, and while it depends on what
>   specific number you are interested in, most seem within margin of error.
> * Within unit tests, the one number I mostly look at when considering
>   inlining is the "empty poll" cost, since I believe we should look to keep
>   that as close to zero as possible. In the past I've seen that number jump
>   from 3 cycles to 12 cycles due to missed inlining. In this case, it seem
>   fine.
> * Ran a quick test with the eventdev_pipeline example app using SW eventdev,
>   as a test of an actual app which is fairly ring-heavy [used 8 workers
>   with 1000 cycles per packet hop]. (Thanks to Harry vH for this suggestion
>   of a workload)
>   * GCC 8 build - no difference observed
>   * GCC 11 build - approx 2% perf reduction observed
> 
> As I said, these are just some quick rough numbers, and I'll try and get
> some more numbers on a couple of different platforms, see if the small
> reduction seen is consistent or not. I may also test a few differnet
> combinations/options in the eventdev test.  It would be good if others also
> tested on a few platforms available to them.
> 
> /Bruce

I wonder if a mixed approach might help where some key bits were marked
as more important to inline? Or setting compiler flags in build infra?
  
Bruce Richardson May 6, 2022, 4:38 p.m. UTC | #10
On Fri, May 06, 2022 at 08:41:12AM -0700, Stephen Hemminger wrote:
> On Fri, 6 May 2022 00:24:34 -0700
> Tyler Retzlaff <roretzla@linux.microsoft.com> wrote:
> 
> > On Thu, May 05, 2022 at 10:59:32PM +0000, Honnappa Nagarahalli wrote:
> > > Thanks Stephen. Do you see any performance difference with this change?  
> > 
> > as a matter of due diligence i think a comparison should be made just
> > to be confident nothing is regressing.
> > 
> > i support this change in principal since it is generally accepted best
> > practice to not force inlining since it can remove more valuable
> > optimizations that the compiler may make that the human can't see.
> > the optimizations may vary depending on compiler implementation.
> > 
> > force inlining should be used as a targeted measure rather than blanket
> > on every function and when in use probably needs to be periodically
> > reviewed and potentially removed as the code / compiler evolves.
> > 
> > also one other consideration is the impact of a particular compiler's
> > force inlining intrinsic/builtin is that it may permit inlining of
> > functions when not declared in a header. i.e. a function from one
> > library may be able to be inlined to another binary as a link time
> > optimization. although everything here is in a header so it's a bit
> > moot.
> > 
> > i'd like to see this change go in if possible.
> > 
> > thanks
> > 
> 
> 
> Some quick numbers from Gcc 10.3 and 2.7G AMD and ring_perf_autotest
> 
> Looks like always inline is faster on second run but just inline is
> slightly faster on first run. Maybe the icache gets loaded for second run,
> but on first pass the smaller code size helps.
>
Interesting, I haven't observed that effect.

The main trouble with the unit tests is that there are so many possible
numbers to compare. We probably need to focus on a few to make sense of it
all. Running on an "Intel(R) Xeon(R) Gold 6330N CPU @ 2.20GHz" with Ubuntu
20.04 (GCC 9.4), I scanned through the numbers looking for signicant
percentage differences. This one caught my eye due to the %age difference:

Basline value:
sudo ./build-baseline/app/test/dpdk-test -- ring_perf_autotest | grep "SP/SC: single:"
...
legacy APIs: SP/SC: single: 9.08
elem APIs: element size 16B: SP/SC: single: 11.13

With patch:
sudo ./build/app/test/dpdk-test -- ring_perf_autotest | grep "SP/SC: single:"
...
legacy APIs: SP/SC: single: 15.81
elem APIs: element size 16B: SP/SC: single: 21.14

So the SP/SC element enqueue cost went from 9-11 cycles to 15-21 cycles.
Percentage-wise, this seems a lot, though in absolute terms it may not be.
Therefore, I think we'll need a decent amount of varied testing before
taking this patch.

/Bruce
  
Bruce Richardson May 6, 2022, 4:39 p.m. UTC | #11
On Fri, May 06, 2022 at 09:33:41AM -0700, Stephen Hemminger wrote:
> On Fri, 6 May 2022 16:28:41 +0100
> Bruce Richardson <bruce.richardson@intel.com> wrote:
> 
> > On Fri, May 06, 2022 at 03:12:32PM +0000, Honnappa Nagarahalli wrote:
> > > <snip>  
> > > > 
> > > > On Thu, May 05, 2022 at 10:59:32PM +0000, Honnappa Nagarahalli wrote:  
> > > > > Thanks Stephen. Do you see any performance difference with this change?  
> > > > 
> > > > as a matter of due diligence i think a comparison should be made just to be
> > > > confident nothing is regressing.
> > > > 
> > > > i support this change in principal since it is generally accepted best practice to
> > > > not force inlining since it can remove more valuable optimizations that the
> > > > compiler may make that the human can't see.
> > > > the optimizations may vary depending on compiler implementation.
> > > > 
> > > > force inlining should be used as a targeted measure rather than blanket on
> > > > every function and when in use probably needs to be periodically reviewed and
> > > > potentially removed as the code / compiler evolves.
> > > > 
> > > > also one other consideration is the impact of a particular compiler's force
> > > > inlining intrinsic/builtin is that it may permit inlining of functions when not
> > > > declared in a header. i.e. a function from one library may be able to be inlined
> > > > to another binary as a link time optimization. although everything here is in a
> > > > header so it's a bit moot.
> > > > 
> > > > i'd like to see this change go in if possible.  
> > > Like Stephen mentions below, I am sure we will have a for and against discussion here.
> > > As a DPDK community we have put performance front and center, I would prefer to go down that route first.
> > >  
> > 
> > I ran some initial numbers with this patch, and the very quick summary of
> > what I've seen so far:
> > 
> > * Unit tests show no major differences, and while it depends on what
> >   specific number you are interested in, most seem within margin of error.
> > * Within unit tests, the one number I mostly look at when considering
> >   inlining is the "empty poll" cost, since I believe we should look to keep
> >   that as close to zero as possible. In the past I've seen that number jump
> >   from 3 cycles to 12 cycles due to missed inlining. In this case, it seem
> >   fine.
> > * Ran a quick test with the eventdev_pipeline example app using SW eventdev,
> >   as a test of an actual app which is fairly ring-heavy [used 8 workers
> >   with 1000 cycles per packet hop]. (Thanks to Harry vH for this suggestion
> >   of a workload)
> >   * GCC 8 build - no difference observed
> >   * GCC 11 build - approx 2% perf reduction observed
> > 
> > As I said, these are just some quick rough numbers, and I'll try and get
> > some more numbers on a couple of different platforms, see if the small
> > reduction seen is consistent or not. I may also test a few differnet
> > combinations/options in the eventdev test.  It would be good if others also
> > tested on a few platforms available to them.
> > 
> > /Bruce
> 
> I wonder if a mixed approach might help where some key bits were marked
> as more important to inline? Or setting compiler flags in build infra?

Yep, could be a number of options.
  
Konstantin Ananyev May 6, 2022, 5:48 p.m. UTC | #12
06/05/2022 17:39, Bruce Richardson пишет:
> On Fri, May 06, 2022 at 09:33:41AM -0700, Stephen Hemminger wrote:
>> On Fri, 6 May 2022 16:28:41 +0100
>> Bruce Richardson <bruce.richardson@intel.com> wrote:
>>
>>> On Fri, May 06, 2022 at 03:12:32PM +0000, Honnappa Nagarahalli wrote:
>>>> <snip>
>>>>>
>>>>> On Thu, May 05, 2022 at 10:59:32PM +0000, Honnappa Nagarahalli wrote:
>>>>>> Thanks Stephen. Do you see any performance difference with this change?
>>>>>
>>>>> as a matter of due diligence i think a comparison should be made just to be
>>>>> confident nothing is regressing.
>>>>>
>>>>> i support this change in principal since it is generally accepted best practice to
>>>>> not force inlining since it can remove more valuable optimizations that the
>>>>> compiler may make that the human can't see.
>>>>> the optimizations may vary depending on compiler implementation.
>>>>>
>>>>> force inlining should be used as a targeted measure rather than blanket on
>>>>> every function and when in use probably needs to be periodically reviewed and
>>>>> potentially removed as the code / compiler evolves.
>>>>>
>>>>> also one other consideration is the impact of a particular compiler's force
>>>>> inlining intrinsic/builtin is that it may permit inlining of functions when not
>>>>> declared in a header. i.e. a function from one library may be able to be inlined
>>>>> to another binary as a link time optimization. although everything here is in a
>>>>> header so it's a bit moot.
>>>>>
>>>>> i'd like to see this change go in if possible.
>>>> Like Stephen mentions below, I am sure we will have a for and against discussion here.
>>>> As a DPDK community we have put performance front and center, I would prefer to go down that route first.
>>>>   
>>>
>>> I ran some initial numbers with this patch, and the very quick summary of
>>> what I've seen so far:
>>>
>>> * Unit tests show no major differences, and while it depends on what
>>>    specific number you are interested in, most seem within margin of error.
>>> * Within unit tests, the one number I mostly look at when considering
>>>    inlining is the "empty poll" cost, since I believe we should look to keep
>>>    that as close to zero as possible. In the past I've seen that number jump
>>>    from 3 cycles to 12 cycles due to missed inlining. In this case, it seem
>>>    fine.
>>> * Ran a quick test with the eventdev_pipeline example app using SW eventdev,
>>>    as a test of an actual app which is fairly ring-heavy [used 8 workers
>>>    with 1000 cycles per packet hop]. (Thanks to Harry vH for this suggestion
>>>    of a workload)
>>>    * GCC 8 build - no difference observed
>>>    * GCC 11 build - approx 2% perf reduction observed

Just to note that apart from ring_perf_autotest, there also exist 
ring_stress_autotest which trying to do some stress-testing in hopefully
more realistic usage scenarios.
Might be worth to consider when benchmarking.

>>>
>>> As I said, these are just some quick rough numbers, and I'll try and get
>>> some more numbers on a couple of different platforms, see if the small
>>> reduction seen is consistent or not. I may also test a few differnet
>>> combinations/options in the eventdev test.  It would be good if others also
>>> tested on a few platforms available to them.
>>>
>>> /Bruce
>>
>> I wonder if a mixed approach might help where some key bits were marked
>> as more important to inline? Or setting compiler flags in build infra?
> 
> Yep, could be a number of options.
  

Patch

diff --git a/lib/ring/rte_ring.h b/lib/ring/rte_ring.h
index 980e92e59493..4a2588beed9e 100644
--- a/lib/ring/rte_ring.h
+++ b/lib/ring/rte_ring.h
@@ -226,7 +226,7 @@  void rte_ring_dump(FILE *f, const struct rte_ring *r);
  * @return
  *   The number of objects enqueued, either 0 or n
  */
-static __rte_always_inline unsigned int
+static inline unsigned int
 rte_ring_mp_enqueue_bulk(struct rte_ring *r, void * const *obj_table,
 			 unsigned int n, unsigned int *free_space)
 {
@@ -249,7 +249,7 @@  rte_ring_mp_enqueue_bulk(struct rte_ring *r, void * const *obj_table,
  * @return
  *   The number of objects enqueued, either 0 or n
  */
-static __rte_always_inline unsigned int
+static inline unsigned int
 rte_ring_sp_enqueue_bulk(struct rte_ring *r, void * const *obj_table,
 			 unsigned int n, unsigned int *free_space)
 {
@@ -276,7 +276,7 @@  rte_ring_sp_enqueue_bulk(struct rte_ring *r, void * const *obj_table,
  * @return
  *   The number of objects enqueued, either 0 or n
  */
-static __rte_always_inline unsigned int
+static inline unsigned int
 rte_ring_enqueue_bulk(struct rte_ring *r, void * const *obj_table,
 		      unsigned int n, unsigned int *free_space)
 {
@@ -298,7 +298,7 @@  rte_ring_enqueue_bulk(struct rte_ring *r, void * const *obj_table,
  *   - 0: Success; objects enqueued.
  *   - -ENOBUFS: Not enough room in the ring to enqueue; no object is enqueued.
  */
-static __rte_always_inline int
+static inline int
 rte_ring_mp_enqueue(struct rte_ring *r, void *obj)
 {
 	return rte_ring_mp_enqueue_elem(r, &obj, sizeof(void *));
@@ -315,7 +315,7 @@  rte_ring_mp_enqueue(struct rte_ring *r, void *obj)
  *   - 0: Success; objects enqueued.
  *   - -ENOBUFS: Not enough room in the ring to enqueue; no object is enqueued.
  */
-static __rte_always_inline int
+static inline int
 rte_ring_sp_enqueue(struct rte_ring *r, void *obj)
 {
 	return rte_ring_sp_enqueue_elem(r, &obj, sizeof(void *));
@@ -336,7 +336,7 @@  rte_ring_sp_enqueue(struct rte_ring *r, void *obj)
  *   - 0: Success; objects enqueued.
  *   - -ENOBUFS: Not enough room in the ring to enqueue; no object is enqueued.
  */
-static __rte_always_inline int
+static inline int
 rte_ring_enqueue(struct rte_ring *r, void *obj)
 {
 	return rte_ring_enqueue_elem(r, &obj, sizeof(void *));
@@ -360,7 +360,7 @@  rte_ring_enqueue(struct rte_ring *r, void *obj)
  * @return
  *   The number of objects dequeued, either 0 or n
  */
-static __rte_always_inline unsigned int
+static inline unsigned int
 rte_ring_mc_dequeue_bulk(struct rte_ring *r, void **obj_table,
 		unsigned int n, unsigned int *available)
 {
@@ -384,7 +384,7 @@  rte_ring_mc_dequeue_bulk(struct rte_ring *r, void **obj_table,
  * @return
  *   The number of objects dequeued, either 0 or n
  */
-static __rte_always_inline unsigned int
+static inline unsigned int
 rte_ring_sc_dequeue_bulk(struct rte_ring *r, void **obj_table,
 		unsigned int n, unsigned int *available)
 {
@@ -411,7 +411,7 @@  rte_ring_sc_dequeue_bulk(struct rte_ring *r, void **obj_table,
  * @return
  *   The number of objects dequeued, either 0 or n
  */
-static __rte_always_inline unsigned int
+static inline unsigned int
 rte_ring_dequeue_bulk(struct rte_ring *r, void **obj_table, unsigned int n,
 		unsigned int *available)
 {
@@ -434,7 +434,7 @@  rte_ring_dequeue_bulk(struct rte_ring *r, void **obj_table, unsigned int n,
  *   - -ENOENT: Not enough entries in the ring to dequeue; no object is
  *     dequeued.
  */
-static __rte_always_inline int
+static inline int
 rte_ring_mc_dequeue(struct rte_ring *r, void **obj_p)
 {
 	return rte_ring_mc_dequeue_elem(r, obj_p, sizeof(void *));
@@ -452,7 +452,7 @@  rte_ring_mc_dequeue(struct rte_ring *r, void **obj_p)
  *   - -ENOENT: Not enough entries in the ring to dequeue, no object is
  *     dequeued.
  */
-static __rte_always_inline int
+static inline int
 rte_ring_sc_dequeue(struct rte_ring *r, void **obj_p)
 {
 	return rte_ring_sc_dequeue_elem(r, obj_p, sizeof(void *));
@@ -474,7 +474,7 @@  rte_ring_sc_dequeue(struct rte_ring *r, void **obj_p)
  *   - -ENOENT: Not enough entries in the ring to dequeue, no object is
  *     dequeued.
  */
-static __rte_always_inline int
+static inline int
 rte_ring_dequeue(struct rte_ring *r, void **obj_p)
 {
 	return rte_ring_dequeue_elem(r, obj_p, sizeof(void *));
@@ -681,7 +681,7 @@  struct rte_ring *rte_ring_lookup(const char *name);
  * @return
  *   - n: Actual number of objects enqueued.
  */
-static __rte_always_inline unsigned int
+static inline unsigned int
 rte_ring_mp_enqueue_burst(struct rte_ring *r, void * const *obj_table,
 			 unsigned int n, unsigned int *free_space)
 {
@@ -704,7 +704,7 @@  rte_ring_mp_enqueue_burst(struct rte_ring *r, void * const *obj_table,
  * @return
  *   - n: Actual number of objects enqueued.
  */
-static __rte_always_inline unsigned int
+static inline unsigned int
 rte_ring_sp_enqueue_burst(struct rte_ring *r, void * const *obj_table,
 			 unsigned int n, unsigned int *free_space)
 {
@@ -731,7 +731,7 @@  rte_ring_sp_enqueue_burst(struct rte_ring *r, void * const *obj_table,
  * @return
  *   - n: Actual number of objects enqueued.
  */
-static __rte_always_inline unsigned int
+static inline unsigned int
 rte_ring_enqueue_burst(struct rte_ring *r, void * const *obj_table,
 		      unsigned int n, unsigned int *free_space)
 {
@@ -759,7 +759,7 @@  rte_ring_enqueue_burst(struct rte_ring *r, void * const *obj_table,
  * @return
  *   - n: Actual number of objects dequeued, 0 if ring is empty
  */
-static __rte_always_inline unsigned int
+static inline unsigned int
 rte_ring_mc_dequeue_burst(struct rte_ring *r, void **obj_table,
 		unsigned int n, unsigned int *available)
 {
@@ -784,7 +784,7 @@  rte_ring_mc_dequeue_burst(struct rte_ring *r, void **obj_table,
  * @return
  *   - n: Actual number of objects dequeued, 0 if ring is empty
  */
-static __rte_always_inline unsigned int
+static inline unsigned int
 rte_ring_sc_dequeue_burst(struct rte_ring *r, void **obj_table,
 		unsigned int n, unsigned int *available)
 {
@@ -811,7 +811,7 @@  rte_ring_sc_dequeue_burst(struct rte_ring *r, void **obj_table,
  * @return
  *   - Number of objects dequeued
  */
-static __rte_always_inline unsigned int
+static inline unsigned int
 rte_ring_dequeue_burst(struct rte_ring *r, void **obj_table,
 		unsigned int n, unsigned int *available)
 {
diff --git a/lib/ring/rte_ring_c11_pvt.h b/lib/ring/rte_ring_c11_pvt.h
index f895950df487..6972a9825cb7 100644
--- a/lib/ring/rte_ring_c11_pvt.h
+++ b/lib/ring/rte_ring_c11_pvt.h
@@ -11,7 +11,7 @@ 
 #ifndef _RTE_RING_C11_PVT_H_
 #define _RTE_RING_C11_PVT_H_
 
-static __rte_always_inline void
+static inline void
 __rte_ring_update_tail(struct rte_ring_headtail *ht, uint32_t old_val,
 		uint32_t new_val, uint32_t single, uint32_t enqueue)
 {
@@ -50,7 +50,7 @@  __rte_ring_update_tail(struct rte_ring_headtail *ht, uint32_t old_val,
  *   Actual number of objects enqueued.
  *   If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only.
  */
-static __rte_always_inline unsigned int
+static inline unsigned int
 __rte_ring_move_prod_head(struct rte_ring *r, unsigned int is_sp,
 		unsigned int n, enum rte_ring_queue_behavior behavior,
 		uint32_t *old_head, uint32_t *new_head,
@@ -126,7 +126,7 @@  __rte_ring_move_prod_head(struct rte_ring *r, unsigned int is_sp,
  *   - Actual number of objects dequeued.
  *     If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only.
  */
-static __rte_always_inline unsigned int
+static inline unsigned int
 __rte_ring_move_cons_head(struct rte_ring *r, int is_sc,
 		unsigned int n, enum rte_ring_queue_behavior behavior,
 		uint32_t *old_head, uint32_t *new_head,
diff --git a/lib/ring/rte_ring_elem.h b/lib/ring/rte_ring_elem.h
index fb1edc9aad1f..35e110fc5b4b 100644
--- a/lib/ring/rte_ring_elem.h
+++ b/lib/ring/rte_ring_elem.h
@@ -128,7 +128,7 @@  struct rte_ring *rte_ring_create_elem(const char *name, unsigned int esize,
  * @return
  *   The number of objects enqueued, either 0 or n
  */
-static __rte_always_inline unsigned int
+static inline unsigned int
 rte_ring_mp_enqueue_bulk_elem(struct rte_ring *r, const void *obj_table,
 		unsigned int esize, unsigned int n, unsigned int *free_space)
 {
@@ -157,7 +157,7 @@  rte_ring_mp_enqueue_bulk_elem(struct rte_ring *r, const void *obj_table,
  * @return
  *   The number of objects enqueued, either 0 or n
  */
-static __rte_always_inline unsigned int
+static inline unsigned int
 rte_ring_sp_enqueue_bulk_elem(struct rte_ring *r, const void *obj_table,
 		unsigned int esize, unsigned int n, unsigned int *free_space)
 {
@@ -191,7 +191,7 @@  rte_ring_sp_enqueue_bulk_elem(struct rte_ring *r, const void *obj_table,
  * @return
  *   The number of objects enqueued, either 0 or n
  */
-static __rte_always_inline unsigned int
+static inline unsigned int
 rte_ring_enqueue_bulk_elem(struct rte_ring *r, const void *obj_table,
 		unsigned int esize, unsigned int n, unsigned int *free_space)
 {
@@ -235,7 +235,7 @@  rte_ring_enqueue_bulk_elem(struct rte_ring *r, const void *obj_table,
  *   - 0: Success; objects enqueued.
  *   - -ENOBUFS: Not enough room in the ring to enqueue; no object is enqueued.
  */
-static __rte_always_inline int
+static inline int
 rte_ring_mp_enqueue_elem(struct rte_ring *r, void *obj, unsigned int esize)
 {
 	return rte_ring_mp_enqueue_bulk_elem(r, obj, esize, 1, NULL) ? 0 :
@@ -259,7 +259,7 @@  rte_ring_mp_enqueue_elem(struct rte_ring *r, void *obj, unsigned int esize)
  *   - 0: Success; objects enqueued.
  *   - -ENOBUFS: Not enough room in the ring to enqueue; no object is enqueued.
  */
-static __rte_always_inline int
+static inline int
 rte_ring_sp_enqueue_elem(struct rte_ring *r, void *obj, unsigned int esize)
 {
 	return rte_ring_sp_enqueue_bulk_elem(r, obj, esize, 1, NULL) ? 0 :
@@ -285,7 +285,7 @@  rte_ring_sp_enqueue_elem(struct rte_ring *r, void *obj, unsigned int esize)
  *   - 0: Success; objects enqueued.
  *   - -ENOBUFS: Not enough room in the ring to enqueue; no object is enqueued.
  */
-static __rte_always_inline int
+static inline int
 rte_ring_enqueue_elem(struct rte_ring *r, void *obj, unsigned int esize)
 {
 	return rte_ring_enqueue_bulk_elem(r, obj, esize, 1, NULL) ? 0 :
@@ -314,7 +314,7 @@  rte_ring_enqueue_elem(struct rte_ring *r, void *obj, unsigned int esize)
  * @return
  *   The number of objects dequeued, either 0 or n
  */
-static __rte_always_inline unsigned int
+static inline unsigned int
 rte_ring_mc_dequeue_bulk_elem(struct rte_ring *r, void *obj_table,
 		unsigned int esize, unsigned int n, unsigned int *available)
 {
@@ -342,7 +342,7 @@  rte_ring_mc_dequeue_bulk_elem(struct rte_ring *r, void *obj_table,
  * @return
  *   The number of objects dequeued, either 0 or n
  */
-static __rte_always_inline unsigned int
+static inline unsigned int
 rte_ring_sc_dequeue_bulk_elem(struct rte_ring *r, void *obj_table,
 		unsigned int esize, unsigned int n, unsigned int *available)
 {
@@ -373,7 +373,7 @@  rte_ring_sc_dequeue_bulk_elem(struct rte_ring *r, void *obj_table,
  * @return
  *   The number of objects dequeued, either 0 or n
  */
-static __rte_always_inline unsigned int
+static inline unsigned int
 rte_ring_dequeue_bulk_elem(struct rte_ring *r, void *obj_table,
 		unsigned int esize, unsigned int n, unsigned int *available)
 {
@@ -418,7 +418,7 @@  rte_ring_dequeue_bulk_elem(struct rte_ring *r, void *obj_table,
  *   - -ENOENT: Not enough entries in the ring to dequeue; no object is
  *     dequeued.
  */
-static __rte_always_inline int
+static inline int
 rte_ring_mc_dequeue_elem(struct rte_ring *r, void *obj_p,
 				unsigned int esize)
 {
@@ -442,7 +442,7 @@  rte_ring_mc_dequeue_elem(struct rte_ring *r, void *obj_p,
  *   - -ENOENT: Not enough entries in the ring to dequeue, no object is
  *     dequeued.
  */
-static __rte_always_inline int
+static inline int
 rte_ring_sc_dequeue_elem(struct rte_ring *r, void *obj_p,
 				unsigned int esize)
 {
@@ -470,7 +470,7 @@  rte_ring_sc_dequeue_elem(struct rte_ring *r, void *obj_p,
  *   - -ENOENT: Not enough entries in the ring to dequeue, no object is
  *     dequeued.
  */
-static __rte_always_inline int
+static inline int
 rte_ring_dequeue_elem(struct rte_ring *r, void *obj_p, unsigned int esize)
 {
 	return rte_ring_dequeue_bulk_elem(r, obj_p, esize, 1, NULL) ? 0 :
@@ -499,7 +499,7 @@  rte_ring_dequeue_elem(struct rte_ring *r, void *obj_p, unsigned int esize)
  * @return
  *   - n: Actual number of objects enqueued.
  */
-static __rte_always_inline unsigned int
+static inline unsigned int
 rte_ring_mp_enqueue_burst_elem(struct rte_ring *r, const void *obj_table,
 		unsigned int esize, unsigned int n, unsigned int *free_space)
 {
@@ -528,7 +528,7 @@  rte_ring_mp_enqueue_burst_elem(struct rte_ring *r, const void *obj_table,
  * @return
  *   - n: Actual number of objects enqueued.
  */
-static __rte_always_inline unsigned int
+static inline unsigned int
 rte_ring_sp_enqueue_burst_elem(struct rte_ring *r, const void *obj_table,
 		unsigned int esize, unsigned int n, unsigned int *free_space)
 {
@@ -559,7 +559,7 @@  rte_ring_sp_enqueue_burst_elem(struct rte_ring *r, const void *obj_table,
  * @return
  *   - n: Actual number of objects enqueued.
  */
-static __rte_always_inline unsigned int
+static inline unsigned int
 rte_ring_enqueue_burst_elem(struct rte_ring *r, const void *obj_table,
 		unsigned int esize, unsigned int n, unsigned int *free_space)
 {
@@ -609,7 +609,7 @@  rte_ring_enqueue_burst_elem(struct rte_ring *r, const void *obj_table,
  * @return
  *   - n: Actual number of objects dequeued, 0 if ring is empty
  */
-static __rte_always_inline unsigned int
+static inline unsigned int
 rte_ring_mc_dequeue_burst_elem(struct rte_ring *r, void *obj_table,
 		unsigned int esize, unsigned int n, unsigned int *available)
 {
@@ -638,7 +638,7 @@  rte_ring_mc_dequeue_burst_elem(struct rte_ring *r, void *obj_table,
  * @return
  *   - n: Actual number of objects dequeued, 0 if ring is empty
  */
-static __rte_always_inline unsigned int
+static inline unsigned int
 rte_ring_sc_dequeue_burst_elem(struct rte_ring *r, void *obj_table,
 		unsigned int esize, unsigned int n, unsigned int *available)
 {
@@ -669,7 +669,7 @@  rte_ring_sc_dequeue_burst_elem(struct rte_ring *r, void *obj_table,
  * @return
  *   - Number of objects dequeued
  */
-static __rte_always_inline unsigned int
+static inline unsigned int
 rte_ring_dequeue_burst_elem(struct rte_ring *r, void *obj_table,
 		unsigned int esize, unsigned int n, unsigned int *available)
 {