[v3,12/12] service: relax barriers with C11 atomic operations

Message ID 1584407863-774-13-git-send-email-phil.yang@arm.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series generic rte atomic APIs deprecate proposal |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/travis-robot success Travis build: passed
ci/Intel-compilation fail Compilation issues

Commit Message

Phil Yang March 17, 2020, 1:17 a.m. UTC
  To guarantee the inter-threads visibility of the shareable domain, it
uses a lot of rte_smp_r/wmb in the service library. This patch relaxed
these barriers for service by using c11 atomic one-way barrier operations.

Signed-off-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Gavin Hu <gavin.hu@arm.com>
---
 lib/librte_eal/common/rte_service.c | 45 ++++++++++++++++++++-----------------
 1 file changed, 25 insertions(+), 20 deletions(-)
  

Comments

Van Haaren, Harry April 3, 2020, 11:58 a.m. UTC | #1
> From: Phil Yang <phil.yang@arm.com>
> Sent: Tuesday, March 17, 2020 1:18 AM
> To: thomas@monjalon.net; Van Haaren, Harry <harry.van.haaren@intel.com>;
> Ananyev, Konstantin <konstantin.ananyev@intel.com>;
> stephen@networkplumber.org; maxime.coquelin@redhat.com; dev@dpdk.org
> Cc: david.marchand@redhat.com; jerinj@marvell.com; hemant.agrawal@nxp.com;
> Honnappa.Nagarahalli@arm.com; gavin.hu@arm.com; ruifeng.wang@arm.com;
> joyce.kong@arm.com; nd@arm.com
> Subject: [PATCH v3 12/12] service: relax barriers with C11 atomic operations
> 
> To guarantee the inter-threads visibility of the shareable domain, it
> uses a lot of rte_smp_r/wmb in the service library. This patch relaxed
> these barriers for service by using c11 atomic one-way barrier operations.
> 
> Signed-off-by: Phil Yang <phil.yang@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> ---
>  lib/librte_eal/common/rte_service.c | 45 ++++++++++++++++++++----------------
> -
>  1 file changed, 25 insertions(+), 20 deletions(-)
> 
> diff --git a/lib/librte_eal/common/rte_service.c
> b/lib/librte_eal/common/rte_service.c
> index c033224..d31663e 100644
> --- a/lib/librte_eal/common/rte_service.c
> +++ b/lib/librte_eal/common/rte_service.c
> @@ -179,9 +179,11 @@ rte_service_set_stats_enable(uint32_t id, int32_t
> enabled)
>  	SERVICE_VALID_GET_OR_ERR_RET(id, s, 0);
> 
>  	if (enabled)
> -		s->internal_flags |= SERVICE_F_STATS_ENABLED;
> +		__atomic_or_fetch(&s->internal_flags, SERVICE_F_STATS_ENABLED,
> +			__ATOMIC_RELEASE);
>  	else
> -		s->internal_flags &= ~(SERVICE_F_STATS_ENABLED);
> +		__atomic_and_fetch(&s->internal_flags,
> +			~(SERVICE_F_STATS_ENABLED), __ATOMIC_RELEASE);

Not sure why these have to become stores with RELEASE memory ordering?
(More occurances of same Q below, just answer here?)

>  	return 0;
>  }
> @@ -193,9 +195,11 @@ rte_service_set_runstate_mapped_check(uint32_t id,
> int32_t enabled)
>  	SERVICE_VALID_GET_OR_ERR_RET(id, s, 0);
> 
>  	if (enabled)
> -		s->internal_flags |= SERVICE_F_START_CHECK;
> +		__atomic_or_fetch(&s->internal_flags, SERVICE_F_START_CHECK,
> +			__ATOMIC_RELEASE);
>  	else
> -		s->internal_flags &= ~(SERVICE_F_START_CHECK);
> +		__atomic_and_fetch(&s->internal_flags, ~(SERVICE_F_START_CHECK),
> +			__ATOMIC_RELEASE);

Same as above, why do these require RELEASE?


Remainder of patch below seems to make sense - there's a wmb() involved hence RELEASE m/o.

>  	return 0;
>  }
> @@ -264,8 +268,8 @@ rte_service_component_register(const struct
> rte_service_spec *spec,
>  	s->spec = *spec;
>  	s->internal_flags |= SERVICE_F_REGISTERED | SERVICE_F_START_CHECK;
> 
> -	rte_smp_wmb();
> -	rte_service_count++;
> +	/* make sure the counter update after the state change. */
> +	__atomic_add_fetch(&rte_service_count, 1, __ATOMIC_RELEASE);

This makes sense to me - the RELEASE ensures that previous stores to the
s->internal_flags are visible to other cores before rte_service_count
increments atomically.


>  	if (id_ptr)
>  		*id_ptr = free_slot;
> @@ -281,9 +285,10 @@ rte_service_component_unregister(uint32_t id)
>  	SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL);
> 
>  	rte_service_count--;
> -	rte_smp_wmb();
> 
> -	s->internal_flags &= ~(SERVICE_F_REGISTERED);
> +	/* make sure the counter update before the state change. */
> +	__atomic_and_fetch(&s->internal_flags, ~(SERVICE_F_REGISTERED),
> +			   __ATOMIC_RELEASE);
> 
>  	/* clear the run-bit in all cores */
>  	for (i = 0; i < RTE_MAX_LCORE; i++)
> @@ -301,11 +306,12 @@ rte_service_component_runstate_set(uint32_t id, uint32_t
> runstate)
>  	SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL);
> 
>  	if (runstate)
> -		s->comp_runstate = RUNSTATE_RUNNING;
> +		__atomic_store_n(&s->comp_runstate, RUNSTATE_RUNNING,
> +				__ATOMIC_RELEASE);
>  	else
> -		s->comp_runstate = RUNSTATE_STOPPED;
> +		__atomic_store_n(&s->comp_runstate, RUNSTATE_STOPPED,
> +				__ATOMIC_RELEASE);
> 
> -	rte_smp_wmb();
>  	return 0;
>  }
>
> 
> @@ -316,11 +322,12 @@ rte_service_runstate_set(uint32_t id, uint32_t runstate)
>  	SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL);
> 
>  	if (runstate)
> -		s->app_runstate = RUNSTATE_RUNNING;
> +		__atomic_store_n(&s->app_runstate, RUNSTATE_RUNNING,
> +				__ATOMIC_RELEASE);
>  	else
> -		s->app_runstate = RUNSTATE_STOPPED;
> +		__atomic_store_n(&s->app_runstate, RUNSTATE_STOPPED,
> +				__ATOMIC_RELEASE);
> 
> -	rte_smp_wmb();
>  	return 0;
>  }
> 
> @@ -442,7 +449,8 @@ service_runner_func(void *arg)
>  	const int lcore = rte_lcore_id();
>  	struct core_state *cs = &lcore_states[lcore];
> 
> -	while (lcore_states[lcore].runstate == RUNSTATE_RUNNING) {
> +	while (__atomic_load_n(&cs->runstate,
> +		    __ATOMIC_ACQUIRE) == RUNSTATE_RUNNING) {
>  		const uint64_t service_mask = cs->service_mask;
> 
>  		for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) {
> @@ -453,8 +461,6 @@ service_runner_func(void *arg)
>  		}
> 
>  		cs->loops++;
> -
> -		rte_smp_rmb();
>  	}
> 
>  	lcore_config[lcore].state = WAIT;
> @@ -663,9 +669,8 @@ rte_service_lcore_add(uint32_t lcore)
> 
>  	/* ensure that after adding a core the mask and state are defaults */
>  	lcore_states[lcore].service_mask = 0;
> -	lcore_states[lcore].runstate = RUNSTATE_STOPPED;
> -
> -	rte_smp_wmb();
> +	__atomic_store_n(&lcore_states[lcore].runstate, RUNSTATE_STOPPED,
> +			__ATOMIC_RELEASE);
> 
>  	return rte_eal_wait_lcore(lcore);
>  }
> --
> 2.7.4
  
Honnappa Nagarahalli April 6, 2020, 5:06 p.m. UTC | #2
<snip>
Just to get us on the same page on 'techniques to communicate data from writer to reader' (apologies if it is too trivial)

Let us say that the writer has 512B (key point is - cannot be written atomically) that needs to be communicated to the reader.

Since the data cannot be written atomically, we need a guard variable (which can be written atomically, can be a flag or pointer to data). So, the writer will store 512B in non-atomic way and write to guard variable with release memory order. So, if the guard variable is valid (set in the case of flag or not null in the case of pointer), it guarantees that 512B is written.

The reader will read the guard variable with acquire memory order and read the 512B data only if the guard variable is valid. So, the acquire memory order on the guard variable guarantees that the load of 512B does not happen before the guard variable is read. The validity check on the guard variable guarantees that 512B was written before it was read.

The store(guard_variable, RELEASE) on the writer and the load(guard_variable, ACQUIRE) can be said as synchronizing with each other.

(the guard variable technique applies even if we are not using C11 atomics)

Let us say that the writer has 4B (key point is - can be written atomically) that needs to be communicated to the reader. The writer is free to write this atomically with no constraints on memory ordering as long as this data is not acting as a guard variable for any other data.

In my understanding, the sequence of APIs to call to start a service (writer) are as follows:
1) rte_service_init
2) rte_service_component_register
3) <possible configuration of the service>
4) rte_service_component_runstate_set (the reader is allowed at this point to read the information about the service - written by rte_service_component_register API. This API should not be called before rte_service_component_register)
5) <possible configuration of the service>
6) rte_service_runstate_set (the reader is allowed at this point to read the information about the service - written by rte_service_component_register API and run the service. This API can be called anytime. But, the reader should not attempt to run the service before this API is called)
7) rte_lcore_service_add (multiple of these probably, can be called before this, can't be called later)
8) rte_service_map_lcore_set (this can be called anytime. Can be called even if the service is not registered)
9) rte_service_lcore_start (again, this can be called anytime, even before the service is registered)

So, there are 2 guard variables - 'comp_runstate' and 'app_runstate'. Only these 2 need to have RELEASE ordering in writer and ACQUIRE ordering in reader.

We can write test cases with different orders of these API calls to prove that the memory orders we use are sufficient.

Few comments are inline based on this assessment.

> Subject: RE: [PATCH v3 12/12] service: relax barriers with C11 atomic
> operations
> 
> > From: Phil Yang <phil.yang@arm.com>
> > Sent: Tuesday, March 17, 2020 1:18 AM
> > To: thomas@monjalon.net; Van Haaren, Harry
> > <harry.van.haaren@intel.com>; Ananyev, Konstantin
> > <konstantin.ananyev@intel.com>; stephen@networkplumber.org;
> > maxime.coquelin@redhat.com; dev@dpdk.org
> > Cc: david.marchand@redhat.com; jerinj@marvell.com;
> > hemant.agrawal@nxp.com; Honnappa.Nagarahalli@arm.com;
> > gavin.hu@arm.com; ruifeng.wang@arm.com; joyce.kong@arm.com;
> nd@arm.com
> > Subject: [PATCH v3 12/12] service: relax barriers with C11 atomic
> > operations
> >
> > To guarantee the inter-threads visibility of the shareable domain, it
> > uses a lot of rte_smp_r/wmb in the service library. This patch relaxed
> > these barriers for service by using c11 atomic one-way barrier operations.
> >
> > Signed-off-by: Phil Yang <phil.yang@arm.com>
> > Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> > Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> > ---
> >  lib/librte_eal/common/rte_service.c | 45
> > ++++++++++++++++++++----------------
> > -
> >  1 file changed, 25 insertions(+), 20 deletions(-)
> >
> > diff --git a/lib/librte_eal/common/rte_service.c
> > b/lib/librte_eal/common/rte_service.c
> > index c033224..d31663e 100644
> > --- a/lib/librte_eal/common/rte_service.c
> > +++ b/lib/librte_eal/common/rte_service.c
> > @@ -179,9 +179,11 @@ rte_service_set_stats_enable(uint32_t id, int32_t
> > enabled)
> >  	SERVICE_VALID_GET_OR_ERR_RET(id, s, 0);
> >
> >  	if (enabled)
> > -		s->internal_flags |= SERVICE_F_STATS_ENABLED;
> > +		__atomic_or_fetch(&s->internal_flags,
> SERVICE_F_STATS_ENABLED,
> > +			__ATOMIC_RELEASE);
> >  	else
> > -		s->internal_flags &= ~(SERVICE_F_STATS_ENABLED);
> > +		__atomic_and_fetch(&s->internal_flags,
> > +			~(SERVICE_F_STATS_ENABLED), __ATOMIC_RELEASE);
> 
> Not sure why these have to become stores with RELEASE memory ordering?
> (More occurances of same Q below, just answer here?)
Agree, 'internal_flags' is not acting as a guard variable, this should be RELAXED (similarly for the others below). Though I suggest keeping it atomic.

> 
> >  	return 0;
> >  }
> > @@ -193,9 +195,11 @@ rte_service_set_runstate_mapped_check(uint32_t
> > id, int32_t enabled)
> >  	SERVICE_VALID_GET_OR_ERR_RET(id, s, 0);
> >
> >  	if (enabled)
> > -		s->internal_flags |= SERVICE_F_START_CHECK;
> > +		__atomic_or_fetch(&s->internal_flags,
> SERVICE_F_START_CHECK,
> > +			__ATOMIC_RELEASE);
> >  	else
> > -		s->internal_flags &= ~(SERVICE_F_START_CHECK);
> > +		__atomic_and_fetch(&s->internal_flags,
> ~(SERVICE_F_START_CHECK),
> > +			__ATOMIC_RELEASE);
> 
> Same as above, why do these require RELEASE?
Agree

> 
> 
> Remainder of patch below seems to make sense - there's a wmb() involved
> hence RELEASE m/o.
> 
> >  	return 0;
> >  }
> > @@ -264,8 +268,8 @@ rte_service_component_register(const struct
> > rte_service_spec *spec,
> >  	s->spec = *spec;
> >  	s->internal_flags |= SERVICE_F_REGISTERED |
> SERVICE_F_START_CHECK;
> >
> > -	rte_smp_wmb();
> > -	rte_service_count++;
> > +	/* make sure the counter update after the state change. */
> > +	__atomic_add_fetch(&rte_service_count, 1, __ATOMIC_RELEASE);
> 
> This makes sense to me - the RELEASE ensures that previous stores to the
> s->internal_flags are visible to other cores before rte_service_count
> increments atomically.
rte_service_count is not a guard variable, does not need RELEASE order. It is also not used by the reader. It looks like it is just a statistic being maintained.

> 
> 
> >  	if (id_ptr)
> >  		*id_ptr = free_slot;
> > @@ -281,9 +285,10 @@ rte_service_component_unregister(uint32_t id)
> >  	SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL);
> >
> >  	rte_service_count--;
> > -	rte_smp_wmb();
> >
> > -	s->internal_flags &= ~(SERVICE_F_REGISTERED);
> > +	/* make sure the counter update before the state change. */
> > +	__atomic_and_fetch(&s->internal_flags, ~(SERVICE_F_REGISTERED),
> > +			   __ATOMIC_RELEASE);
RELAXED is enough.

> >
> >  	/* clear the run-bit in all cores */
> >  	for (i = 0; i < RTE_MAX_LCORE; i++)
> > @@ -301,11 +306,12 @@ rte_service_component_runstate_set(uint32_t id,
> > uint32_t
> > runstate)
> >  	SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL);
> >
> >  	if (runstate)
> > -		s->comp_runstate = RUNSTATE_RUNNING;
> > +		__atomic_store_n(&s->comp_runstate, RUNSTATE_RUNNING,
> > +				__ATOMIC_RELEASE);
> >  	else
> > -		s->comp_runstate = RUNSTATE_STOPPED;
> > +		__atomic_store_n(&s->comp_runstate, RUNSTATE_STOPPED,
> > +				__ATOMIC_RELEASE);
Here we need a thread_fence to prevent the memory operations from a subsequent call to 'rte_service_component_unregister' from getting hoisted above this. The user should be forced to call rte_service_component_unregister before calling rte_service_component_runstate_set. I suggest adding a check in rte_service_component_unregister to ensure that the state is set to RUNSTATE_STOPPED. In fact, the user needs to make sure that the service is stopped for sure before calling rte_service_component_unregister.

> >
> > -	rte_smp_wmb();
> >  	return 0;
> >  }
> >
> >
> > @@ -316,11 +322,12 @@ rte_service_runstate_set(uint32_t id, uint32_t
> runstate)
> >  	SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL);
> >
> >  	if (runstate)
> > -		s->app_runstate = RUNSTATE_RUNNING;
> > +		__atomic_store_n(&s->app_runstate, RUNSTATE_RUNNING,
> > +				__ATOMIC_RELEASE);
> >  	else
> > -		s->app_runstate = RUNSTATE_STOPPED;
> > +		__atomic_store_n(&s->app_runstate, RUNSTATE_STOPPED,
> > +				__ATOMIC_RELEASE);
> >
> > -	rte_smp_wmb();
> >  	return 0;
> >  }
> >
> > @@ -442,7 +449,8 @@ service_runner_func(void *arg)
> >  	const int lcore = rte_lcore_id();
> >  	struct core_state *cs = &lcore_states[lcore];
> >
> > -	while (lcore_states[lcore].runstate == RUNSTATE_RUNNING) {
> > +	while (__atomic_load_n(&cs->runstate,
> > +		    __ATOMIC_ACQUIRE) == RUNSTATE_RUNNING) {
This can be RELAXED, lcore's runstate is not acting as a guard variable.
However, note that the writer thread wants to communicate the 'runstate' (4B) to the reader thread. This ordering needs to be handled in 'rte_eal_remote_launch' and 'eal_thread_loop' functions. We have to note that in some other use case, the writer wants to communicate more than 4B to reader. Currently, the 'write' and 'read' system calls may have enough barriers to make things work fine. But, I suggest using the ' lcore_config[lcore].f' as the guard variable to make it explicit and not depend on 'write' and 'read'. We can take up the EAL things in a later patch as it does not cause any issues right now.

> >  		const uint64_t service_mask = cs->service_mask;
> >
> >  		for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) { @@ -453,8
> +461,6 @@
> > service_runner_func(void *arg)
> >  		}
> >
> >  		cs->loops++;
> > -
> > -		rte_smp_rmb();
> >  	}
> >
> >  	lcore_config[lcore].state = WAIT;
> > @@ -663,9 +669,8 @@ rte_service_lcore_add(uint32_t lcore)
> >
> >  	/* ensure that after adding a core the mask and state are defaults */
> >  	lcore_states[lcore].service_mask = 0;
> > -	lcore_states[lcore].runstate = RUNSTATE_STOPPED;
> > -
> > -	rte_smp_wmb();
Agree.

> > +	__atomic_store_n(&lcore_states[lcore].runstate, RUNSTATE_STOPPED,
> > +			__ATOMIC_RELEASE);
This can be relaxed.

> >
> >  	return rte_eal_wait_lcore(lcore);
> >  }
> > --
> > 2.7.4
  
Van Haaren, Harry April 8, 2020, 7:42 p.m. UTC | #3
> -----Original Message-----
> From: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
> Sent: Monday, April 6, 2020 6:06 PM
> To: Van Haaren, Harry <harry.van.haaren@intel.com>; Phil Yang
> <Phil.Yang@arm.com>; thomas@monjalon.net; Ananyev, Konstantin
> <konstantin.ananyev@intel.com>; stephen@networkplumber.org;
> maxime.coquelin@redhat.com; dev@dpdk.org
> Cc: david.marchand@redhat.com; jerinj@marvell.com; hemant.agrawal@nxp.com;
> Gavin Hu <Gavin.Hu@arm.com>; Ruifeng Wang <Ruifeng.Wang@arm.com>; Joyce Kong
> <Joyce.Kong@arm.com>; nd <nd@arm.com>; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; nd <nd@arm.com>
> Subject: RE: [PATCH v3 12/12] service: relax barriers with C11 atomic
> operations
> 
> <snip>
> Just to get us on the same page on 'techniques to communicate data from writer
> to reader' (apologies if it is too trivial)
> 
> Let us say that the writer has 512B (key point is - cannot be written
> atomically) that needs to be communicated to the reader.
> 
> Since the data cannot be written atomically, we need a guard variable (which
> can be written atomically, can be a flag or pointer to data). So, the writer
> will store 512B in non-atomic way and write to guard variable with release
> memory order. So, if the guard variable is valid (set in the case of flag or
> not null in the case of pointer), it guarantees that 512B is written.
> 
> The reader will read the guard variable with acquire memory order and read the
> 512B data only if the guard variable is valid. So, the acquire memory order on
> the guard variable guarantees that the load of 512B does not happen before the
> guard variable is read. The validity check on the guard variable guarantees
> that 512B was written before it was read.
> 
> The store(guard_variable, RELEASE) on the writer and the load(guard_variable,
> ACQUIRE) can be said as synchronizing with each other.
> 
> (the guard variable technique applies even if we are not using C11 atomics)

Yep agreed on the above.


> Let us say that the writer has 4B (key point is - can be written atomically)
> that needs to be communicated to the reader. The writer is free to write this
> atomically with no constraints on memory ordering as long as this data is not
> acting as a guard variable for any other data.
> 
> In my understanding, the sequence of APIs to call to start a service (writer)
> are as follows:
> 1) rte_service_init
> 2) rte_service_component_register
> 3) <possible configuration of the service>
> 4) rte_service_component_runstate_set (the reader is allowed at this point to
> read the information about the service - written by
> rte_service_component_register API. This API should not be called before
> rte_service_component_register)
> 5) <possible configuration of the service>
> 6) rte_service_runstate_set (the reader is allowed at this point to read the
> information about the service - written by rte_service_component_register API
> and run the service. This API can be called anytime. But, the reader should
> not attempt to run the service before this API is called)
> 7) rte_lcore_service_add (multiple of these probably, can be called before
> this, can't be called later)
> 8) rte_service_map_lcore_set (this can be called anytime. Can be called even
> if the service is not registered)
> 9) rte_service_lcore_start (again, this can be called anytime, even before the
> service is registered)

I think this can be simplified, if we look at calling threads:
 - one thread is the writer/config thread, and is allowed to call anything
 --- Any updates/changes must be atomically correct in how the other threads can read state.

 - service lcores, which fundamentally spin in run(), and call services mapped to it
 --- here we need to ensure any service mapped to it is atomic, and the service is valid to run.

 - other application threads using "run on app lcore" function
 --- similar to service lcore, check for service in valid state, and allow to run.


Services are not allowed to be unregistered e.g. while running.
I'd like to avoid the "explosion of combinations" by enforcing
simple limitations (and documenting them more clearly if/where required).


> So, there are 2 guard variables - 'comp_runstate' and 'app_runstate'. Only
> these 2 need to have RELEASE ordering in writer and ACQUIRE ordering in
> reader.
> 
> We can write test cases with different orders of these API calls to prove that
> the memory orders we use are sufficient.
> 
> Few comments are inline based on this assessment.

Sure. As per other email thread, splitting changes into bugfix/cleanup/C11
would likely help to try keep track of changes etc required per patch, its
getting hard to follow the various topics being discussed in parallel.


> > Subject: RE: [PATCH v3 12/12] service: relax barriers with C11 atomic
> > operations
> >
> > > From: Phil Yang <phil.yang@arm.com>
> > > Sent: Tuesday, March 17, 2020 1:18 AM
> > > To: thomas@monjalon.net; Van Haaren, Harry
> > > <harry.van.haaren@intel.com>; Ananyev, Konstantin
> > > <konstantin.ananyev@intel.com>; stephen@networkplumber.org;
> > > maxime.coquelin@redhat.com; dev@dpdk.org
> > > Cc: david.marchand@redhat.com; jerinj@marvell.com;
> > > hemant.agrawal@nxp.com; Honnappa.Nagarahalli@arm.com;
> > > gavin.hu@arm.com; ruifeng.wang@arm.com; joyce.kong@arm.com;
> > nd@arm.com
> > > Subject: [PATCH v3 12/12] service: relax barriers with C11 atomic
> > > operations
> > >
> > > To guarantee the inter-threads visibility of the shareable domain, it
> > > uses a lot of rte_smp_r/wmb in the service library. This patch relaxed
> > > these barriers for service by using c11 atomic one-way barrier operations.
> > >
> > > Signed-off-by: Phil Yang <phil.yang@arm.com>
> > > Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> > > Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> > > ---
> > >  lib/librte_eal/common/rte_service.c | 45

<snip patch review comments>
  

Patch

diff --git a/lib/librte_eal/common/rte_service.c b/lib/librte_eal/common/rte_service.c
index c033224..d31663e 100644
--- a/lib/librte_eal/common/rte_service.c
+++ b/lib/librte_eal/common/rte_service.c
@@ -179,9 +179,11 @@  rte_service_set_stats_enable(uint32_t id, int32_t enabled)
 	SERVICE_VALID_GET_OR_ERR_RET(id, s, 0);
 
 	if (enabled)
-		s->internal_flags |= SERVICE_F_STATS_ENABLED;
+		__atomic_or_fetch(&s->internal_flags, SERVICE_F_STATS_ENABLED,
+			__ATOMIC_RELEASE);
 	else
-		s->internal_flags &= ~(SERVICE_F_STATS_ENABLED);
+		__atomic_and_fetch(&s->internal_flags,
+			~(SERVICE_F_STATS_ENABLED), __ATOMIC_RELEASE);
 
 	return 0;
 }
@@ -193,9 +195,11 @@  rte_service_set_runstate_mapped_check(uint32_t id, int32_t enabled)
 	SERVICE_VALID_GET_OR_ERR_RET(id, s, 0);
 
 	if (enabled)
-		s->internal_flags |= SERVICE_F_START_CHECK;
+		__atomic_or_fetch(&s->internal_flags, SERVICE_F_START_CHECK,
+			__ATOMIC_RELEASE);
 	else
-		s->internal_flags &= ~(SERVICE_F_START_CHECK);
+		__atomic_and_fetch(&s->internal_flags, ~(SERVICE_F_START_CHECK),
+			__ATOMIC_RELEASE);
 
 	return 0;
 }
@@ -264,8 +268,8 @@  rte_service_component_register(const struct rte_service_spec *spec,
 	s->spec = *spec;
 	s->internal_flags |= SERVICE_F_REGISTERED | SERVICE_F_START_CHECK;
 
-	rte_smp_wmb();
-	rte_service_count++;
+	/* make sure the counter update after the state change. */
+	__atomic_add_fetch(&rte_service_count, 1, __ATOMIC_RELEASE);
 
 	if (id_ptr)
 		*id_ptr = free_slot;
@@ -281,9 +285,10 @@  rte_service_component_unregister(uint32_t id)
 	SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL);
 
 	rte_service_count--;
-	rte_smp_wmb();
 
-	s->internal_flags &= ~(SERVICE_F_REGISTERED);
+	/* make sure the counter update before the state change. */
+	__atomic_and_fetch(&s->internal_flags, ~(SERVICE_F_REGISTERED),
+			   __ATOMIC_RELEASE);
 
 	/* clear the run-bit in all cores */
 	for (i = 0; i < RTE_MAX_LCORE; i++)
@@ -301,11 +306,12 @@  rte_service_component_runstate_set(uint32_t id, uint32_t runstate)
 	SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL);
 
 	if (runstate)
-		s->comp_runstate = RUNSTATE_RUNNING;
+		__atomic_store_n(&s->comp_runstate, RUNSTATE_RUNNING,
+				__ATOMIC_RELEASE);
 	else
-		s->comp_runstate = RUNSTATE_STOPPED;
+		__atomic_store_n(&s->comp_runstate, RUNSTATE_STOPPED,
+				__ATOMIC_RELEASE);
 
-	rte_smp_wmb();
 	return 0;
 }
 
@@ -316,11 +322,12 @@  rte_service_runstate_set(uint32_t id, uint32_t runstate)
 	SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL);
 
 	if (runstate)
-		s->app_runstate = RUNSTATE_RUNNING;
+		__atomic_store_n(&s->app_runstate, RUNSTATE_RUNNING,
+				__ATOMIC_RELEASE);
 	else
-		s->app_runstate = RUNSTATE_STOPPED;
+		__atomic_store_n(&s->app_runstate, RUNSTATE_STOPPED,
+				__ATOMIC_RELEASE);
 
-	rte_smp_wmb();
 	return 0;
 }
 
@@ -442,7 +449,8 @@  service_runner_func(void *arg)
 	const int lcore = rte_lcore_id();
 	struct core_state *cs = &lcore_states[lcore];
 
-	while (lcore_states[lcore].runstate == RUNSTATE_RUNNING) {
+	while (__atomic_load_n(&cs->runstate,
+		    __ATOMIC_ACQUIRE) == RUNSTATE_RUNNING) {
 		const uint64_t service_mask = cs->service_mask;
 
 		for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) {
@@ -453,8 +461,6 @@  service_runner_func(void *arg)
 		}
 
 		cs->loops++;
-
-		rte_smp_rmb();
 	}
 
 	lcore_config[lcore].state = WAIT;
@@ -663,9 +669,8 @@  rte_service_lcore_add(uint32_t lcore)
 
 	/* ensure that after adding a core the mask and state are defaults */
 	lcore_states[lcore].service_mask = 0;
-	lcore_states[lcore].runstate = RUNSTATE_STOPPED;
-
-	rte_smp_wmb();
+	__atomic_store_n(&lcore_states[lcore].runstate, RUNSTATE_STOPPED,
+			__ATOMIC_RELEASE);
 
 	return rte_eal_wait_lcore(lcore);
 }