[v3,1/7] eal: generic 64 bit counter

Message ID 20240514153845.42489-2-stephen@networkplumber.org (mailing list archive)
State Superseded
Delegated to: Thomas Monjalon
Headers
Series Generic SW counters |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Stephen Hemminger May 14, 2024, 3:35 p.m. UTC
  This header implements 64 bit counters that are NOT atomic
but are safe against load/store splits on 32 bit platforms.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
---
 lib/eal/include/meson.build   |  1 +
 lib/eal/include/rte_counter.h | 91 +++++++++++++++++++++++++++++++++++
 2 files changed, 92 insertions(+)
 create mode 100644 lib/eal/include/rte_counter.h
  

Comments

Morten Brørup May 15, 2024, 9:30 a.m. UTC | #1
+To: @Mattias, @Ferruh, @Bruce, participants in a related discussion

> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> Sent: Tuesday, 14 May 2024 17.35
> 
> This header implements 64 bit counters that are NOT atomic
> but are safe against load/store splits on 32 bit platforms.
> 
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> Acked-by: Morten Brørup <mb@smartsharesystems.com>
> ---

With a long term perspective, I consider this patch very useful.
And its 32 bit implementation can be optimized for various architectures/compilers later.


In addition, it would be "nice to have" if reset() and fetch() could be called from another thread than the thread adding to the counter.

As previously discussed [1], I think it can be done without significantly affecting fast path add() performance, by using an "offset" with Release-Consume ordering.

[1]: https://inbox.dpdk.org/dev/98CBD80474FA8B44BF855DF32C47DC35E9F427@smartserver.smartshare.dk/


rte_counter64_add(rte_counter64_t *counter, uint32_t val)
{
	// Write "counter" with memory_order_relaxed, so
	// it eventually becomes visible in other threads.

	rte_counter64_t ctr = *counter + val;
	rte_atomic_store_explicit(counter, ctr, rte_memory_order_relaxed);
}

rte_counter64_get(rte_counter64_t *counter, rte_counter64_t *offset)
{
	// Read "offset" with memory_order_consume, so:
	// - no reads or writes in the current thread dependent on "offset"
	//   can be reordered before this load, and
	// - writes to "counter" (a data-dependent variable)
	//   in other threads that release "offset" are visible in the current thread.

	rte_counter64_t off = rte_atomic_load_explicit(offset, rte_memory_order_consume);
	rte_counter64_t ctr = rte_atomic_load_explicit(counter, rte_memory_order_relaxed);

	return ctr - off;
}

rte_counter64_reset(rte_counter64_t *counter, rte_counter64_t *offset)
{
	// Write "offset" with memory_order_release, so
	// "counter" cannot be visible after it.

	rte_counter64_t ctr = rte_atomic_load_explicit(offset, rte_memory_order_relaxed);
	rte_atomic_store_explicit(offset, ctr, rte_memory_order_release);
}


Support for counters shared by multi threads, e.g. rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed, should be provided too:

rte_counter64_mt_add(rte_counter64_t *counter, uint32_t val)
{
	rte_atomic_fetch_add_explicit(counter, val, rte_memory_order_relaxed);
}


>  lib/eal/include/meson.build   |  1 +
>  lib/eal/include/rte_counter.h | 91 +++++++++++++++++++++++++++++++++++
>  2 files changed, 92 insertions(+)
>  create mode 100644 lib/eal/include/rte_counter.h
> 
> diff --git a/lib/eal/include/meson.build b/lib/eal/include/meson.build
> index e94b056d46..c070dd0079 100644
> --- a/lib/eal/include/meson.build
> +++ b/lib/eal/include/meson.build
> @@ -12,6 +12,7 @@ headers += files(
>          'rte_class.h',
>          'rte_common.h',
>          'rte_compat.h',
> +        'rte_counter.h',
>          'rte_debug.h',
>          'rte_dev.h',
>          'rte_devargs.h',
> diff --git a/lib/eal/include/rte_counter.h b/lib/eal/include/rte_counter.h
> new file mode 100644
> index 0000000000..8068d6d26e
> --- /dev/null
> +++ b/lib/eal/include/rte_counter.h
> @@ -0,0 +1,91 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright (c) Stephen Hemminger <stephen@networkplumber.org>
> + */
> +
> +#ifndef _RTE_COUNTER_H_
> +#define _RTE_COUNTER_H_
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +/**
> + * @file
> + * RTE Counter
> + *
> + * A counter is 64 bit value that is safe from split read/write
> + * on 32 bit platforms. It assumes that only one cpu at a time
> + * will update the counter, and another CPU may want to read it.
> + *
> + * This is a much weaker guarantee than full atomic variables
> + * but is faster since no locked operations are required for update.
> + */
> +
> +#include <stdatomic.h>
> +
> +#ifdef RTE_ARCH_64
> +/*
> + * On a platform that can support native 64 bit type, no special handling.
> + * These are just wrapper around 64 bit value.
> + */
> +typedef uint64_t rte_counter64_t;
> +
> +/**
> + * Add value to counter.
> + */
> +__rte_experimental
> +static inline void
> +rte_counter64_add(rte_counter64_t *counter, uint32_t val)
> +{
> +	*counter += val;
> +}
> +
> +__rte_experimental
> +static inline uint64_t
> +rte_counter64_fetch(const rte_counter64_t *counter)
> +{
> +	return *counter;
> +}
> +
> +__rte_experimental
> +static inline void
> +rte_counter64_reset(rte_counter64_t *counter)
> +{
> +	*counter = 0;
> +}
> +
> +#else
> +/*
> + * On a 32 bit platform need to use atomic to force the compler to not
> + * split 64 bit read/write.
> + */
> +typedef RTE_ATOMIC(uint64_t) rte_counter64_t;
> +
> +__rte_experimental
> +static inline void
> +rte_counter64_add(rte_counter64_t *counter, uint32_t val)
> +{
> +	rte_atomic_fetch_add_explicit(counter, val, rte_memory_order_relaxed);
> +}
> +
> +__rte_experimental
> +static inline uint64_t
> +rte_counter64_fetch(const rte_counter64_t *counter)
> +{
> +	return rte_atomic_load_explicit(counter, rte_memory_order_relaxed);
> +}
> +
> +__rte_experimental
> +static inline void
> +rte_counter64_reset(rte_counter64_t *counter)
> +{
> +	rte_atomic_store_explicit(counter, 0, rte_memory_order_relaxed);
> +}
> +#endif
> +
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_COUNTER_H_ */
> --
> 2.43.0
  
Stephen Hemminger May 15, 2024, 3:03 p.m. UTC | #2
On Wed, 15 May 2024 11:30:45 +0200
Morten Brørup <mb@smartsharesystems.com> wrote:

> With a long term perspective, I consider this patch very useful.
> And its 32 bit implementation can be optimized for various architectures/compilers later.
> 
> 
> In addition, it would be "nice to have" if reset() and fetch() could be called from another thread than the thread adding to the counter.
> 
> As previously discussed [1], I think it can be done without significantly affecting fast path add() performance, by using an "offset" with Release-Consume ordering.
> 
> [1]: https://inbox.dpdk.org/dev/98CBD80474FA8B44BF855DF32C47DC35E9F427@smartserver.smartshare.dk/
> 


Without a specific driver use case, not sure why this added complexity is needed.
If there is a specific example, can add it later. Any atomic operation ends up
impacting the speculative execution pipeline on modern CPU's. This version
ends up being just a single add instruction on ARM and x86 64 bit.
  
Morten Brørup May 15, 2024, 4:18 p.m. UTC | #3
> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> Sent: Wednesday, 15 May 2024 17.03
> 
> On Wed, 15 May 2024 11:30:45 +0200
> Morten Brørup <mb@smartsharesystems.com> wrote:
> 
> > With a long term perspective, I consider this patch very useful.
> > And its 32 bit implementation can be optimized for various
> architectures/compilers later.
> >
> >
> > In addition, it would be "nice to have" if reset() and fetch() could
> be called from another thread than the thread adding to the counter.
> >
> > As previously discussed [1], I think it can be done without
> significantly affecting fast path add() performance, by using an
> "offset" with Release-Consume ordering.
> >
> > [1]:
> https://inbox.dpdk.org/dev/98CBD80474FA8B44BF855DF32C47DC35E9F427@smarts
> erver.smartshare.dk/
> >
> 
> 
> Without a specific driver use case, not sure why this added complexity
> is needed.

Our application reads the stats counters from another thread than the fast path threads. We don't pause the fast path forwarding loops to aggregate a bunch of counters.
I would guess that many other application work that way too. Especially latency sensitive applications.

> If there is a specific example, can add it later. Any atomic operation
> ends up
> impacting the speculative execution pipeline on modern CPU's. This
> version
> ends up being just a single add instruction on ARM and x86 64 bit.

I agree that everything is mostly fine on 64 bit.
I am trying to ensure that we future proof it for multi threaded applications and 32 bit architectures too.
  
Mattias Rönnblom May 26, 2024, 6:45 a.m. UTC | #4
On 2024-05-14 17:35, Stephen Hemminger wrote:
> This header implements 64 bit counters that are NOT atomic
> but are safe against load/store splits on 32 bit platforms.
> 
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> Acked-by: Morten Brørup <mb@smartsharesystems.com>
> ---
>   lib/eal/include/meson.build   |  1 +
>   lib/eal/include/rte_counter.h | 91 +++++++++++++++++++++++++++++++++++
>   2 files changed, 92 insertions(+)
>   create mode 100644 lib/eal/include/rte_counter.h
> 
> diff --git a/lib/eal/include/meson.build b/lib/eal/include/meson.build
> index e94b056d46..c070dd0079 100644
> --- a/lib/eal/include/meson.build
> +++ b/lib/eal/include/meson.build
> @@ -12,6 +12,7 @@ headers += files(
>           'rte_class.h',
>           'rte_common.h',
>           'rte_compat.h',
> +        'rte_counter.h',
>           'rte_debug.h',
>           'rte_dev.h',
>           'rte_devargs.h',
> diff --git a/lib/eal/include/rte_counter.h b/lib/eal/include/rte_counter.h
> new file mode 100644
> index 0000000000..8068d6d26e
> --- /dev/null
> +++ b/lib/eal/include/rte_counter.h
> @@ -0,0 +1,91 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright (c) Stephen Hemminger <stephen@networkplumber.org>
> + */
> +
> +#ifndef _RTE_COUNTER_H_
> +#define _RTE_COUNTER_H_
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +/**
> + * @file
> + * RTE Counter
> + *
> + * A counter is 64 bit value that is safe from split read/write
> + * on 32 bit platforms. It assumes that only one cpu at a time
> + * will update the counter, and another CPU may want to read it.

It's not totally obvious what "split read/write" means.

I think there is a word for this already; atomic. Atomic read/load and 
atomic write/store.

"A counter is value which can be atomically read, atomically written to, 
but does not allow atomic arithmetic operations (such as add), making 
them mostly useful in single-writer scenarios."

> + *
> + * This is a much weaker guarantee than full atomic variables
> + * but is faster since no locked operations are required for update.
> + */
> +
> +#include <stdatomic.h>

This shouldn't read rte_stdatomic.h?

> +
> +#ifdef RTE_ARCH_64
> +/*
> + * On a platform that can support native 64 bit type, no special handling.
> + * These are just wrapper around 64 bit value.
> + */
> +typedef uint64_t rte_counter64_t;
> +
> +/**
> + * Add value to counter.
> + */
> +__rte_experimental
> +static inline void
> +rte_counter64_add(rte_counter64_t *counter, uint32_t val)

Shouldn't 'val' also be uint64_t? Can't see it would be slower.

> +{
> +	*counter += val;
> +}
> +
> +__rte_experimental
> +static inline uint64_t
> +rte_counter64_fetch(const rte_counter64_t *counter)
> +{
> +	return *counter;
> +}
> +
> +__rte_experimental
> +static inline void
> +rte_counter64_reset(rte_counter64_t *counter)
> +{
> +	*counter = 0;
> +}
> +
> +#else
> +/*
> + * On a 32 bit platform need to use atomic to force the compler to not
> + * split 64 bit read/write.
> + */
> +typedef RTE_ATOMIC(uint64_t) rte_counter64_t;

To have an API that sometimes, for certain build configurations and 
architectures, makes some object _Atomic, makes me somewhat uneasy. All 
direct accesses to the object in question (e.g., my_counter++) will be 
atomic with SEQ CST memory model.

The alternative, to always use the regular type (uint64_t in this case), 
and cast to _Atomic (RTE_ATOMIC()) also seems less than ideal.

The atomic bit operations in the bitops patch set takes the latter approach.

> +
> +__rte_experimental
> +static inline void
> +rte_counter64_add(rte_counter64_t *counter, uint32_t val)
> +{
> +	rte_atomic_fetch_add_explicit(counter, val, rte_memory_order_relaxed);

This is overkill, and will generate a locked instruction on x86.

Use an atomic load, a non-atomic add, and an atomic store. A non-atomic 
load would do, but with RTE_ATOMIC() I don't think there's a safe way to 
achieve that.

uint64_t value = *counter;

would be a non-atomic load on non-C11-atomics-builds, but an atomic load 
with SEQ CST memory ordering on C11-atomics-enabled builds.

> +}
> +
> +__rte_experimental
> +static inline uint64_t
> +rte_counter64_fetch(const rte_counter64_t *counter)
> +{
> +	return rte_atomic_load_explicit(counter, rte_memory_order_relaxed);
> +}
> +
> +__rte_experimental
> +static inline void
> +rte_counter64_reset(rte_counter64_t *counter)
> +{
> +	rte_atomic_store_explicit(counter, 0, rte_memory_order_relaxed);
> +}
> +#endif
> +
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_COUNTER_H_ */
  
Mattias Rönnblom May 26, 2024, 7:34 a.m. UTC | #5
On 2024-05-15 11:30, Morten Brørup wrote:
> +To: @Mattias, @Ferruh, @Bruce, participants in a related discussion
> 
>> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
>> Sent: Tuesday, 14 May 2024 17.35
>>
>> This header implements 64 bit counters that are NOT atomic
>> but are safe against load/store splits on 32 bit platforms.
>>
>> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
>> Acked-by: Morten Brørup <mb@smartsharesystems.com>
>> ---
> 
> With a long term perspective, I consider this patch very useful.
> And its 32 bit implementation can be optimized for various architectures/compilers later.
> 
> 
> In addition, it would be "nice to have" if reset() and fetch() could be called from another thread than the thread adding to the counter.
> 

reset() from a different thread, and you enter the "multiple writer" 
domain. Both reset and add needs to be atomic over the whole 
read-modify-write cycle. (Add and reset are really no different.)

...unless you keep an offset per counter, per the discussion in the 
other thread, where I proposed something like that as a common counter 
API (in case we really needed MT safe reset).

It seems to me that we shouldn't provide a MT safe reset. By some means, 
the user must assure there is only a single writer (the "resetter" or 
the "adder").

> As previously discussed [1], I think it can be done without significantly affecting fast path add() performance, by using an "offset" with Release-Consume ordering.
> 
> [1]: https://inbox.dpdk.org/dev/98CBD80474FA8B44BF855DF32C47DC35E9F427@smartserver.smartshare.dk/
> 
> 
> rte_counter64_add(rte_counter64_t *counter, uint32_t val)
> {
> 	// Write "counter" with memory_order_relaxed, so
> 	// it eventually becomes visible in other threads.
> 
> 	rte_counter64_t ctr = *counter + val;
> 	rte_atomic_store_explicit(counter, ctr, rte_memory_order_relaxed);
> }
> 
> rte_counter64_get(rte_counter64_t *counter, rte_counter64_t *offset)
> {
> 	// Read "offset" with memory_order_consume, so:
> 	// - no reads or writes in the current thread dependent on "offset"
> 	//   can be reordered before this load, and
> 	// - writes to "counter" (a data-dependent variable)
> 	//   in other threads that release "offset" are visible in the current thread.
> 
> 	rte_counter64_t off = rte_atomic_load_explicit(offset, rte_memory_order_consume);
> 	rte_counter64_t ctr = rte_atomic_load_explicit(counter, rte_memory_order_relaxed);
> 
> 	return ctr - off;
> }
> 
> rte_counter64_reset(rte_counter64_t *counter, rte_counter64_t *offset)
> {
> 	// Write "offset" with memory_order_release, so
> 	// "counter" cannot be visible after it.
> 
> 	rte_counter64_t ctr = rte_atomic_load_explicit(offset, rte_memory_order_relaxed);
> 	rte_atomic_store_explicit(offset, ctr, rte_memory_order_release);
> }
> 
> 
> Support for counters shared by multi threads, e.g. rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed, should be provided too:
> 
> rte_counter64_mt_add(rte_counter64_t *counter, uint32_t val)
> {
> 	rte_atomic_fetch_add_explicit(counter, val, rte_memory_order_relaxed);
> }
> 
> 
>>   lib/eal/include/meson.build   |  1 +
>>   lib/eal/include/rte_counter.h | 91 +++++++++++++++++++++++++++++++++++
>>   2 files changed, 92 insertions(+)
>>   create mode 100644 lib/eal/include/rte_counter.h
>>
>> diff --git a/lib/eal/include/meson.build b/lib/eal/include/meson.build
>> index e94b056d46..c070dd0079 100644
>> --- a/lib/eal/include/meson.build
>> +++ b/lib/eal/include/meson.build
>> @@ -12,6 +12,7 @@ headers += files(
>>           'rte_class.h',
>>           'rte_common.h',
>>           'rte_compat.h',
>> +        'rte_counter.h',
>>           'rte_debug.h',
>>           'rte_dev.h',
>>           'rte_devargs.h',
>> diff --git a/lib/eal/include/rte_counter.h b/lib/eal/include/rte_counter.h
>> new file mode 100644
>> index 0000000000..8068d6d26e
>> --- /dev/null
>> +++ b/lib/eal/include/rte_counter.h
>> @@ -0,0 +1,91 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright (c) Stephen Hemminger <stephen@networkplumber.org>
>> + */
>> +
>> +#ifndef _RTE_COUNTER_H_
>> +#define _RTE_COUNTER_H_
>> +
>> +#ifdef __cplusplus
>> +extern "C" {
>> +#endif
>> +
>> +/**
>> + * @file
>> + * RTE Counter
>> + *
>> + * A counter is 64 bit value that is safe from split read/write
>> + * on 32 bit platforms. It assumes that only one cpu at a time
>> + * will update the counter, and another CPU may want to read it.
>> + *
>> + * This is a much weaker guarantee than full atomic variables
>> + * but is faster since no locked operations are required for update.
>> + */
>> +
>> +#include <stdatomic.h>
>> +
>> +#ifdef RTE_ARCH_64
>> +/*
>> + * On a platform that can support native 64 bit type, no special handling.
>> + * These are just wrapper around 64 bit value.
>> + */
>> +typedef uint64_t rte_counter64_t;
>> +
>> +/**
>> + * Add value to counter.
>> + */
>> +__rte_experimental
>> +static inline void
>> +rte_counter64_add(rte_counter64_t *counter, uint32_t val)
>> +{
>> +	*counter += val;
>> +}
>> +
>> +__rte_experimental
>> +static inline uint64_t
>> +rte_counter64_fetch(const rte_counter64_t *counter)
>> +{
>> +	return *counter;
>> +}
>> +
>> +__rte_experimental
>> +static inline void
>> +rte_counter64_reset(rte_counter64_t *counter)
>> +{
>> +	*counter = 0;
>> +}
>> +
>> +#else
>> +/*
>> + * On a 32 bit platform need to use atomic to force the compler to not
>> + * split 64 bit read/write.
>> + */
>> +typedef RTE_ATOMIC(uint64_t) rte_counter64_t;
>> +
>> +__rte_experimental
>> +static inline void
>> +rte_counter64_add(rte_counter64_t *counter, uint32_t val)
>> +{
>> +	rte_atomic_fetch_add_explicit(counter, val, rte_memory_order_relaxed);
>> +}
>> +
>> +__rte_experimental
>> +static inline uint64_t
>> +rte_counter64_fetch(const rte_counter64_t *counter)
>> +{
>> +	return rte_atomic_load_explicit(counter, rte_memory_order_relaxed);
>> +}
>> +
>> +__rte_experimental
>> +static inline void
>> +rte_counter64_reset(rte_counter64_t *counter)
>> +{
>> +	rte_atomic_store_explicit(counter, 0, rte_memory_order_relaxed);
>> +}
>> +#endif
>> +
>> +
>> +#ifdef __cplusplus
>> +}
>> +#endif
>> +
>> +#endif /* _RTE_COUNTER_H_ */
>> --
>> 2.43.0
>
  

Patch

diff --git a/lib/eal/include/meson.build b/lib/eal/include/meson.build
index e94b056d46..c070dd0079 100644
--- a/lib/eal/include/meson.build
+++ b/lib/eal/include/meson.build
@@ -12,6 +12,7 @@  headers += files(
         'rte_class.h',
         'rte_common.h',
         'rte_compat.h',
+        'rte_counter.h',
         'rte_debug.h',
         'rte_dev.h',
         'rte_devargs.h',
diff --git a/lib/eal/include/rte_counter.h b/lib/eal/include/rte_counter.h
new file mode 100644
index 0000000000..8068d6d26e
--- /dev/null
+++ b/lib/eal/include/rte_counter.h
@@ -0,0 +1,91 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) Stephen Hemminger <stephen@networkplumber.org>
+ */
+
+#ifndef _RTE_COUNTER_H_
+#define _RTE_COUNTER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @file
+ * RTE Counter
+ *
+ * A counter is 64 bit value that is safe from split read/write
+ * on 32 bit platforms. It assumes that only one cpu at a time
+ * will update the counter, and another CPU may want to read it.
+ *
+ * This is a much weaker guarantee than full atomic variables
+ * but is faster since no locked operations are required for update.
+ */
+
+#include <stdatomic.h>
+
+#ifdef RTE_ARCH_64
+/*
+ * On a platform that can support native 64 bit type, no special handling.
+ * These are just wrapper around 64 bit value.
+ */
+typedef uint64_t rte_counter64_t;
+
+/**
+ * Add value to counter.
+ */
+__rte_experimental
+static inline void
+rte_counter64_add(rte_counter64_t *counter, uint32_t val)
+{
+	*counter += val;
+}
+
+__rte_experimental
+static inline uint64_t
+rte_counter64_fetch(const rte_counter64_t *counter)
+{
+	return *counter;
+}
+
+__rte_experimental
+static inline void
+rte_counter64_reset(rte_counter64_t *counter)
+{
+	*counter = 0;
+}
+
+#else
+/*
+ * On a 32 bit platform need to use atomic to force the compler to not
+ * split 64 bit read/write.
+ */
+typedef RTE_ATOMIC(uint64_t) rte_counter64_t;
+
+__rte_experimental
+static inline void
+rte_counter64_add(rte_counter64_t *counter, uint32_t val)
+{
+	rte_atomic_fetch_add_explicit(counter, val, rte_memory_order_relaxed);
+}
+
+__rte_experimental
+static inline uint64_t
+rte_counter64_fetch(const rte_counter64_t *counter)
+{
+	return rte_atomic_load_explicit(counter, rte_memory_order_relaxed);
+}
+
+__rte_experimental
+static inline void
+rte_counter64_reset(rte_counter64_t *counter)
+{
+	rte_atomic_store_explicit(counter, 0, rte_memory_order_relaxed);
+}
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_COUNTER_H_ */