[v4,1/2] eal: add WC store functions
diff mbox series

Message ID 1593681821-22357-2-git-send-email-radu.nicolau@intel.com
State Superseded
Delegated to: David Marchand
Headers show
Series
  • eal: add WC store functions
Related show

Checks

Context Check Description
ci/iol-testing success Testing PASS
ci/Intel-compilation fail Compilation issues
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-broadcom-Performance success Performance Testing PASS
ci/checkpatch success coding style OK

Commit Message

Radu Nicolau July 2, 2020, 9:23 a.m. UTC
Add rte_write32_wc and rte_write32_wc_relaxed functions
that implement 32bit stores using write combining memory protocol.
Provided generic stubs and x86 implementation.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
v4: address feedback and include ack

 lib/librte_eal/include/generic/rte_io.h | 47 +++++++++++++++++++++++++++
 lib/librte_eal/x86/include/rte_io.h     | 56 +++++++++++++++++++++++++++++++++
 2 files changed, 103 insertions(+)

Comments

David Marchand July 3, 2020, 3:19 p.m. UTC | #1
On Thu, Jul 2, 2020 at 11:24 AM Radu Nicolau <radu.nicolau@intel.com> wrote:
>
> Add rte_write32_wc and rte_write32_wc_relaxed functions
> that implement 32bit stores using write combining memory protocol.
> Provided generic stubs and x86 implementation.
>
> Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
> Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> ---
> v4: address feedback and include ack
>
>  lib/librte_eal/include/generic/rte_io.h | 47 +++++++++++++++++++++++++++
>  lib/librte_eal/x86/include/rte_io.h     | 56 +++++++++++++++++++++++++++++++++
>  2 files changed, 103 insertions(+)
>
> diff --git a/lib/librte_eal/include/generic/rte_io.h b/lib/librte_eal/include/generic/rte_io.h
> index da457f7..7391782 100644
> --- a/lib/librte_eal/include/generic/rte_io.h
> +++ b/lib/librte_eal/include/generic/rte_io.h
> @@ -229,6 +229,39 @@ rte_write32(uint32_t value, volatile void *addr);
>  static inline void
>  rte_write64(uint64_t value, volatile void *addr);
>
> +/**
> + * Write a 32-bit value to I/O device memory address addr using write
> + * combining memory write protocol. Depending on the platform write combining
> + * may not be available and/or may be treated as a hint and the behavior may
> + * fallback to a regular store.
> + *
> + * @param value
> + *  Value to write
> + * @param addr
> + *  I/O memory address to write the value to
> + */
> +static inline void
> +rte_write32_wc(uint32_t value, volatile void *addr);

This is a new API, and even if inlined, it should be marked experimental.

Is volatile necessary?


> +
> +/**
> + * Write a 32-bit value to I/O device memory address addr using write
> + * combining memory write protocol. Depending on the platform write combining
> + * may not be available and/or may be treated as a hint and the behavior may
> + * fallback to a regular store.
> + *
> + * The relaxed version does not have additional I/O memory barrier, useful in
> + * accessing the device registers of integrated controllers which implicitly
> + * strongly ordered with respect to memory access.
> + *
> + * @param value
> + *  Value to write
> + * @param addr
> + *  I/O memory address to write the value to
> + */
> +static inline void
> +rte_write32_wc_relaxed(uint32_t value, volatile void *addr);
> +
> +

Double empty line (there are some other in this patch that I won't flag again).


>  #endif /* __DOXYGEN__ */
>
>  #ifndef RTE_OVERRIDE_IO_H
> @@ -345,6 +378,20 @@ rte_write64(uint64_t value, volatile void *addr)
>         rte_write64_relaxed(value, addr);
>  }
>
> +#ifndef RTE_NATIVE_WRITE32_WC

Missing return type, this causes build failure on anything but x86.


> +rte_write32_wc(uint32_t value, volatile void *addr)
> +{
> +       rte_write32(value, addr);
> +}
> +
> +static __rte_always_inline void
> +rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
> +{
> +       rte_write32_relaxed(value, addr);
> +}
> +#endif /* RTE_NATIVE_WRITE32_WC */
> +
> +
>  #endif /* RTE_OVERRIDE_IO_H */
>
>  #endif /* _RTE_IO_H_ */
> diff --git a/lib/librte_eal/x86/include/rte_io.h b/lib/librte_eal/x86/include/rte_io.h
> index 2db71b1..c95ed67 100644
> --- a/lib/librte_eal/x86/include/rte_io.h
> +++ b/lib/librte_eal/x86/include/rte_io.h
> @@ -9,8 +9,64 @@
>  extern "C" {
>  #endif
>
> +#include "rte_cpuflags.h"

Inclusion of this header should be out of the extern "C" block.


> +
> +#define RTE_NATIVE_WRITE32_WC
>  #include "generic/rte_io.h"
>
> +/**
> + * @internal
> + * MOVDIRI wrapper.
> + */
> +static __rte_always_inline void
> +_rte_x86_movdiri(uint32_t value, volatile void *addr)
> +{
> +       asm volatile(
> +               /* MOVDIRI */
> +               ".byte 0x40, 0x0f, 0x38, 0xf9, 0x02"
> +               :
> +               : "a" (value), "d" (addr));
> +}
> +
> +static __rte_always_inline void
> +rte_write32_wc(uint32_t value, volatile void *addr)
> +{
> +       static int _x86_movdiri_flag = -1;
> +       if (_x86_movdiri_flag == 1) {
> +               rte_wmb();
> +               _rte_x86_movdiri(value, addr);
> +       } else if (_x86_movdiri_flag == 0) {
> +               rte_write32(value, addr);
> +       } else {
> +               _x86_movdiri_flag =
> +                       (rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);

Can't this cpu flag check be moved in a constructor?
This would avoid this copy/paste.


> +               if (_x86_movdiri_flag == 1) {
> +                       rte_wmb();
> +                       _rte_x86_movdiri(value, addr);
> +               } else {
> +                       rte_write32(value, addr);
> +               }
> +       }
> +}
> +
> +static __rte_always_inline void
> +rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
> +{
> +       static int _x86_movdiri_flag = -1;

Same check with a static variable with the same name.


I wonder if wrapping all of this in a single function would be more elegant.
Then rte_write32_wc(|_relaxed) would call it with a flag.


> +       if (_x86_movdiri_flag == 1) {
> +               _rte_x86_movdiri(value, addr);
> +       } else if (_x86_movdiri_flag == 0) {
> +               rte_write32_relaxed(value, addr);
> +       } else {
> +               _x86_movdiri_flag =
> +                       (rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
> +               if (_x86_movdiri_flag == 1)
> +                       _rte_x86_movdiri(value, addr);
> +               else
> +                       rte_write32_relaxed(value, addr);
> +       }
> +}
> +
>  #ifdef __cplusplus
>  }
>  #endif
> --
> 2.7.4
>
Radu Nicolau July 6, 2020, 9:15 a.m. UTC | #2
Hi David, thanks for reviewing!

Some comments inline.


On 7/3/2020 4:19 PM, David Marchand wrote:
> On Thu, Jul 2, 2020 at 11:24 AM Radu Nicolau<radu.nicolau@intel.com>  wrote:
>> +static inline void
>> +rte_write32_wc(uint32_t value, volatile void *addr);
> This is a new API, and even if inlined, it should be marked experimental.
Will do.
> Is volatile necessary?
Yes, most of these functions will be called on mmio addresses/volatile 
pointers. All other io functions have it.
>
> +static inline void
> +rte_write32_wc_relaxed(uint32_t value, volatile void *addr);
> +
> +
> Double empty line (there are some other in this patch that I won't flag again).
I will check all occurrences.
>
>
>>   #endif /* __DOXYGEN__ */
>>
>>   #ifndef RTE_OVERRIDE_IO_H
>> @@ -345,6 +378,20 @@ rte_write64(uint64_t value, volatile void *addr)
>>          rte_write64_relaxed(value, addr);
>>   }
>>
>> +#ifndef RTE_NATIVE_WRITE32_WC
> Missing return type, this causes build failure on anything but x86.
Yes, got lost in the copy/paste. I will fix it in the next version
>
>
>> +rte_write32_wc(uint32_t value, volatile void *addr)
>> +{
>> +       rte_write32(value, addr);
>> +}
>> +
>> +static __rte_always_inline void
>> +rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
>> +{
>> +       rte_write32_relaxed(value, addr);
>> +}
>> +#endif /* RTE_NATIVE_WRITE32_WC */
>> +
>> +
>>   #endif /* RTE_OVERRIDE_IO_H */
>>
>>   #endif /* _RTE_IO_H_ */
>> diff --git a/lib/librte_eal/x86/include/rte_io.h b/lib/librte_eal/x86/include/rte_io.h
>> index 2db71b1..c95ed67 100644
>> --- a/lib/librte_eal/x86/include/rte_io.h
>> +++ b/lib/librte_eal/x86/include/rte_io.h
>> @@ -9,8 +9,64 @@
>>   extern "C" {
>>   #endif
>>
>> +#include "rte_cpuflags.h"
> Inclusion of this header should be out of the extern "C" block.
Why? It is used elsewhere inside the extern "C" block e.g. 
x86/rte_spinlock.h
>
>
>> +
>> +#define RTE_NATIVE_WRITE32_WC
>>   #include "generic/rte_io.h"
>>
>> +/**
>> + * @internal
>> + * MOVDIRI wrapper.
>> + */
>> +static __rte_always_inline void
>> +_rte_x86_movdiri(uint32_t value, volatile void *addr)
>> +{
>> +       asm volatile(
>> +               /* MOVDIRI */
>> +               ".byte 0x40, 0x0f, 0x38, 0xf9, 0x02"
>> +               :
>> +               : "a" (value), "d" (addr));
>> +}
>> +
>> +static __rte_always_inline void
>> +rte_write32_wc(uint32_t value, volatile void *addr)
>> +{
>> +       static int _x86_movdiri_flag = -1;
>> +       if (_x86_movdiri_flag == 1) {
>> +               rte_wmb();
>> +               _rte_x86_movdiri(value, addr);
>> +       } else if (_x86_movdiri_flag == 0) {
>> +               rte_write32(value, addr);
>> +       } else {
>> +               _x86_movdiri_flag =
>> +                       (rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
> Can't this cpu flag check be moved in a constructor?
> This would avoid this copy/paste.
We evaluated this approach but it creates more problems than if fixes - 
it will need a variable that needs to be exported and there is no good 
place to put it.
>
>
>> +               if (_x86_movdiri_flag == 1) {
>> +                       rte_wmb();
>> +                       _rte_x86_movdiri(value, addr);
>> +               } else {
>> +                       rte_write32(value, addr);
>> +               }
>> +       }
>> +}
>> +
>> +static __rte_always_inline void
>> +rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
>> +{
>> +       static int _x86_movdiri_flag = -1;
> Same check with a static variable with the same name.
It should be no problem, they are static local variables.
>
>
> I wonder if wrapping all of this in a single function would be more elegant.
> Then rte_write32_wc(|_relaxed) would call it with a flag.
Yes, it will be more elegant but also it will cost more, it was written 
like this to minimize the number of branches taken for the movdiri path.

Patch
diff mbox series

diff --git a/lib/librte_eal/include/generic/rte_io.h b/lib/librte_eal/include/generic/rte_io.h
index da457f7..7391782 100644
--- a/lib/librte_eal/include/generic/rte_io.h
+++ b/lib/librte_eal/include/generic/rte_io.h
@@ -229,6 +229,39 @@  rte_write32(uint32_t value, volatile void *addr);
 static inline void
 rte_write64(uint64_t value, volatile void *addr);
 
+/**
+ * Write a 32-bit value to I/O device memory address addr using write
+ * combining memory write protocol. Depending on the platform write combining
+ * may not be available and/or may be treated as a hint and the behavior may
+ * fallback to a regular store.
+ *
+ * @param value
+ *  Value to write
+ * @param addr
+ *  I/O memory address to write the value to
+ */
+static inline void
+rte_write32_wc(uint32_t value, volatile void *addr);
+
+/**
+ * Write a 32-bit value to I/O device memory address addr using write
+ * combining memory write protocol. Depending on the platform write combining
+ * may not be available and/or may be treated as a hint and the behavior may
+ * fallback to a regular store.
+ *
+ * The relaxed version does not have additional I/O memory barrier, useful in
+ * accessing the device registers of integrated controllers which implicitly
+ * strongly ordered with respect to memory access.
+ *
+ * @param value
+ *  Value to write
+ * @param addr
+ *  I/O memory address to write the value to
+ */
+static inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr);
+
+
 #endif /* __DOXYGEN__ */
 
 #ifndef RTE_OVERRIDE_IO_H
@@ -345,6 +378,20 @@  rte_write64(uint64_t value, volatile void *addr)
 	rte_write64_relaxed(value, addr);
 }
 
+#ifndef RTE_NATIVE_WRITE32_WC
+rte_write32_wc(uint32_t value, volatile void *addr)
+{
+	rte_write32(value, addr);
+}
+
+static __rte_always_inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
+{
+	rte_write32_relaxed(value, addr);
+}
+#endif /* RTE_NATIVE_WRITE32_WC */
+
+
 #endif /* RTE_OVERRIDE_IO_H */
 
 #endif /* _RTE_IO_H_ */
diff --git a/lib/librte_eal/x86/include/rte_io.h b/lib/librte_eal/x86/include/rte_io.h
index 2db71b1..c95ed67 100644
--- a/lib/librte_eal/x86/include/rte_io.h
+++ b/lib/librte_eal/x86/include/rte_io.h
@@ -9,8 +9,64 @@ 
 extern "C" {
 #endif
 
+#include "rte_cpuflags.h"
+
+#define RTE_NATIVE_WRITE32_WC
 #include "generic/rte_io.h"
 
+/**
+ * @internal
+ * MOVDIRI wrapper.
+ */
+static __rte_always_inline void
+_rte_x86_movdiri(uint32_t value, volatile void *addr)
+{
+	asm volatile(
+		/* MOVDIRI */
+		".byte 0x40, 0x0f, 0x38, 0xf9, 0x02"
+		:
+		: "a" (value), "d" (addr));
+}
+
+static __rte_always_inline void
+rte_write32_wc(uint32_t value, volatile void *addr)
+{
+	static int _x86_movdiri_flag = -1;
+	if (_x86_movdiri_flag == 1) {
+		rte_wmb();
+		_rte_x86_movdiri(value, addr);
+	} else if (_x86_movdiri_flag == 0) {
+		rte_write32(value, addr);
+	} else {
+		_x86_movdiri_flag =
+			(rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
+		if (_x86_movdiri_flag == 1) {
+			rte_wmb();
+			_rte_x86_movdiri(value, addr);
+		} else {
+			rte_write32(value, addr);
+		}
+	}
+}
+
+static __rte_always_inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
+{
+	static int _x86_movdiri_flag = -1;
+	if (_x86_movdiri_flag == 1) {
+		_rte_x86_movdiri(value, addr);
+	} else if (_x86_movdiri_flag == 0) {
+		rte_write32_relaxed(value, addr);
+	} else {
+		_x86_movdiri_flag =
+			(rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
+		if (_x86_movdiri_flag == 1)
+			_rte_x86_movdiri(value, addr);
+		else
+			rte_write32_relaxed(value, addr);
+	}
+}
+
 #ifdef __cplusplus
 }
 #endif