[v4,1/2] eal: add WC store functions

Message ID 1593681821-22357-2-git-send-email-radu.nicolau@intel.com (mailing list archive)
State Superseded, archived
Delegated to: David Marchand
Headers
Series eal: add WC store functions |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/iol-broadcom-Performance success Performance Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-mellanox-Performance success Performance Testing PASS
ci/Intel-compilation fail Compilation issues
ci/iol-testing success Testing PASS

Commit Message

Radu Nicolau July 2, 2020, 9:23 a.m. UTC
  Add rte_write32_wc and rte_write32_wc_relaxed functions
that implement 32bit stores using write combining memory protocol.
Provided generic stubs and x86 implementation.

Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
---
v4: address feedback and include ack

 lib/librte_eal/include/generic/rte_io.h | 47 +++++++++++++++++++++++++++
 lib/librte_eal/x86/include/rte_io.h     | 56 +++++++++++++++++++++++++++++++++
 2 files changed, 103 insertions(+)
  

Comments

David Marchand July 3, 2020, 3:19 p.m. UTC | #1
On Thu, Jul 2, 2020 at 11:24 AM Radu Nicolau <radu.nicolau@intel.com> wrote:
>
> Add rte_write32_wc and rte_write32_wc_relaxed functions
> that implement 32bit stores using write combining memory protocol.
> Provided generic stubs and x86 implementation.
>
> Signed-off-by: Radu Nicolau <radu.nicolau@intel.com>
> Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> ---
> v4: address feedback and include ack
>
>  lib/librte_eal/include/generic/rte_io.h | 47 +++++++++++++++++++++++++++
>  lib/librte_eal/x86/include/rte_io.h     | 56 +++++++++++++++++++++++++++++++++
>  2 files changed, 103 insertions(+)
>
> diff --git a/lib/librte_eal/include/generic/rte_io.h b/lib/librte_eal/include/generic/rte_io.h
> index da457f7..7391782 100644
> --- a/lib/librte_eal/include/generic/rte_io.h
> +++ b/lib/librte_eal/include/generic/rte_io.h
> @@ -229,6 +229,39 @@ rte_write32(uint32_t value, volatile void *addr);
>  static inline void
>  rte_write64(uint64_t value, volatile void *addr);
>
> +/**
> + * Write a 32-bit value to I/O device memory address addr using write
> + * combining memory write protocol. Depending on the platform write combining
> + * may not be available and/or may be treated as a hint and the behavior may
> + * fallback to a regular store.
> + *
> + * @param value
> + *  Value to write
> + * @param addr
> + *  I/O memory address to write the value to
> + */
> +static inline void
> +rte_write32_wc(uint32_t value, volatile void *addr);

This is a new API, and even if inlined, it should be marked experimental.

Is volatile necessary?


> +
> +/**
> + * Write a 32-bit value to I/O device memory address addr using write
> + * combining memory write protocol. Depending on the platform write combining
> + * may not be available and/or may be treated as a hint and the behavior may
> + * fallback to a regular store.
> + *
> + * The relaxed version does not have additional I/O memory barrier, useful in
> + * accessing the device registers of integrated controllers which implicitly
> + * strongly ordered with respect to memory access.
> + *
> + * @param value
> + *  Value to write
> + * @param addr
> + *  I/O memory address to write the value to
> + */
> +static inline void
> +rte_write32_wc_relaxed(uint32_t value, volatile void *addr);
> +
> +

Double empty line (there are some other in this patch that I won't flag again).


>  #endif /* __DOXYGEN__ */
>
>  #ifndef RTE_OVERRIDE_IO_H
> @@ -345,6 +378,20 @@ rte_write64(uint64_t value, volatile void *addr)
>         rte_write64_relaxed(value, addr);
>  }
>
> +#ifndef RTE_NATIVE_WRITE32_WC

Missing return type, this causes build failure on anything but x86.


> +rte_write32_wc(uint32_t value, volatile void *addr)
> +{
> +       rte_write32(value, addr);
> +}
> +
> +static __rte_always_inline void
> +rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
> +{
> +       rte_write32_relaxed(value, addr);
> +}
> +#endif /* RTE_NATIVE_WRITE32_WC */
> +
> +
>  #endif /* RTE_OVERRIDE_IO_H */
>
>  #endif /* _RTE_IO_H_ */
> diff --git a/lib/librte_eal/x86/include/rte_io.h b/lib/librte_eal/x86/include/rte_io.h
> index 2db71b1..c95ed67 100644
> --- a/lib/librte_eal/x86/include/rte_io.h
> +++ b/lib/librte_eal/x86/include/rte_io.h
> @@ -9,8 +9,64 @@
>  extern "C" {
>  #endif
>
> +#include "rte_cpuflags.h"

Inclusion of this header should be out of the extern "C" block.


> +
> +#define RTE_NATIVE_WRITE32_WC
>  #include "generic/rte_io.h"
>
> +/**
> + * @internal
> + * MOVDIRI wrapper.
> + */
> +static __rte_always_inline void
> +_rte_x86_movdiri(uint32_t value, volatile void *addr)
> +{
> +       asm volatile(
> +               /* MOVDIRI */
> +               ".byte 0x40, 0x0f, 0x38, 0xf9, 0x02"
> +               :
> +               : "a" (value), "d" (addr));
> +}
> +
> +static __rte_always_inline void
> +rte_write32_wc(uint32_t value, volatile void *addr)
> +{
> +       static int _x86_movdiri_flag = -1;
> +       if (_x86_movdiri_flag == 1) {
> +               rte_wmb();
> +               _rte_x86_movdiri(value, addr);
> +       } else if (_x86_movdiri_flag == 0) {
> +               rte_write32(value, addr);
> +       } else {
> +               _x86_movdiri_flag =
> +                       (rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);

Can't this cpu flag check be moved in a constructor?
This would avoid this copy/paste.


> +               if (_x86_movdiri_flag == 1) {
> +                       rte_wmb();
> +                       _rte_x86_movdiri(value, addr);
> +               } else {
> +                       rte_write32(value, addr);
> +               }
> +       }
> +}
> +
> +static __rte_always_inline void
> +rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
> +{
> +       static int _x86_movdiri_flag = -1;

Same check with a static variable with the same name.


I wonder if wrapping all of this in a single function would be more elegant.
Then rte_write32_wc(|_relaxed) would call it with a flag.


> +       if (_x86_movdiri_flag == 1) {
> +               _rte_x86_movdiri(value, addr);
> +       } else if (_x86_movdiri_flag == 0) {
> +               rte_write32_relaxed(value, addr);
> +       } else {
> +               _x86_movdiri_flag =
> +                       (rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
> +               if (_x86_movdiri_flag == 1)
> +                       _rte_x86_movdiri(value, addr);
> +               else
> +                       rte_write32_relaxed(value, addr);
> +       }
> +}
> +
>  #ifdef __cplusplus
>  }
>  #endif
> --
> 2.7.4
>
  
Radu Nicolau July 6, 2020, 9:15 a.m. UTC | #2
Hi David, thanks for reviewing!

Some comments inline.


On 7/3/2020 4:19 PM, David Marchand wrote:
> On Thu, Jul 2, 2020 at 11:24 AM Radu Nicolau<radu.nicolau@intel.com>  wrote:
>> +static inline void
>> +rte_write32_wc(uint32_t value, volatile void *addr);
> This is a new API, and even if inlined, it should be marked experimental.
Will do.
> Is volatile necessary?
Yes, most of these functions will be called on mmio addresses/volatile 
pointers. All other io functions have it.
>
> +static inline void
> +rte_write32_wc_relaxed(uint32_t value, volatile void *addr);
> +
> +
> Double empty line (there are some other in this patch that I won't flag again).
I will check all occurrences.
>
>
>>   #endif /* __DOXYGEN__ */
>>
>>   #ifndef RTE_OVERRIDE_IO_H
>> @@ -345,6 +378,20 @@ rte_write64(uint64_t value, volatile void *addr)
>>          rte_write64_relaxed(value, addr);
>>   }
>>
>> +#ifndef RTE_NATIVE_WRITE32_WC
> Missing return type, this causes build failure on anything but x86.
Yes, got lost in the copy/paste. I will fix it in the next version
>
>
>> +rte_write32_wc(uint32_t value, volatile void *addr)
>> +{
>> +       rte_write32(value, addr);
>> +}
>> +
>> +static __rte_always_inline void
>> +rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
>> +{
>> +       rte_write32_relaxed(value, addr);
>> +}
>> +#endif /* RTE_NATIVE_WRITE32_WC */
>> +
>> +
>>   #endif /* RTE_OVERRIDE_IO_H */
>>
>>   #endif /* _RTE_IO_H_ */
>> diff --git a/lib/librte_eal/x86/include/rte_io.h b/lib/librte_eal/x86/include/rte_io.h
>> index 2db71b1..c95ed67 100644
>> --- a/lib/librte_eal/x86/include/rte_io.h
>> +++ b/lib/librte_eal/x86/include/rte_io.h
>> @@ -9,8 +9,64 @@
>>   extern "C" {
>>   #endif
>>
>> +#include "rte_cpuflags.h"
> Inclusion of this header should be out of the extern "C" block.
Why? It is used elsewhere inside the extern "C" block e.g. 
x86/rte_spinlock.h
>
>
>> +
>> +#define RTE_NATIVE_WRITE32_WC
>>   #include "generic/rte_io.h"
>>
>> +/**
>> + * @internal
>> + * MOVDIRI wrapper.
>> + */
>> +static __rte_always_inline void
>> +_rte_x86_movdiri(uint32_t value, volatile void *addr)
>> +{
>> +       asm volatile(
>> +               /* MOVDIRI */
>> +               ".byte 0x40, 0x0f, 0x38, 0xf9, 0x02"
>> +               :
>> +               : "a" (value), "d" (addr));
>> +}
>> +
>> +static __rte_always_inline void
>> +rte_write32_wc(uint32_t value, volatile void *addr)
>> +{
>> +       static int _x86_movdiri_flag = -1;
>> +       if (_x86_movdiri_flag == 1) {
>> +               rte_wmb();
>> +               _rte_x86_movdiri(value, addr);
>> +       } else if (_x86_movdiri_flag == 0) {
>> +               rte_write32(value, addr);
>> +       } else {
>> +               _x86_movdiri_flag =
>> +                       (rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
> Can't this cpu flag check be moved in a constructor?
> This would avoid this copy/paste.
We evaluated this approach but it creates more problems than if fixes - 
it will need a variable that needs to be exported and there is no good 
place to put it.
>
>
>> +               if (_x86_movdiri_flag == 1) {
>> +                       rte_wmb();
>> +                       _rte_x86_movdiri(value, addr);
>> +               } else {
>> +                       rte_write32(value, addr);
>> +               }
>> +       }
>> +}
>> +
>> +static __rte_always_inline void
>> +rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
>> +{
>> +       static int _x86_movdiri_flag = -1;
> Same check with a static variable with the same name.
It should be no problem, they are static local variables.
>
>
> I wonder if wrapping all of this in a single function would be more elegant.
> Then rte_write32_wc(|_relaxed) would call it with a flag.
Yes, it will be more elegant but also it will cost more, it was written 
like this to minimize the number of branches taken for the movdiri path.
  

Patch

diff --git a/lib/librte_eal/include/generic/rte_io.h b/lib/librte_eal/include/generic/rte_io.h
index da457f7..7391782 100644
--- a/lib/librte_eal/include/generic/rte_io.h
+++ b/lib/librte_eal/include/generic/rte_io.h
@@ -229,6 +229,39 @@  rte_write32(uint32_t value, volatile void *addr);
 static inline void
 rte_write64(uint64_t value, volatile void *addr);
 
+/**
+ * Write a 32-bit value to I/O device memory address addr using write
+ * combining memory write protocol. Depending on the platform write combining
+ * may not be available and/or may be treated as a hint and the behavior may
+ * fallback to a regular store.
+ *
+ * @param value
+ *  Value to write
+ * @param addr
+ *  I/O memory address to write the value to
+ */
+static inline void
+rte_write32_wc(uint32_t value, volatile void *addr);
+
+/**
+ * Write a 32-bit value to I/O device memory address addr using write
+ * combining memory write protocol. Depending on the platform write combining
+ * may not be available and/or may be treated as a hint and the behavior may
+ * fallback to a regular store.
+ *
+ * The relaxed version does not have additional I/O memory barrier, useful in
+ * accessing the device registers of integrated controllers which implicitly
+ * strongly ordered with respect to memory access.
+ *
+ * @param value
+ *  Value to write
+ * @param addr
+ *  I/O memory address to write the value to
+ */
+static inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr);
+
+
 #endif /* __DOXYGEN__ */
 
 #ifndef RTE_OVERRIDE_IO_H
@@ -345,6 +378,20 @@  rte_write64(uint64_t value, volatile void *addr)
 	rte_write64_relaxed(value, addr);
 }
 
+#ifndef RTE_NATIVE_WRITE32_WC
+rte_write32_wc(uint32_t value, volatile void *addr)
+{
+	rte_write32(value, addr);
+}
+
+static __rte_always_inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
+{
+	rte_write32_relaxed(value, addr);
+}
+#endif /* RTE_NATIVE_WRITE32_WC */
+
+
 #endif /* RTE_OVERRIDE_IO_H */
 
 #endif /* _RTE_IO_H_ */
diff --git a/lib/librte_eal/x86/include/rte_io.h b/lib/librte_eal/x86/include/rte_io.h
index 2db71b1..c95ed67 100644
--- a/lib/librte_eal/x86/include/rte_io.h
+++ b/lib/librte_eal/x86/include/rte_io.h
@@ -9,8 +9,64 @@ 
 extern "C" {
 #endif
 
+#include "rte_cpuflags.h"
+
+#define RTE_NATIVE_WRITE32_WC
 #include "generic/rte_io.h"
 
+/**
+ * @internal
+ * MOVDIRI wrapper.
+ */
+static __rte_always_inline void
+_rte_x86_movdiri(uint32_t value, volatile void *addr)
+{
+	asm volatile(
+		/* MOVDIRI */
+		".byte 0x40, 0x0f, 0x38, 0xf9, 0x02"
+		:
+		: "a" (value), "d" (addr));
+}
+
+static __rte_always_inline void
+rte_write32_wc(uint32_t value, volatile void *addr)
+{
+	static int _x86_movdiri_flag = -1;
+	if (_x86_movdiri_flag == 1) {
+		rte_wmb();
+		_rte_x86_movdiri(value, addr);
+	} else if (_x86_movdiri_flag == 0) {
+		rte_write32(value, addr);
+	} else {
+		_x86_movdiri_flag =
+			(rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
+		if (_x86_movdiri_flag == 1) {
+			rte_wmb();
+			_rte_x86_movdiri(value, addr);
+		} else {
+			rte_write32(value, addr);
+		}
+	}
+}
+
+static __rte_always_inline void
+rte_write32_wc_relaxed(uint32_t value, volatile void *addr)
+{
+	static int _x86_movdiri_flag = -1;
+	if (_x86_movdiri_flag == 1) {
+		_rte_x86_movdiri(value, addr);
+	} else if (_x86_movdiri_flag == 0) {
+		rte_write32_relaxed(value, addr);
+	} else {
+		_x86_movdiri_flag =
+			(rte_cpu_get_flag_enabled(RTE_CPUFLAG_MOVDIRI) > 0);
+		if (_x86_movdiri_flag == 1)
+			_rte_x86_movdiri(value, addr);
+		else
+			rte_write32_relaxed(value, addr);
+	}
+}
+
 #ifdef __cplusplus
 }
 #endif