[v3] eal: fix unaligned loads/stores in rte_memcpy_generic
Checks
Commit Message
Calls to rte_memcpy_generic could result in unaligned loads/stores for
1 < n < 16. This is undefined behavior according to the C standard,
and it gets flagged by the clang undefined behavior sanitizer.
rte_memcpy_generic is called with unaligned src and dst addresses.
When 1 < n < 16, the code would cast both src and dst to a qword,
dword or word pointer, without verifying the alignment of src/dst. The
code was changed to use inline assembly for the load/store operations.
Unaligned load/store operations are permitted in x86/x64 assembly.
Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time")
Cc: Xiaoyun Li <xiaoyun.li@intel.com>
Cc: stable@dpdk.org
Signed-off-by: Luc Pelletier <lucp.at.work@gmail.com>
---
Please note that I didn't write the entire function in inline assembly.
The reason why I kept the bitwise ands as C code is so the optimizer can
remove the branches when n is known at compile-time.
lib/eal/x86/include/rte_memcpy.h | 134 +++++++++++++++++--------------
1 file changed, 72 insertions(+), 62 deletions(-)
Comments
As a side note, and to follow up on Stephen's indication that this is
'performance critical code', I think it might be worthwhile to
revisit/revalidate the current implementation of rte_memcpy. There's a
good thread here that mentions rte_memcpy, and its performance on at
least one platform/architecture combination is far from being the
best:
https://github.com/microsoft/mimalloc/issues/201
It seems like enhanced rep movsb could be faster on more recent CPUs,
but that's currently not being used in the current implementation of
rte_memcpy.
I understand some of this may not be directly related to this patch,
but whoever looks at this patch might want to provide their thoughts
on whether updating rte_memcpy would be worthwhile? I suspect looking
at all current public implementations of memcpy (libc, microsoft,
compilers builtin implementations, etc.) might help in making
improvements.
Le dim. 16 janv. 2022 à 09:15, Luc Pelletier <lucp.at.work@gmail.com> a écrit :
>
> Calls to rte_memcpy_generic could result in unaligned loads/stores for
> 1 < n < 16. This is undefined behavior according to the C standard,
> and it gets flagged by the clang undefined behavior sanitizer.
>
> rte_memcpy_generic is called with unaligned src and dst addresses.
> When 1 < n < 16, the code would cast both src and dst to a qword,
> dword or word pointer, without verifying the alignment of src/dst. The
> code was changed to use inline assembly for the load/store operations.
> Unaligned load/store operations are permitted in x86/x64 assembly.
>
> Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time")
> Cc: Xiaoyun Li <xiaoyun.li@intel.com>
> Cc: stable@dpdk.org
>
> Signed-off-by: Luc Pelletier <lucp.at.work@gmail.com>
> ---
>
> Please note that I didn't write the entire function in inline assembly.
> The reason why I kept the bitwise ands as C code is so the optimizer can
> remove the branches when n is known at compile-time.
>
> lib/eal/x86/include/rte_memcpy.h | 134 +++++++++++++++++--------------
> 1 file changed, 72 insertions(+), 62 deletions(-)
>
> diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
> index 1b6c6e585f..b99c1b2ca5 100644
> --- a/lib/eal/x86/include/rte_memcpy.h
> +++ b/lib/eal/x86/include/rte_memcpy.h
> @@ -45,6 +45,75 @@ extern "C" {
> static __rte_always_inline void *
> rte_memcpy(void *dst, const void *src, size_t n);
>
> +#if defined(__i386__)
> + #define RTE_ACCUMULATOR_REGISTER_NAME "eax"
> +#elif defined(__x86_64__)
> + #define RTE_ACCUMULATOR_REGISTER_NAME "rax"
> +#endif
> +
> +/**
> + * Copy bytes from one location to another,
> + * locations should not overlap.
> + * Use with unaligned src/dst, and n <= 15.
> + */
> +static __rte_always_inline void *
> +rte_mov15_or_less_unaligned(void *dst, const void *src, size_t n)
> +{
> + void *ret = dst;
> + if (n & 8) {
> + asm (
> +#if defined(__i386__)
> + "movl (%[src]), %%eax\n"
> + "movl %%eax, (%[dst])\n"
> + "add $4, %[src]\n"
> + "add $4, %[dst]\n"
> + "movl (%[src]), %%eax\n"
> + "movl %%eax, (%[dst])\n"
> + "add $4, %[src]\n"
> + "add $4, %[dst]\n"
> +#elif defined(__x86_64__)
> + "movq (%[src]), %%rax\n"
> + "movq %%rax, (%[dst])\n"
> + "add $8, %[src]\n"
> + "add $8, %[dst]\n"
> +#else
> + #error Unsupported architecture
> +#endif
> + : [dst] "+r" (dst), [src] "+r" (src)
> + :
> + : RTE_ACCUMULATOR_REGISTER_NAME, "memory");
> + }
> + if (n & 4) {
> + asm (
> + "movl (%[src]), %%eax\n"
> + "movl %%eax, (%[dst])\n"
> + "add $4, %[src]\n"
> + "add $4, %[dst]\n"
> + : [dst] "+r" (dst), [src] "+r" (src)
> + :
> + : RTE_ACCUMULATOR_REGISTER_NAME, "memory");
> + }
> + if (n & 2) {
> + asm (
> + "movw (%[src]), %%ax\n"
> + "movw %%ax, (%[dst])\n"
> + "add $2, %[src]\n"
> + "add $2, %[dst]\n"
> + : [dst] "+r" (dst), [src] "+r" (src)
> + :
> + : RTE_ACCUMULATOR_REGISTER_NAME, "memory");
> + }
> + if (n & 1) {
> + asm (
> + "movb (%[src]), %%al\n"
> + "movb %%al, (%[dst])\n"
> + : [dst] "+r" (dst), [src] "+r" (src)
> + :
> + : RTE_ACCUMULATOR_REGISTER_NAME, "memory");
> + }
> + return ret;
> +}
> +
> #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
>
> #define ALIGNMENT_MASK 0x3F
> @@ -171,8 +240,6 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
> static __rte_always_inline void *
> rte_memcpy_generic(void *dst, const void *src, size_t n)
> {
> - uintptr_t dstu = (uintptr_t)dst;
> - uintptr_t srcu = (uintptr_t)src;
> void *ret = dst;
> size_t dstofss;
> size_t bits;
> @@ -181,24 +248,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> * Copy less than 16 bytes
> */
> if (n < 16) {
> - if (n & 0x01) {
> - *(uint8_t *)dstu = *(const uint8_t *)srcu;
> - srcu = (uintptr_t)((const uint8_t *)srcu + 1);
> - dstu = (uintptr_t)((uint8_t *)dstu + 1);
> - }
> - if (n & 0x02) {
> - *(uint16_t *)dstu = *(const uint16_t *)srcu;
> - srcu = (uintptr_t)((const uint16_t *)srcu + 1);
> - dstu = (uintptr_t)((uint16_t *)dstu + 1);
> - }
> - if (n & 0x04) {
> - *(uint32_t *)dstu = *(const uint32_t *)srcu;
> - srcu = (uintptr_t)((const uint32_t *)srcu + 1);
> - dstu = (uintptr_t)((uint32_t *)dstu + 1);
> - }
> - if (n & 0x08)
> - *(uint64_t *)dstu = *(const uint64_t *)srcu;
> - return ret;
> + return rte_mov15_or_less_unaligned(dst, src, n);
> }
>
> /**
> @@ -379,8 +429,6 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
> static __rte_always_inline void *
> rte_memcpy_generic(void *dst, const void *src, size_t n)
> {
> - uintptr_t dstu = (uintptr_t)dst;
> - uintptr_t srcu = (uintptr_t)src;
> void *ret = dst;
> size_t dstofss;
> size_t bits;
> @@ -389,25 +437,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> * Copy less than 16 bytes
> */
> if (n < 16) {
> - if (n & 0x01) {
> - *(uint8_t *)dstu = *(const uint8_t *)srcu;
> - srcu = (uintptr_t)((const uint8_t *)srcu + 1);
> - dstu = (uintptr_t)((uint8_t *)dstu + 1);
> - }
> - if (n & 0x02) {
> - *(uint16_t *)dstu = *(const uint16_t *)srcu;
> - srcu = (uintptr_t)((const uint16_t *)srcu + 1);
> - dstu = (uintptr_t)((uint16_t *)dstu + 1);
> - }
> - if (n & 0x04) {
> - *(uint32_t *)dstu = *(const uint32_t *)srcu;
> - srcu = (uintptr_t)((const uint32_t *)srcu + 1);
> - dstu = (uintptr_t)((uint32_t *)dstu + 1);
> - }
> - if (n & 0x08) {
> - *(uint64_t *)dstu = *(const uint64_t *)srcu;
> - }
> - return ret;
> + return rte_mov15_or_less_unaligned(dst, src, n);
> }
>
> /**
> @@ -672,8 +702,6 @@ static __rte_always_inline void *
> rte_memcpy_generic(void *dst, const void *src, size_t n)
> {
> __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
> - uintptr_t dstu = (uintptr_t)dst;
> - uintptr_t srcu = (uintptr_t)src;
> void *ret = dst;
> size_t dstofss;
> size_t srcofs;
> @@ -682,25 +710,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
> * Copy less than 16 bytes
> */
> if (n < 16) {
> - if (n & 0x01) {
> - *(uint8_t *)dstu = *(const uint8_t *)srcu;
> - srcu = (uintptr_t)((const uint8_t *)srcu + 1);
> - dstu = (uintptr_t)((uint8_t *)dstu + 1);
> - }
> - if (n & 0x02) {
> - *(uint16_t *)dstu = *(const uint16_t *)srcu;
> - srcu = (uintptr_t)((const uint16_t *)srcu + 1);
> - dstu = (uintptr_t)((uint16_t *)dstu + 1);
> - }
> - if (n & 0x04) {
> - *(uint32_t *)dstu = *(const uint32_t *)srcu;
> - srcu = (uintptr_t)((const uint32_t *)srcu + 1);
> - dstu = (uintptr_t)((uint32_t *)dstu + 1);
> - }
> - if (n & 0x08) {
> - *(uint64_t *)dstu = *(const uint64_t *)srcu;
> - }
> - return ret;
> + return rte_mov15_or_less_unaligned(dst, src, n);
> }
>
> /**
> --
> 2.25.1
>
On Sun, 16 Jan 2022 09:33:19 -0500
Luc Pelletier <lucp.at.work@gmail.com> wrote:
> As a side note, and to follow up on Stephen's indication that this is
> 'performance critical code', I think it might be worthwhile to
> revisit/revalidate the current implementation of rte_memcpy. There's a
> good thread here that mentions rte_memcpy, and its performance on at
> least one platform/architecture combination is far from being the
> best:
>
> https://github.com/microsoft/mimalloc/issues/201
>
> It seems like enhanced rep movsb could be faster on more recent CPUs,
> but that's currently not being used in the current implementation of
> rte_memcpy.
>
> I understand some of this may not be directly related to this patch,
> but whoever looks at this patch might want to provide their thoughts
> on whether updating rte_memcpy would be worthwhile? I suspect looking
> at all current public implementations of memcpy (libc, microsoft,
> compilers builtin implementations, etc.) might help in making
> improvements.
I would prefer that rte_memcpy did not exist at all.
Instead the system library should always be used.
It is only exists because some architectures have slower code
in glibc.
> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> Sent: Sunday, 16 January 2022 17.34
>
> On Sun, 16 Jan 2022 09:33:19 -0500
> Luc Pelletier <lucp.at.work@gmail.com> wrote:
>
> > As a side note, and to follow up on Stephen's indication that this is
> > 'performance critical code', I think it might be worthwhile to
> > revisit/revalidate the current implementation of rte_memcpy. There's
> a
> > good thread here that mentions rte_memcpy, and its performance on at
> > least one platform/architecture combination is far from being the
> > best:
> >
> > https://github.com/microsoft/mimalloc/issues/201
> >
> > It seems like enhanced rep movsb could be faster on more recent CPUs,
> > but that's currently not being used in the current implementation of
> > rte_memcpy.
> >
> > I understand some of this may not be directly related to this patch,
> > but whoever looks at this patch might want to provide their thoughts
> > on whether updating rte_memcpy would be worthwhile? I suspect looking
> > at all current public implementations of memcpy (libc, microsoft,
> > compilers builtin implementations, etc.) might help in making
> > improvements.
>
> I would prefer that rte_memcpy did not exist at all.
> Instead the system library should always be used.
>
> It is only exists because some architectures have slower code
> in glibc.
I wonder if that is still the case?
Otherwise, DPDK is probably full of obsolete optimizations, which should be eliminated like this:
http://inbox.dpdk.org/dev/20210918114930.245387-1-mail@gms.tf/
@@ -45,6 +45,75 @@ extern "C" {
static __rte_always_inline void *
rte_memcpy(void *dst, const void *src, size_t n);
+#if defined(__i386__)
+ #define RTE_ACCUMULATOR_REGISTER_NAME "eax"
+#elif defined(__x86_64__)
+ #define RTE_ACCUMULATOR_REGISTER_NAME "rax"
+#endif
+
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with unaligned src/dst, and n <= 15.
+ */
+static __rte_always_inline void *
+rte_mov15_or_less_unaligned(void *dst, const void *src, size_t n)
+{
+ void *ret = dst;
+ if (n & 8) {
+ asm (
+#if defined(__i386__)
+ "movl (%[src]), %%eax\n"
+ "movl %%eax, (%[dst])\n"
+ "add $4, %[src]\n"
+ "add $4, %[dst]\n"
+ "movl (%[src]), %%eax\n"
+ "movl %%eax, (%[dst])\n"
+ "add $4, %[src]\n"
+ "add $4, %[dst]\n"
+#elif defined(__x86_64__)
+ "movq (%[src]), %%rax\n"
+ "movq %%rax, (%[dst])\n"
+ "add $8, %[src]\n"
+ "add $8, %[dst]\n"
+#else
+ #error Unsupported architecture
+#endif
+ : [dst] "+r" (dst), [src] "+r" (src)
+ :
+ : RTE_ACCUMULATOR_REGISTER_NAME, "memory");
+ }
+ if (n & 4) {
+ asm (
+ "movl (%[src]), %%eax\n"
+ "movl %%eax, (%[dst])\n"
+ "add $4, %[src]\n"
+ "add $4, %[dst]\n"
+ : [dst] "+r" (dst), [src] "+r" (src)
+ :
+ : RTE_ACCUMULATOR_REGISTER_NAME, "memory");
+ }
+ if (n & 2) {
+ asm (
+ "movw (%[src]), %%ax\n"
+ "movw %%ax, (%[dst])\n"
+ "add $2, %[src]\n"
+ "add $2, %[dst]\n"
+ : [dst] "+r" (dst), [src] "+r" (src)
+ :
+ : RTE_ACCUMULATOR_REGISTER_NAME, "memory");
+ }
+ if (n & 1) {
+ asm (
+ "movb (%[src]), %%al\n"
+ "movb %%al, (%[dst])\n"
+ : [dst] "+r" (dst), [src] "+r" (src)
+ :
+ : RTE_ACCUMULATOR_REGISTER_NAME, "memory");
+ }
+ return ret;
+}
+
#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
#define ALIGNMENT_MASK 0x3F
@@ -171,8 +240,6 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
static __rte_always_inline void *
rte_memcpy_generic(void *dst, const void *src, size_t n)
{
- uintptr_t dstu = (uintptr_t)dst;
- uintptr_t srcu = (uintptr_t)src;
void *ret = dst;
size_t dstofss;
size_t bits;
@@ -181,24 +248,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
* Copy less than 16 bytes
*/
if (n < 16) {
- if (n & 0x01) {
- *(uint8_t *)dstu = *(const uint8_t *)srcu;
- srcu = (uintptr_t)((const uint8_t *)srcu + 1);
- dstu = (uintptr_t)((uint8_t *)dstu + 1);
- }
- if (n & 0x02) {
- *(uint16_t *)dstu = *(const uint16_t *)srcu;
- srcu = (uintptr_t)((const uint16_t *)srcu + 1);
- dstu = (uintptr_t)((uint16_t *)dstu + 1);
- }
- if (n & 0x04) {
- *(uint32_t *)dstu = *(const uint32_t *)srcu;
- srcu = (uintptr_t)((const uint32_t *)srcu + 1);
- dstu = (uintptr_t)((uint32_t *)dstu + 1);
- }
- if (n & 0x08)
- *(uint64_t *)dstu = *(const uint64_t *)srcu;
- return ret;
+ return rte_mov15_or_less_unaligned(dst, src, n);
}
/**
@@ -379,8 +429,6 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
static __rte_always_inline void *
rte_memcpy_generic(void *dst, const void *src, size_t n)
{
- uintptr_t dstu = (uintptr_t)dst;
- uintptr_t srcu = (uintptr_t)src;
void *ret = dst;
size_t dstofss;
size_t bits;
@@ -389,25 +437,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
* Copy less than 16 bytes
*/
if (n < 16) {
- if (n & 0x01) {
- *(uint8_t *)dstu = *(const uint8_t *)srcu;
- srcu = (uintptr_t)((const uint8_t *)srcu + 1);
- dstu = (uintptr_t)((uint8_t *)dstu + 1);
- }
- if (n & 0x02) {
- *(uint16_t *)dstu = *(const uint16_t *)srcu;
- srcu = (uintptr_t)((const uint16_t *)srcu + 1);
- dstu = (uintptr_t)((uint16_t *)dstu + 1);
- }
- if (n & 0x04) {
- *(uint32_t *)dstu = *(const uint32_t *)srcu;
- srcu = (uintptr_t)((const uint32_t *)srcu + 1);
- dstu = (uintptr_t)((uint32_t *)dstu + 1);
- }
- if (n & 0x08) {
- *(uint64_t *)dstu = *(const uint64_t *)srcu;
- }
- return ret;
+ return rte_mov15_or_less_unaligned(dst, src, n);
}
/**
@@ -672,8 +702,6 @@ static __rte_always_inline void *
rte_memcpy_generic(void *dst, const void *src, size_t n)
{
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
- uintptr_t dstu = (uintptr_t)dst;
- uintptr_t srcu = (uintptr_t)src;
void *ret = dst;
size_t dstofss;
size_t srcofs;
@@ -682,25 +710,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
* Copy less than 16 bytes
*/
if (n < 16) {
- if (n & 0x01) {
- *(uint8_t *)dstu = *(const uint8_t *)srcu;
- srcu = (uintptr_t)((const uint8_t *)srcu + 1);
- dstu = (uintptr_t)((uint8_t *)dstu + 1);
- }
- if (n & 0x02) {
- *(uint16_t *)dstu = *(const uint16_t *)srcu;
- srcu = (uintptr_t)((const uint16_t *)srcu + 1);
- dstu = (uintptr_t)((uint16_t *)dstu + 1);
- }
- if (n & 0x04) {
- *(uint32_t *)dstu = *(const uint32_t *)srcu;
- srcu = (uintptr_t)((const uint32_t *)srcu + 1);
- dstu = (uintptr_t)((uint32_t *)dstu + 1);
- }
- if (n & 0x08) {
- *(uint64_t *)dstu = *(const uint64_t *)srcu;
- }
- return ret;
+ return rte_mov15_or_less_unaligned(dst, src, n);
}
/**