[v2,4/5] common/octeontx2: fix build with sve enabled
Checks
Commit Message
Building with gcc 10.2 with SVE extension enabled got error:
{standard input}: Assembler messages:
{standard input}:4002: Error: selected processor does not support `mov z3.b,#0'
{standard input}:4003: Error: selected processor does not support `whilelo p1.b,xzr,x7'
{standard input}:4005: Error: selected processor does not support `ld1b z0.b,p1/z,[x8]'
{standard input}:4006: Error: selected processor does not support `whilelo p4.s,wzr,w7'
This is because inline assembly code explicitly resets cpu model to
not have SVE support. Thus SVE instructions generated by compiler
auto vectorization got rejected by assembler.
Fixed the issue by replacing inline assembly with equivalent atomic
built-ins. Compiler will generate LSE instructions for cpu that has
the extension.
Fixes: 8a4f835971f5 ("common/octeontx2: add IO handling APIs")
Cc: jerinj@marvell.com
Cc: stable@dpdk.org
Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
drivers/common/octeontx2/otx2_io_arm64.h | 37 +++---------------------
1 file changed, 4 insertions(+), 33 deletions(-)
Comments
Hi Ruifeng,
>Building with gcc 10.2 with SVE extension enabled got error:
>
>{standard input}: Assembler messages:
>{standard input}:4002: Error: selected processor does not support `mov
>z3.b,#0'
>{standard input}:4003: Error: selected processor does not support
>`whilelo p1.b,xzr,x7'
>{standard input}:4005: Error: selected processor does not support `ld1b
>z0.b,p1/z,[x8]'
>{standard input}:4006: Error: selected processor does not support
>`whilelo p4.s,wzr,w7'
>
>This is because inline assembly code explicitly resets cpu model to
>not have SVE support. Thus SVE instructions generated by compiler
>auto vectorization got rejected by assembler.
>
>Fixed the issue by replacing inline assembly with equivalent atomic
>built-ins. Compiler will generate LSE instructions for cpu that has
>the extension.
>
>Fixes: 8a4f835971f5 ("common/octeontx2: add IO handling APIs")
>Cc: jerinj@marvell.com
>Cc: stable@dpdk.org
>
>Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
>---
> drivers/common/octeontx2/otx2_io_arm64.h | 37 +++--------------------
>-
> 1 file changed, 4 insertions(+), 33 deletions(-)
>
>diff --git a/drivers/common/octeontx2/otx2_io_arm64.h
>b/drivers/common/octeontx2/otx2_io_arm64.h
>index b5c85d9a6..8843a79b5 100644
>--- a/drivers/common/octeontx2/otx2_io_arm64.h
>+++ b/drivers/common/octeontx2/otx2_io_arm64.h
>@@ -24,55 +24,26 @@
> static __rte_always_inline uint64_t
> otx2_atomic64_add_nosync(int64_t incr, int64_t *ptr)
> {
>- uint64_t result;
>-
> /* Atomic add with no ordering */
>- asm volatile (
>- ".cpu generic+lse\n"
>- "ldadd %x[i], %x[r], [%[b]]"
>- : [r] "=r" (result), "+m" (*ptr)
>- : [i] "r" (incr), [b] "r" (ptr)
>- : "memory");
>- return result;
>+ return (uint64_t)__atomic_fetch_add(ptr, incr,
>__ATOMIC_RELAXED);
> }
>
Here LDADD acts as a way to interface to co-processors i.e.
LDADD instruction opcode + specific io address are recognized by
HW interceptor and dispatched to the specific coprocessor.
Leaving it to the compiler to use the correct instruction is a bad idea.
This breaks the arm64_armv8_linux_gcc build as it doesn't have the
+lse enabled.
__atomic_fetch_add will generate a different instruction with SVE
enabled.
Instead can we add +sve to the first line to prevent outer loop from optimizing out
the trap?
I tested with 10.2 and n2 config below change works fine.
-" .cpu generic+lse\n"
+" .cpu generic+lse+sve\n"
Regards,
Pavan.
> static __rte_always_inline uint64_t
> otx2_atomic64_add_sync(int64_t incr, int64_t *ptr)
> {
>- uint64_t result;
>-
>- /* Atomic add with ordering */
>- asm volatile (
>- ".cpu generic+lse\n"
>- "ldadda %x[i], %x[r], [%[b]]"
>- : [r] "=r" (result), "+m" (*ptr)
>- : [i] "r" (incr), [b] "r" (ptr)
>- : "memory");
>- return result;
>+ return (uint64_t)__atomic_fetch_add(ptr, incr,
>__ATOMIC_ACQUIRE);
> }
>
> static __rte_always_inline uint64_t
> otx2_lmt_submit(rte_iova_t io_address)
> {
>- uint64_t result;
>-
>- asm volatile (
>- ".cpu generic+lse\n"
>- "ldeor xzr,%x[rf],[%[rs]]" :
>- [rf] "=r"(result): [rs] "r"(io_address));
>- return result;
>+ return __atomic_fetch_xor((uint64_t *)io_address, 0,
>__ATOMIC_RELAXED);
> }
>
> static __rte_always_inline uint64_t
> otx2_lmt_submit_release(rte_iova_t io_address)
> {
>- uint64_t result;
>-
>- asm volatile (
>- ".cpu generic+lse\n"
>- "ldeorl xzr,%x[rf],[%[rs]]" :
>- [rf] "=r"(result) : [rs] "r"(io_address));
>- return result;
>+ return __atomic_fetch_xor((uint64_t *)io_address, 0,
>__ATOMIC_RELEASE);
> }
>
> static __rte_always_inline void
>--
>2.25.1
> -----Original Message-----
> From: Pavan Nikhilesh Bhagavatula <pbhagavatula@marvell.com>
> Sent: Friday, January 8, 2021 6:29 PM
> To: Ruifeng Wang <Ruifeng.Wang@arm.com>; jerinj@marvell.com; Nithin
> Kumar Dabilpuram <ndabilpuram@marvell.com>
> Cc: dev@dpdk.org; vladimir.medvedkin@intel.com;
> hemant.agrawal@nxp.com; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; nd <nd@arm.com>; stable@dpdk.org
> Subject: RE: [EXT] [PATCH v2 4/5] common/octeontx2: fix build with sve
> enabled
>
> Hi Ruifeng,
>
> >Building with gcc 10.2 with SVE extension enabled got error:
> >
> >{standard input}: Assembler messages:
> >{standard input}:4002: Error: selected processor does not support `mov
> >z3.b,#0'
> >{standard input}:4003: Error: selected processor does not support
> >`whilelo p1.b,xzr,x7'
> >{standard input}:4005: Error: selected processor does not support `ld1b
> >z0.b,p1/z,[x8]'
> >{standard input}:4006: Error: selected processor does not support
> >`whilelo p4.s,wzr,w7'
> >
> >This is because inline assembly code explicitly resets cpu model to not
> >have SVE support. Thus SVE instructions generated by compiler auto
> >vectorization got rejected by assembler.
> >
> >Fixed the issue by replacing inline assembly with equivalent atomic
> >built-ins. Compiler will generate LSE instructions for cpu that has the
> >extension.
> >
> >Fixes: 8a4f835971f5 ("common/octeontx2: add IO handling APIs")
> >Cc: jerinj@marvell.com
> >Cc: stable@dpdk.org
> >
> >Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
> >---
> > drivers/common/octeontx2/otx2_io_arm64.h | 37 +++--------------------
> >-
> > 1 file changed, 4 insertions(+), 33 deletions(-)
> >
> >diff --git a/drivers/common/octeontx2/otx2_io_arm64.h
> >b/drivers/common/octeontx2/otx2_io_arm64.h
> >index b5c85d9a6..8843a79b5 100644
> >--- a/drivers/common/octeontx2/otx2_io_arm64.h
> >+++ b/drivers/common/octeontx2/otx2_io_arm64.h
> >@@ -24,55 +24,26 @@
> > static __rte_always_inline uint64_t
> > otx2_atomic64_add_nosync(int64_t incr, int64_t *ptr) {
> >- uint64_t result;
> >-
> > /* Atomic add with no ordering */
> >- asm volatile (
> >- ".cpu generic+lse\n"
> >- "ldadd %x[i], %x[r], [%[b]]"
> >- : [r] "=r" (result), "+m" (*ptr)
> >- : [i] "r" (incr), [b] "r" (ptr)
> >- : "memory");
> >- return result;
> >+ return (uint64_t)__atomic_fetch_add(ptr, incr,
> >__ATOMIC_RELAXED);
> > }
> >
>
> Here LDADD acts as a way to interface to co-processors i.e.
> LDADD instruction opcode + specific io address are recognized by HW
> interceptor and dispatched to the specific coprocessor.
OK. Now I understand the background.
>
> Leaving it to the compiler to use the correct instruction is a bad idea.
> This breaks the arm64_armv8_linux_gcc build as it doesn't have the
> +lse enabled.
> __atomic_fetch_add will generate a different instruction with SVE enabled.
>
> Instead can we add +sve to the first line to prevent outer loop from
> optimizing out the trap?
Since the inline assembly needs to be preserved, we have to tune the enabled extensions.
I will change in next version.
Thanks,
Ruifeng
>
> I tested with 10.2 and n2 config below change works fine.
> -" .cpu generic+lse\n"
> +" .cpu generic+lse+sve\n"
>
> Regards,
> Pavan.
>
> > static __rte_always_inline uint64_t
> > otx2_atomic64_add_sync(int64_t incr, int64_t *ptr) {
> >- uint64_t result;
> >-
> >- /* Atomic add with ordering */
> >- asm volatile (
> >- ".cpu generic+lse\n"
> >- "ldadda %x[i], %x[r], [%[b]]"
> >- : [r] "=r" (result), "+m" (*ptr)
> >- : [i] "r" (incr), [b] "r" (ptr)
> >- : "memory");
> >- return result;
> >+ return (uint64_t)__atomic_fetch_add(ptr, incr,
> >__ATOMIC_ACQUIRE);
> > }
> >
> > static __rte_always_inline uint64_t
> > otx2_lmt_submit(rte_iova_t io_address) {
> >- uint64_t result;
> >-
> >- asm volatile (
> >- ".cpu generic+lse\n"
> >- "ldeor xzr,%x[rf],[%[rs]]" :
> >- [rf] "=r"(result): [rs] "r"(io_address));
> >- return result;
> >+ return __atomic_fetch_xor((uint64_t *)io_address, 0,
> >__ATOMIC_RELAXED);
> > }
> >
> > static __rte_always_inline uint64_t
> > otx2_lmt_submit_release(rte_iova_t io_address) {
> >- uint64_t result;
> >-
> >- asm volatile (
> >- ".cpu generic+lse\n"
> >- "ldeorl xzr,%x[rf],[%[rs]]" :
> >- [rf] "=r"(result) : [rs] "r"(io_address));
> >- return result;
> >+ return __atomic_fetch_xor((uint64_t *)io_address, 0,
> >__ATOMIC_RELEASE);
> > }
> >
> > static __rte_always_inline void
> >--
> >2.25.1
@@ -24,55 +24,26 @@
static __rte_always_inline uint64_t
otx2_atomic64_add_nosync(int64_t incr, int64_t *ptr)
{
- uint64_t result;
-
/* Atomic add with no ordering */
- asm volatile (
- ".cpu generic+lse\n"
- "ldadd %x[i], %x[r], [%[b]]"
- : [r] "=r" (result), "+m" (*ptr)
- : [i] "r" (incr), [b] "r" (ptr)
- : "memory");
- return result;
+ return (uint64_t)__atomic_fetch_add(ptr, incr, __ATOMIC_RELAXED);
}
static __rte_always_inline uint64_t
otx2_atomic64_add_sync(int64_t incr, int64_t *ptr)
{
- uint64_t result;
-
- /* Atomic add with ordering */
- asm volatile (
- ".cpu generic+lse\n"
- "ldadda %x[i], %x[r], [%[b]]"
- : [r] "=r" (result), "+m" (*ptr)
- : [i] "r" (incr), [b] "r" (ptr)
- : "memory");
- return result;
+ return (uint64_t)__atomic_fetch_add(ptr, incr, __ATOMIC_ACQUIRE);
}
static __rte_always_inline uint64_t
otx2_lmt_submit(rte_iova_t io_address)
{
- uint64_t result;
-
- asm volatile (
- ".cpu generic+lse\n"
- "ldeor xzr,%x[rf],[%[rs]]" :
- [rf] "=r"(result): [rs] "r"(io_address));
- return result;
+ return __atomic_fetch_xor((uint64_t *)io_address, 0, __ATOMIC_RELAXED);
}
static __rte_always_inline uint64_t
otx2_lmt_submit_release(rte_iova_t io_address)
{
- uint64_t result;
-
- asm volatile (
- ".cpu generic+lse\n"
- "ldeorl xzr,%x[rf],[%[rs]]" :
- [rf] "=r"(result) : [rs] "r"(io_address));
- return result;
+ return __atomic_fetch_xor((uint64_t *)io_address, 0, __ATOMIC_RELEASE);
}
static __rte_always_inline void