[dpdk-dev,2/3] arm64: acl: add neon based acl implementation

Message ID 1446473921-12706-3-git-send-email-jerin.jacob@caviumnetworks.com (mailing list archive)
State Superseded, archived
Headers

Commit Message

Jerin Jacob Nov. 2, 2015, 2:18 p.m. UTC
Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
---
 app/test-acl/main.c           |   4 +
 lib/librte_acl/Makefile       |   5 +
 lib/librte_acl/acl.h          |   4 +
 lib/librte_acl/acl_run_neon.c |  46 +++++++
 lib/librte_acl/acl_run_neon.h | 290 ++++++++++++++++++++++++++++++++++++++++++
 lib/librte_acl/rte_acl.c      |  25 ++++
 lib/librte_acl/rte_acl.h      |   1 +
 7 files changed, 375 insertions(+)
 create mode 100644 lib/librte_acl/acl_run_neon.c
 create mode 100644 lib/librte_acl/acl_run_neon.h
  

Comments

Jan Viktorin Nov. 2, 2015, 3:39 p.m. UTC | #1
On Mon, 2 Nov 2015 19:48:40 +0530
Jerin Jacob <jerin.jacob@caviumnetworks.com> wrote:

> Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> ---
>  app/test-acl/main.c           |   4 +
>  lib/librte_acl/Makefile       |   5 +
>  lib/librte_acl/acl.h          |   4 +
>  lib/librte_acl/acl_run_neon.c |  46 +++++++
>  lib/librte_acl/acl_run_neon.h | 290 ++++++++++++++++++++++++++++++++++++++++++
>  lib/librte_acl/rte_acl.c      |  25 ++++
>  lib/librte_acl/rte_acl.h      |   1 +
>  7 files changed, 375 insertions(+)
>  create mode 100644 lib/librte_acl/acl_run_neon.c
>  create mode 100644 lib/librte_acl/acl_run_neon.h
> 
> diff --git a/app/test-acl/main.c b/app/test-acl/main.c
> index 72ce83c..0b0c093 100644
> --- a/app/test-acl/main.c
> +++ b/app/test-acl/main.c
> @@ -101,6 +101,10 @@ static const struct acl_alg acl_alg[] = {
>  		.name = "avx2",
>  		.alg = RTE_ACL_CLASSIFY_AVX2,
>  	},
> +	{
> +		.name = "neon",
> +		.alg = RTE_ACL_CLASSIFY_NEON,
> +	},
>  };
>  
>  static struct {
> diff --git a/lib/librte_acl/Makefile b/lib/librte_acl/Makefile
> index 7a1cf8a..27f91d5 100644
> --- a/lib/librte_acl/Makefile
> +++ b/lib/librte_acl/Makefile
> @@ -48,9 +48,14 @@ SRCS-$(CONFIG_RTE_LIBRTE_ACL) += rte_acl.c
>  SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_bld.c
>  SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_gen.c
>  SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_scalar.c
> +ifeq ($(CONFIG_RTE_ARCH_ARM64),y)
> +SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_neon.c

Are the used NEON instrinsics for ACL ARMv8-specific? If so, the file should be named
something like acl_run_neonv8.c...

> +else
>  SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_sse.c
> +endif
>  
>  CFLAGS_acl_run_sse.o += -msse4.1
> +CFLAGS_acl_run_neon.o += -flax-vector-conversions -Wno-maybe-uninitialized

From man gcc:

-flax-vector-conversions
 Allow implicit conversions between vectors with differing numbers of elements and/or
 incompatible element types.  This option should not be used for new code.

I've already pointed to this in the Dave's ARMv8 patchset. They dropped it silently.
What is the purpose? Is it necessary?

Jan

>  
>  #
>  # If the compiler supports AVX2 instructions,
> diff --git a/lib/librte_acl/acl.h b/lib/librte_acl/acl.h
> index eb4930c..09d6784 100644
> --- a/lib/librte_acl/acl.h
> +++ b/lib/librte_acl/acl.h
> @@ -230,6 +230,10 @@ int
>  rte_acl_classify_avx2(const struct rte_acl_ctx *ctx, const uint8_t **data,
>  	uint32_t *results, uint32_t num, uint32_t categories);
>  
--snip--
  
Jerin Jacob Nov. 2, 2015, 4:19 p.m. UTC | #2
On Mon, Nov 02, 2015 at 04:39:37PM +0100, Jan Viktorin wrote:
> On Mon, 2 Nov 2015 19:48:40 +0530
> Jerin Jacob <jerin.jacob@caviumnetworks.com> wrote:
> 
> > Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> > ---
> >  app/test-acl/main.c           |   4 +
> >  lib/librte_acl/Makefile       |   5 +
> >  lib/librte_acl/acl.h          |   4 +
> >  lib/librte_acl/acl_run_neon.c |  46 +++++++
> >  lib/librte_acl/acl_run_neon.h | 290 ++++++++++++++++++++++++++++++++++++++++++
> >  lib/librte_acl/rte_acl.c      |  25 ++++
> >  lib/librte_acl/rte_acl.h      |   1 +
> >  7 files changed, 375 insertions(+)
> >  create mode 100644 lib/librte_acl/acl_run_neon.c
> >  create mode 100644 lib/librte_acl/acl_run_neon.h
> > 
> > diff --git a/app/test-acl/main.c b/app/test-acl/main.c
> > index 72ce83c..0b0c093 100644
> > --- a/app/test-acl/main.c
> > +++ b/app/test-acl/main.c
> > @@ -101,6 +101,10 @@ static const struct acl_alg acl_alg[] = {
> >  		.name = "avx2",
> >  		.alg = RTE_ACL_CLASSIFY_AVX2,
> >  	},
> > +	{
> > +		.name = "neon",
> > +		.alg = RTE_ACL_CLASSIFY_NEON,
> > +	},
> >  };
> >  
> >  static struct {
> > diff --git a/lib/librte_acl/Makefile b/lib/librte_acl/Makefile
> > index 7a1cf8a..27f91d5 100644
> > --- a/lib/librte_acl/Makefile
> > +++ b/lib/librte_acl/Makefile
> > @@ -48,9 +48,14 @@ SRCS-$(CONFIG_RTE_LIBRTE_ACL) += rte_acl.c
> >  SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_bld.c
> >  SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_gen.c
> >  SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_scalar.c
> > +ifeq ($(CONFIG_RTE_ARCH_ARM64),y)
> > +SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_neon.c
> 
> Are the used NEON instrinsics for ACL ARMv8-specific? If so, the file should be named
> something like acl_run_neonv8.c...
> 

Yes, bit of armv8 specific, looks like vqtbl1q_u8 NEON instrinsics
defined only in armv8. I could rename to acl_run_neonv8.c but keeping
as acl_run_neon.c, may in future it can be extend to armv7 also.
I am open to any decision, let me know your views.

> > +else
> >  SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_sse.c
> > +endif
> >  
> >  CFLAGS_acl_run_sse.o += -msse4.1
> > +CFLAGS_acl_run_neon.o += -flax-vector-conversions -Wno-maybe-uninitialized
> 
> From man gcc:
> 
> -flax-vector-conversions
>  Allow implicit conversions between vectors with differing numbers of elements and/or
>  incompatible element types.  This option should not be used for new code.
> 
> I've already pointed to this in the Dave's ARMv8 patchset. They dropped it silently.
> What is the purpose? Is it necessary?

Yes, the same tr hi value we can representing as unsigned and signed
based on it DFA or QRANGE .


> 
> Jan
> 
> >  
> >  #
> >  # If the compiler supports AVX2 instructions,
> > diff --git a/lib/librte_acl/acl.h b/lib/librte_acl/acl.h
> > index eb4930c..09d6784 100644
> > --- a/lib/librte_acl/acl.h
> > +++ b/lib/librte_acl/acl.h
> > @@ -230,6 +230,10 @@ int
> >  rte_acl_classify_avx2(const struct rte_acl_ctx *ctx, const uint8_t **data,
> >  	uint32_t *results, uint32_t num, uint32_t categories);
> >  
> --snip--
> 
> -- 
>    Jan Viktorin                  E-mail: Viktorin@RehiveTech.com
>    System Architect              Web:    www.RehiveTech.com
>    RehiveTech
>    Brno, Czech Republic
  
Ananyev, Konstantin Nov. 2, 2015, 4:54 p.m. UTC | #3
Hi Jacob,

> diff --git a/lib/librte_acl/rte_acl.c b/lib/librte_acl/rte_acl.c
> index d60219f..e2fdebd 100644
> --- a/lib/librte_acl/rte_acl.c
> +++ b/lib/librte_acl/rte_acl.c
> @@ -55,11 +55,32 @@ rte_acl_classify_avx2(__rte_unused const struct rte_acl_ctx *ctx,
>  	return -ENOTSUP;
>  }
> 
> +int __attribute__ ((weak))
> +rte_acl_classify_sse(__rte_unused const struct rte_acl_ctx *ctx,
> +	__rte_unused const uint8_t **data,
> +	__rte_unused uint32_t *results,
> +	__rte_unused uint32_t num,
> +	__rte_unused uint32_t categories)
> +{
> +	return -ENOTSUP;
> +}
> +
> +int __attribute__ ((weak))
> +rte_acl_classify_neon(__rte_unused const struct rte_acl_ctx *ctx,
> +	__rte_unused const uint8_t **data,
> +	__rte_unused uint32_t *results,
> +	__rte_unused uint32_t num,
> +	__rte_unused uint32_t categories)
> +{
> +	return -ENOTSUP;
> +}
> +
>  static const rte_acl_classify_t classify_fns[] = {
>  	[RTE_ACL_CLASSIFY_DEFAULT] = rte_acl_classify_scalar,
>  	[RTE_ACL_CLASSIFY_SCALAR] = rte_acl_classify_scalar,
>  	[RTE_ACL_CLASSIFY_SSE] = rte_acl_classify_sse,
>  	[RTE_ACL_CLASSIFY_AVX2] = rte_acl_classify_avx2,
> +	[RTE_ACL_CLASSIFY_NEON] = rte_acl_classify_neon,
>  };
> 
>  /* by default, use always available scalar code path. */
> @@ -93,6 +114,9 @@ rte_acl_init(void)
>  {
>  	enum rte_acl_classify_alg alg = RTE_ACL_CLASSIFY_DEFAULT;
> 
> +#ifdef RTE_ARCH_ARM64
> +	alg =  RTE_ACL_CLASSIFY_NEON;
> +#else

On ARM, is there any specific cpu flag that you can use to determine is NEON
isa is supported or not?
It would be good to avoid extra conditional compilation here if possible.
Another question - did I get it right that NEON isa is supported on all
possible RTE_ARCH_ARM64 cpu models you plan to support?
Konstantin
  
Jan Viktorin Nov. 2, 2015, 5:31 p.m. UTC | #4
On Mon, 2 Nov 2015 21:49:54 +0530
Jerin Jacob <jerin.jacob@caviumnetworks.com> wrote:

> On Mon, Nov 02, 2015 at 04:39:37PM +0100, Jan Viktorin wrote:
> > On Mon, 2 Nov 2015 19:48:40 +0530
> > Jerin Jacob <jerin.jacob@caviumnetworks.com> wrote:
> >   
> > > Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> > > ---
> > >  app/test-acl/main.c           |   4 +
> > >  lib/librte_acl/Makefile       |   5 +
> > >  lib/librte_acl/acl.h          |   4 +
> > >  lib/librte_acl/acl_run_neon.c |  46 +++++++
> > >  lib/librte_acl/acl_run_neon.h | 290 ++++++++++++++++++++++++++++++++++++++++++
> > >  lib/librte_acl/rte_acl.c      |  25 ++++
> > >  lib/librte_acl/rte_acl.h      |   1 +
> > >  7 files changed, 375 insertions(+)
> > >  create mode 100644 lib/librte_acl/acl_run_neon.c
> > >  create mode 100644 lib/librte_acl/acl_run_neon.h
> > > 
> > > diff --git a/app/test-acl/main.c b/app/test-acl/main.c
> > > index 72ce83c..0b0c093 100644
> > > --- a/app/test-acl/main.c
> > > +++ b/app/test-acl/main.c
> > > @@ -101,6 +101,10 @@ static const struct acl_alg acl_alg[] = {
> > >  		.name = "avx2",
> > >  		.alg = RTE_ACL_CLASSIFY_AVX2,
> > >  	},
> > > +	{
> > > +		.name = "neon",
> > > +		.alg = RTE_ACL_CLASSIFY_NEON,
> > > +	},
> > >  };
> > >  
> > >  static struct {
> > > diff --git a/lib/librte_acl/Makefile b/lib/librte_acl/Makefile
> > > index 7a1cf8a..27f91d5 100644
> > > --- a/lib/librte_acl/Makefile
> > > +++ b/lib/librte_acl/Makefile
> > > @@ -48,9 +48,14 @@ SRCS-$(CONFIG_RTE_LIBRTE_ACL) += rte_acl.c
> > >  SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_bld.c
> > >  SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_gen.c
> > >  SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_scalar.c
> > > +ifeq ($(CONFIG_RTE_ARCH_ARM64),y)
> > > +SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_neon.c  
> > 
> > Are the used NEON instrinsics for ACL ARMv8-specific? If so, the file should be named
> > something like acl_run_neonv8.c...
> >   
> 
> Yes, bit of armv8 specific, looks like vqtbl1q_u8 NEON instrinsics
> defined only in armv8. I could rename to acl_run_neonv8.c but keeping
> as acl_run_neon.c, may in future it can be extend to armv7 also.
> I am open to any decision, let me know your views.

OK, this sounds reasonable. Leave it as it is.

> 
> > > +else
> > >  SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_sse.c
> > > +endif
> > >  
> > >  CFLAGS_acl_run_sse.o += -msse4.1
> > > +CFLAGS_acl_run_neon.o += -flax-vector-conversions -Wno-maybe-uninitialized  
> > 
> > From man gcc:
> > 
> > -flax-vector-conversions
> >  Allow implicit conversions between vectors with differing numbers of elements and/or
> >  incompatible element types.  This option should not be used for new code.
> > 
> > I've already pointed to this in the Dave's ARMv8 patchset. They dropped it silently.
> > What is the purpose? Is it necessary?  
> 
> Yes, the same tr hi value we can representing as unsigned and signed
> based on it DFA or QRANGE .

I don't understand your answer. What is "tr hi"? What means DFA and
QRANGE here?

I just wanted to point to the note: "This option should not be used for
new code."

Jan

> 
> 
> > 
> > Jan
> >   
> > >  
> > >  #
> > >  # If the compiler supports AVX2 instructions,
> > > diff --git a/lib/librte_acl/acl.h b/lib/librte_acl/acl.h
> > > index eb4930c..09d6784 100644
> > > --- a/lib/librte_acl/acl.h
> > > +++ b/lib/librte_acl/acl.h
> > > @@ -230,6 +230,10 @@ int
> > >  rte_acl_classify_avx2(const struct rte_acl_ctx *ctx, const uint8_t **data,
> > >  	uint32_t *results, uint32_t num, uint32_t categories);
> > >    
> > --snip--
> > 
> > -- 
> >    Jan Viktorin                  E-mail: Viktorin@RehiveTech.com
> >    System Architect              Web:    www.RehiveTech.com
> >    RehiveTech
> >    Brno, Czech Republic
  
Jerin Jacob Nov. 3, 2015, 4:30 a.m. UTC | #5
On Mon, Nov 02, 2015 at 04:54:24PM +0000, Ananyev, Konstantin wrote:
> Hi Jacob,
> 
> > diff --git a/lib/librte_acl/rte_acl.c b/lib/librte_acl/rte_acl.c
> > index d60219f..e2fdebd 100644
> > --- a/lib/librte_acl/rte_acl.c
> > +++ b/lib/librte_acl/rte_acl.c
> > @@ -55,11 +55,32 @@ rte_acl_classify_avx2(__rte_unused const struct rte_acl_ctx *ctx,
> >  	return -ENOTSUP;
> >  }
> > 
> > +int __attribute__ ((weak))
> > +rte_acl_classify_sse(__rte_unused const struct rte_acl_ctx *ctx,
> > +	__rte_unused const uint8_t **data,
> > +	__rte_unused uint32_t *results,
> > +	__rte_unused uint32_t num,
> > +	__rte_unused uint32_t categories)
> > +{
> > +	return -ENOTSUP;
> > +}
> > +
> > +int __attribute__ ((weak))
> > +rte_acl_classify_neon(__rte_unused const struct rte_acl_ctx *ctx,
> > +	__rte_unused const uint8_t **data,
> > +	__rte_unused uint32_t *results,
> > +	__rte_unused uint32_t num,
> > +	__rte_unused uint32_t categories)
> > +{
> > +	return -ENOTSUP;
> > +}
> > +
> >  static const rte_acl_classify_t classify_fns[] = {
> >  	[RTE_ACL_CLASSIFY_DEFAULT] = rte_acl_classify_scalar,
> >  	[RTE_ACL_CLASSIFY_SCALAR] = rte_acl_classify_scalar,
> >  	[RTE_ACL_CLASSIFY_SSE] = rte_acl_classify_sse,
> >  	[RTE_ACL_CLASSIFY_AVX2] = rte_acl_classify_avx2,
> > +	[RTE_ACL_CLASSIFY_NEON] = rte_acl_classify_neon,
> >  };
> > 
> >  /* by default, use always available scalar code path. */
> > @@ -93,6 +114,9 @@ rte_acl_init(void)
> >  {
> >  	enum rte_acl_classify_alg alg = RTE_ACL_CLASSIFY_DEFAULT;
> > 
> > +#ifdef RTE_ARCH_ARM64
> > +	alg =  RTE_ACL_CLASSIFY_NEON;
> > +#else

Hi Konstantin,
> 
> On ARM, is there any specific cpu flag that you can use to determine is NEON
> isa is supported or not?

Yes, on armv7(RTE_CPUFLAG_NEON). On armv8-a NEON is mandatory.

> It would be good to avoid extra conditional compilation here if possible.
neon acl is verified/ported only on armv8. While adding the armv7 support the
check can be extended for cpuflag based on RTE_CPUFLAG_NEON on armv7

> Another question - did I get it right that NEON isa is supported on all
> possible RTE_ARCH_ARM64 cpu models you plan to support?

Yes

> Konstantin
> 
>
  
Ananyev, Konstantin Nov. 3, 2015, 10:23 a.m. UTC | #6
Hi Jacob,

> -----Original Message-----
> From: Jerin Jacob [mailto:jerin.jacob@caviumnetworks.com]
> Sent: Tuesday, November 03, 2015 4:31 AM
> To: Ananyev, Konstantin
> Cc: dev@dpdk.org; thomas.monjalon@6wind.com; Hunt, David; viktorin@rehivetech.com
> Subject: Re: [PATCH 2/3] arm64: acl: add neon based acl implementation
> 
> On Mon, Nov 02, 2015 at 04:54:24PM +0000, Ananyev, Konstantin wrote:
> > Hi Jacob,
> >
> > > diff --git a/lib/librte_acl/rte_acl.c b/lib/librte_acl/rte_acl.c
> > > index d60219f..e2fdebd 100644
> > > --- a/lib/librte_acl/rte_acl.c
> > > +++ b/lib/librte_acl/rte_acl.c
> > > @@ -55,11 +55,32 @@ rte_acl_classify_avx2(__rte_unused const struct rte_acl_ctx *ctx,
> > >  	return -ENOTSUP;
> > >  }
> > >
> > > +int __attribute__ ((weak))
> > > +rte_acl_classify_sse(__rte_unused const struct rte_acl_ctx *ctx,
> > > +	__rte_unused const uint8_t **data,
> > > +	__rte_unused uint32_t *results,
> > > +	__rte_unused uint32_t num,
> > > +	__rte_unused uint32_t categories)
> > > +{
> > > +	return -ENOTSUP;
> > > +}
> > > +
> > > +int __attribute__ ((weak))
> > > +rte_acl_classify_neon(__rte_unused const struct rte_acl_ctx *ctx,
> > > +	__rte_unused const uint8_t **data,
> > > +	__rte_unused uint32_t *results,
> > > +	__rte_unused uint32_t num,
> > > +	__rte_unused uint32_t categories)
> > > +{
> > > +	return -ENOTSUP;
> > > +}
> > > +
> > >  static const rte_acl_classify_t classify_fns[] = {
> > >  	[RTE_ACL_CLASSIFY_DEFAULT] = rte_acl_classify_scalar,
> > >  	[RTE_ACL_CLASSIFY_SCALAR] = rte_acl_classify_scalar,
> > >  	[RTE_ACL_CLASSIFY_SSE] = rte_acl_classify_sse,
> > >  	[RTE_ACL_CLASSIFY_AVX2] = rte_acl_classify_avx2,
> > > +	[RTE_ACL_CLASSIFY_NEON] = rte_acl_classify_neon,
> > >  };
> > >
> > >  /* by default, use always available scalar code path. */
> > > @@ -93,6 +114,9 @@ rte_acl_init(void)
> > >  {
> > >  	enum rte_acl_classify_alg alg = RTE_ACL_CLASSIFY_DEFAULT;
> > >
> > > +#ifdef RTE_ARCH_ARM64
> > > +	alg =  RTE_ACL_CLASSIFY_NEON;
> > > +#else
> 
> Hi Konstantin,
> >
> > On ARM, is there any specific cpu flag that you can use to determine is NEON
> > isa is supported or not?
> 
> Yes, on armv7(RTE_CPUFLAG_NEON). On armv8-a NEON is mandatory.
> 
> > It would be good to avoid extra conditional compilation here if possible.
> neon acl is verified/ported only on armv8. While adding the armv7 support the
> check can be extended for cpuflag based on RTE_CPUFLAG_NEON on armv7

Ok, and is there a flag that allows to distinguish between armv7 and arm8 then at runtime?
It is probably ok like that, but with all that conditional compilations it gets too messy.
Another thing - if you can distinguish between armv7 and armv8 at runtime, then you probably
can set alg = RTE_ACL_CLASSIFY_DEFAULT for armv7 and alg = RTE_ACL_CLASSIFY_NEON for arm8?
Konstantin

> 
> > Another question - did I get it right that NEON isa is supported on all
> > possible RTE_ARCH_ARM64 cpu models you plan to support?
> 
> Yes
> 
> > Konstantin
> >
> >
  
Jan Viktorin Nov. 3, 2015, 10:35 a.m. UTC | #7
On Tue, 3 Nov 2015 10:23:55 +0000
"Ananyev, Konstantin" <konstantin.ananyev@intel.com> wrote:

> > Hi Konstantin,  
> > >
> > > On ARM, is there any specific cpu flag that you can use to determine is NEON
> > > isa is supported or not?  
> > 
> > Yes, on armv7(RTE_CPUFLAG_NEON). On armv8-a NEON is mandatory.
> >   
> > > It would be good to avoid extra conditional compilation here if possible.  
> > neon acl is verified/ported only on armv8. While adding the armv7 support the
> > check can be extended for cpuflag based on RTE_CPUFLAG_NEON on armv7  
> 
> Ok, and is there a flag that allows to distinguish between armv7 and arm8 then at runtime?
> It is probably ok like that, but with all that conditional compilations it gets too messy.
> Another thing - if you can distinguish between armv7 and armv8 at runtime, then you probably
> can set alg = RTE_ACL_CLASSIFY_DEFAULT for armv7 and alg = RTE_ACL_CLASSIFY_NEON for arm8?
> Konstantin

The ARMv7 provides "v7l" in the AT_PLATFORM and the ARMv8 gives
"aarach64". However, I am afraid, as these are two distinct platforms
(with distinct cpuflags), we cannot use the cpuflags easily for this.

Is there an API in DPDK which allows to ask globally: "what platform am
I running on?"?

Jan

> 
> >   
> > > Another question - did I get it right that NEON isa is supported on all
> > > possible RTE_ARCH_ARM64 cpu models you plan to support?  
> > 
> > Yes
  
Ananyev, Konstantin Nov. 3, 2015, 1:20 p.m. UTC | #8
> -----Original Message-----
> From: Jan Viktorin [mailto:viktorin@rehivetech.com]
> Sent: Tuesday, November 03, 2015 10:36 AM
> To: Ananyev, Konstantin
> Cc: Jerin Jacob; dev@dpdk.org; thomas.monjalon@6wind.com; Hunt, David
> Subject: Re: [PATCH 2/3] arm64: acl: add neon based acl implementation
> 
> On Tue, 3 Nov 2015 10:23:55 +0000
> "Ananyev, Konstantin" <konstantin.ananyev@intel.com> wrote:
> 
> > > Hi Konstantin,
> > > >
> > > > On ARM, is there any specific cpu flag that you can use to determine is NEON
> > > > isa is supported or not?
> > >
> > > Yes, on armv7(RTE_CPUFLAG_NEON). On armv8-a NEON is mandatory.
> > >
> > > > It would be good to avoid extra conditional compilation here if possible.
> > > neon acl is verified/ported only on armv8. While adding the armv7 support the
> > > check can be extended for cpuflag based on RTE_CPUFLAG_NEON on armv7
> >
> > Ok, and is there a flag that allows to distinguish between armv7 and arm8 then at runtime?
> > It is probably ok like that, but with all that conditional compilations it gets too messy.
> > Another thing - if you can distinguish between armv7 and armv8 at runtime, then you probably
> > can set alg = RTE_ACL_CLASSIFY_DEFAULT for armv7 and alg = RTE_ACL_CLASSIFY_NEON for arm8?
> > Konstantin
> 
> The ARMv7 provides "v7l" in the AT_PLATFORM and the ARMv8 gives
> "aarach64". However, I am afraid, as these are two distinct platforms
> (with distinct cpuflags), we cannot use the cpuflags easily for this.
> 

Ok, if you think it is unavoidable - let's keep it like that for now.

> Is there an API in DPDK which allows to ask globally: "what platform am
> I running on?"?

Not that I am aware about.
Konstantin

> 
> Jan
> 
> >
> > >
> > > > Another question - did I get it right that NEON isa is supported on all
> > > > possible RTE_ARCH_ARM64 cpu models you plan to support?
> > >
> > > Yes
> 
> 
> --
>   Jan Viktorin                E-mail: Viktorin@RehiveTech.com
>   System Architect            Web:    www.RehiveTech.com
>   RehiveTech
>   Brno, Czech Republic
  

Patch

diff --git a/app/test-acl/main.c b/app/test-acl/main.c
index 72ce83c..0b0c093 100644
--- a/app/test-acl/main.c
+++ b/app/test-acl/main.c
@@ -101,6 +101,10 @@  static const struct acl_alg acl_alg[] = {
 		.name = "avx2",
 		.alg = RTE_ACL_CLASSIFY_AVX2,
 	},
+	{
+		.name = "neon",
+		.alg = RTE_ACL_CLASSIFY_NEON,
+	},
 };
 
 static struct {
diff --git a/lib/librte_acl/Makefile b/lib/librte_acl/Makefile
index 7a1cf8a..27f91d5 100644
--- a/lib/librte_acl/Makefile
+++ b/lib/librte_acl/Makefile
@@ -48,9 +48,14 @@  SRCS-$(CONFIG_RTE_LIBRTE_ACL) += rte_acl.c
 SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_bld.c
 SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_gen.c
 SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_scalar.c
+ifeq ($(CONFIG_RTE_ARCH_ARM64),y)
+SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_neon.c
+else
 SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_sse.c
+endif
 
 CFLAGS_acl_run_sse.o += -msse4.1
+CFLAGS_acl_run_neon.o += -flax-vector-conversions -Wno-maybe-uninitialized
 
 #
 # If the compiler supports AVX2 instructions,
diff --git a/lib/librte_acl/acl.h b/lib/librte_acl/acl.h
index eb4930c..09d6784 100644
--- a/lib/librte_acl/acl.h
+++ b/lib/librte_acl/acl.h
@@ -230,6 +230,10 @@  int
 rte_acl_classify_avx2(const struct rte_acl_ctx *ctx, const uint8_t **data,
 	uint32_t *results, uint32_t num, uint32_t categories);
 
+int
+rte_acl_classify_neon(const struct rte_acl_ctx *ctx, const uint8_t **data,
+	uint32_t *results, uint32_t num, uint32_t categories);
+
 #ifdef __cplusplus
 }
 #endif /* __cplusplus */
diff --git a/lib/librte_acl/acl_run_neon.c b/lib/librte_acl/acl_run_neon.c
new file mode 100644
index 0000000..b014451
--- /dev/null
+++ b/lib/librte_acl/acl_run_neon.c
@@ -0,0 +1,46 @@ 
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright (C) Cavium networks Ltd. 2015.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Cavium networks nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "acl_run_neon.h"
+
+int
+rte_acl_classify_neon(const struct rte_acl_ctx *ctx, const uint8_t **data,
+		      uint32_t *results, uint32_t num, uint32_t categories)
+{
+	if (likely(num >= 8))
+		return search_neon_8(ctx, data, results, num, categories);
+	else if (num >= 4)
+		return search_neon_4(ctx, data, results, num, categories);
+	else
+		return rte_acl_classify_scalar(ctx, data, results, num,
+			categories);
+}
diff --git a/lib/librte_acl/acl_run_neon.h b/lib/librte_acl/acl_run_neon.h
new file mode 100644
index 0000000..4579476
--- /dev/null
+++ b/lib/librte_acl/acl_run_neon.h
@@ -0,0 +1,290 @@ 
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright (C) Cavium networks Ltd. 2015.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Cavium networks nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "acl_run.h"
+#include "acl_vect.h"
+
+struct _neon_acl_const {
+	rte_xmm_t xmm_shuffle_input;
+	rte_xmm_t xmm_index_mask;
+	rte_xmm_t range_base;
+} neon_acl_const  __attribute__((aligned(RTE_CACHE_LINE_SIZE))) = {
+	{
+		.u32 = {0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c}
+	},
+	{
+		.u32 = {RTE_ACL_NODE_INDEX, RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX, RTE_ACL_NODE_INDEX}
+	},
+	{
+		.u32 = {0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c}
+	},
+};
+
+/*
+ * Resolve priority for multiple results (neon version).
+ * This consists comparing the priority of the current traversal with the
+ * running set of results for the packet.
+ * For each result, keep a running array of the result (rule number) and
+ * its priority for each category.
+ */
+static inline void
+resolve_priority_neon(uint64_t transition, int n, const struct rte_acl_ctx *ctx,
+		      struct parms *parms,
+		      const struct rte_acl_match_results *p,
+		      uint32_t categories)
+{
+	uint32_t x;
+	int32x4_t results, priority, results1, priority1;
+	uint32x4_t selector;
+	int32_t *saved_results, *saved_priority;
+
+	for (x = 0; x < categories; x += RTE_ACL_RESULTS_MULTIPLIER) {
+		saved_results = (int32_t *)(&parms[n].cmplt->results[x]);
+		saved_priority = (int32_t *)(&parms[n].cmplt->priority[x]);
+
+		/* get results and priorities for completed trie */
+		results = vld1q_s32(
+			(const int32_t *)&p[transition].results[x]);
+		priority = vld1q_s32(
+			(const int32_t *)&p[transition].priority[x]);
+
+		/* if this is not the first completed trie */
+		if (parms[n].cmplt->count != ctx->num_tries) {
+			/* get running best results and their priorities */
+			results1 = vld1q_s32(saved_results);
+			priority1 = vld1q_s32(saved_priority);
+
+			/* select results that are highest priority */
+			selector = vcgtq_s32(priority1, priority);
+			results = vbslq_s32(selector, results1, results);
+			priority = vbslq_s32(selector, priority1, priority);
+		}
+
+		/* save running best results and their priorities */
+		vst1q_s32(saved_results, results);
+		vst1q_s32(saved_priority, priority);
+	}
+}
+
+/*
+ * Check for any match in 4 transitions
+ */
+static inline __attribute__((always_inline)) uint32_t
+check_any_match_x4(uint64_t val[])
+{
+	return ((val[0] | val[1] | val[2] | val[3]) & RTE_ACL_NODE_MATCH);
+}
+
+static inline __attribute__((always_inline)) void
+acl_match_check_x4(int slot, const struct rte_acl_ctx *ctx, struct parms *parms,
+		   struct acl_flow_data *flows, uint64_t transitions[])
+{
+	while (check_any_match_x4(transitions)) {
+		transitions[0] = acl_match_check(transitions[0], slot, ctx,
+			parms, flows, resolve_priority_neon);
+		transitions[1] = acl_match_check(transitions[1], slot + 1, ctx,
+			parms, flows, resolve_priority_neon);
+		transitions[2] = acl_match_check(transitions[2], slot + 2, ctx,
+			parms, flows, resolve_priority_neon);
+		transitions[3] = acl_match_check(transitions[3], slot + 3, ctx,
+			parms, flows, resolve_priority_neon);
+	}
+}
+
+/*
+ * Process 4 transitions (in 2 NEON Q registers) in parallel
+ */
+static inline __attribute__((always_inline)) int32x4_t
+transition4(int32x4_t next_input, const uint64_t *trans, uint64_t transitions[])
+{
+	int32x4x2_t tr_hi_lo;
+	int32x4_t t, in, r;
+	uint32x4_t index_msk, node_type, addr;
+	uint32x4_t dfa_msk, mask, quad_ofs, dfa_ofs;
+
+	/* Move low 32 into tr_hi_lo.val[0] and high 32 into tr_hi_lo.val[1] */
+	tr_hi_lo = vld2q_s32((const int32_t *)transitions);
+
+	/* Calculate the address (array index) for all 4 transitions. */
+
+	index_msk = vld1q_u32((const uint32_t *)&neon_acl_const.xmm_index_mask);
+
+	/* Calc node type and node addr */
+	node_type = vbicq_s32(tr_hi_lo.val[0], index_msk);
+	addr = vandq_s32(tr_hi_lo.val[0], index_msk);
+
+	/* t = 0 */
+	t = veorq_s32(node_type, node_type);
+
+	/* mask for DFA type(0) nodes */
+	dfa_msk = vceqq_u32(node_type, t);
+
+	mask = vld1q_s32((const int32_t *)&neon_acl_const.xmm_shuffle_input);
+	in = vqtbl1q_u8((uint8x16_t)next_input, (uint8x16_t)mask);
+
+	/* DFA calculations. */
+	r = vshrq_n_u32(in, 30); /* div by 64 */
+	mask = vld1q_s32((const int32_t *)&neon_acl_const.range_base);
+	r = vaddq_u8(r, mask);
+	t = vshrq_n_u32(in, 24);
+	r = vqtbl1q_u8((uint8x16_t)tr_hi_lo.val[1], (uint8x16_t)r);
+	dfa_ofs = vsubq_s32(t, r);
+
+	/* QUAD/SINGLE calculations. */
+	t = vcgtq_s8(in, tr_hi_lo.val[1]);
+	t = vabsq_s8(t);
+	t = vpaddlq_u8(t);
+	quad_ofs = vpaddlq_u16(t);
+
+	/* blend DFA and QUAD/SINGLE. */
+	t = vbslq_u8(dfa_msk, dfa_ofs, quad_ofs);
+
+	/* calculate address for next transitions */
+	addr = vaddq_u32(addr, t);
+
+	/* Fill next transitions */
+	transitions[0] = trans[vgetq_lane_u32(addr, 0)];
+	transitions[1] = trans[vgetq_lane_u32(addr, 1)];
+	transitions[2] = trans[vgetq_lane_u32(addr, 2)];
+	transitions[3] = trans[vgetq_lane_u32(addr, 3)];
+
+	return vshrq_n_u32(next_input, CHAR_BIT);
+}
+
+/*
+ * Execute trie traversal with 8 traversals in parallel
+ */
+static inline int
+search_neon_8(const struct rte_acl_ctx *ctx, const uint8_t **data,
+	      uint32_t *results, uint32_t total_packets, uint32_t categories)
+{
+	int n;
+	struct acl_flow_data flows;
+	uint64_t index_array[8];
+	struct completion cmplt[8];
+	struct parms parms[8];
+	int32x4_t input0, input1;
+
+	acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
+		     total_packets, categories, ctx->trans_table);
+
+	for (n = 0; n < 8; n++) {
+		cmplt[n].count = 0;
+		index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
+	}
+
+	 /* Check for any matches. */
+	acl_match_check_x4(0, ctx, parms, &flows, &index_array[0]);
+	acl_match_check_x4(4, ctx, parms, &flows, &index_array[4]);
+
+	while (flows.started > 0) {
+		/* Gather 4 bytes of input data for each stream. */
+		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 0), input0, 0);
+		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 4), input1, 0);
+
+		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1), input0, 1);
+		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 5), input1, 1);
+
+		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 2), input0, 2);
+		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 6), input1, 2);
+
+		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 3), input0, 3);
+		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 7), input1, 3);
+
+		/* Process the 4 bytes of input on each stream. */
+
+		input0 = transition4(input0, flows.trans, &index_array[0]);
+		input1 = transition4(input1, flows.trans, &index_array[4]);
+
+		input0 = transition4(input0, flows.trans, &index_array[0]);
+		input1 = transition4(input1, flows.trans, &index_array[4]);
+
+		input0 = transition4(input0, flows.trans, &index_array[0]);
+		input1 = transition4(input1, flows.trans, &index_array[4]);
+
+		input0 = transition4(input0, flows.trans, &index_array[0]);
+		input1 = transition4(input1, flows.trans, &index_array[4]);
+
+		 /* Check for any matches. */
+		acl_match_check_x4(0, ctx, parms, &flows, &index_array[0]);
+		acl_match_check_x4(4, ctx, parms, &flows, &index_array[4]);
+	}
+
+	return 0;
+}
+
+/*
+ * Execute trie traversal with 4 traversals in parallel
+ */
+static inline int
+search_neon_4(const struct rte_acl_ctx *ctx, const uint8_t **data,
+	      uint32_t *results, int total_packets, uint32_t categories)
+{
+	int n;
+	struct acl_flow_data flows;
+	uint64_t index_array[4];
+	struct completion cmplt[4];
+	struct parms parms[4];
+	int32x4_t input;
+
+	acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
+		     total_packets, categories, ctx->trans_table);
+
+	for (n = 0; n < 4; n++) {
+		cmplt[n].count = 0;
+		index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
+	}
+
+	/* Check for any matches. */
+	acl_match_check_x4(0, ctx, parms, &flows, index_array);
+
+	while (flows.started > 0) {
+		/* Gather 4 bytes of input data for each stream. */
+		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 0), input, 0);
+		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1), input, 1);
+		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 2), input, 2);
+		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 3), input, 3);
+
+		/* Process the 4 bytes of input on each stream. */
+		input = transition4(input, flows.trans, index_array);
+		input = transition4(input, flows.trans, index_array);
+		input = transition4(input, flows.trans, index_array);
+		input = transition4(input, flows.trans, index_array);
+
+		/* Check for any matches. */
+		acl_match_check_x4(0, ctx, parms, &flows, index_array);
+	}
+
+	return 0;
+}
+
diff --git a/lib/librte_acl/rte_acl.c b/lib/librte_acl/rte_acl.c
index d60219f..e2fdebd 100644
--- a/lib/librte_acl/rte_acl.c
+++ b/lib/librte_acl/rte_acl.c
@@ -55,11 +55,32 @@  rte_acl_classify_avx2(__rte_unused const struct rte_acl_ctx *ctx,
 	return -ENOTSUP;
 }
 
+int __attribute__ ((weak))
+rte_acl_classify_sse(__rte_unused const struct rte_acl_ctx *ctx,
+	__rte_unused const uint8_t **data,
+	__rte_unused uint32_t *results,
+	__rte_unused uint32_t num,
+	__rte_unused uint32_t categories)
+{
+	return -ENOTSUP;
+}
+
+int __attribute__ ((weak))
+rte_acl_classify_neon(__rte_unused const struct rte_acl_ctx *ctx,
+	__rte_unused const uint8_t **data,
+	__rte_unused uint32_t *results,
+	__rte_unused uint32_t num,
+	__rte_unused uint32_t categories)
+{
+	return -ENOTSUP;
+}
+
 static const rte_acl_classify_t classify_fns[] = {
 	[RTE_ACL_CLASSIFY_DEFAULT] = rte_acl_classify_scalar,
 	[RTE_ACL_CLASSIFY_SCALAR] = rte_acl_classify_scalar,
 	[RTE_ACL_CLASSIFY_SSE] = rte_acl_classify_sse,
 	[RTE_ACL_CLASSIFY_AVX2] = rte_acl_classify_avx2,
+	[RTE_ACL_CLASSIFY_NEON] = rte_acl_classify_neon,
 };
 
 /* by default, use always available scalar code path. */
@@ -93,6 +114,9 @@  rte_acl_init(void)
 {
 	enum rte_acl_classify_alg alg = RTE_ACL_CLASSIFY_DEFAULT;
 
+#ifdef RTE_ARCH_ARM64
+	alg =  RTE_ACL_CLASSIFY_NEON;
+#else
 #ifdef CC_AVX2_SUPPORT
 	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2))
 		alg = RTE_ACL_CLASSIFY_AVX2;
@@ -102,6 +126,7 @@  rte_acl_init(void)
 #endif
 		alg = RTE_ACL_CLASSIFY_SSE;
 
+#endif
 	rte_acl_set_default_classify(alg);
 }
 
diff --git a/lib/librte_acl/rte_acl.h b/lib/librte_acl/rte_acl.h
index 98ef2fc..0979a09 100644
--- a/lib/librte_acl/rte_acl.h
+++ b/lib/librte_acl/rte_acl.h
@@ -270,6 +270,7 @@  enum rte_acl_classify_alg {
 	RTE_ACL_CLASSIFY_SCALAR = 1,  /**< generic implementation. */
 	RTE_ACL_CLASSIFY_SSE = 2,     /**< requires SSE4.1 support. */
 	RTE_ACL_CLASSIFY_AVX2 = 3,    /**< requires AVX2 support. */
+	RTE_ACL_CLASSIFY_NEON = 4,    /**< requires NEON support. */
 	RTE_ACL_CLASSIFY_NUM          /* should always be the last one. */
 };