[RFC] lpm: add sve support for lookup on Arm platform

Message ID 20201218101210.356836-1-ruifeng.wang@arm.com (mailing list archive)
State Superseded, archived
Delegated to: David Marchand
Headers
Series [RFC] lpm: add sve support for lookup on Arm platform |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

Ruifeng Wang Dec. 18, 2020, 10:12 a.m. UTC
  Added new path to do lpm4 lookup by using scalable vector extension.
The SVE path will be selected if compiler has flag SVE set.

Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/librte_eal/arm/include/rte_vect.h |  3 +
 lib/librte_lpm/meson.build            |  2 +-
 lib/librte_lpm/rte_lpm.h              |  4 ++
 lib/librte_lpm/rte_lpm_sve.h          | 83 +++++++++++++++++++++++++++
 4 files changed, 91 insertions(+), 1 deletion(-)
 create mode 100644 lib/librte_lpm/rte_lpm_sve.h
  

Comments

Vladimir Medvedkin Jan. 5, 2021, 3:44 p.m. UTC | #1
Hi Ruifeng,

Thanks for the patch, see comments below

On 18/12/2020 10:12, Ruifeng Wang wrote:
> Added new path to do lpm4 lookup by using scalable vector extension.
> The SVE path will be selected if compiler has flag SVE set.
> 
> Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
> ---
>   lib/librte_eal/arm/include/rte_vect.h |  3 +
>   lib/librte_lpm/meson.build            |  2 +-
>   lib/librte_lpm/rte_lpm.h              |  4 ++
>   lib/librte_lpm/rte_lpm_sve.h          | 83 +++++++++++++++++++++++++++
>   4 files changed, 91 insertions(+), 1 deletion(-)
>   create mode 100644 lib/librte_lpm/rte_lpm_sve.h
> 
> diff --git a/lib/librte_eal/arm/include/rte_vect.h b/lib/librte_eal/arm/include/rte_vect.h
> index a739e6e66..093e9122a 100644
> --- a/lib/librte_eal/arm/include/rte_vect.h
> +++ b/lib/librte_eal/arm/include/rte_vect.h
> @@ -9,6 +9,9 @@
>   #include "generic/rte_vect.h"
>   #include "rte_debug.h"
>   #include "arm_neon.h"
> +#ifdef __ARM_FEATURE_SVE
> +#include <arm_sve.h>
> +#endif
>   
>   #ifdef __cplusplus
>   extern "C" {
> diff --git a/lib/librte_lpm/meson.build b/lib/librte_lpm/meson.build
> index 6cfc083c5..f93c86640 100644
> --- a/lib/librte_lpm/meson.build
> +++ b/lib/librte_lpm/meson.build
> @@ -5,6 +5,6 @@ sources = files('rte_lpm.c', 'rte_lpm6.c')
>   headers = files('rte_lpm.h', 'rte_lpm6.h')
>   # since header files have different names, we can install all vector headers
>   # without worrying about which architecture we actually need
> -headers += files('rte_lpm_altivec.h', 'rte_lpm_neon.h', 'rte_lpm_sse.h')
> +headers += files('rte_lpm_altivec.h', 'rte_lpm_neon.h', 'rte_lpm_sse.h', 'rte_lpm_sve.h')
>   deps += ['hash']
>   deps += ['rcu']
> diff --git a/lib/librte_lpm/rte_lpm.h b/lib/librte_lpm/rte_lpm.h
> index 1afe55cdc..28b57683b 100644
> --- a/lib/librte_lpm/rte_lpm.h
> +++ b/lib/librte_lpm/rte_lpm.h
> @@ -402,7 +402,11 @@ rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
>   	uint32_t defv);
>   
>   #if defined(RTE_ARCH_ARM)
> +#ifdef __ARM_FEATURE_SVE
> +#include "rte_lpm_sve.h"
> +#else
>   #include "rte_lpm_neon.h"
> +#endif
>   #elif defined(RTE_ARCH_PPC_64)
>   #include "rte_lpm_altivec.h"
>   #else
> diff --git a/lib/librte_lpm/rte_lpm_sve.h b/lib/librte_lpm/rte_lpm_sve.h
> new file mode 100644
> index 000000000..86576ec52
> --- /dev/null
> +++ b/lib/librte_lpm/rte_lpm_sve.h
> @@ -0,0 +1,83 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2020 Arm Limited
> + */
> +
> +#ifndef _RTE_LPM_SVE_H_
> +#define _RTE_LPM_SVE_H_
> +
> +#include <rte_vect.h>
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +__rte_internal
> +static void
> +__rte_lpm_lookup_vec(const struct rte_lpm *lpm, const uint32_t *ips,
> +		uint32_t *__rte_restrict next_hops, const uint32_t n)
> +{
> +	uint32_t i = 0;
> +	svuint32_t v_ip, v_idx, v_tbl24, v_tbl8, v_hop;
> +	svuint32_t v_mask_xv, v_mask_v, v_mask_hop;
> +	svbool_t pg = svwhilelt_b32(i, n);
> +	svbool_t pv;
> +
> +	do {
> +		v_ip = svld1(pg, &ips[i]);
> +		/* Get indices for tbl24[] */
> +		v_idx = svlsr_x(pg, v_ip, 8);
> +		/* Extract values from tbl24[] */
> +		v_tbl24 = svld1_gather_index(pg, (const uint32_t *)lpm->tbl24,
> +						v_idx);
> +
> +		/* Create mask with valid set */
> +		v_mask_v = svdup_u32_z(pg, RTE_LPM_LOOKUP_SUCCESS);
> +		/* Create mask with valid and valid_group set */
> +		v_mask_xv = svdup_u32_z(pg, RTE_LPM_VALID_EXT_ENTRY_BITMASK);
> +		/* Create predicate for tbl24 entries: (valid && !valid_group) */
> +		pv = svcmpeq(pg, svand_z(pg, v_tbl24, v_mask_xv), v_mask_v);
> +		/* Create mask for next_hop in table entry */
> +		v_mask_hop = svdup_u32_z(pg, 0x00ffffff);
> +		/* Extract next_hop and write back */
> +		v_hop = svand_x(pv, v_tbl24, v_mask_hop);
> +		svst1(pv, &next_hops[i], v_hop);
> +
> +		/* Update predicate for tbl24 entries: (valid && valid_group) */
> +		pv = svcmpeq(pg, svand_z(pg, v_tbl24, v_mask_xv), v_mask_xv);
> +		/* Compute tbl8 index */
> +		v_idx = svand_x(pv, v_tbl24, svdup_u32_z(pv, 0xff));

Loos like here should be
v_idx = svand_x(pv, v_tbl24, svdup_u32_z(pv, 0xffffff));
because we are using 24 bits to keep tbl8 group.


> +		v_idx = svmul_x(pv, v_idx, RTE_LPM_TBL8_GROUP_NUM_ENTRIES);
> +		v_idx = svadd_x(pv, svand_x(pv, v_ip, svdup_u32_z(pv, 0xff)),
> +				v_idx);
> +		/* Extract values from tbl8[] */
> +		v_tbl8 = svld1_gather_index(pv, (const uint32_t *)lpm->tbl8,
> +						v_idx);
> +		/* Update predicate for tbl8 entries: (valid) */
> +		pv = svcmpeq(pv, svand_z(pv, v_tbl8, v_mask_v), v_mask_v);
> +		/* Extract next_hop and write back */
> +		v_hop = svand_x(pv, v_tbl8, v_mask_hop);
> +		svst1(pv, &next_hops[i], v_hop);

I'm not an expert, but probably it would be better to merge two stores 
(svst1) into a single one?

> +
> +		i += svlen(v_ip);
> +		pg = svwhilelt_b32(i, n);

Isn't it better to move the predicate calculation to the beginning of 
the loop and just do {} while (i < n)?

> +	} while (svptest_any(svptrue_b32(), pg));
> +}
> +
> +static inline void
> +rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> +		uint32_t defv)
> +{
> +	uint32_t i, ips[4];
> +
> +	vst1q_s32((int32_t *)ips, ip);
> +	for (i = 0; i < 4; i++)
> +		hop[i] = defv;
> +
> +	__rte_lpm_lookup_vec(lpm, ips, hop, 4);
> +}
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_LPM_SVE_H_ */
>
  
Ruifeng Wang Jan. 6, 2021, 10:11 a.m. UTC | #2
> -----Original Message-----
> From: Medvedkin, Vladimir <vladimir.medvedkin@intel.com>
> Sent: Tuesday, January 5, 2021 11:44 PM
> To: Ruifeng Wang <Ruifeng.Wang@arm.com>; Jan Viktorin
> <viktorin@rehivetech.com>; jerinj@marvell.com; Bruce Richardson
> <bruce.richardson@intel.com>
> Cc: dev@dpdk.org; hemant.agrawal@nxp.com; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; nd <nd@arm.com>
> Subject: Re: [RFC PATCH] lpm: add sve support for lookup on Arm platform
> 
> Hi Ruifeng,
> 
> Thanks for the patch, see comments below

Hi Vladimir,
Thank you for your review.

> 
> On 18/12/2020 10:12, Ruifeng Wang wrote:
> > Added new path to do lpm4 lookup by using scalable vector extension.
> > The SVE path will be selected if compiler has flag SVE set.
> >
> > Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
> > ---
> >   lib/librte_eal/arm/include/rte_vect.h |  3 +
> >   lib/librte_lpm/meson.build            |  2 +-
> >   lib/librte_lpm/rte_lpm.h              |  4 ++
> >   lib/librte_lpm/rte_lpm_sve.h          | 83 +++++++++++++++++++++++++++
> >   4 files changed, 91 insertions(+), 1 deletion(-)
> >   create mode 100644 lib/librte_lpm/rte_lpm_sve.h
> >
> > diff --git a/lib/librte_eal/arm/include/rte_vect.h
> > b/lib/librte_eal/arm/include/rte_vect.h
> > index a739e6e66..093e9122a 100644
> > --- a/lib/librte_eal/arm/include/rte_vect.h
> > +++ b/lib/librte_eal/arm/include/rte_vect.h
> > @@ -9,6 +9,9 @@
> >   #include "generic/rte_vect.h"
> >   #include "rte_debug.h"
> >   #include "arm_neon.h"
> > +#ifdef __ARM_FEATURE_SVE
> > +#include <arm_sve.h>
> > +#endif
> >
> >   #ifdef __cplusplus
> >   extern "C" {
> > diff --git a/lib/librte_lpm/meson.build b/lib/librte_lpm/meson.build
> > index 6cfc083c5..f93c86640 100644
> > --- a/lib/librte_lpm/meson.build
> > +++ b/lib/librte_lpm/meson.build
> > @@ -5,6 +5,6 @@ sources = files('rte_lpm.c', 'rte_lpm6.c')
> >   headers = files('rte_lpm.h', 'rte_lpm6.h')
> >   # since header files have different names, we can install all vector headers
> >   # without worrying about which architecture we actually need
> > -headers += files('rte_lpm_altivec.h', 'rte_lpm_neon.h',
> > 'rte_lpm_sse.h')
> > +headers += files('rte_lpm_altivec.h', 'rte_lpm_neon.h',
> > +'rte_lpm_sse.h', 'rte_lpm_sve.h')
> >   deps += ['hash']
> >   deps += ['rcu']
> > diff --git a/lib/librte_lpm/rte_lpm.h b/lib/librte_lpm/rte_lpm.h index
> > 1afe55cdc..28b57683b 100644
> > --- a/lib/librte_lpm/rte_lpm.h
> > +++ b/lib/librte_lpm/rte_lpm.h
> > @@ -402,7 +402,11 @@ rte_lpm_lookupx4(const struct rte_lpm *lpm,
> xmm_t ip, uint32_t hop[4],
> >   	uint32_t defv);
> >
> >   #if defined(RTE_ARCH_ARM)
> > +#ifdef __ARM_FEATURE_SVE
> > +#include "rte_lpm_sve.h"
> > +#else
> >   #include "rte_lpm_neon.h"
> > +#endif
> >   #elif defined(RTE_ARCH_PPC_64)
> >   #include "rte_lpm_altivec.h"
> >   #else
> > diff --git a/lib/librte_lpm/rte_lpm_sve.h
> > b/lib/librte_lpm/rte_lpm_sve.h new file mode 100644 index
> > 000000000..86576ec52
> > --- /dev/null
> > +++ b/lib/librte_lpm/rte_lpm_sve.h
> > @@ -0,0 +1,83 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2020 Arm Limited
> > + */
> > +
> > +#ifndef _RTE_LPM_SVE_H_
> > +#define _RTE_LPM_SVE_H_
> > +
> > +#include <rte_vect.h>
> > +
> > +#ifdef __cplusplus
> > +extern "C" {
> > +#endif
> > +
> > +__rte_internal
> > +static void
> > +__rte_lpm_lookup_vec(const struct rte_lpm *lpm, const uint32_t *ips,
> > +		uint32_t *__rte_restrict next_hops, const uint32_t n) {
> > +	uint32_t i = 0;
> > +	svuint32_t v_ip, v_idx, v_tbl24, v_tbl8, v_hop;
> > +	svuint32_t v_mask_xv, v_mask_v, v_mask_hop;
> > +	svbool_t pg = svwhilelt_b32(i, n);
> > +	svbool_t pv;
> > +
> > +	do {
> > +		v_ip = svld1(pg, &ips[i]);
> > +		/* Get indices for tbl24[] */
> > +		v_idx = svlsr_x(pg, v_ip, 8);
> > +		/* Extract values from tbl24[] */
> > +		v_tbl24 = svld1_gather_index(pg, (const uint32_t *)lpm-
> >tbl24,
> > +						v_idx);
> > +
> > +		/* Create mask with valid set */
> > +		v_mask_v = svdup_u32_z(pg, RTE_LPM_LOOKUP_SUCCESS);
> > +		/* Create mask with valid and valid_group set */
> > +		v_mask_xv = svdup_u32_z(pg,
> RTE_LPM_VALID_EXT_ENTRY_BITMASK);
> > +		/* Create predicate for tbl24 entries: (valid && !valid_group)
> */
> > +		pv = svcmpeq(pg, svand_z(pg, v_tbl24, v_mask_xv),
> v_mask_v);
> > +		/* Create mask for next_hop in table entry */
> > +		v_mask_hop = svdup_u32_z(pg, 0x00ffffff);
> > +		/* Extract next_hop and write back */
> > +		v_hop = svand_x(pv, v_tbl24, v_mask_hop);
> > +		svst1(pv, &next_hops[i], v_hop);
> > +
> > +		/* Update predicate for tbl24 entries: (valid && valid_group)
> */
> > +		pv = svcmpeq(pg, svand_z(pg, v_tbl24, v_mask_xv),
> v_mask_xv);
> > +		/* Compute tbl8 index */
> > +		v_idx = svand_x(pv, v_tbl24, svdup_u32_z(pv, 0xff));
> 
> Loos like here should be
> v_idx = svand_x(pv, v_tbl24, svdup_u32_z(pv, 0xffffff)); because we are
> using 24 bits to keep tbl8 group.

Yes, the mask should be 0xffffff.

Also noticed there is common issue in all vector lookup implementations (NEON/SSE/ALTIVEC).
I'll correct this and fix other vector implementations in next version.

> 
> 
> > +		v_idx = svmul_x(pv, v_idx,
> RTE_LPM_TBL8_GROUP_NUM_ENTRIES);
> > +		v_idx = svadd_x(pv, svand_x(pv, v_ip, svdup_u32_z(pv,
> 0xff)),
> > +				v_idx);
> > +		/* Extract values from tbl8[] */
> > +		v_tbl8 = svld1_gather_index(pv, (const uint32_t *)lpm->tbl8,
> > +						v_idx);
> > +		/* Update predicate for tbl8 entries: (valid) */
> > +		pv = svcmpeq(pv, svand_z(pv, v_tbl8, v_mask_v), v_mask_v);
> > +		/* Extract next_hop and write back */
> > +		v_hop = svand_x(pv, v_tbl8, v_mask_hop);
> > +		svst1(pv, &next_hops[i], v_hop);
> 
> I'm not an expert, but probably it would be better to merge two stores
> (svst1) into a single one?

I think we can keep current implementation.
In most cases, tbl24 will be not expanded. Then SVE predicate for tbl8 processing will be zero.
So operations on tbl8 will be null operations.
I think it is better not to mix the two stores (from tbl24 and from tbl8).
> 
> > +
> > +		i += svlen(v_ip);
> > +		pg = svwhilelt_b32(i, n);
> 
> Isn't it better to move the predicate calculation to the beginning of the loop
> and just do {} while (i < n)?

Yes, that also works.
I think checking on SVE predicates is the suggested way to do vector length agnostic loop.
It is more generic and flexible.
> 
> > +	} while (svptest_any(svptrue_b32(), pg)); }
> > +
> > +static inline void
> > +rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> > +		uint32_t defv)
> > +{
> > +	uint32_t i, ips[4];
> > +
> > +	vst1q_s32((int32_t *)ips, ip);
> > +	for (i = 0; i < 4; i++)
> > +		hop[i] = defv;
> > +
> > +	__rte_lpm_lookup_vec(lpm, ips, hop, 4); }
> > +
> > +#ifdef __cplusplus
> > +}
> > +#endif
> > +
> > +#endif /* _RTE_LPM_SVE_H_ */
> >
> 
> --
> Regards,
> Vladimir
  

Patch

diff --git a/lib/librte_eal/arm/include/rte_vect.h b/lib/librte_eal/arm/include/rte_vect.h
index a739e6e66..093e9122a 100644
--- a/lib/librte_eal/arm/include/rte_vect.h
+++ b/lib/librte_eal/arm/include/rte_vect.h
@@ -9,6 +9,9 @@ 
 #include "generic/rte_vect.h"
 #include "rte_debug.h"
 #include "arm_neon.h"
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/lib/librte_lpm/meson.build b/lib/librte_lpm/meson.build
index 6cfc083c5..f93c86640 100644
--- a/lib/librte_lpm/meson.build
+++ b/lib/librte_lpm/meson.build
@@ -5,6 +5,6 @@  sources = files('rte_lpm.c', 'rte_lpm6.c')
 headers = files('rte_lpm.h', 'rte_lpm6.h')
 # since header files have different names, we can install all vector headers
 # without worrying about which architecture we actually need
-headers += files('rte_lpm_altivec.h', 'rte_lpm_neon.h', 'rte_lpm_sse.h')
+headers += files('rte_lpm_altivec.h', 'rte_lpm_neon.h', 'rte_lpm_sse.h', 'rte_lpm_sve.h')
 deps += ['hash']
 deps += ['rcu']
diff --git a/lib/librte_lpm/rte_lpm.h b/lib/librte_lpm/rte_lpm.h
index 1afe55cdc..28b57683b 100644
--- a/lib/librte_lpm/rte_lpm.h
+++ b/lib/librte_lpm/rte_lpm.h
@@ -402,7 +402,11 @@  rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
 	uint32_t defv);
 
 #if defined(RTE_ARCH_ARM)
+#ifdef __ARM_FEATURE_SVE
+#include "rte_lpm_sve.h"
+#else
 #include "rte_lpm_neon.h"
+#endif
 #elif defined(RTE_ARCH_PPC_64)
 #include "rte_lpm_altivec.h"
 #else
diff --git a/lib/librte_lpm/rte_lpm_sve.h b/lib/librte_lpm/rte_lpm_sve.h
new file mode 100644
index 000000000..86576ec52
--- /dev/null
+++ b/lib/librte_lpm/rte_lpm_sve.h
@@ -0,0 +1,83 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Arm Limited
+ */
+
+#ifndef _RTE_LPM_SVE_H_
+#define _RTE_LPM_SVE_H_
+
+#include <rte_vect.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+__rte_internal
+static void
+__rte_lpm_lookup_vec(const struct rte_lpm *lpm, const uint32_t *ips,
+		uint32_t *__rte_restrict next_hops, const uint32_t n)
+{
+	uint32_t i = 0;
+	svuint32_t v_ip, v_idx, v_tbl24, v_tbl8, v_hop;
+	svuint32_t v_mask_xv, v_mask_v, v_mask_hop;
+	svbool_t pg = svwhilelt_b32(i, n);
+	svbool_t pv;
+
+	do {
+		v_ip = svld1(pg, &ips[i]);
+		/* Get indices for tbl24[] */
+		v_idx = svlsr_x(pg, v_ip, 8);
+		/* Extract values from tbl24[] */
+		v_tbl24 = svld1_gather_index(pg, (const uint32_t *)lpm->tbl24,
+						v_idx);
+
+		/* Create mask with valid set */
+		v_mask_v = svdup_u32_z(pg, RTE_LPM_LOOKUP_SUCCESS);
+		/* Create mask with valid and valid_group set */
+		v_mask_xv = svdup_u32_z(pg, RTE_LPM_VALID_EXT_ENTRY_BITMASK);
+		/* Create predicate for tbl24 entries: (valid && !valid_group) */
+		pv = svcmpeq(pg, svand_z(pg, v_tbl24, v_mask_xv), v_mask_v);
+		/* Create mask for next_hop in table entry */
+		v_mask_hop = svdup_u32_z(pg, 0x00ffffff);
+		/* Extract next_hop and write back */
+		v_hop = svand_x(pv, v_tbl24, v_mask_hop);
+		svst1(pv, &next_hops[i], v_hop);
+
+		/* Update predicate for tbl24 entries: (valid && valid_group) */
+		pv = svcmpeq(pg, svand_z(pg, v_tbl24, v_mask_xv), v_mask_xv);
+		/* Compute tbl8 index */
+		v_idx = svand_x(pv, v_tbl24, svdup_u32_z(pv, 0xff));
+		v_idx = svmul_x(pv, v_idx, RTE_LPM_TBL8_GROUP_NUM_ENTRIES);
+		v_idx = svadd_x(pv, svand_x(pv, v_ip, svdup_u32_z(pv, 0xff)),
+				v_idx);
+		/* Extract values from tbl8[] */
+		v_tbl8 = svld1_gather_index(pv, (const uint32_t *)lpm->tbl8,
+						v_idx);
+		/* Update predicate for tbl8 entries: (valid) */
+		pv = svcmpeq(pv, svand_z(pv, v_tbl8, v_mask_v), v_mask_v);
+		/* Extract next_hop and write back */
+		v_hop = svand_x(pv, v_tbl8, v_mask_hop);
+		svst1(pv, &next_hops[i], v_hop);
+
+		i += svlen(v_ip);
+		pg = svwhilelt_b32(i, n);
+	} while (svptest_any(svptrue_b32(), pg));
+}
+
+static inline void
+rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
+		uint32_t defv)
+{
+	uint32_t i, ips[4];
+
+	vst1q_s32((int32_t *)ips, ip);
+	for (i = 0; i < 4; i++)
+		hop[i] = defv;
+
+	__rte_lpm_lookup_vec(lpm, ips, hop, 4);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_LPM_SVE_H_ */