Message ID | 20220510115824.457885-1-kda@semihalf.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | David Marchand |
Headers | show |
Series | [1/1] lpm: add a scalar version of lookupx4 function | expand |
Context | Check | Description |
---|---|---|
ci/intel-Testing | success | Testing PASS |
ci/Intel-compilation | success | Compilation OK |
ci/iol-intel-Performance | success | Performance Testing PASS |
ci/iol-intel-Functional | success | Functional Testing PASS |
ci/iol-aarch64-unit-testing | success | Testing PASS |
ci/iol-mellanox-Performance | success | Performance Testing PASS |
ci/iol-aarch64-compile-testing | success | Testing PASS |
ci/github-robot: build | success | github build: passed |
ci/checkpatch | success | coding style OK |
Hi Stanislaw, Michal, As far as I can see, this implementation almost completely repeats other lookupx4() implementations, except for the use of vector instructions. On my board (x86_64) in lpm_perf_autotest your implementation takes about: LPM LookupX4: 29.5 cycles (fails = 12.5%) replacing this code with a simple loop with rte_lpm_lookup(): uint32_t nh; int i, ret; for (i = 0; i < 4; i++) { ret = rte_lpm_lookup((struct rte_lpm *)lpm, ((rte_xmm_t)ip).u32[i], &nh); hop[i] = (ret == 0) ? nh : defv; } works faster: LPM LookupX4: 22.2 cycles (fails = 12.5%) I'm wondering if this will work faster on your board (I assume it it RISC-V arch)? Thanks! On 10/05/2022 12:58, Stanislaw Kardach wrote: > From: Michal Mazurek <maz@semihalf.com> > > Add an implementation of the rte_lpm_lookupx4() function for platforms > without support for vector operations. > > This will be useful in the upcoming RISC-V port as well as any platform > which may want to start with a basic level of LPM support. > > Signed-off-by: Michal Mazurek <maz@semihalf.com> > Signed-off-by: Stanislaw Kardach <kda@semihalf.com> > --- > doc/guides/rel_notes/release_22_07.rst | 5 + > lib/lpm/meson.build | 1 + > lib/lpm/rte_lpm.h | 4 +- > lib/lpm/rte_lpm_scalar.h | 122 +++++++++++++++++++++++++ > 4 files changed, 131 insertions(+), 1 deletion(-) > create mode 100644 lib/lpm/rte_lpm_scalar.h > > diff --git a/doc/guides/rel_notes/release_22_07.rst b/doc/guides/rel_notes/release_22_07.rst > index 4ae91dd94d..73e8d632f2 100644 > --- a/doc/guides/rel_notes/release_22_07.rst > +++ b/doc/guides/rel_notes/release_22_07.rst > @@ -70,6 +70,11 @@ New Features > * Added AH mode support in lookaside protocol (IPsec) for CN9K & CN10K. > * Added AES-GMAC support in lookaside protocol (IPsec) for CN9K & CN10K. > > +* **Added scalar version of the LPM library.** > + > + * Added scalar implementation of ``rte_lpm_lookupx4``. This is a fall-back > + implementation for platforms that don't support vector operations. > + > > Removed Items > ------------- > diff --git a/lib/lpm/meson.build b/lib/lpm/meson.build > index 78d91d3421..6b47361fce 100644 > --- a/lib/lpm/meson.build > +++ b/lib/lpm/meson.build > @@ -14,6 +14,7 @@ headers = files('rte_lpm.h', 'rte_lpm6.h') > indirect_headers += files( > 'rte_lpm_altivec.h', > 'rte_lpm_neon.h', > + 'rte_lpm_scalar.h', > 'rte_lpm_sse.h', > 'rte_lpm_sve.h', > ) > diff --git a/lib/lpm/rte_lpm.h b/lib/lpm/rte_lpm.h > index eb91960e81..b5db6a353a 100644 > --- a/lib/lpm/rte_lpm.h > +++ b/lib/lpm/rte_lpm.h > @@ -405,8 +405,10 @@ rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], > #endif > #elif defined(RTE_ARCH_PPC_64) > #include "rte_lpm_altivec.h" > -#else > +#elif defined(RTE_ARCH_X86) > #include "rte_lpm_sse.h" > +#else > +#include "rte_lpm_scalar.h" > #endif > > #ifdef __cplusplus > diff --git a/lib/lpm/rte_lpm_scalar.h b/lib/lpm/rte_lpm_scalar.h > new file mode 100644 > index 0000000000..991b94e687 > --- /dev/null > +++ b/lib/lpm/rte_lpm_scalar.h > @@ -0,0 +1,122 @@ > +/* SPDX-License-Identifier: BSD-3-Clause > + * Copyright(c) 2022 StarFive > + * Copyright(c) 2022 SiFive > + * Copyright(c) 2022 Semihalf > + */ > + > +#ifndef _RTE_LPM_SCALAR_H_ > +#define _RTE_LPM_SCALAR_H_ > + > +#include <rte_branch_prediction.h> > +#include <rte_byteorder.h> > +#include <rte_common.h> > +#include <rte_vect.h> > + > +#ifdef __cplusplus > +extern "C" { > +#endif > + > +static inline void > +rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], > + uint32_t defv) > +{ > + rte_xmm_t i24; > + rte_xmm_t i8; > + uint32_t tbl[4]; > + uint64_t pt, pt2; > + const uint32_t *ptbl; > + > + const rte_xmm_t mask8 = { > + .u32 = {UINT8_MAX, UINT8_MAX, UINT8_MAX, UINT8_MAX}}; > + > + /* > + * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 2 LPM entries > + * as one 64-bit value (0x0300000003000000). > + */ > + const uint64_t mask_xv = > + ((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK | > + (uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 32); > + > + /* > + * RTE_LPM_LOOKUP_SUCCESS for 2 LPM entries > + * as one 64-bit value (0x0100000001000000). > + */ > + const uint64_t mask_v = > + ((uint64_t)RTE_LPM_LOOKUP_SUCCESS | > + (uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32); > + > + /* get 4 indexes for tbl24[]. */ > + i24.x = ip; > + i24.u32[0] >>= CHAR_BIT; > + i24.u32[1] >>= CHAR_BIT; > + i24.u32[2] >>= CHAR_BIT; > + i24.u32[3] >>= CHAR_BIT; > + > + /* extract values from tbl24[] */ > + ptbl = (const uint32_t *)&lpm->tbl24[i24.u32[0]]; > + tbl[0] = *ptbl; > + ptbl = (const uint32_t *)&lpm->tbl24[i24.u32[1]]; > + tbl[1] = *ptbl; > + ptbl = (const uint32_t *)&lpm->tbl24[i24.u32[2]]; > + tbl[2] = *ptbl; > + ptbl = (const uint32_t *)&lpm->tbl24[i24.u32[3]]; > + tbl[3] = *ptbl; > + > + /* get 4 indexes for tbl8[]. */ > + i8.x = ip; > + i8.u64[0] &= mask8.u64[0]; > + i8.u64[1] &= mask8.u64[1]; > + > + pt = (uint64_t)tbl[0] | > + (uint64_t)tbl[1] << 32; > + pt2 = (uint64_t)tbl[2] | > + (uint64_t)tbl[3] << 32; > + > + /* search successfully finished for all 4 IP addresses. */ > + if (likely((pt & mask_xv) == mask_v) && > + likely((pt2 & mask_xv) == mask_v)) { > + *(uint64_t *)hop = pt & RTE_LPM_MASKX4_RES; > + *(uint64_t *)(hop + 2) = pt2 & RTE_LPM_MASKX4_RES; > + return; > + } > + > + if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == > + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { > + i8.u32[0] = i8.u32[0] + > + (tbl[0] & 0x00FFFFFF) * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; > + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[0]]; > + tbl[0] = *ptbl; > + } > + if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == > + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { > + i8.u32[1] = i8.u32[1] + > + (tbl[1] & 0x00FFFFFF) * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; > + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[1]]; > + tbl[1] = *ptbl; > + } > + if (unlikely((pt2 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == > + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { > + i8.u32[2] = i8.u32[2] + > + (tbl[2] & 0x00FFFFFF) * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; > + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[2]]; > + tbl[2] = *ptbl; > + } > + if (unlikely((pt2 >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == > + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { > + i8.u32[3] = i8.u32[3] + > + (tbl[3] & 0x00FFFFFF) * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; > + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[3]]; > + tbl[3] = *ptbl; > + } > + > + hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[0] & 0x00FFFFFF : defv; > + hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[1] & 0x00FFFFFF : defv; > + hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[2] & 0x00FFFFFF : defv; > + hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & 0x00FFFFFF : defv; > +} > + > +#ifdef __cplusplus > +} > +#endif > + > +#endif /* _RTE_LPM_SCALAR_H_ */
On Thu, May 19, 2022 at 7:04 PM Medvedkin, Vladimir <vladimir.medvedkin@intel.com> wrote: > > Hi Stanislaw, Michal, > > As far as I can see, this implementation almost completely repeats other > lookupx4() implementations, except for the use of vector instructions. > > On my board (x86_64) in lpm_perf_autotest your implementation takes about: > LPM LookupX4: 29.5 cycles (fails = 12.5%) > > replacing this code with a simple loop with rte_lpm_lookup(): > > uint32_t nh; > int i, ret; > > for (i = 0; i < 4; i++) { > ret = rte_lpm_lookup((struct rte_lpm *)lpm, ((rte_xmm_t)ip).u32[i], &nh); > hop[i] = (ret == 0) ? nh : defv; > } > > works faster: > LPM LookupX4: 22.2 cycles (fails = 12.5%) > > I'm wondering if this will work faster on your board (I assume it it > RISC-V arch)? Hi Vladimir, On my HiFive Unmatched RISC-V board there is a marginal difference (~ -1.56%): Our version: 210.5 cycles (fails = 12.5%) rte_lpm_lookup version: 213.8 cycles (fails = 12.5%) Given that x86 is faster with rte_lpm_lookup, I'll change to this implementation in the next version. That said I wonder why do we have different const requirements for rte_lpm_lookup() and rte_lpm_lookupx4(): static inline int rte_lpm_lookup(struct rte_lpm *lpm, uint32_t ip, uint32_t *next_hop) static inline void rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], uint32_t defv); I think both should be const. > > Thanks! > > On 10/05/2022 12:58, Stanislaw Kardach wrote: > > From: Michal Mazurek <maz@semihalf.com> > > > > Add an implementation of the rte_lpm_lookupx4() function for platforms > > without support for vector operations. > > > > This will be useful in the upcoming RISC-V port as well as any platform > > which may want to start with a basic level of LPM support. > > > > Signed-off-by: Michal Mazurek <maz@semihalf.com> > > Signed-off-by: Stanislaw Kardach <kda@semihalf.com> > > --- > > doc/guides/rel_notes/release_22_07.rst | 5 + > > lib/lpm/meson.build | 1 + > > lib/lpm/rte_lpm.h | 4 +- > > lib/lpm/rte_lpm_scalar.h | 122 +++++++++++++++++++++++++ > > 4 files changed, 131 insertions(+), 1 deletion(-) > > create mode 100644 lib/lpm/rte_lpm_scalar.h > > > > diff --git a/doc/guides/rel_notes/release_22_07.rst b/doc/guides/rel_notes/release_22_07.rst > > index 4ae91dd94d..73e8d632f2 100644 > > --- a/doc/guides/rel_notes/release_22_07.rst > > +++ b/doc/guides/rel_notes/release_22_07.rst > > @@ -70,6 +70,11 @@ New Features > > * Added AH mode support in lookaside protocol (IPsec) for CN9K & CN10K. > > * Added AES-GMAC support in lookaside protocol (IPsec) for CN9K & CN10K. > > > > +* **Added scalar version of the LPM library.** > > + > > + * Added scalar implementation of ``rte_lpm_lookupx4``. This is a fall-back > > + implementation for platforms that don't support vector operations. > > + > > > > Removed Items > > ------------- > > diff --git a/lib/lpm/meson.build b/lib/lpm/meson.build > > index 78d91d3421..6b47361fce 100644 > > --- a/lib/lpm/meson.build > > +++ b/lib/lpm/meson.build > > @@ -14,6 +14,7 @@ headers = files('rte_lpm.h', 'rte_lpm6.h') > > indirect_headers += files( > > 'rte_lpm_altivec.h', > > 'rte_lpm_neon.h', > > + 'rte_lpm_scalar.h', > > 'rte_lpm_sse.h', > > 'rte_lpm_sve.h', > > ) > > diff --git a/lib/lpm/rte_lpm.h b/lib/lpm/rte_lpm.h > > index eb91960e81..b5db6a353a 100644 > > --- a/lib/lpm/rte_lpm.h > > +++ b/lib/lpm/rte_lpm.h > > @@ -405,8 +405,10 @@ rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], > > #endif > > #elif defined(RTE_ARCH_PPC_64) > > #include "rte_lpm_altivec.h" > > -#else > > +#elif defined(RTE_ARCH_X86) > > #include "rte_lpm_sse.h" > > +#else > > +#include "rte_lpm_scalar.h" > > #endif > > > > #ifdef __cplusplus > > diff --git a/lib/lpm/rte_lpm_scalar.h b/lib/lpm/rte_lpm_scalar.h > > new file mode 100644 > > index 0000000000..991b94e687 > > --- /dev/null > > +++ b/lib/lpm/rte_lpm_scalar.h > > @@ -0,0 +1,122 @@ > > +/* SPDX-License-Identifier: BSD-3-Clause > > + * Copyright(c) 2022 StarFive > > + * Copyright(c) 2022 SiFive > > + * Copyright(c) 2022 Semihalf > > + */ > > + > > +#ifndef _RTE_LPM_SCALAR_H_ > > +#define _RTE_LPM_SCALAR_H_ > > + > > +#include <rte_branch_prediction.h> > > +#include <rte_byteorder.h> > > +#include <rte_common.h> > > +#include <rte_vect.h> > > + > > +#ifdef __cplusplus > > +extern "C" { > > +#endif > > + > > +static inline void > > +rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], > > + uint32_t defv) > > +{ > > + rte_xmm_t i24; > > + rte_xmm_t i8; > > + uint32_t tbl[4]; > > + uint64_t pt, pt2; > > + const uint32_t *ptbl; > > + > > + const rte_xmm_t mask8 = { > > + .u32 = {UINT8_MAX, UINT8_MAX, UINT8_MAX, UINT8_MAX}}; > > + > > + /* > > + * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 2 LPM entries > > + * as one 64-bit value (0x0300000003000000). > > + */ > > + const uint64_t mask_xv = > > + ((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK | > > + (uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 32); > > + > > + /* > > + * RTE_LPM_LOOKUP_SUCCESS for 2 LPM entries > > + * as one 64-bit value (0x0100000001000000). > > + */ > > + const uint64_t mask_v = > > + ((uint64_t)RTE_LPM_LOOKUP_SUCCESS | > > + (uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32); > > + > > + /* get 4 indexes for tbl24[]. */ > > + i24.x = ip; > > + i24.u32[0] >>= CHAR_BIT; > > + i24.u32[1] >>= CHAR_BIT; > > + i24.u32[2] >>= CHAR_BIT; > > + i24.u32[3] >>= CHAR_BIT; > > + > > + /* extract values from tbl24[] */ > > + ptbl = (const uint32_t *)&lpm->tbl24[i24.u32[0]]; > > + tbl[0] = *ptbl; > > + ptbl = (const uint32_t *)&lpm->tbl24[i24.u32[1]]; > > + tbl[1] = *ptbl; > > + ptbl = (const uint32_t *)&lpm->tbl24[i24.u32[2]]; > > + tbl[2] = *ptbl; > > + ptbl = (const uint32_t *)&lpm->tbl24[i24.u32[3]]; > > + tbl[3] = *ptbl; > > + > > + /* get 4 indexes for tbl8[]. */ > > + i8.x = ip; > > + i8.u64[0] &= mask8.u64[0]; > > + i8.u64[1] &= mask8.u64[1]; > > + > > + pt = (uint64_t)tbl[0] | > > + (uint64_t)tbl[1] << 32; > > + pt2 = (uint64_t)tbl[2] | > > + (uint64_t)tbl[3] << 32; > > + > > + /* search successfully finished for all 4 IP addresses. */ > > + if (likely((pt & mask_xv) == mask_v) && > > + likely((pt2 & mask_xv) == mask_v)) { > > + *(uint64_t *)hop = pt & RTE_LPM_MASKX4_RES; > > + *(uint64_t *)(hop + 2) = pt2 & RTE_LPM_MASKX4_RES; > > + return; > > + } > > + > > + if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == > > + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { > > + i8.u32[0] = i8.u32[0] + > > + (tbl[0] & 0x00FFFFFF) * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; > > + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[0]]; > > + tbl[0] = *ptbl; > > + } > > + if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == > > + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { > > + i8.u32[1] = i8.u32[1] + > > + (tbl[1] & 0x00FFFFFF) * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; > > + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[1]]; > > + tbl[1] = *ptbl; > > + } > > + if (unlikely((pt2 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == > > + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { > > + i8.u32[2] = i8.u32[2] + > > + (tbl[2] & 0x00FFFFFF) * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; > > + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[2]]; > > + tbl[2] = *ptbl; > > + } > > + if (unlikely((pt2 >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == > > + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { > > + i8.u32[3] = i8.u32[3] + > > + (tbl[3] & 0x00FFFFFF) * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; > > + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[3]]; > > + tbl[3] = *ptbl; > > + } > > + > > + hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[0] & 0x00FFFFFF : defv; > > + hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[1] & 0x00FFFFFF : defv; > > + hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[2] & 0x00FFFFFF : defv; > > + hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & 0x00FFFFFF : defv; > > +} > > + > > +#ifdef __cplusplus > > +} > > +#endif > > + > > +#endif /* _RTE_LPM_SCALAR_H_ */ > > -- > Regards, > Vladimir
On Tue, May 24, 2022 at 6:28 PM Stanisław Kardach <kda@semihalf.com> wrote: <snip> > That said I wonder why do we have different const requirements for > rte_lpm_lookup() and rte_lpm_lookupx4(): > static inline int rte_lpm_lookup(struct rte_lpm *lpm, uint32_t ip, > uint32_t *next_hop) > static inline void rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t > ip, uint32_t hop[4], uint32_t defv); > I think both should be const. > To re-iterate the question, should I also post a patch for changing rte_lpm_lookup() to add "const" to "struct rte_lpm *lpm" argument? rte_lpm_lookup_bulk_func() and rte_lpm_lookupx4() already take lpm as const. I'm pushing because otherwise I get a const discard warning in the scalar version of rte_lpm_lookupx4() utilizing rte_lpm_lookup(). Best Regards, Stanislaw Kardach
Hi Stanislaw, On 27/05/2022 12:16, Stanisław Kardach wrote: > On Tue, May 24, 2022 at 6:28 PM Stanisław Kardach <kda@semihalf.com> wrote: > <snip> >> That said I wonder why do we have different const requirements for >> rte_lpm_lookup() and rte_lpm_lookupx4(): >> static inline int rte_lpm_lookup(struct rte_lpm *lpm, uint32_t ip, >> uint32_t *next_hop) >> static inline void rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t >> ip, uint32_t hop[4], uint32_t defv); >> I think both should be const. >> > To re-iterate the question, should I also post a patch for changing > rte_lpm_lookup() to add "const" to "struct rte_lpm *lpm" argument? > rte_lpm_lookup_bulk_func() and rte_lpm_lookupx4() already take lpm as > const. > I'm pushing because otherwise I get a const discard warning in the > scalar version of rte_lpm_lookupx4() utilizing rte_lpm_lookup(). Since these are inline functions, there will be no problems with the ABI/API, so please add const to the *lpm argument. Thanks! > > Best Regards, > Stanislaw Kardach
diff --git a/doc/guides/rel_notes/release_22_07.rst b/doc/guides/rel_notes/release_22_07.rst index 4ae91dd94d..73e8d632f2 100644 --- a/doc/guides/rel_notes/release_22_07.rst +++ b/doc/guides/rel_notes/release_22_07.rst @@ -70,6 +70,11 @@ New Features * Added AH mode support in lookaside protocol (IPsec) for CN9K & CN10K. * Added AES-GMAC support in lookaside protocol (IPsec) for CN9K & CN10K. +* **Added scalar version of the LPM library.** + + * Added scalar implementation of ``rte_lpm_lookupx4``. This is a fall-back + implementation for platforms that don't support vector operations. + Removed Items ------------- diff --git a/lib/lpm/meson.build b/lib/lpm/meson.build index 78d91d3421..6b47361fce 100644 --- a/lib/lpm/meson.build +++ b/lib/lpm/meson.build @@ -14,6 +14,7 @@ headers = files('rte_lpm.h', 'rte_lpm6.h') indirect_headers += files( 'rte_lpm_altivec.h', 'rte_lpm_neon.h', + 'rte_lpm_scalar.h', 'rte_lpm_sse.h', 'rte_lpm_sve.h', ) diff --git a/lib/lpm/rte_lpm.h b/lib/lpm/rte_lpm.h index eb91960e81..b5db6a353a 100644 --- a/lib/lpm/rte_lpm.h +++ b/lib/lpm/rte_lpm.h @@ -405,8 +405,10 @@ rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], #endif #elif defined(RTE_ARCH_PPC_64) #include "rte_lpm_altivec.h" -#else +#elif defined(RTE_ARCH_X86) #include "rte_lpm_sse.h" +#else +#include "rte_lpm_scalar.h" #endif #ifdef __cplusplus diff --git a/lib/lpm/rte_lpm_scalar.h b/lib/lpm/rte_lpm_scalar.h new file mode 100644 index 0000000000..991b94e687 --- /dev/null +++ b/lib/lpm/rte_lpm_scalar.h @@ -0,0 +1,122 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2022 StarFive + * Copyright(c) 2022 SiFive + * Copyright(c) 2022 Semihalf + */ + +#ifndef _RTE_LPM_SCALAR_H_ +#define _RTE_LPM_SCALAR_H_ + +#include <rte_branch_prediction.h> +#include <rte_byteorder.h> +#include <rte_common.h> +#include <rte_vect.h> + +#ifdef __cplusplus +extern "C" { +#endif + +static inline void +rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4], + uint32_t defv) +{ + rte_xmm_t i24; + rte_xmm_t i8; + uint32_t tbl[4]; + uint64_t pt, pt2; + const uint32_t *ptbl; + + const rte_xmm_t mask8 = { + .u32 = {UINT8_MAX, UINT8_MAX, UINT8_MAX, UINT8_MAX}}; + + /* + * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 2 LPM entries + * as one 64-bit value (0x0300000003000000). + */ + const uint64_t mask_xv = + ((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK | + (uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 32); + + /* + * RTE_LPM_LOOKUP_SUCCESS for 2 LPM entries + * as one 64-bit value (0x0100000001000000). + */ + const uint64_t mask_v = + ((uint64_t)RTE_LPM_LOOKUP_SUCCESS | + (uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32); + + /* get 4 indexes for tbl24[]. */ + i24.x = ip; + i24.u32[0] >>= CHAR_BIT; + i24.u32[1] >>= CHAR_BIT; + i24.u32[2] >>= CHAR_BIT; + i24.u32[3] >>= CHAR_BIT; + + /* extract values from tbl24[] */ + ptbl = (const uint32_t *)&lpm->tbl24[i24.u32[0]]; + tbl[0] = *ptbl; + ptbl = (const uint32_t *)&lpm->tbl24[i24.u32[1]]; + tbl[1] = *ptbl; + ptbl = (const uint32_t *)&lpm->tbl24[i24.u32[2]]; + tbl[2] = *ptbl; + ptbl = (const uint32_t *)&lpm->tbl24[i24.u32[3]]; + tbl[3] = *ptbl; + + /* get 4 indexes for tbl8[]. */ + i8.x = ip; + i8.u64[0] &= mask8.u64[0]; + i8.u64[1] &= mask8.u64[1]; + + pt = (uint64_t)tbl[0] | + (uint64_t)tbl[1] << 32; + pt2 = (uint64_t)tbl[2] | + (uint64_t)tbl[3] << 32; + + /* search successfully finished for all 4 IP addresses. */ + if (likely((pt & mask_xv) == mask_v) && + likely((pt2 & mask_xv) == mask_v)) { + *(uint64_t *)hop = pt & RTE_LPM_MASKX4_RES; + *(uint64_t *)(hop + 2) = pt2 & RTE_LPM_MASKX4_RES; + return; + } + + if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + i8.u32[0] = i8.u32[0] + + (tbl[0] & 0x00FFFFFF) * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[0]]; + tbl[0] = *ptbl; + } + if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + i8.u32[1] = i8.u32[1] + + (tbl[1] & 0x00FFFFFF) * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[1]]; + tbl[1] = *ptbl; + } + if (unlikely((pt2 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + i8.u32[2] = i8.u32[2] + + (tbl[2] & 0x00FFFFFF) * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[2]]; + tbl[2] = *ptbl; + } + if (unlikely((pt2 >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) == + RTE_LPM_VALID_EXT_ENTRY_BITMASK)) { + i8.u32[3] = i8.u32[3] + + (tbl[3] & 0x00FFFFFF) * RTE_LPM_TBL8_GROUP_NUM_ENTRIES; + ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[3]]; + tbl[3] = *ptbl; + } + + hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[0] & 0x00FFFFFF : defv; + hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[1] & 0x00FFFFFF : defv; + hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[2] & 0x00FFFFFF : defv; + hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & 0x00FFFFFF : defv; +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_LPM_SCALAR_H_ */