[v7,1/4] hash: pack the hitmask for hash in bulk lookup

Message ID 20240312154215.802374-2-yoan.picchi@arm.com (mailing list archive)
State Superseded
Delegated to: Thomas Monjalon
Headers
Series hash: add SVE support for bulk key lookup |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Yoan Picchi March 12, 2024, 3:42 p.m. UTC
  Current hitmask includes padding due to Intel's SIMD
implementation detail. This patch allows non Intel SIMD
implementations to benefit from a dense hitmask.
In addition, the new dense hitmask interweave the primary
and secondary matches which allow a better cache usage and
enable future improvements for the SIMD implementations

Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Nathan Brown <nathan.brown@arm.com>
---
 .mailmap                                  |   2 +
 lib/hash/arch/arm/compare_signatures.h    |  61 +++++++
 lib/hash/arch/common/compare_signatures.h |  38 +++++
 lib/hash/arch/x86/compare_signatures.h    |  53 ++++++
 lib/hash/rte_cuckoo_hash.c                | 192 ++++++++++++----------
 5 files changed, 255 insertions(+), 91 deletions(-)
 create mode 100644 lib/hash/arch/arm/compare_signatures.h
 create mode 100644 lib/hash/arch/common/compare_signatures.h
 create mode 100644 lib/hash/arch/x86/compare_signatures.h
  

Comments

Konstantin Ananyev March 19, 2024, 10:41 a.m. UTC | #1
Hi,

> Current hitmask includes padding due to Intel's SIMD
> implementation detail. This patch allows non Intel SIMD
> implementations to benefit from a dense hitmask.
> In addition, the new dense hitmask interweave the primary
> and secondary matches which allow a better cache usage and
> enable future improvements for the SIMD implementations
> 
> Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> Reviewed-by: Nathan Brown <nathan.brown@arm.com>
> ---
>  .mailmap                                  |   2 +
>  lib/hash/arch/arm/compare_signatures.h    |  61 +++++++
>  lib/hash/arch/common/compare_signatures.h |  38 +++++
>  lib/hash/arch/x86/compare_signatures.h    |  53 ++++++
>  lib/hash/rte_cuckoo_hash.c                | 192 ++++++++++++----------
>  5 files changed, 255 insertions(+), 91 deletions(-)
>  create mode 100644 lib/hash/arch/arm/compare_signatures.h
>  create mode 100644 lib/hash/arch/common/compare_signatures.h
>  create mode 100644 lib/hash/arch/x86/compare_signatures.h
> 
> diff --git a/.mailmap b/.mailmap
> index 66ebc20666..00b50414d3 100644
> --- a/.mailmap
> +++ b/.mailmap
> @@ -494,6 +494,7 @@ Hari Kumar Vemula <hari.kumarx.vemula@intel.com>
>  Harini Ramakrishnan <harini.ramakrishnan@microsoft.com>
>  Hariprasad Govindharajan <hariprasad.govindharajan@intel.com>
>  Harish Patil <harish.patil@cavium.com> <harish.patil@qlogic.com>
> +Harjot Singh <harjot.singh@arm.com>
>  Harman Kalra <hkalra@marvell.com>
>  Harneet Singh <harneet.singh@intel.com>
>  Harold Huang <baymaxhuang@gmail.com>
> @@ -1633,6 +1634,7 @@ Yixue Wang <yixue.wang@intel.com>
>  Yi Yang <yangyi01@inspur.com> <yi.y.yang@intel.com>
>  Yi Zhang <zhang.yi75@zte.com.cn>
>  Yoann Desmouceaux <ydesmouc@cisco.com>
> +Yoan Picchi <yoan.picchi@arm.com>
>  Yogesh Jangra <yogesh.jangra@intel.com>
>  Yogev Chaimovich <yogev@cgstowernetworks.com>
>  Yongjie Gu <yongjiex.gu@intel.com>
> diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
> new file mode 100644
> index 0000000000..1af6ba8190
> --- /dev/null
> +++ b/lib/hash/arch/arm/compare_signatures.h
> @@ -0,0 +1,61 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2010-2016 Intel Corporation
> + * Copyright(c) 2018-2024 Arm Limited
> + */
> +
> +/*
> + * Arm's version uses a densely packed hitmask buffer:
> + * Every bit is in use.
> + */
> +
> +#include <inttypes.h>
> +#include <rte_common.h>
> +#include <rte_vect.h>
> +#include "rte_cuckoo_hash.h"
> +
> +#define DENSE_HASH_BULK_LOOKUP 1
> +
> +static inline void
> +compare_signatures_dense(uint16_t *hitmask_buffer,
> +			const uint16_t *prim_bucket_sigs,
> +			const uint16_t *sec_bucket_sigs,
> +			uint16_t sig,
> +			enum rte_hash_sig_compare_function sig_cmp_fn)
> +{
> +
> +	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
> +	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
> +
> +	/* For match mask every bits indicates the match */
> +	switch (sig_cmp_fn) {
> +#if RTE_HASH_BUCKET_ENTRIES <= 8
> +	case RTE_HASH_COMPARE_NEON: {
> +		uint16x8_t vmat, vsig, x;
> +		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
> +		uint16_t low, high;
> +
> +		vsig = vld1q_dup_u16((uint16_t const *)&sig);
> +		/* Compare all signatures in the primary bucket */
> +		vmat = vceqq_u16(vsig,
> +			vld1q_u16((uint16_t const *)prim_bucket_sigs));
> +		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
> +		low = (uint16_t)(vaddvq_u16(x));
> +		/* Compare all signatures in the secondary bucket */
> +		vmat = vceqq_u16(vsig,
> +			vld1q_u16((uint16_t const *)sec_bucket_sigs));
> +		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
> +		high = (uint16_t)(vaddvq_u16(x));
> +		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
> +
> +		}
> +		break;
> +#endif
> +	default:
> +		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> +			*hitmask_buffer |=
> +				((sig == prim_bucket_sigs[i]) << i);
> +			*hitmask_buffer |=
> +				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
> +		}
> +	}
> +}
> diff --git a/lib/hash/arch/common/compare_signatures.h b/lib/hash/arch/common/compare_signatures.h
> new file mode 100644
> index 0000000000..dcf9444032
> --- /dev/null
> +++ b/lib/hash/arch/common/compare_signatures.h
> @@ -0,0 +1,38 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2010-2016 Intel Corporation
> + * Copyright(c) 2018-2024 Arm Limited
> + */
> +
> +/*
> + * The generic version could use either a dense or sparsely packed hitmask buffer,
> + * but the dense one is slightly faster.
> + */
> +
> +#include <inttypes.h>
> +#include <rte_common.h>
> +#include <rte_vect.h>
> +#include "rte_cuckoo_hash.h"
> +
> +#define DENSE_HASH_BULK_LOOKUP 1
> +
> +static inline void
> +compare_signatures_dense(uint16_t *hitmask_buffer,
> +			const uint16_t *prim_bucket_sigs,
> +			const uint16_t *sec_bucket_sigs,
> +			uint16_t sig,
> +			enum rte_hash_sig_compare_function sig_cmp_fn)
> +{
> +	(void) sig_cmp_fn;
> +
> +	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
> +	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
> +
> +	/* For match mask every bits indicates the match */
> +	for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> +		*hitmask_buffer |=
> +			((sig == prim_bucket_sigs[i]) << i);
> +		*hitmask_buffer |=
> +			((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
> +	}
> +
> +}

Thanks for re-factoring compare_signatures_...() code, it looks much cleaner that way.
One question I have - does it mean that now for x86 we always use 'sparse' while for all other
ARM and non-ARM platforms we switch to 'dense'?

> diff --git a/lib/hash/arch/x86/compare_signatures.h b/lib/hash/arch/x86/compare_signatures.h
> new file mode 100644
> index 0000000000..7eec499e1f
> --- /dev/null
> +++ b/lib/hash/arch/x86/compare_signatures.h
> @@ -0,0 +1,53 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2010-2016 Intel Corporation
> + * Copyright(c) 2018-2024 Arm Limited
> + */
> +
> +/*
> + * x86's version uses a sparsely packed hitmask buffer:
> + * Every other bit is padding.
> + */
> +
> +#include <inttypes.h>
> +#include <rte_common.h>
> +#include <rte_vect.h>
> +#include "rte_cuckoo_hash.h"
> +
> +#define DENSE_HASH_BULK_LOOKUP 0
> +
> +static inline void
> +compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
> +			const struct rte_hash_bucket *prim_bkt,
> +			const struct rte_hash_bucket *sec_bkt,
> +			uint16_t sig,
> +			enum rte_hash_sig_compare_function sig_cmp_fn)
> +{
> +	/* For match mask the first bit of every two bits indicates the match */
> +	switch (sig_cmp_fn) {
> +#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
> +	case RTE_HASH_COMPARE_SSE:
> +		/* Compare all signatures in the bucket */
> +		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
> +				_mm_load_si128(
> +					(__m128i const *)prim_bkt->sig_current),
> +				_mm_set1_epi16(sig)));
> +		/* Extract the even-index bits only */
> +		*prim_hash_matches &= 0x5555;
> +		/* Compare all signatures in the bucket */
> +		*sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
> +				_mm_load_si128(
> +					(__m128i const *)sec_bkt->sig_current),
> +				_mm_set1_epi16(sig)));
> +		/* Extract the even-index bits only */
> +		*sec_hash_matches &= 0x5555;
> +		break;
> +#endif /* defined(__SSE2__) */
> +	default:
> +		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> +			*prim_hash_matches |=
> +				((sig == prim_bkt->sig_current[i]) << (i << 1));
> +			*sec_hash_matches |=
> +				((sig == sec_bkt->sig_current[i]) << (i << 1));
> +		}
> +	}
> +}
  
Yoan Picchi March 19, 2024, 1:09 p.m. UTC | #2
On 3/19/24 10:41, Konstantin Ananyev wrote:
> 
> Hi,
> 
>> Current hitmask includes padding due to Intel's SIMD
>> implementation detail. This patch allows non Intel SIMD
>> implementations to benefit from a dense hitmask.
>> In addition, the new dense hitmask interweave the primary
>> and secondary matches which allow a better cache usage and
>> enable future improvements for the SIMD implementations
>>
>> Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
>> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
>> Reviewed-by: Nathan Brown <nathan.brown@arm.com>
>> ---
>>   .mailmap                                  |   2 +
>>   lib/hash/arch/arm/compare_signatures.h    |  61 +++++++
>>   lib/hash/arch/common/compare_signatures.h |  38 +++++
>>   lib/hash/arch/x86/compare_signatures.h    |  53 ++++++
>>   lib/hash/rte_cuckoo_hash.c                | 192 ++++++++++++----------
>>   5 files changed, 255 insertions(+), 91 deletions(-)
>>   create mode 100644 lib/hash/arch/arm/compare_signatures.h
>>   create mode 100644 lib/hash/arch/common/compare_signatures.h
>>   create mode 100644 lib/hash/arch/x86/compare_signatures.h
>>
>> diff --git a/.mailmap b/.mailmap
>> index 66ebc20666..00b50414d3 100644
>> --- a/.mailmap
>> +++ b/.mailmap
>> @@ -494,6 +494,7 @@ Hari Kumar Vemula <hari.kumarx.vemula@intel.com>
>>   Harini Ramakrishnan <harini.ramakrishnan@microsoft.com>
>>   Hariprasad Govindharajan <hariprasad.govindharajan@intel.com>
>>   Harish Patil <harish.patil@cavium.com> <harish.patil@qlogic.com>
>> +Harjot Singh <harjot.singh@arm.com>
>>   Harman Kalra <hkalra@marvell.com>
>>   Harneet Singh <harneet.singh@intel.com>
>>   Harold Huang <baymaxhuang@gmail.com>
>> @@ -1633,6 +1634,7 @@ Yixue Wang <yixue.wang@intel.com>
>>   Yi Yang <yangyi01@inspur.com> <yi.y.yang@intel.com>
>>   Yi Zhang <zhang.yi75@zte.com.cn>
>>   Yoann Desmouceaux <ydesmouc@cisco.com>
>> +Yoan Picchi <yoan.picchi@arm.com>
>>   Yogesh Jangra <yogesh.jangra@intel.com>
>>   Yogev Chaimovich <yogev@cgstowernetworks.com>
>>   Yongjie Gu <yongjiex.gu@intel.com>
>> diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
>> new file mode 100644
>> index 0000000000..1af6ba8190
>> --- /dev/null
>> +++ b/lib/hash/arch/arm/compare_signatures.h
>> @@ -0,0 +1,61 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2010-2016 Intel Corporation
>> + * Copyright(c) 2018-2024 Arm Limited
>> + */
>> +
>> +/*
>> + * Arm's version uses a densely packed hitmask buffer:
>> + * Every bit is in use.
>> + */
>> +
>> +#include <inttypes.h>
>> +#include <rte_common.h>
>> +#include <rte_vect.h>
>> +#include "rte_cuckoo_hash.h"
>> +
>> +#define DENSE_HASH_BULK_LOOKUP 1
>> +
>> +static inline void
>> +compare_signatures_dense(uint16_t *hitmask_buffer,
>> +			const uint16_t *prim_bucket_sigs,
>> +			const uint16_t *sec_bucket_sigs,
>> +			uint16_t sig,
>> +			enum rte_hash_sig_compare_function sig_cmp_fn)
>> +{
>> +
>> +	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
>> +	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
>> +
>> +	/* For match mask every bits indicates the match */
>> +	switch (sig_cmp_fn) {
>> +#if RTE_HASH_BUCKET_ENTRIES <= 8
>> +	case RTE_HASH_COMPARE_NEON: {
>> +		uint16x8_t vmat, vsig, x;
>> +		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
>> +		uint16_t low, high;
>> +
>> +		vsig = vld1q_dup_u16((uint16_t const *)&sig);
>> +		/* Compare all signatures in the primary bucket */
>> +		vmat = vceqq_u16(vsig,
>> +			vld1q_u16((uint16_t const *)prim_bucket_sigs));
>> +		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
>> +		low = (uint16_t)(vaddvq_u16(x));
>> +		/* Compare all signatures in the secondary bucket */
>> +		vmat = vceqq_u16(vsig,
>> +			vld1q_u16((uint16_t const *)sec_bucket_sigs));
>> +		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
>> +		high = (uint16_t)(vaddvq_u16(x));
>> +		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
>> +
>> +		}
>> +		break;
>> +#endif
>> +	default:
>> +		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
>> +			*hitmask_buffer |=
>> +				((sig == prim_bucket_sigs[i]) << i);
>> +			*hitmask_buffer |=
>> +				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
>> +		}
>> +	}
>> +}
>> diff --git a/lib/hash/arch/common/compare_signatures.h b/lib/hash/arch/common/compare_signatures.h
>> new file mode 100644
>> index 0000000000..dcf9444032
>> --- /dev/null
>> +++ b/lib/hash/arch/common/compare_signatures.h
>> @@ -0,0 +1,38 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2010-2016 Intel Corporation
>> + * Copyright(c) 2018-2024 Arm Limited
>> + */
>> +
>> +/*
>> + * The generic version could use either a dense or sparsely packed hitmask buffer,
>> + * but the dense one is slightly faster.
>> + */
>> +
>> +#include <inttypes.h>
>> +#include <rte_common.h>
>> +#include <rte_vect.h>
>> +#include "rte_cuckoo_hash.h"
>> +
>> +#define DENSE_HASH_BULK_LOOKUP 1
>> +
>> +static inline void
>> +compare_signatures_dense(uint16_t *hitmask_buffer,
>> +			const uint16_t *prim_bucket_sigs,
>> +			const uint16_t *sec_bucket_sigs,
>> +			uint16_t sig,
>> +			enum rte_hash_sig_compare_function sig_cmp_fn)
>> +{
>> +	(void) sig_cmp_fn;
>> +
>> +	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
>> +	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
>> +
>> +	/* For match mask every bits indicates the match */
>> +	for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
>> +		*hitmask_buffer |=
>> +			((sig == prim_bucket_sigs[i]) << i);
>> +		*hitmask_buffer |=
>> +			((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
>> +	}
>> +
>> +}
> 
> Thanks for re-factoring compare_signatures_...() code, it looks much cleaner that way.
> One question I have - does it mean that now for x86 we always use 'sparse' while for all other
> ARM and non-ARM platforms we switch to 'dense'?

Yes it does. x86 support only the sparse method (the legacy one). Arm 
and generic code could support both dense and sparse. The reason I made 
them use the dense method is because it was slightly faster in my tests. 
(no need to add padding and shifts amongst other benefit.)

> 
>> diff --git a/lib/hash/arch/x86/compare_signatures.h b/lib/hash/arch/x86/compare_signatures.h
>> new file mode 100644
>> index 0000000000..7eec499e1f
>> --- /dev/null
>> +++ b/lib/hash/arch/x86/compare_signatures.h
>> @@ -0,0 +1,53 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2010-2016 Intel Corporation
>> + * Copyright(c) 2018-2024 Arm Limited
>> + */
>> +
>> +/*
>> + * x86's version uses a sparsely packed hitmask buffer:
>> + * Every other bit is padding.
>> + */
>> +
>> +#include <inttypes.h>
>> +#include <rte_common.h>
>> +#include <rte_vect.h>
>> +#include "rte_cuckoo_hash.h"
>> +
>> +#define DENSE_HASH_BULK_LOOKUP 0
>> +
>> +static inline void
>> +compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
>> +			const struct rte_hash_bucket *prim_bkt,
>> +			const struct rte_hash_bucket *sec_bkt,
>> +			uint16_t sig,
>> +			enum rte_hash_sig_compare_function sig_cmp_fn)
>> +{
>> +	/* For match mask the first bit of every two bits indicates the match */
>> +	switch (sig_cmp_fn) {
>> +#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
>> +	case RTE_HASH_COMPARE_SSE:
>> +		/* Compare all signatures in the bucket */
>> +		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
>> +				_mm_load_si128(
>> +					(__m128i const *)prim_bkt->sig_current),
>> +				_mm_set1_epi16(sig)));
>> +		/* Extract the even-index bits only */
>> +		*prim_hash_matches &= 0x5555;
>> +		/* Compare all signatures in the bucket */
>> +		*sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
>> +				_mm_load_si128(
>> +					(__m128i const *)sec_bkt->sig_current),
>> +				_mm_set1_epi16(sig)));
>> +		/* Extract the even-index bits only */
>> +		*sec_hash_matches &= 0x5555;
>> +		break;
>> +#endif /* defined(__SSE2__) */
>> +	default:
>> +		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
>> +			*prim_hash_matches |=
>> +				((sig == prim_bkt->sig_current[i]) << (i << 1));
>> +			*sec_hash_matches |=
>> +				((sig == sec_bkt->sig_current[i]) << (i << 1));
>> +		}
>> +	}
>> +}
  
Konstantin Ananyev March 19, 2024, 1:25 p.m. UTC | #3
> >
> > Hi,
> >
> >> Current hitmask includes padding due to Intel's SIMD
> >> implementation detail. This patch allows non Intel SIMD
> >> implementations to benefit from a dense hitmask.
> >> In addition, the new dense hitmask interweave the primary
> >> and secondary matches which allow a better cache usage and
> >> enable future improvements for the SIMD implementations
> >>
> >> Signed-off-by: Yoan Picchi <yoan.picchi@arm.com>
> >> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> >> Reviewed-by: Nathan Brown <nathan.brown@arm.com>
> >> ---
> >>   .mailmap                                  |   2 +
> >>   lib/hash/arch/arm/compare_signatures.h    |  61 +++++++
> >>   lib/hash/arch/common/compare_signatures.h |  38 +++++
> >>   lib/hash/arch/x86/compare_signatures.h    |  53 ++++++
> >>   lib/hash/rte_cuckoo_hash.c                | 192 ++++++++++++----------
> >>   5 files changed, 255 insertions(+), 91 deletions(-)
> >>   create mode 100644 lib/hash/arch/arm/compare_signatures.h
> >>   create mode 100644 lib/hash/arch/common/compare_signatures.h
> >>   create mode 100644 lib/hash/arch/x86/compare_signatures.h
> >>
> >> diff --git a/.mailmap b/.mailmap
> >> index 66ebc20666..00b50414d3 100644
> >> --- a/.mailmap
> >> +++ b/.mailmap
> >> @@ -494,6 +494,7 @@ Hari Kumar Vemula <hari.kumarx.vemula@intel.com>
> >>   Harini Ramakrishnan <harini.ramakrishnan@microsoft.com>
> >>   Hariprasad Govindharajan <hariprasad.govindharajan@intel.com>
> >>   Harish Patil <harish.patil@cavium.com> <harish.patil@qlogic.com>
> >> +Harjot Singh <harjot.singh@arm.com>
> >>   Harman Kalra <hkalra@marvell.com>
> >>   Harneet Singh <harneet.singh@intel.com>
> >>   Harold Huang <baymaxhuang@gmail.com>
> >> @@ -1633,6 +1634,7 @@ Yixue Wang <yixue.wang@intel.com>
> >>   Yi Yang <yangyi01@inspur.com> <yi.y.yang@intel.com>
> >>   Yi Zhang <zhang.yi75@zte.com.cn>
> >>   Yoann Desmouceaux <ydesmouc@cisco.com>
> >> +Yoan Picchi <yoan.picchi@arm.com>
> >>   Yogesh Jangra <yogesh.jangra@intel.com>
> >>   Yogev Chaimovich <yogev@cgstowernetworks.com>
> >>   Yongjie Gu <yongjiex.gu@intel.com>
> >> diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
> >> new file mode 100644
> >> index 0000000000..1af6ba8190
> >> --- /dev/null
> >> +++ b/lib/hash/arch/arm/compare_signatures.h
> >> @@ -0,0 +1,61 @@
> >> +/* SPDX-License-Identifier: BSD-3-Clause
> >> + * Copyright(c) 2010-2016 Intel Corporation
> >> + * Copyright(c) 2018-2024 Arm Limited
> >> + */
> >> +
> >> +/*
> >> + * Arm's version uses a densely packed hitmask buffer:
> >> + * Every bit is in use.
> >> + */
> >> +
> >> +#include <inttypes.h>
> >> +#include <rte_common.h>
> >> +#include <rte_vect.h>
> >> +#include "rte_cuckoo_hash.h"
> >> +
> >> +#define DENSE_HASH_BULK_LOOKUP 1
> >> +
> >> +static inline void
> >> +compare_signatures_dense(uint16_t *hitmask_buffer,
> >> +			const uint16_t *prim_bucket_sigs,
> >> +			const uint16_t *sec_bucket_sigs,
> >> +			uint16_t sig,
> >> +			enum rte_hash_sig_compare_function sig_cmp_fn)
> >> +{
> >> +
> >> +	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
> >> +	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
> >> +
> >> +	/* For match mask every bits indicates the match */
> >> +	switch (sig_cmp_fn) {
> >> +#if RTE_HASH_BUCKET_ENTRIES <= 8
> >> +	case RTE_HASH_COMPARE_NEON: {
> >> +		uint16x8_t vmat, vsig, x;
> >> +		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
> >> +		uint16_t low, high;
> >> +
> >> +		vsig = vld1q_dup_u16((uint16_t const *)&sig);
> >> +		/* Compare all signatures in the primary bucket */
> >> +		vmat = vceqq_u16(vsig,
> >> +			vld1q_u16((uint16_t const *)prim_bucket_sigs));
> >> +		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
> >> +		low = (uint16_t)(vaddvq_u16(x));
> >> +		/* Compare all signatures in the secondary bucket */
> >> +		vmat = vceqq_u16(vsig,
> >> +			vld1q_u16((uint16_t const *)sec_bucket_sigs));
> >> +		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
> >> +		high = (uint16_t)(vaddvq_u16(x));
> >> +		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
> >> +
> >> +		}
> >> +		break;
> >> +#endif
> >> +	default:
> >> +		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> >> +			*hitmask_buffer |=
> >> +				((sig == prim_bucket_sigs[i]) << i);
> >> +			*hitmask_buffer |=
> >> +				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
> >> +		}
> >> +	}
> >> +}
> >> diff --git a/lib/hash/arch/common/compare_signatures.h b/lib/hash/arch/common/compare_signatures.h
> >> new file mode 100644
> >> index 0000000000..dcf9444032
> >> --- /dev/null
> >> +++ b/lib/hash/arch/common/compare_signatures.h
> >> @@ -0,0 +1,38 @@
> >> +/* SPDX-License-Identifier: BSD-3-Clause
> >> + * Copyright(c) 2010-2016 Intel Corporation
> >> + * Copyright(c) 2018-2024 Arm Limited
> >> + */
> >> +
> >> +/*
> >> + * The generic version could use either a dense or sparsely packed hitmask buffer,
> >> + * but the dense one is slightly faster.
> >> + */
> >> +
> >> +#include <inttypes.h>
> >> +#include <rte_common.h>
> >> +#include <rte_vect.h>
> >> +#include "rte_cuckoo_hash.h"
> >> +
> >> +#define DENSE_HASH_BULK_LOOKUP 1
> >> +
> >> +static inline void
> >> +compare_signatures_dense(uint16_t *hitmask_buffer,
> >> +			const uint16_t *prim_bucket_sigs,
> >> +			const uint16_t *sec_bucket_sigs,
> >> +			uint16_t sig,
> >> +			enum rte_hash_sig_compare_function sig_cmp_fn)
> >> +{
> >> +	(void) sig_cmp_fn;
> >> +
> >> +	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
> >> +	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
> >> +
> >> +	/* For match mask every bits indicates the match */
> >> +	for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> >> +		*hitmask_buffer |=
> >> +			((sig == prim_bucket_sigs[i]) << i);
> >> +		*hitmask_buffer |=
> >> +			((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
> >> +	}
> >> +
> >> +}
> >
> > Thanks for re-factoring compare_signatures_...() code, it looks much cleaner that way.
> > One question I have - does it mean that now for x86 we always use 'sparse' while for all other
> > ARM and non-ARM platforms we switch to 'dense'?
> 
> Yes it does. x86 support only the sparse method (the legacy one). Arm
> and generic code could support both dense and sparse. The reason I made
> them use the dense method is because it was slightly faster in my tests.

Ok, but before that, a 'generic' one (non-x86 and non-ARM) used 'sparse' one, correct?
If so, then probably need to outline it a bit more in patch comments and might be even release notes.
At least that would be my expectations, probably hash lib maintainers need to say what is the best way here.
The code refactoring itself - LGTM.

> (no need to add padding and shifts amongst other benefit.)
> 
> >
> >> diff --git a/lib/hash/arch/x86/compare_signatures.h b/lib/hash/arch/x86/compare_signatures.h
> >> new file mode 100644
> >> index 0000000000..7eec499e1f
> >> --- /dev/null
> >> +++ b/lib/hash/arch/x86/compare_signatures.h
> >> @@ -0,0 +1,53 @@
> >> +/* SPDX-License-Identifier: BSD-3-Clause
> >> + * Copyright(c) 2010-2016 Intel Corporation
> >> + * Copyright(c) 2018-2024 Arm Limited
> >> + */
> >> +
> >> +/*
> >> + * x86's version uses a sparsely packed hitmask buffer:
> >> + * Every other bit is padding.
> >> + */
> >> +
> >> +#include <inttypes.h>
> >> +#include <rte_common.h>
> >> +#include <rte_vect.h>
> >> +#include "rte_cuckoo_hash.h"
> >> +
> >> +#define DENSE_HASH_BULK_LOOKUP 0
> >> +
> >> +static inline void
> >> +compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
> >> +			const struct rte_hash_bucket *prim_bkt,
> >> +			const struct rte_hash_bucket *sec_bkt,
> >> +			uint16_t sig,
> >> +			enum rte_hash_sig_compare_function sig_cmp_fn)
> >> +{
> >> +	/* For match mask the first bit of every two bits indicates the match */
> >> +	switch (sig_cmp_fn) {
> >> +#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
> >> +	case RTE_HASH_COMPARE_SSE:
> >> +		/* Compare all signatures in the bucket */
> >> +		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
> >> +				_mm_load_si128(
> >> +					(__m128i const *)prim_bkt->sig_current),
> >> +				_mm_set1_epi16(sig)));
> >> +		/* Extract the even-index bits only */
> >> +		*prim_hash_matches &= 0x5555;
> >> +		/* Compare all signatures in the bucket */
> >> +		*sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
> >> +				_mm_load_si128(
> >> +					(__m128i const *)sec_bkt->sig_current),
> >> +				_mm_set1_epi16(sig)));
> >> +		/* Extract the even-index bits only */
> >> +		*sec_hash_matches &= 0x5555;
> >> +		break;
> >> +#endif /* defined(__SSE2__) */
> >> +	default:
> >> +		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
> >> +			*prim_hash_matches |=
> >> +				((sig == prim_bkt->sig_current[i]) << (i << 1));
> >> +			*sec_hash_matches |=
> >> +				((sig == sec_bkt->sig_current[i]) << (i << 1));
> >> +		}
> >> +	}
> >> +}
  
Stephen Hemminger March 19, 2024, 4:09 p.m. UTC | #4
On Tue, 12 Mar 2024 15:42:12 +0000
Yoan Picchi <yoan.picchi@arm.com> wrote:

> +	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),

Space around math operations please.
  

Patch

diff --git a/.mailmap b/.mailmap
index 66ebc20666..00b50414d3 100644
--- a/.mailmap
+++ b/.mailmap
@@ -494,6 +494,7 @@  Hari Kumar Vemula <hari.kumarx.vemula@intel.com>
 Harini Ramakrishnan <harini.ramakrishnan@microsoft.com>
 Hariprasad Govindharajan <hariprasad.govindharajan@intel.com>
 Harish Patil <harish.patil@cavium.com> <harish.patil@qlogic.com>
+Harjot Singh <harjot.singh@arm.com>
 Harman Kalra <hkalra@marvell.com>
 Harneet Singh <harneet.singh@intel.com>
 Harold Huang <baymaxhuang@gmail.com>
@@ -1633,6 +1634,7 @@  Yixue Wang <yixue.wang@intel.com>
 Yi Yang <yangyi01@inspur.com> <yi.y.yang@intel.com>
 Yi Zhang <zhang.yi75@zte.com.cn>
 Yoann Desmouceaux <ydesmouc@cisco.com>
+Yoan Picchi <yoan.picchi@arm.com>
 Yogesh Jangra <yogesh.jangra@intel.com>
 Yogev Chaimovich <yogev@cgstowernetworks.com>
 Yongjie Gu <yongjiex.gu@intel.com>
diff --git a/lib/hash/arch/arm/compare_signatures.h b/lib/hash/arch/arm/compare_signatures.h
new file mode 100644
index 0000000000..1af6ba8190
--- /dev/null
+++ b/lib/hash/arch/arm/compare_signatures.h
@@ -0,0 +1,61 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018-2024 Arm Limited
+ */
+
+/*
+ * Arm's version uses a densely packed hitmask buffer:
+ * Every bit is in use.
+ */
+
+#include <inttypes.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+#include "rte_cuckoo_hash.h"
+
+#define DENSE_HASH_BULK_LOOKUP 1
+
+static inline void
+compare_signatures_dense(uint16_t *hitmask_buffer,
+			const uint16_t *prim_bucket_sigs,
+			const uint16_t *sec_bucket_sigs,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+
+	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
+	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
+
+	/* For match mask every bits indicates the match */
+	switch (sig_cmp_fn) {
+#if RTE_HASH_BUCKET_ENTRIES <= 8
+	case RTE_HASH_COMPARE_NEON: {
+		uint16x8_t vmat, vsig, x;
+		int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
+		uint16_t low, high;
+
+		vsig = vld1q_dup_u16((uint16_t const *)&sig);
+		/* Compare all signatures in the primary bucket */
+		vmat = vceqq_u16(vsig,
+			vld1q_u16((uint16_t const *)prim_bucket_sigs));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		low = (uint16_t)(vaddvq_u16(x));
+		/* Compare all signatures in the secondary bucket */
+		vmat = vceqq_u16(vsig,
+			vld1q_u16((uint16_t const *)sec_bucket_sigs));
+		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift);
+		high = (uint16_t)(vaddvq_u16(x));
+		*hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES;
+
+		}
+		break;
+#endif
+	default:
+		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			*hitmask_buffer |=
+				((sig == prim_bucket_sigs[i]) << i);
+			*hitmask_buffer |=
+				((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
+		}
+	}
+}
diff --git a/lib/hash/arch/common/compare_signatures.h b/lib/hash/arch/common/compare_signatures.h
new file mode 100644
index 0000000000..dcf9444032
--- /dev/null
+++ b/lib/hash/arch/common/compare_signatures.h
@@ -0,0 +1,38 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018-2024 Arm Limited
+ */
+
+/*
+ * The generic version could use either a dense or sparsely packed hitmask buffer,
+ * but the dense one is slightly faster.
+ */
+
+#include <inttypes.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+#include "rte_cuckoo_hash.h"
+
+#define DENSE_HASH_BULK_LOOKUP 1
+
+static inline void
+compare_signatures_dense(uint16_t *hitmask_buffer,
+			const uint16_t *prim_bucket_sigs,
+			const uint16_t *sec_bucket_sigs,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+	(void) sig_cmp_fn;
+
+	static_assert(sizeof(*hitmask_buffer) >= 2*(RTE_HASH_BUCKET_ENTRIES/8),
+	"The hitmask must be exactly wide enough to accept the whole hitmask if it is dense");
+
+	/* For match mask every bits indicates the match */
+	for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+		*hitmask_buffer |=
+			((sig == prim_bucket_sigs[i]) << i);
+		*hitmask_buffer |=
+			((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES;
+	}
+
+}
diff --git a/lib/hash/arch/x86/compare_signatures.h b/lib/hash/arch/x86/compare_signatures.h
new file mode 100644
index 0000000000..7eec499e1f
--- /dev/null
+++ b/lib/hash/arch/x86/compare_signatures.h
@@ -0,0 +1,53 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2018-2024 Arm Limited
+ */
+
+/*
+ * x86's version uses a sparsely packed hitmask buffer:
+ * Every other bit is padding.
+ */
+
+#include <inttypes.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+#include "rte_cuckoo_hash.h"
+
+#define DENSE_HASH_BULK_LOOKUP 0
+
+static inline void
+compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
+			const struct rte_hash_bucket *prim_bkt,
+			const struct rte_hash_bucket *sec_bkt,
+			uint16_t sig,
+			enum rte_hash_sig_compare_function sig_cmp_fn)
+{
+	/* For match mask the first bit of every two bits indicates the match */
+	switch (sig_cmp_fn) {
+#if defined(__SSE2__) && RTE_HASH_BUCKET_ENTRIES <= 8
+	case RTE_HASH_COMPARE_SSE:
+		/* Compare all signatures in the bucket */
+		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
+				_mm_load_si128(
+					(__m128i const *)prim_bkt->sig_current),
+				_mm_set1_epi16(sig)));
+		/* Extract the even-index bits only */
+		*prim_hash_matches &= 0x5555;
+		/* Compare all signatures in the bucket */
+		*sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
+				_mm_load_si128(
+					(__m128i const *)sec_bkt->sig_current),
+				_mm_set1_epi16(sig)));
+		/* Extract the even-index bits only */
+		*sec_hash_matches &= 0x5555;
+		break;
+#endif /* defined(__SSE2__) */
+	default:
+		for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
+			*prim_hash_matches |=
+				((sig == prim_bkt->sig_current[i]) << (i << 1));
+			*sec_hash_matches |=
+				((sig == sec_bkt->sig_current[i]) << (i << 1));
+		}
+	}
+}
diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c
index 9cf94645f6..0697743cdf 100644
--- a/lib/hash/rte_cuckoo_hash.c
+++ b/lib/hash/rte_cuckoo_hash.c
@@ -33,6 +33,14 @@  RTE_LOG_REGISTER_DEFAULT(hash_logtype, INFO);
 
 #include "rte_cuckoo_hash.h"
 
+#if defined(__ARM_NEON)
+#include "arch/arm/compare_signatures.h"
+#elif defined(__SSE2__)
+#include "arch/x86/compare_signatures.h"
+#else
+#include "arch/common/compare_signatures.h"
+#endif
+
 /* Mask of all flags supported by this version */
 #define RTE_HASH_EXTRA_FLAGS_MASK (RTE_HASH_EXTRA_FLAGS_TRANS_MEM_SUPPORT | \
 				   RTE_HASH_EXTRA_FLAGS_MULTI_WRITER_ADD | \
@@ -1857,63 +1865,6 @@  rte_hash_free_key_with_position(const struct rte_hash *h,
 
 }
 
-static inline void
-compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches,
-			const struct rte_hash_bucket *prim_bkt,
-			const struct rte_hash_bucket *sec_bkt,
-			uint16_t sig,
-			enum rte_hash_sig_compare_function sig_cmp_fn)
-{
-	unsigned int i;
-
-	/* For match mask the first bit of every two bits indicates the match */
-	switch (sig_cmp_fn) {
-#if defined(__SSE2__)
-	case RTE_HASH_COMPARE_SSE:
-		/* Compare all signatures in the bucket */
-		*prim_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
-				_mm_load_si128(
-					(__m128i const *)prim_bkt->sig_current),
-				_mm_set1_epi16(sig)));
-		/* Extract the even-index bits only */
-		*prim_hash_matches &= 0x5555;
-		/* Compare all signatures in the bucket */
-		*sec_hash_matches = _mm_movemask_epi8(_mm_cmpeq_epi16(
-				_mm_load_si128(
-					(__m128i const *)sec_bkt->sig_current),
-				_mm_set1_epi16(sig)));
-		/* Extract the even-index bits only */
-		*sec_hash_matches &= 0x5555;
-		break;
-#elif defined(__ARM_NEON)
-	case RTE_HASH_COMPARE_NEON: {
-		uint16x8_t vmat, vsig, x;
-		int16x8_t shift = {-15, -13, -11, -9, -7, -5, -3, -1};
-
-		vsig = vld1q_dup_u16((uint16_t const *)&sig);
-		/* Compare all signatures in the primary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)prim_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
-		/* Compare all signatures in the secondary bucket */
-		vmat = vceqq_u16(vsig,
-			vld1q_u16((uint16_t const *)sec_bkt->sig_current));
-		x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
-		*sec_hash_matches = (uint32_t)(vaddvq_u16(x));
-		}
-		break;
-#endif
-	default:
-		for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) {
-			*prim_hash_matches |=
-				((sig == prim_bkt->sig_current[i]) << (i << 1));
-			*sec_hash_matches |=
-				((sig == sec_bkt->sig_current[i]) << (i << 1));
-		}
-	}
-}
-
 static inline void
 __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 		const struct rte_hash_bucket **primary_bkt,
@@ -1924,22 +1875,44 @@  __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 
+#if DENSE_HASH_BULK_LOOKUP
+	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+
+	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+	"The hitmask must be exactly wide enough to accept the whole hitmask when it is dense");
+#else
+	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+#endif
+
 	__hash_rw_reader_lock(h);
 
 	/* Compare signatures and prefetch key slot of first hit */
 	for (i = 0; i < num_keys; i++) {
-		compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if DENSE_HASH_BULK_LOOKUP
+		uint16_t *hitmask = &hitmask_buffer[i];
+		compare_signatures_dense(hitmask,
+			primary_bkt[i]->sig_current,
+			secondary_bkt[i]->sig_current,
+			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+		compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 			primary_bkt[i], secondary_bkt[i],
 			sig[i], h->sig_cmp_fn);
+		const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		const unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
 
-		if (prim_hitmask[i]) {
+		if (prim_hitmask) {
 			uint32_t first_hit =
-					rte_ctz32(prim_hitmask[i])
-					>> 1;
+					rte_ctz32(prim_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1950,10 +1923,10 @@  __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 			continue;
 		}
 
-		if (sec_hitmask[i]) {
+		if (sec_hitmask) {
 			uint32_t first_hit =
-					rte_ctz32(sec_hitmask[i])
-					>> 1;
+					rte_ctz32(sec_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[first_hit];
 			const struct rte_hash_key *key_slot =
@@ -1967,10 +1940,18 @@  __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 	/* Compare keys, first hits in primary first */
 	for (i = 0; i < num_keys; i++) {
 		positions[i] = -ENOENT;
-		while (prim_hitmask[i]) {
+#if DENSE_HASH_BULK_LOOKUP
+		uint16_t *hitmask = &hitmask_buffer[i];
+		unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+		unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+		unsigned int prim_hitmask = prim_hitmask_buffer[i];
+		unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+		while (prim_hitmask) {
 			uint32_t hit_index =
-					rte_ctz32(prim_hitmask[i])
-					>> 1;
+					rte_ctz32(prim_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				primary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -1992,13 +1973,13 @@  __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 
-		while (sec_hitmask[i]) {
+		while (sec_hitmask) {
 			uint32_t hit_index =
-					rte_ctz32(sec_hitmask[i])
-					>> 1;
+					rte_ctz32(sec_hitmask)
+					>> hitmask_padding;
 			uint32_t key_idx =
 				secondary_bkt[i]->key_idx[hit_index];
 			const struct rte_hash_key *key_slot =
@@ -2021,7 +2002,7 @@  __bulk_lookup_l(const struct rte_hash *h, const void **keys,
 				positions[i] = key_idx - 1;
 				goto next_key;
 			}
-			sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+			sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 		}
 next_key:
 		continue;
@@ -2071,11 +2052,20 @@  __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 	uint64_t hits = 0;
 	int32_t i;
 	int32_t ret;
-	uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
-	uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0};
 	struct rte_hash_bucket *cur_bkt, *next_bkt;
 	uint32_t cnt_b, cnt_a;
 
+#if DENSE_HASH_BULK_LOOKUP
+	const int hitmask_padding = 0;
+	uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES,
+	"The hitmask must be exactly wide enough to accept the whole hitmask chen it is dense");
+#else
+	const int hitmask_padding = 1;
+	uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+	uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0};
+#endif
+
 	for (i = 0; i < num_keys; i++)
 		positions[i] = -ENOENT;
 
@@ -2089,14 +2079,26 @@  __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare signatures and prefetch key slot of first hit */
 		for (i = 0; i < num_keys; i++) {
-			compare_signatures(&prim_hitmask[i], &sec_hitmask[i],
+#if DENSE_HASH_BULK_LOOKUP
+			uint16_t *hitmask = &hitmask_buffer[i];
+			compare_signatures_dense(hitmask,
+				primary_bkt[i]->sig_current,
+				secondary_bkt[i]->sig_current,
+				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+			compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i],
 				primary_bkt[i], secondary_bkt[i],
 				sig[i], h->sig_cmp_fn);
+			const unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			const unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
 
-			if (prim_hitmask[i]) {
+			if (prim_hitmask) {
 				uint32_t first_hit =
-						rte_ctz32(prim_hitmask[i])
-						>> 1;
+						rte_ctz32(prim_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 					primary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2107,10 +2109,10 @@  __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 				continue;
 			}
 
-			if (sec_hitmask[i]) {
+			if (sec_hitmask) {
 				uint32_t first_hit =
-						rte_ctz32(sec_hitmask[i])
-						>> 1;
+						rte_ctz32(sec_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 					secondary_bkt[i]->key_idx[first_hit];
 				const struct rte_hash_key *key_slot =
@@ -2123,10 +2125,18 @@  __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 
 		/* Compare keys, first hits in primary first */
 		for (i = 0; i < num_keys; i++) {
-			while (prim_hitmask[i]) {
+#if DENSE_HASH_BULK_LOOKUP
+			uint16_t *hitmask = &hitmask_buffer[i];
+			unsigned int prim_hitmask = *(uint8_t *)(hitmask);
+			unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1);
+#else
+			unsigned int prim_hitmask = prim_hitmask_buffer[i];
+			unsigned int sec_hitmask = sec_hitmask_buffer[i];
+#endif
+			while (prim_hitmask) {
 				uint32_t hit_index =
-						rte_ctz32(prim_hitmask[i])
-						>> 1;
+						rte_ctz32(prim_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
 					&primary_bkt[i]->key_idx[hit_index],
@@ -2152,13 +2162,13 @@  __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				prim_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				prim_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 
-			while (sec_hitmask[i]) {
+			while (sec_hitmask) {
 				uint32_t hit_index =
-						rte_ctz32(sec_hitmask[i])
-						>> 1;
+						rte_ctz32(sec_hitmask)
+						>> hitmask_padding;
 				uint32_t key_idx =
 				rte_atomic_load_explicit(
 					&secondary_bkt[i]->key_idx[hit_index],
@@ -2185,7 +2195,7 @@  __bulk_lookup_lf(const struct rte_hash *h, const void **keys,
 					positions[i] = key_idx - 1;
 					goto next_key;
 				}
-				sec_hitmask[i] &= ~(3ULL << (hit_index << 1));
+				sec_hitmask &= ~(1 << (hit_index << hitmask_padding));
 			}
 next_key:
 			continue;