From patchwork Mon Jul 8 12:14:08 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Yoan Picchi X-Patchwork-Id: 142213 X-Patchwork-Delegate: david.marchand@redhat.com Return-Path: X-Original-To: patchwork@inbox.dpdk.org Delivered-To: patchwork@inbox.dpdk.org Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124]) by inbox.dpdk.org (Postfix) with ESMTP id 0814A4552F; Mon, 8 Jul 2024 14:14:51 +0200 (CEST) Received: from mails.dpdk.org (localhost [127.0.0.1]) by mails.dpdk.org (Postfix) with ESMTP id 6343840F35; Mon, 8 Jul 2024 14:14:28 +0200 (CEST) Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by mails.dpdk.org (Postfix) with ESMTP id B880840E41 for ; Mon, 8 Jul 2024 14:14:21 +0200 (CEST) Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 474541655; Mon, 8 Jul 2024 05:14:46 -0700 (PDT) Received: from ampere-altra-2-3.austin.arm.com (ampere-altra-2-3.austin.arm.com [10.118.14.97]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id 08FA63F766; Mon, 8 Jul 2024 05:14:21 -0700 (PDT) From: Yoan Picchi To: Yipeng Wang , Sameh Gobriel , Bruce Richardson , Vladimir Medvedkin Cc: dev@dpdk.org, nd@arm.com, Yoan Picchi , Ruifeng Wang , Nathan Brown Subject: [PATCH v12 4/7] hash: pack the hitmask for hash in bulk lookup Date: Mon, 8 Jul 2024 12:14:08 +0000 Message-Id: <20240708121411.885996-5-yoan.picchi@arm.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20240708121411.885996-1-yoan.picchi@arm.com> References: <20231020165159.1649282-1-yoan.picchi@arm.com> <20240708121411.885996-1-yoan.picchi@arm.com> MIME-Version: 1.0 X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Current hitmask includes padding due to Intel's SIMD implementation detail. This patch allows non Intel SIMD implementations to benefit from a dense hitmask. In addition, the new dense hitmask interweave the primary and secondary matches which allow a better cache usage and enable future improvements for the SIMD implementations The default non SIMD path now use this dense mask. Signed-off-by: Yoan Picchi Reviewed-by: Ruifeng Wang Reviewed-by: Nathan Brown --- lib/hash/compare_signatures_arm.h | 47 ++++++---- lib/hash/compare_signatures_generic.h | 31 ++++--- lib/hash/compare_signatures_x86.h | 9 +- lib/hash/rte_cuckoo_hash.c | 124 +++++++++++++++++++------- 4 files changed, 145 insertions(+), 66 deletions(-) diff --git a/lib/hash/compare_signatures_arm.h b/lib/hash/compare_signatures_arm.h index 74b3286c95..0fc657c49b 100644 --- a/lib/hash/compare_signatures_arm.h +++ b/lib/hash/compare_signatures_arm.h @@ -6,48 +6,57 @@ #ifndef _COMPARE_SIGNATURE_ARM_PVT_H_ #define _COMPARE_SIGNATURE_ARM_PVT_H_ +/* + * Arm's version uses a densely packed hitmask buffer: + * Every bit is in use. + */ + #include #include #include #include "rte_cuckoo_hash.h" +#define DENSE_HASH_BULK_LOOKUP 1 + static inline void -compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches, - const struct rte_hash_bucket *prim_bkt, - const struct rte_hash_bucket *sec_bkt, +compare_signatures_dense(uint16_t *hitmask_buffer, + const uint16_t *prim_bucket_sigs, + const uint16_t *sec_bucket_sigs, uint16_t sig, enum rte_hash_sig_compare_function sig_cmp_fn) { - unsigned int i; - /* For match mask the first bit of every two bits indicates the match */ + static_assert(sizeof(*hitmask_buffer) >= 2 * (RTE_HASH_BUCKET_ENTRIES / 8), + "hitmask_buffer must be wide enough to fit a dense hitmask"); + + /* For match mask every bits indicates the match */ switch (sig_cmp_fn) { #if defined(__ARM_NEON) && RTE_HASH_BUCKET_ENTRIES <= 8 case RTE_HASH_COMPARE_NEON: { uint16x8_t vmat, vsig, x; - int16x8_t shift = {-15, -13, -11, -9, -7, -5, -3, -1}; + int16x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7}; + uint16_t low, high; vsig = vld1q_dup_u16((uint16_t const *)&sig); /* Compare all signatures in the primary bucket */ - vmat = vceqq_u16(vsig, - vld1q_u16((uint16_t const *)prim_bkt->sig_current)); - x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift); - *prim_hash_matches = (uint32_t)(vaddvq_u16(x)); + vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)prim_bucket_sigs)); + x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift); + low = (uint16_t)(vaddvq_u16(x)); /* Compare all signatures in the secondary bucket */ - vmat = vceqq_u16(vsig, - vld1q_u16((uint16_t const *)sec_bkt->sig_current)); - x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift); - *sec_hash_matches = (uint32_t)(vaddvq_u16(x)); + vmat = vceqq_u16(vsig, vld1q_u16((uint16_t const *)sec_bucket_sigs)); + x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x0001)), shift); + high = (uint16_t)(vaddvq_u16(x)); + *hitmask_buffer = low | high << RTE_HASH_BUCKET_ENTRIES; + } break; #endif default: - for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { - *prim_hash_matches |= - ((sig == prim_bkt->sig_current[i]) << (i << 1)); - *sec_hash_matches |= - ((sig == sec_bkt->sig_current[i]) << (i << 1)); + for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { + *hitmask_buffer |= (sig == prim_bucket_sigs[i]) << i; + *hitmask_buffer |= + ((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES; } } } diff --git a/lib/hash/compare_signatures_generic.h b/lib/hash/compare_signatures_generic.h index 43587adcef..1d065d4c28 100644 --- a/lib/hash/compare_signatures_generic.h +++ b/lib/hash/compare_signatures_generic.h @@ -6,27 +6,34 @@ #ifndef _COMPARE_SIGNATURE_GENERIC_PVT_H_ #define _COMPARE_SIGNATURE_GENERIC_PVT_H_ +/* + * The generic version could use either a dense or sparsely packed hitmask buffer, + * but the dense one is slightly faster. + */ + #include #include #include #include "rte_cuckoo_hash.h" +#define DENSE_HASH_BULK_LOOKUP 1 + static inline void -compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches, - const struct rte_hash_bucket *prim_bkt, - const struct rte_hash_bucket *sec_bkt, +compare_signatures_dense(uint16_t *hitmask_buffer, + const uint16_t *prim_bucket_sigs, + const uint16_t *sec_bucket_sigs, uint16_t sig, - enum rte_hash_sig_compare_function sig_cmp_fn) + __rte_unused enum rte_hash_sig_compare_function sig_cmp_fn) { - unsigned int i; - - /* For match mask the first bit of every two bits indicates the match */ - for (i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { - *prim_hash_matches |= - ((sig == prim_bkt->sig_current[i]) << (i << 1)); - *sec_hash_matches |= - ((sig == sec_bkt->sig_current[i]) << (i << 1)); + + static_assert(sizeof(*hitmask_buffer) >= 2 * (RTE_HASH_BUCKET_ENTRIES / 8), + "hitmask_buffer must be wide enough to fit a dense hitmask"); + + /* For match mask every bits indicates the match */ + for (unsigned int i = 0; i < RTE_HASH_BUCKET_ENTRIES; i++) { + *hitmask_buffer |= (sig == prim_bucket_sigs[i]) << i; + *hitmask_buffer |= ((sig == sec_bucket_sigs[i]) << i) << RTE_HASH_BUCKET_ENTRIES; } } diff --git a/lib/hash/compare_signatures_x86.h b/lib/hash/compare_signatures_x86.h index f77b37f1cd..03e9c44e53 100644 --- a/lib/hash/compare_signatures_x86.h +++ b/lib/hash/compare_signatures_x86.h @@ -6,14 +6,21 @@ #ifndef _COMPARE_SIGNATURE_X86_PVT_H_ #define _COMPARE_SIGNATURE_X86_PVT_H_ +/* + * x86's version uses a sparsely packed hitmask buffer: + * Every other bit is padding. + */ + #include #include #include #include "rte_cuckoo_hash.h" +#define DENSE_HASH_BULK_LOOKUP 0 + static inline void -compare_signatures(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches, +compare_signatures_sparse(uint32_t *prim_hash_matches, uint32_t *sec_hash_matches, const struct rte_hash_bucket *prim_bkt, const struct rte_hash_bucket *sec_bkt, uint16_t sig, diff --git a/lib/hash/rte_cuckoo_hash.c b/lib/hash/rte_cuckoo_hash.c index c3256dff4c..7512861aac 100644 --- a/lib/hash/rte_cuckoo_hash.c +++ b/lib/hash/rte_cuckoo_hash.c @@ -1908,22 +1908,41 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys, uint64_t hits = 0; int32_t i; int32_t ret; - uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0}; - uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0}; struct rte_hash_bucket *cur_bkt, *next_bkt; +#if DENSE_HASH_BULK_LOOKUP + const int hitmask_padding = 0; + uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0}; +#else + const int hitmask_padding = 1; + uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0}; + uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0}; +#endif + __hash_rw_reader_lock(h); /* Compare signatures and prefetch key slot of first hit */ for (i = 0; i < num_keys; i++) { - compare_signatures(&prim_hitmask[i], &sec_hitmask[i], +#if DENSE_HASH_BULK_LOOKUP + uint16_t *hitmask = &hitmask_buffer[i]; + compare_signatures_dense(hitmask, + primary_bkt[i]->sig_current, + secondary_bkt[i]->sig_current, + sig[i], h->sig_cmp_fn); + const unsigned int prim_hitmask = *(uint8_t *)(hitmask); + const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1); +#else + compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i], primary_bkt[i], secondary_bkt[i], sig[i], h->sig_cmp_fn); + const unsigned int prim_hitmask = prim_hitmask_buffer[i]; + const unsigned int sec_hitmask = sec_hitmask_buffer[i]; +#endif - if (prim_hitmask[i]) { + if (prim_hitmask) { uint32_t first_hit = - rte_ctz32(prim_hitmask[i]) - >> 1; + rte_ctz32(prim_hitmask) + >> hitmask_padding; uint32_t key_idx = primary_bkt[i]->key_idx[first_hit]; const struct rte_hash_key *key_slot = @@ -1934,10 +1953,10 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys, continue; } - if (sec_hitmask[i]) { + if (sec_hitmask) { uint32_t first_hit = - rte_ctz32(sec_hitmask[i]) - >> 1; + rte_ctz32(sec_hitmask) + >> hitmask_padding; uint32_t key_idx = secondary_bkt[i]->key_idx[first_hit]; const struct rte_hash_key *key_slot = @@ -1951,10 +1970,18 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys, /* Compare keys, first hits in primary first */ for (i = 0; i < num_keys; i++) { positions[i] = -ENOENT; - while (prim_hitmask[i]) { +#if DENSE_HASH_BULK_LOOKUP + uint16_t *hitmask = &hitmask_buffer[i]; + unsigned int prim_hitmask = *(uint8_t *)(hitmask); + unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1); +#else + unsigned int prim_hitmask = prim_hitmask_buffer[i]; + unsigned int sec_hitmask = sec_hitmask_buffer[i]; +#endif + while (prim_hitmask) { uint32_t hit_index = - rte_ctz32(prim_hitmask[i]) - >> 1; + rte_ctz32(prim_hitmask) + >> hitmask_padding; uint32_t key_idx = primary_bkt[i]->key_idx[hit_index]; const struct rte_hash_key *key_slot = @@ -1976,13 +2003,13 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys, positions[i] = key_idx - 1; goto next_key; } - prim_hitmask[i] &= ~(3ULL << (hit_index << 1)); + prim_hitmask &= ~(1 << (hit_index << hitmask_padding)); } - while (sec_hitmask[i]) { + while (sec_hitmask) { uint32_t hit_index = - rte_ctz32(sec_hitmask[i]) - >> 1; + rte_ctz32(sec_hitmask) + >> hitmask_padding; uint32_t key_idx = secondary_bkt[i]->key_idx[hit_index]; const struct rte_hash_key *key_slot = @@ -2005,7 +2032,7 @@ __bulk_lookup_l(const struct rte_hash *h, const void **keys, positions[i] = key_idx - 1; goto next_key; } - sec_hitmask[i] &= ~(3ULL << (hit_index << 1)); + sec_hitmask &= ~(1 << (hit_index << hitmask_padding)); } next_key: continue; @@ -2055,11 +2082,20 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys, uint64_t hits = 0; int32_t i; int32_t ret; - uint32_t prim_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0}; - uint32_t sec_hitmask[RTE_HASH_LOOKUP_BULK_MAX] = {0}; struct rte_hash_bucket *cur_bkt, *next_bkt; uint32_t cnt_b, cnt_a; +#if DENSE_HASH_BULK_LOOKUP + const int hitmask_padding = 0; + uint16_t hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0}; + static_assert(sizeof(*hitmask_buffer)*8/2 == RTE_HASH_BUCKET_ENTRIES, + "The hitmask must be exactly wide enough to accept the whole hitmask chen it is dense"); +#else + const int hitmask_padding = 1; + uint32_t prim_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0}; + uint32_t sec_hitmask_buffer[RTE_HASH_LOOKUP_BULK_MAX] = {0}; +#endif + for (i = 0; i < num_keys; i++) positions[i] = -ENOENT; @@ -2073,14 +2109,26 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys, /* Compare signatures and prefetch key slot of first hit */ for (i = 0; i < num_keys; i++) { - compare_signatures(&prim_hitmask[i], &sec_hitmask[i], +#if DENSE_HASH_BULK_LOOKUP + uint16_t *hitmask = &hitmask_buffer[i]; + compare_signatures_dense(hitmask, + primary_bkt[i]->sig_current, + secondary_bkt[i]->sig_current, + sig[i], h->sig_cmp_fn); + const unsigned int prim_hitmask = *(uint8_t *)(hitmask); + const unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1); +#else + compare_signatures_sparse(&prim_hitmask_buffer[i], &sec_hitmask_buffer[i], primary_bkt[i], secondary_bkt[i], sig[i], h->sig_cmp_fn); + const unsigned int prim_hitmask = prim_hitmask_buffer[i]; + const unsigned int sec_hitmask = sec_hitmask_buffer[i]; +#endif - if (prim_hitmask[i]) { + if (prim_hitmask) { uint32_t first_hit = - rte_ctz32(prim_hitmask[i]) - >> 1; + rte_ctz32(prim_hitmask) + >> hitmask_padding; uint32_t key_idx = primary_bkt[i]->key_idx[first_hit]; const struct rte_hash_key *key_slot = @@ -2091,10 +2139,10 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys, continue; } - if (sec_hitmask[i]) { + if (sec_hitmask) { uint32_t first_hit = - rte_ctz32(sec_hitmask[i]) - >> 1; + rte_ctz32(sec_hitmask) + >> hitmask_padding; uint32_t key_idx = secondary_bkt[i]->key_idx[first_hit]; const struct rte_hash_key *key_slot = @@ -2107,10 +2155,18 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys, /* Compare keys, first hits in primary first */ for (i = 0; i < num_keys; i++) { - while (prim_hitmask[i]) { +#if DENSE_HASH_BULK_LOOKUP + uint16_t *hitmask = &hitmask_buffer[i]; + unsigned int prim_hitmask = *(uint8_t *)(hitmask); + unsigned int sec_hitmask = *((uint8_t *)(hitmask)+1); +#else + unsigned int prim_hitmask = prim_hitmask_buffer[i]; + unsigned int sec_hitmask = sec_hitmask_buffer[i]; +#endif + while (prim_hitmask) { uint32_t hit_index = - rte_ctz32(prim_hitmask[i]) - >> 1; + rte_ctz32(prim_hitmask) + >> hitmask_padding; uint32_t key_idx = rte_atomic_load_explicit( &primary_bkt[i]->key_idx[hit_index], @@ -2136,13 +2192,13 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys, positions[i] = key_idx - 1; goto next_key; } - prim_hitmask[i] &= ~(3ULL << (hit_index << 1)); + prim_hitmask &= ~(1 << (hit_index << hitmask_padding)); } - while (sec_hitmask[i]) { + while (sec_hitmask) { uint32_t hit_index = - rte_ctz32(sec_hitmask[i]) - >> 1; + rte_ctz32(sec_hitmask) + >> hitmask_padding; uint32_t key_idx = rte_atomic_load_explicit( &secondary_bkt[i]->key_idx[hit_index], @@ -2169,7 +2225,7 @@ __bulk_lookup_lf(const struct rte_hash *h, const void **keys, positions[i] = key_idx - 1; goto next_key; } - sec_hitmask[i] &= ~(3ULL << (hit_index << 1)); + sec_hitmask &= ~(1 << (hit_index << hitmask_padding)); } next_key: continue;