[dpdk-dev,v3,3/6] hash: update jhash function with the latest available
Commit Message
Jenkins hash function was developed originally in 1996,
and was integrated in first versions of DPDK.
The function has been improved in 2006,
achieving up to 60% better performance, compared to the original one.
This patch integrates that code into the rte_jhash library.
Signed-off-by: Pablo de Lara <pablo.de.lara.guarch@intel.com>
---
lib/librte_hash/rte_jhash.h | 261 +++++++++++++++++++++++++++++++------------
1 files changed, 188 insertions(+), 73 deletions(-)
Comments
Hi Pablo,
> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Pablo de Lara
> Sent: Tuesday, May 05, 2015 3:44 PM
> To: dev@dpdk.org
> Subject: [dpdk-dev] [PATCH v3 3/6] hash: update jhash function with the latest available
>
> Jenkins hash function was developed originally in 1996,
> and was integrated in first versions of DPDK.
> The function has been improved in 2006,
> achieving up to 60% better performance, compared to the original one.
>
> This patch integrates that code into the rte_jhash library.
>
> Signed-off-by: Pablo de Lara <pablo.de.lara.guarch@intel.com>
> ---
> lib/librte_hash/rte_jhash.h | 261 +++++++++++++++++++++++++++++++------------
> 1 files changed, 188 insertions(+), 73 deletions(-)
>
> diff --git a/lib/librte_hash/rte_jhash.h b/lib/librte_hash/rte_jhash.h
> index a4bf5a1..0e96b7c 100644
> --- a/lib/librte_hash/rte_jhash.h
> +++ b/lib/librte_hash/rte_jhash.h
> @@ -1,7 +1,7 @@
> /*-
> * BSD LICENSE
> *
> - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> + * Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
> * All rights reserved.
> *
> * Redistribution and use in source and binary forms, with or without
> @@ -45,38 +45,68 @@ extern "C" {
> #endif
>
> #include <stdint.h>
> +#include <string.h>
> +#include <rte_byteorder.h>
>
> /* jhash.h: Jenkins hash support.
> *
> - * Copyright (C) 1996 Bob Jenkins (bob_jenkins@burtleburtle.net)
> + * Copyright (C) 2006 Bob Jenkins (bob_jenkins@burtleburtle.net)
> *
> * http://burtleburtle.net/bob/hash/
> *
> * These are the credits from Bob's sources:
> *
> - * lookup2.c, by Bob Jenkins, December 1996, Public Domain.
> - * hash(), hash2(), hash3, and mix() are externally useful functions.
> - * Routines to test the hash are included if SELF_TEST is defined.
> - * You can use this free for any purpose. It has no warranty.
> + * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
> + *
> + * These are functions for producing 32-bit hashes for hash table lookup.
> + * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
> + * are externally useful functions. Routines to test the hash are included
> + * if SELF_TEST is defined. You can use this free for any purpose. It's in
> + * the public domain. It has no warranty.
> *
> * $FreeBSD$
> */
>
> +#define rot(x, k) (((x) << (k)) | ((x) >> (32-(k))))
> +
> /** @internal Internal function. NOTE: Arguments are modified. */
> #define __rte_jhash_mix(a, b, c) do { \
> - a -= b; a -= c; a ^= (c>>13); \
> - b -= c; b -= a; b ^= (a<<8); \
> - c -= a; c -= b; c ^= (b>>13); \
> - a -= b; a -= c; a ^= (c>>12); \
> - b -= c; b -= a; b ^= (a<<16); \
> - c -= a; c -= b; c ^= (b>>5); \
> - a -= b; a -= c; a ^= (c>>3); \
> - b -= c; b -= a; b ^= (a<<10); \
> - c -= a; c -= b; c ^= (b>>15); \
> + a -= c; a ^= rot(c, 4); c += b; \
> + b -= a; b ^= rot(a, 6); a += c; \
> + c -= b; c ^= rot(b, 8); b += a; \
> + a -= c; a ^= rot(c, 16); c += b; \
> + b -= a; b ^= rot(a, 19); a += c; \
> + c -= b; c ^= rot(b, 4); b += a; \
> +} while (0)
> +
> +#define __rte_jhash_final(a, b, c) do { \
> + c ^= b; c -= rot(b, 14); \
> + a ^= c; a -= rot(c, 11); \
> + b ^= a; b -= rot(a, 25); \
> + c ^= b; c -= rot(b, 16); \
> + a ^= c; a -= rot(c, 4); \
> + b ^= a; b -= rot(a, 14); \
> + c ^= b; c -= rot(b, 24); \
> } while (0)
>
> /** The golden ratio: an arbitrary value. */
> -#define RTE_JHASH_GOLDEN_RATIO 0x9e3779b9
> +#define RTE_JHASH_GOLDEN_RATIO 0xdeadbeef
> +
> +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
> +#define RTE_JHASH_BYTE0_SHIFT 0
> +#define RTE_JHASH_BYTE1_SHIFT 8
> +#define RTE_JHASH_BYTE2_SHIFT 16
> +#define RTE_JHASH_BYTE3_SHIFT 24
> +#else
> +#define RTE_JHASH_BYTE0_SHIFT 24
> +#define RTE_JHASH_BYTE1_SHIFT 16
> +#define RTE_JHASH_BYTE2_SHIFT 8
> +#define RTE_JHASH_BYTE3_SHIFT 0
> +#endif
> +
> +#define LOWER8b_MASK rte_le_to_cpu_32(0xff)
> +#define LOWER16b_MASK rte_le_to_cpu_32(0xffff)
> +#define LOWER24b_MASK rte_le_to_cpu_32(0xffffff)
>
> /**
> * The most generic version, hashes an arbitrary sequence
> @@ -95,42 +125,119 @@ extern "C" {
> static inline uint32_t
> rte_jhash(const void *key, uint32_t length, uint32_t initval)
> {
> - uint32_t a, b, c, len;
> - const uint8_t *k = (const uint8_t *)key;
> - const uint32_t *k32 = (const uint32_t *)key;
> + uint32_t a, b, c;
> + union {
> + const void *ptr;
> + size_t i;
> + } u;
>
> - len = length;
> - a = b = RTE_JHASH_GOLDEN_RATIO;
> - c = initval;
> + /* Set up the internal state */
> + a = b = c = RTE_JHASH_GOLDEN_RATIO + ((uint32_t)length) + initval;
>
> - while (len >= 12) {
> - a += k32[0];
> - b += k32[1];
> - c += k32[2];
> + u.ptr = key;
>
> - __rte_jhash_mix(a,b,c);
> + /* Check key alignment. For x86 architecture, first case is always optimal */
> + if (!strcmp(RTE_ARCH,"x86_64") || !strcmp(RTE_ARCH,"i686") || (u.i & 0x3) == 0) {
Wonder why strcmp(), why not something like: 'if defined(RTE_ARCH_I686) || defined(RTE_ARCH_X86_64)' as in all other places?
Another question what would be in case of RTE_ARCH="x86_x32"?
Konstantin
> + const uint32_t *k = (const uint32_t *)key;
>
> - k += (3 * sizeof(uint32_t)), k32 += 3;
> - len -= (3 * sizeof(uint32_t));
> - }
> + while (length > 12) {
> + a += k[0];
> + b += k[1];
> + c += k[2];
>
> - c += length;
> - switch (len) {
> - case 11: c += ((uint32_t)k[10] << 24);
> - case 10: c += ((uint32_t)k[9] << 16);
> - case 9 : c += ((uint32_t)k[8] << 8);
> - case 8 : b += ((uint32_t)k[7] << 24);
> - case 7 : b += ((uint32_t)k[6] << 16);
> - case 6 : b += ((uint32_t)k[5] << 8);
> - case 5 : b += k[4];
> - case 4 : a += ((uint32_t)k[3] << 24);
> - case 3 : a += ((uint32_t)k[2] << 16);
> - case 2 : a += ((uint32_t)k[1] << 8);
> - case 1 : a += k[0];
> - default: break;
> - };
> + __rte_jhash_mix(a, b, c);
> +
> + k += 3;
> + length -= 12;
> + }
> +
> + switch (length) {
> + case 12:
> + c += k[2]; b += k[1]; a += k[0]; break;
> + case 11:
> + c += k[2] & LOWER24b_MASK; b += k[1]; a += k[0]; break;
> + case 10:
> + c += k[2] & LOWER16b_MASK; b += k[1]; a += k[0]; break;
> + case 9:
> + c += k[2] & LOWER8b_MASK; b += k[1]; a += k[0]; break;
> + case 8:
> + b += k[1]; a += k[0]; break;
> + case 7:
> + b += k[1] & LOWER24b_MASK; a += k[0]; break;
> + case 6:
> + b += k[1] & LOWER16b_MASK; a += k[0]; break;
> + case 5:
> + b += k[1] & LOWER8b_MASK; a += k[0]; break;
> + case 4:
> + a += k[0]; break;
> + case 3:
> + a += k[0] & LOWER24b_MASK; break;
> + case 2:
> + a += k[0] & LOWER16b_MASK; break;
> + case 1:
> + a += k[0] & LOWER8b_MASK; break;
> + /* zero length strings require no mixing */
> + case 0:
> + return c;
> + };
> + } else {
> + const uint8_t *k = (const uint8_t *)key;
> +
> + /* all but the last block: affect some 32 bits of (a, b, c) */
> + while (length > 12) {
> + a += ((uint32_t)k[0]) << RTE_JHASH_BYTE0_SHIFT;
> + a += ((uint32_t)k[1]) << RTE_JHASH_BYTE1_SHIFT;
> + a += ((uint32_t)k[2]) << RTE_JHASH_BYTE2_SHIFT;
> + a += ((uint32_t)k[3]) << RTE_JHASH_BYTE3_SHIFT;
> + b += ((uint32_t)k[4]) << RTE_JHASH_BYTE0_SHIFT;
> + b += ((uint32_t)k[5]) << RTE_JHASH_BYTE1_SHIFT;
> + b += ((uint32_t)k[6]) << RTE_JHASH_BYTE2_SHIFT;
> + b += ((uint32_t)k[7]) << RTE_JHASH_BYTE3_SHIFT;
> + c += ((uint32_t)k[8]) << RTE_JHASH_BYTE0_SHIFT;
> + c += ((uint32_t)k[9]) << RTE_JHASH_BYTE1_SHIFT;
> + c += ((uint32_t)k[10]) << RTE_JHASH_BYTE2_SHIFT;
> + c += ((uint32_t)k[11]) << RTE_JHASH_BYTE3_SHIFT;
> +
> + __rte_jhash_mix(a, b, c);
> +
> + k += 12;
> + length -= 12;
> + }
> +
> + /* last block: affect all 32 bits of (c) */
> + /* all the case statements fall through */
> + switch (length) {
> + case 12:
> + c += ((uint32_t)k[11]) << RTE_JHASH_BYTE3_SHIFT;
> + case 11:
> + c += ((uint32_t)k[10]) << RTE_JHASH_BYTE2_SHIFT;
> + case 10:
> + c += ((uint32_t)k[9]) << RTE_JHASH_BYTE1_SHIFT;
> + case 9:
> + c += ((uint32_t)k[8]) << RTE_JHASH_BYTE0_SHIFT;
> + case 8:
> + b += ((uint32_t)k[7]) << RTE_JHASH_BYTE3_SHIFT;
> + case 7:
> + b += ((uint32_t)k[6]) << RTE_JHASH_BYTE2_SHIFT;
> + case 6:
> + b += ((uint32_t)k[5]) << RTE_JHASH_BYTE1_SHIFT;
> + case 5:
> + b += ((uint32_t)k[4]) << RTE_JHASH_BYTE0_SHIFT;
> + case 4:
> + a += ((uint32_t)k[3]) << RTE_JHASH_BYTE3_SHIFT;
> + case 3:
> + a += ((uint32_t)k[2]) << RTE_JHASH_BYTE2_SHIFT;
> + case 2:
> + a += ((uint32_t)k[1]) << RTE_JHASH_BYTE1_SHIFT;
> + case 1:
> + a += ((uint32_t)k[0]) << RTE_JHASH_BYTE0_SHIFT;
> + break;
> + case 0:
> + return c;
> + }
> + }
>
> - __rte_jhash_mix(a,b,c);
> + __rte_jhash_final(a, b, c);
>
> return c;
> }
> @@ -151,33 +258,51 @@ rte_jhash(const void *key, uint32_t length, uint32_t initval)
> static inline uint32_t
> rte_jhash2(const uint32_t *k, uint32_t length, uint32_t initval)
> {
> - uint32_t a, b, c, len;
> + uint32_t a, b, c;
>
> - a = b = RTE_JHASH_GOLDEN_RATIO;
> - c = initval;
> - len = length;
> + /* Set up the internal state */
> + a = b = c = RTE_JHASH_GOLDEN_RATIO + (((uint32_t)length) << 2) + initval;
>
> - while (len >= 3) {
> + /* Handle most of the key */
> + while (length > 3) {
> a += k[0];
> b += k[1];
> c += k[2];
> +
> __rte_jhash_mix(a, b, c);
> - k += 3; len -= 3;
> - }
>
> - c += length * 4;
> + k += 3;
> + length -= 3;
> + }
>
> - switch (len) {
> - case 2 : b += k[1];
> - case 1 : a += k[0];
> - default: break;
> + /* Handle the last 3 uint32_t's */
> + switch (length) {
> + case 3:
> + c += k[2];
> + case 2:
> + b += k[1];
> + case 1:
> + a += k[0];
> + __rte_jhash_final(a, b, c);
> + /* case 0: nothing left to add */
> + case 0:
> + break;
> };
>
> - __rte_jhash_mix(a,b,c);
> -
> return c;
> }
>
> +static inline uint32_t
> +__rte_jhash_3words(uint32_t a, uint32_t b, uint32_t c, uint32_t initval)
> +{
> + a += RTE_JHASH_GOLDEN_RATIO + initval;
> + b += RTE_JHASH_GOLDEN_RATIO + initval;
> + c += RTE_JHASH_GOLDEN_RATIO + initval;
> +
> + __rte_jhash_final(a, b, c);
> +
> + return c;
> +}
>
> /**
> * A special ultra-optimized versions that knows it is hashing exactly
> @@ -197,17 +322,7 @@ rte_jhash2(const uint32_t *k, uint32_t length, uint32_t initval)
> static inline uint32_t
> rte_jhash_3words(uint32_t a, uint32_t b, uint32_t c, uint32_t initval)
> {
> - a += RTE_JHASH_GOLDEN_RATIO;
> - b += RTE_JHASH_GOLDEN_RATIO;
> - c += initval;
> -
> - __rte_jhash_mix(a, b, c);
> -
> - /*
> - * NOTE: In particular the "c += length; __rte_jhash_mix(a,b,c);"
> - * normally done at the end is not done here.
> - */
> - return c;
> + return __rte_jhash_3words(a + 12, b + 12, c + 12, initval);
> }
>
> /**
> @@ -226,7 +341,7 @@ rte_jhash_3words(uint32_t a, uint32_t b, uint32_t c, uint32_t initval)
> static inline uint32_t
> rte_jhash_2words(uint32_t a, uint32_t b, uint32_t initval)
> {
> - return rte_jhash_3words(a, b, 0, initval);
> + return __rte_jhash_3words(a + 8, b + 8, 8, initval);
> }
>
> /**
> @@ -243,7 +358,7 @@ rte_jhash_2words(uint32_t a, uint32_t b, uint32_t initval)
> static inline uint32_t
> rte_jhash_1word(uint32_t a, uint32_t initval)
> {
> - return rte_jhash_3words(a, 0, 0, initval);
> + return __rte_jhash_3words(a + 4, 4, 4, initval);
> }
>
> #ifdef __cplusplus
> --
> 1.7.4.1
Hi Konstantin,
> -----Original Message-----
> From: Ananyev, Konstantin
> Sent: Wednesday, May 06, 2015 1:36 AM
> To: De Lara Guarch, Pablo; dev@dpdk.org
> Subject: RE: [dpdk-dev] [PATCH v3 3/6] hash: update jhash function with the
> latest available
>
>
> Hi Pablo,
>
> > -----Original Message-----
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Pablo de Lara
> > Sent: Tuesday, May 05, 2015 3:44 PM
> > To: dev@dpdk.org
> > Subject: [dpdk-dev] [PATCH v3 3/6] hash: update jhash function with the
> latest available
> >
> > Jenkins hash function was developed originally in 1996,
> > and was integrated in first versions of DPDK.
> > The function has been improved in 2006,
> > achieving up to 60% better performance, compared to the original one.
> >
> > This patch integrates that code into the rte_jhash library.
> >
> > Signed-off-by: Pablo de Lara <pablo.de.lara.guarch@intel.com>
> > ---
> > lib/librte_hash/rte_jhash.h | 261
> +++++++++++++++++++++++++++++++------------
> > 1 files changed, 188 insertions(+), 73 deletions(-)
> >
> > diff --git a/lib/librte_hash/rte_jhash.h b/lib/librte_hash/rte_jhash.h
> > index a4bf5a1..0e96b7c 100644
> > --- a/lib/librte_hash/rte_jhash.h
> > +++ b/lib/librte_hash/rte_jhash.h
> > @@ -1,7 +1,7 @@
> > /*-
> > * BSD LICENSE
> > *
> > - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> > + * Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
> > * All rights reserved.
> > *
> > * Redistribution and use in source and binary forms, with or without
> > @@ -45,38 +45,68 @@ extern "C" {
> > #endif
> >
> > #include <stdint.h>
> > +#include <string.h>
> > +#include <rte_byteorder.h>
> >
> > /* jhash.h: Jenkins hash support.
> > *
> > - * Copyright (C) 1996 Bob Jenkins (bob_jenkins@burtleburtle.net)
> > + * Copyright (C) 2006 Bob Jenkins (bob_jenkins@burtleburtle.net)
> > *
> > * http://burtleburtle.net/bob/hash/
> > *
> > * These are the credits from Bob's sources:
> > *
> > - * lookup2.c, by Bob Jenkins, December 1996, Public Domain.
> > - * hash(), hash2(), hash3, and mix() are externally useful functions.
> > - * Routines to test the hash are included if SELF_TEST is defined.
> > - * You can use this free for any purpose. It has no warranty.
> > + * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
> > + *
> > + * These are functions for producing 32-bit hashes for hash table lookup.
> > + * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
> > + * are externally useful functions. Routines to test the hash are included
> > + * if SELF_TEST is defined. You can use this free for any purpose. It's in
> > + * the public domain. It has no warranty.
> > *
> > * $FreeBSD$
> > */
> >
> > +#define rot(x, k) (((x) << (k)) | ((x) >> (32-(k))))
> > +
> > /** @internal Internal function. NOTE: Arguments are modified. */
> > #define __rte_jhash_mix(a, b, c) do { \
> > - a -= b; a -= c; a ^= (c>>13); \
> > - b -= c; b -= a; b ^= (a<<8); \
> > - c -= a; c -= b; c ^= (b>>13); \
> > - a -= b; a -= c; a ^= (c>>12); \
> > - b -= c; b -= a; b ^= (a<<16); \
> > - c -= a; c -= b; c ^= (b>>5); \
> > - a -= b; a -= c; a ^= (c>>3); \
> > - b -= c; b -= a; b ^= (a<<10); \
> > - c -= a; c -= b; c ^= (b>>15); \
> > + a -= c; a ^= rot(c, 4); c += b; \
> > + b -= a; b ^= rot(a, 6); a += c; \
> > + c -= b; c ^= rot(b, 8); b += a; \
> > + a -= c; a ^= rot(c, 16); c += b; \
> > + b -= a; b ^= rot(a, 19); a += c; \
> > + c -= b; c ^= rot(b, 4); b += a; \
> > +} while (0)
> > +
> > +#define __rte_jhash_final(a, b, c) do { \
> > + c ^= b; c -= rot(b, 14); \
> > + a ^= c; a -= rot(c, 11); \
> > + b ^= a; b -= rot(a, 25); \
> > + c ^= b; c -= rot(b, 16); \
> > + a ^= c; a -= rot(c, 4); \
> > + b ^= a; b -= rot(a, 14); \
> > + c ^= b; c -= rot(b, 24); \
> > } while (0)
> >
> > /** The golden ratio: an arbitrary value. */
> > -#define RTE_JHASH_GOLDEN_RATIO 0x9e3779b9
> > +#define RTE_JHASH_GOLDEN_RATIO 0xdeadbeef
> > +
> > +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
> > +#define RTE_JHASH_BYTE0_SHIFT 0
> > +#define RTE_JHASH_BYTE1_SHIFT 8
> > +#define RTE_JHASH_BYTE2_SHIFT 16
> > +#define RTE_JHASH_BYTE3_SHIFT 24
> > +#else
> > +#define RTE_JHASH_BYTE0_SHIFT 24
> > +#define RTE_JHASH_BYTE1_SHIFT 16
> > +#define RTE_JHASH_BYTE2_SHIFT 8
> > +#define RTE_JHASH_BYTE3_SHIFT 0
> > +#endif
> > +
> > +#define LOWER8b_MASK rte_le_to_cpu_32(0xff)
> > +#define LOWER16b_MASK rte_le_to_cpu_32(0xffff)
> > +#define LOWER24b_MASK rte_le_to_cpu_32(0xffffff)
> >
> > /**
> > * The most generic version, hashes an arbitrary sequence
> > @@ -95,42 +125,119 @@ extern "C" {
> > static inline uint32_t
> > rte_jhash(const void *key, uint32_t length, uint32_t initval)
> > {
> > - uint32_t a, b, c, len;
> > - const uint8_t *k = (const uint8_t *)key;
> > - const uint32_t *k32 = (const uint32_t *)key;
> > + uint32_t a, b, c;
> > + union {
> > + const void *ptr;
> > + size_t i;
> > + } u;
> >
> > - len = length;
> > - a = b = RTE_JHASH_GOLDEN_RATIO;
> > - c = initval;
> > + /* Set up the internal state */
> > + a = b = c = RTE_JHASH_GOLDEN_RATIO + ((uint32_t)length) + initval;
> >
> > - while (len >= 12) {
> > - a += k32[0];
> > - b += k32[1];
> > - c += k32[2];
> > + u.ptr = key;
> >
> > - __rte_jhash_mix(a,b,c);
> > + /* Check key alignment. For x86 architecture, first case is always
> optimal */
> > + if (!strcmp(RTE_ARCH,"x86_64") || !strcmp(RTE_ARCH,"i686") || (u.i
> & 0x3) == 0) {
>
> Wonder why strcmp(), why not something like: 'if defined(RTE_ARCH_I686)
> || defined(RTE_ARCH_X86_64)' as in all other places?
> Another question what would be in case of RTE_ARCH="x86_x32"?
> Konstantin
Functionally is the same and using this method, I can integrate all conditions in one line, so it takes less code.
I also checked the assembly code, and the compiler removes the check if it is Intel architecture, so performance remains the same.
Re x86_x32, you are right, probably I need to include it. Although, I just realized that it is not used in any other place.
Wonder if we should include it somewhere else? E.g. rte_hash_crc.h
Hi Pablo,
> -----Original Message-----
> From: De Lara Guarch, Pablo
> Sent: Wednesday, May 06, 2015 10:36 AM
> To: Ananyev, Konstantin; dev@dpdk.org
> Subject: RE: [dpdk-dev] [PATCH v3 3/6] hash: update jhash function with the latest available
>
> Hi Konstantin,
>
> > -----Original Message-----
> > From: Ananyev, Konstantin
> > Sent: Wednesday, May 06, 2015 1:36 AM
> > To: De Lara Guarch, Pablo; dev@dpdk.org
> > Subject: RE: [dpdk-dev] [PATCH v3 3/6] hash: update jhash function with the
> > latest available
> >
> >
> > Hi Pablo,
> >
> > > -----Original Message-----
> > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Pablo de Lara
> > > Sent: Tuesday, May 05, 2015 3:44 PM
> > > To: dev@dpdk.org
> > > Subject: [dpdk-dev] [PATCH v3 3/6] hash: update jhash function with the
> > latest available
> > >
> > > Jenkins hash function was developed originally in 1996,
> > > and was integrated in first versions of DPDK.
> > > The function has been improved in 2006,
> > > achieving up to 60% better performance, compared to the original one.
> > >
> > > This patch integrates that code into the rte_jhash library.
> > >
> > > Signed-off-by: Pablo de Lara <pablo.de.lara.guarch@intel.com>
> > > ---
> > > lib/librte_hash/rte_jhash.h | 261
> > +++++++++++++++++++++++++++++++------------
> > > 1 files changed, 188 insertions(+), 73 deletions(-)
> > >
> > > diff --git a/lib/librte_hash/rte_jhash.h b/lib/librte_hash/rte_jhash.h
> > > index a4bf5a1..0e96b7c 100644
> > > --- a/lib/librte_hash/rte_jhash.h
> > > +++ b/lib/librte_hash/rte_jhash.h
> > > @@ -1,7 +1,7 @@
> > > /*-
> > > * BSD LICENSE
> > > *
> > > - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> > > + * Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
> > > * All rights reserved.
> > > *
> > > * Redistribution and use in source and binary forms, with or without
> > > @@ -45,38 +45,68 @@ extern "C" {
> > > #endif
> > >
> > > #include <stdint.h>
> > > +#include <string.h>
> > > +#include <rte_byteorder.h>
> > >
> > > /* jhash.h: Jenkins hash support.
> > > *
> > > - * Copyright (C) 1996 Bob Jenkins (bob_jenkins@burtleburtle.net)
> > > + * Copyright (C) 2006 Bob Jenkins (bob_jenkins@burtleburtle.net)
> > > *
> > > * http://burtleburtle.net/bob/hash/
> > > *
> > > * These are the credits from Bob's sources:
> > > *
> > > - * lookup2.c, by Bob Jenkins, December 1996, Public Domain.
> > > - * hash(), hash2(), hash3, and mix() are externally useful functions.
> > > - * Routines to test the hash are included if SELF_TEST is defined.
> > > - * You can use this free for any purpose. It has no warranty.
> > > + * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
> > > + *
> > > + * These are functions for producing 32-bit hashes for hash table lookup.
> > > + * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
> > > + * are externally useful functions. Routines to test the hash are included
> > > + * if SELF_TEST is defined. You can use this free for any purpose. It's in
> > > + * the public domain. It has no warranty.
> > > *
> > > * $FreeBSD$
> > > */
> > >
> > > +#define rot(x, k) (((x) << (k)) | ((x) >> (32-(k))))
> > > +
> > > /** @internal Internal function. NOTE: Arguments are modified. */
> > > #define __rte_jhash_mix(a, b, c) do { \
> > > - a -= b; a -= c; a ^= (c>>13); \
> > > - b -= c; b -= a; b ^= (a<<8); \
> > > - c -= a; c -= b; c ^= (b>>13); \
> > > - a -= b; a -= c; a ^= (c>>12); \
> > > - b -= c; b -= a; b ^= (a<<16); \
> > > - c -= a; c -= b; c ^= (b>>5); \
> > > - a -= b; a -= c; a ^= (c>>3); \
> > > - b -= c; b -= a; b ^= (a<<10); \
> > > - c -= a; c -= b; c ^= (b>>15); \
> > > + a -= c; a ^= rot(c, 4); c += b; \
> > > + b -= a; b ^= rot(a, 6); a += c; \
> > > + c -= b; c ^= rot(b, 8); b += a; \
> > > + a -= c; a ^= rot(c, 16); c += b; \
> > > + b -= a; b ^= rot(a, 19); a += c; \
> > > + c -= b; c ^= rot(b, 4); b += a; \
> > > +} while (0)
> > > +
> > > +#define __rte_jhash_final(a, b, c) do { \
> > > + c ^= b; c -= rot(b, 14); \
> > > + a ^= c; a -= rot(c, 11); \
> > > + b ^= a; b -= rot(a, 25); \
> > > + c ^= b; c -= rot(b, 16); \
> > > + a ^= c; a -= rot(c, 4); \
> > > + b ^= a; b -= rot(a, 14); \
> > > + c ^= b; c -= rot(b, 24); \
> > > } while (0)
> > >
> > > /** The golden ratio: an arbitrary value. */
> > > -#define RTE_JHASH_GOLDEN_RATIO 0x9e3779b9
> > > +#define RTE_JHASH_GOLDEN_RATIO 0xdeadbeef
> > > +
> > > +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
> > > +#define RTE_JHASH_BYTE0_SHIFT 0
> > > +#define RTE_JHASH_BYTE1_SHIFT 8
> > > +#define RTE_JHASH_BYTE2_SHIFT 16
> > > +#define RTE_JHASH_BYTE3_SHIFT 24
> > > +#else
> > > +#define RTE_JHASH_BYTE0_SHIFT 24
> > > +#define RTE_JHASH_BYTE1_SHIFT 16
> > > +#define RTE_JHASH_BYTE2_SHIFT 8
> > > +#define RTE_JHASH_BYTE3_SHIFT 0
> > > +#endif
> > > +
> > > +#define LOWER8b_MASK rte_le_to_cpu_32(0xff)
> > > +#define LOWER16b_MASK rte_le_to_cpu_32(0xffff)
> > > +#define LOWER24b_MASK rte_le_to_cpu_32(0xffffff)
> > >
> > > /**
> > > * The most generic version, hashes an arbitrary sequence
> > > @@ -95,42 +125,119 @@ extern "C" {
> > > static inline uint32_t
> > > rte_jhash(const void *key, uint32_t length, uint32_t initval)
> > > {
> > > - uint32_t a, b, c, len;
> > > - const uint8_t *k = (const uint8_t *)key;
> > > - const uint32_t *k32 = (const uint32_t *)key;
> > > + uint32_t a, b, c;
> > > + union {
> > > + const void *ptr;
> > > + size_t i;
> > > + } u;
> > >
> > > - len = length;
> > > - a = b = RTE_JHASH_GOLDEN_RATIO;
> > > - c = initval;
> > > + /* Set up the internal state */
> > > + a = b = c = RTE_JHASH_GOLDEN_RATIO + ((uint32_t)length) + initval;
> > >
> > > - while (len >= 12) {
> > > - a += k32[0];
> > > - b += k32[1];
> > > - c += k32[2];
> > > + u.ptr = key;
> > >
> > > - __rte_jhash_mix(a,b,c);
> > > + /* Check key alignment. For x86 architecture, first case is always
> > optimal */
> > > + if (!strcmp(RTE_ARCH,"x86_64") || !strcmp(RTE_ARCH,"i686") || (u.i
> > & 0x3) == 0) {
> >
> > Wonder why strcmp(), why not something like: 'if defined(RTE_ARCH_I686)
> > || defined(RTE_ARCH_X86_64)' as in all other places?
> > Another question what would be in case of RTE_ARCH="x86_x32"?
> > Konstantin
>
> Functionally is the same and using this method, I can integrate all conditions in one line, so it takes less code.
> I also checked the assembly code, and the compiler removes the check if it is Intel architecture, so performance remains the same.
Well, yes I think most modern compilers treat strcmp() as a builtin function and are able to optimise these strcmp() calls off for that case.
But we probably can't guarantee that it would always be the case for all different compiler/libc combinations.
Again, by some reason user might need to use ' -fno-builtin' flag while building his stuff.
So I would use pre-processor macros here, it is more predictable.
Again, that way it is consistent with other places.
Actually I wonder do you really need such sort of diversity for aligned/non-aligned case?
Wonder wouldn't something like that work for you:
#infdef RTE_ARCH_X86
const uint32_t *k = (uint32_t *)((uintptr_t)key & (uintptr_t)~3);
const uint32_t s = ((uintptr_t)key & 3) * CHAR_BIT;
#else /*X86*/
const uint32_t *k = key;
const uint32_t s = 0;
#endif
while (len > 12) {
a += k[0] >> s | (uint64_t)k[1] << (32 - s);
b += k[1] >> s | (uint64_t)k[2] << (32 - s);
c += k[2] >> s | (uint64_t)k[3] << (32 - s);
k += 3;
length -= 12;
}
switch (length) {
case 12:
a += k[0] >> s | (uint64_t)k[1] << (32 - s);
b += k[1] >> s | (uint64_t)k[2] << (32 - s);
c += k[2] >> s | (uint64_t)k[3] << (32 - s);
break;
case 11:
a += k[0] >> s | (uint64_t)k[1] << (32 - s);
b += k[1] >> s | (uint64_t)k[2] << (32 - s);
c += (k[2] >> s | (uint64_t)k[3] << (32 - s)) & & LOWER24b_MASK;
break;
...
case 1:
a += (k[0] >> s | (uint64_t)k[1] << (32 - s)) & LOWER8b_MASK;
break;
...
In that way, even for non-aligned you don't need do 4B reads.
For x86, compiler would do it's optimisation work and strip off '>> s | (uint64_t)k[..] << (32 - s);'.
>
> Re x86_x32, you are right, probably I need to include it. Although, I just realized that it is not used in any other place.
> Wonder if we should include it somewhere else? E.g. rte_hash_crc.h
Yep, that's true we are not doing it for hash_crc also...
Would probably good to have some sort of ' RTE_ARCH_X86' - that would be defined for all x86 targets and use it whenever applicable.
But I suppose, that's a subject for another patch.
Konstantin
> -----Original Message-----
> From: Ananyev, Konstantin
> Sent: Wednesday, May 06, 2015 5:11 PM
> To: De Lara Guarch, Pablo; dev@dpdk.org
> Subject: RE: [dpdk-dev] [PATCH v3 3/6] hash: update jhash function with the latest available
>
> Hi Pablo,
>
> > -----Original Message-----
> > From: De Lara Guarch, Pablo
> > Sent: Wednesday, May 06, 2015 10:36 AM
> > To: Ananyev, Konstantin; dev@dpdk.org
> > Subject: RE: [dpdk-dev] [PATCH v3 3/6] hash: update jhash function with the latest available
> >
> > Hi Konstantin,
> >
> > > -----Original Message-----
> > > From: Ananyev, Konstantin
> > > Sent: Wednesday, May 06, 2015 1:36 AM
> > > To: De Lara Guarch, Pablo; dev@dpdk.org
> > > Subject: RE: [dpdk-dev] [PATCH v3 3/6] hash: update jhash function with the
> > > latest available
> > >
> > >
> > > Hi Pablo,
> > >
> > > > -----Original Message-----
> > > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Pablo de Lara
> > > > Sent: Tuesday, May 05, 2015 3:44 PM
> > > > To: dev@dpdk.org
> > > > Subject: [dpdk-dev] [PATCH v3 3/6] hash: update jhash function with the
> > > latest available
> > > >
> > > > Jenkins hash function was developed originally in 1996,
> > > > and was integrated in first versions of DPDK.
> > > > The function has been improved in 2006,
> > > > achieving up to 60% better performance, compared to the original one.
> > > >
> > > > This patch integrates that code into the rte_jhash library.
> > > >
> > > > Signed-off-by: Pablo de Lara <pablo.de.lara.guarch@intel.com>
> > > > ---
> > > > lib/librte_hash/rte_jhash.h | 261
> > > +++++++++++++++++++++++++++++++------------
> > > > 1 files changed, 188 insertions(+), 73 deletions(-)
> > > >
> > > > diff --git a/lib/librte_hash/rte_jhash.h b/lib/librte_hash/rte_jhash.h
> > > > index a4bf5a1..0e96b7c 100644
> > > > --- a/lib/librte_hash/rte_jhash.h
> > > > +++ b/lib/librte_hash/rte_jhash.h
> > > > @@ -1,7 +1,7 @@
> > > > /*-
> > > > * BSD LICENSE
> > > > *
> > > > - * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> > > > + * Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
> > > > * All rights reserved.
> > > > *
> > > > * Redistribution and use in source and binary forms, with or without
> > > > @@ -45,38 +45,68 @@ extern "C" {
> > > > #endif
> > > >
> > > > #include <stdint.h>
> > > > +#include <string.h>
> > > > +#include <rte_byteorder.h>
> > > >
> > > > /* jhash.h: Jenkins hash support.
> > > > *
> > > > - * Copyright (C) 1996 Bob Jenkins (bob_jenkins@burtleburtle.net)
> > > > + * Copyright (C) 2006 Bob Jenkins (bob_jenkins@burtleburtle.net)
> > > > *
> > > > * http://burtleburtle.net/bob/hash/
> > > > *
> > > > * These are the credits from Bob's sources:
> > > > *
> > > > - * lookup2.c, by Bob Jenkins, December 1996, Public Domain.
> > > > - * hash(), hash2(), hash3, and mix() are externally useful functions.
> > > > - * Routines to test the hash are included if SELF_TEST is defined.
> > > > - * You can use this free for any purpose. It has no warranty.
> > > > + * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
> > > > + *
> > > > + * These are functions for producing 32-bit hashes for hash table lookup.
> > > > + * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
> > > > + * are externally useful functions. Routines to test the hash are included
> > > > + * if SELF_TEST is defined. You can use this free for any purpose. It's in
> > > > + * the public domain. It has no warranty.
> > > > *
> > > > * $FreeBSD$
> > > > */
> > > >
> > > > +#define rot(x, k) (((x) << (k)) | ((x) >> (32-(k))))
> > > > +
> > > > /** @internal Internal function. NOTE: Arguments are modified. */
> > > > #define __rte_jhash_mix(a, b, c) do { \
> > > > - a -= b; a -= c; a ^= (c>>13); \
> > > > - b -= c; b -= a; b ^= (a<<8); \
> > > > - c -= a; c -= b; c ^= (b>>13); \
> > > > - a -= b; a -= c; a ^= (c>>12); \
> > > > - b -= c; b -= a; b ^= (a<<16); \
> > > > - c -= a; c -= b; c ^= (b>>5); \
> > > > - a -= b; a -= c; a ^= (c>>3); \
> > > > - b -= c; b -= a; b ^= (a<<10); \
> > > > - c -= a; c -= b; c ^= (b>>15); \
> > > > + a -= c; a ^= rot(c, 4); c += b; \
> > > > + b -= a; b ^= rot(a, 6); a += c; \
> > > > + c -= b; c ^= rot(b, 8); b += a; \
> > > > + a -= c; a ^= rot(c, 16); c += b; \
> > > > + b -= a; b ^= rot(a, 19); a += c; \
> > > > + c -= b; c ^= rot(b, 4); b += a; \
> > > > +} while (0)
> > > > +
> > > > +#define __rte_jhash_final(a, b, c) do { \
> > > > + c ^= b; c -= rot(b, 14); \
> > > > + a ^= c; a -= rot(c, 11); \
> > > > + b ^= a; b -= rot(a, 25); \
> > > > + c ^= b; c -= rot(b, 16); \
> > > > + a ^= c; a -= rot(c, 4); \
> > > > + b ^= a; b -= rot(a, 14); \
> > > > + c ^= b; c -= rot(b, 24); \
> > > > } while (0)
> > > >
> > > > /** The golden ratio: an arbitrary value. */
> > > > -#define RTE_JHASH_GOLDEN_RATIO 0x9e3779b9
> > > > +#define RTE_JHASH_GOLDEN_RATIO 0xdeadbeef
> > > > +
> > > > +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
> > > > +#define RTE_JHASH_BYTE0_SHIFT 0
> > > > +#define RTE_JHASH_BYTE1_SHIFT 8
> > > > +#define RTE_JHASH_BYTE2_SHIFT 16
> > > > +#define RTE_JHASH_BYTE3_SHIFT 24
> > > > +#else
> > > > +#define RTE_JHASH_BYTE0_SHIFT 24
> > > > +#define RTE_JHASH_BYTE1_SHIFT 16
> > > > +#define RTE_JHASH_BYTE2_SHIFT 8
> > > > +#define RTE_JHASH_BYTE3_SHIFT 0
> > > > +#endif
> > > > +
> > > > +#define LOWER8b_MASK rte_le_to_cpu_32(0xff)
> > > > +#define LOWER16b_MASK rte_le_to_cpu_32(0xffff)
> > > > +#define LOWER24b_MASK rte_le_to_cpu_32(0xffffff)
> > > >
> > > > /**
> > > > * The most generic version, hashes an arbitrary sequence
> > > > @@ -95,42 +125,119 @@ extern "C" {
> > > > static inline uint32_t
> > > > rte_jhash(const void *key, uint32_t length, uint32_t initval)
> > > > {
> > > > - uint32_t a, b, c, len;
> > > > - const uint8_t *k = (const uint8_t *)key;
> > > > - const uint32_t *k32 = (const uint32_t *)key;
> > > > + uint32_t a, b, c;
> > > > + union {
> > > > + const void *ptr;
> > > > + size_t i;
> > > > + } u;
> > > >
> > > > - len = length;
> > > > - a = b = RTE_JHASH_GOLDEN_RATIO;
> > > > - c = initval;
> > > > + /* Set up the internal state */
> > > > + a = b = c = RTE_JHASH_GOLDEN_RATIO + ((uint32_t)length) + initval;
> > > >
> > > > - while (len >= 12) {
> > > > - a += k32[0];
> > > > - b += k32[1];
> > > > - c += k32[2];
> > > > + u.ptr = key;
> > > >
> > > > - __rte_jhash_mix(a,b,c);
> > > > + /* Check key alignment. For x86 architecture, first case is always
> > > optimal */
> > > > + if (!strcmp(RTE_ARCH,"x86_64") || !strcmp(RTE_ARCH,"i686") || (u.i
> > > & 0x3) == 0) {
> > >
> > > Wonder why strcmp(), why not something like: 'if defined(RTE_ARCH_I686)
> > > || defined(RTE_ARCH_X86_64)' as in all other places?
> > > Another question what would be in case of RTE_ARCH="x86_x32"?
> > > Konstantin
> >
> > Functionally is the same and using this method, I can integrate all conditions in one line, so it takes less code.
> > I also checked the assembly code, and the compiler removes the check if it is Intel architecture, so performance remains the same.
>
> Well, yes I think most modern compilers treat strcmp() as a builtin function and are able to optimise these strcmp() calls off for that
> case.
> But we probably can't guarantee that it would always be the case for all different compiler/libc combinations.
> Again, by some reason user might need to use ' -fno-builtin' flag while building his stuff.
> So I would use pre-processor macros here, it is more predictable.
> Again, that way it is consistent with other places.
>
> Actually I wonder do you really need such sort of diversity for aligned/non-aligned case?
> Wonder wouldn't something like that work for you:
>
> #infdef RTE_ARCH_X86
> const uint32_t *k = (uint32_t *)((uintptr_t)key & (uintptr_t)~3);
> const uint32_t s = ((uintptr_t)key & 3) * CHAR_BIT;
> #else /*X86*/
> const uint32_t *k = key;
> const uint32_t s = 0;
> #endif
>
> while (len > 12) {
> a += k[0] >> s | (uint64_t)k[1] << (32 - s);
> b += k[1] >> s | (uint64_t)k[2] << (32 - s);
> c += k[2] >> s | (uint64_t)k[3] << (32 - s);
> k += 3;
> length -= 12;
> }
>
> switch (length) {
> case 12:
> a += k[0] >> s | (uint64_t)k[1] << (32 - s);
> b += k[1] >> s | (uint64_t)k[2] << (32 - s);
> c += k[2] >> s | (uint64_t)k[3] << (32 - s);
> break;
> case 11:
> a += k[0] >> s | (uint64_t)k[1] << (32 - s);
> b += k[1] >> s | (uint64_t)k[2] << (32 - s);
> c += (k[2] >> s | (uint64_t)k[3] << (32 - s)) & & LOWER24b_MASK;
> break;
> ...
> case 1:
> a += (k[0] >> s | (uint64_t)k[1] << (32 - s)) & LOWER8b_MASK;
> break;
> ...
>
> In that way, even for non-aligned you don't need do 4B reads.
> For x86, compiler would do it's optimisation work and strip off '>> s | (uint64_t)k[..] << (32 - s);'.
>
Actually, as Sergio pointed out, that approach might penalise non-x86 4B aligned case.
So probably, a special path for s== 0 is still needed, i.e:
if (s==0) {...; a += k[0]; ...} else {...; a += k[0] >> s | (uint64_t)k[1] << (32 - s);...}
Konstantin
> >
> > Re x86_x32, you are right, probably I need to include it. Although, I just realized that it is not used in any other place.
> > Wonder if we should include it somewhere else? E.g. rte_hash_crc.h
>
> Yep, that's true we are not doing it for hash_crc also...
> Would probably good to have some sort of ' RTE_ARCH_X86' - that would be defined for all x86 targets and use it whenever applicable.
> But I suppose, that's a subject for another patch.
>
> Konstantin
>
@@ -1,7 +1,7 @@
/*-
* BSD LICENSE
*
- * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ * Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -45,38 +45,68 @@ extern "C" {
#endif
#include <stdint.h>
+#include <string.h>
+#include <rte_byteorder.h>
/* jhash.h: Jenkins hash support.
*
- * Copyright (C) 1996 Bob Jenkins (bob_jenkins@burtleburtle.net)
+ * Copyright (C) 2006 Bob Jenkins (bob_jenkins@burtleburtle.net)
*
* http://burtleburtle.net/bob/hash/
*
* These are the credits from Bob's sources:
*
- * lookup2.c, by Bob Jenkins, December 1996, Public Domain.
- * hash(), hash2(), hash3, and mix() are externally useful functions.
- * Routines to test the hash are included if SELF_TEST is defined.
- * You can use this free for any purpose. It has no warranty.
+ * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
+ *
+ * These are functions for producing 32-bit hashes for hash table lookup.
+ * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
+ * are externally useful functions. Routines to test the hash are included
+ * if SELF_TEST is defined. You can use this free for any purpose. It's in
+ * the public domain. It has no warranty.
*
* $FreeBSD$
*/
+#define rot(x, k) (((x) << (k)) | ((x) >> (32-(k))))
+
/** @internal Internal function. NOTE: Arguments are modified. */
#define __rte_jhash_mix(a, b, c) do { \
- a -= b; a -= c; a ^= (c>>13); \
- b -= c; b -= a; b ^= (a<<8); \
- c -= a; c -= b; c ^= (b>>13); \
- a -= b; a -= c; a ^= (c>>12); \
- b -= c; b -= a; b ^= (a<<16); \
- c -= a; c -= b; c ^= (b>>5); \
- a -= b; a -= c; a ^= (c>>3); \
- b -= c; b -= a; b ^= (a<<10); \
- c -= a; c -= b; c ^= (b>>15); \
+ a -= c; a ^= rot(c, 4); c += b; \
+ b -= a; b ^= rot(a, 6); a += c; \
+ c -= b; c ^= rot(b, 8); b += a; \
+ a -= c; a ^= rot(c, 16); c += b; \
+ b -= a; b ^= rot(a, 19); a += c; \
+ c -= b; c ^= rot(b, 4); b += a; \
+} while (0)
+
+#define __rte_jhash_final(a, b, c) do { \
+ c ^= b; c -= rot(b, 14); \
+ a ^= c; a -= rot(c, 11); \
+ b ^= a; b -= rot(a, 25); \
+ c ^= b; c -= rot(b, 16); \
+ a ^= c; a -= rot(c, 4); \
+ b ^= a; b -= rot(a, 14); \
+ c ^= b; c -= rot(b, 24); \
} while (0)
/** The golden ratio: an arbitrary value. */
-#define RTE_JHASH_GOLDEN_RATIO 0x9e3779b9
+#define RTE_JHASH_GOLDEN_RATIO 0xdeadbeef
+
+#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
+#define RTE_JHASH_BYTE0_SHIFT 0
+#define RTE_JHASH_BYTE1_SHIFT 8
+#define RTE_JHASH_BYTE2_SHIFT 16
+#define RTE_JHASH_BYTE3_SHIFT 24
+#else
+#define RTE_JHASH_BYTE0_SHIFT 24
+#define RTE_JHASH_BYTE1_SHIFT 16
+#define RTE_JHASH_BYTE2_SHIFT 8
+#define RTE_JHASH_BYTE3_SHIFT 0
+#endif
+
+#define LOWER8b_MASK rte_le_to_cpu_32(0xff)
+#define LOWER16b_MASK rte_le_to_cpu_32(0xffff)
+#define LOWER24b_MASK rte_le_to_cpu_32(0xffffff)
/**
* The most generic version, hashes an arbitrary sequence
@@ -95,42 +125,119 @@ extern "C" {
static inline uint32_t
rte_jhash(const void *key, uint32_t length, uint32_t initval)
{
- uint32_t a, b, c, len;
- const uint8_t *k = (const uint8_t *)key;
- const uint32_t *k32 = (const uint32_t *)key;
+ uint32_t a, b, c;
+ union {
+ const void *ptr;
+ size_t i;
+ } u;
- len = length;
- a = b = RTE_JHASH_GOLDEN_RATIO;
- c = initval;
+ /* Set up the internal state */
+ a = b = c = RTE_JHASH_GOLDEN_RATIO + ((uint32_t)length) + initval;
- while (len >= 12) {
- a += k32[0];
- b += k32[1];
- c += k32[2];
+ u.ptr = key;
- __rte_jhash_mix(a,b,c);
+ /* Check key alignment. For x86 architecture, first case is always optimal */
+ if (!strcmp(RTE_ARCH,"x86_64") || !strcmp(RTE_ARCH,"i686") || (u.i & 0x3) == 0) {
+ const uint32_t *k = (const uint32_t *)key;
- k += (3 * sizeof(uint32_t)), k32 += 3;
- len -= (3 * sizeof(uint32_t));
- }
+ while (length > 12) {
+ a += k[0];
+ b += k[1];
+ c += k[2];
- c += length;
- switch (len) {
- case 11: c += ((uint32_t)k[10] << 24);
- case 10: c += ((uint32_t)k[9] << 16);
- case 9 : c += ((uint32_t)k[8] << 8);
- case 8 : b += ((uint32_t)k[7] << 24);
- case 7 : b += ((uint32_t)k[6] << 16);
- case 6 : b += ((uint32_t)k[5] << 8);
- case 5 : b += k[4];
- case 4 : a += ((uint32_t)k[3] << 24);
- case 3 : a += ((uint32_t)k[2] << 16);
- case 2 : a += ((uint32_t)k[1] << 8);
- case 1 : a += k[0];
- default: break;
- };
+ __rte_jhash_mix(a, b, c);
+
+ k += 3;
+ length -= 12;
+ }
+
+ switch (length) {
+ case 12:
+ c += k[2]; b += k[1]; a += k[0]; break;
+ case 11:
+ c += k[2] & LOWER24b_MASK; b += k[1]; a += k[0]; break;
+ case 10:
+ c += k[2] & LOWER16b_MASK; b += k[1]; a += k[0]; break;
+ case 9:
+ c += k[2] & LOWER8b_MASK; b += k[1]; a += k[0]; break;
+ case 8:
+ b += k[1]; a += k[0]; break;
+ case 7:
+ b += k[1] & LOWER24b_MASK; a += k[0]; break;
+ case 6:
+ b += k[1] & LOWER16b_MASK; a += k[0]; break;
+ case 5:
+ b += k[1] & LOWER8b_MASK; a += k[0]; break;
+ case 4:
+ a += k[0]; break;
+ case 3:
+ a += k[0] & LOWER24b_MASK; break;
+ case 2:
+ a += k[0] & LOWER16b_MASK; break;
+ case 1:
+ a += k[0] & LOWER8b_MASK; break;
+ /* zero length strings require no mixing */
+ case 0:
+ return c;
+ };
+ } else {
+ const uint8_t *k = (const uint8_t *)key;
+
+ /* all but the last block: affect some 32 bits of (a, b, c) */
+ while (length > 12) {
+ a += ((uint32_t)k[0]) << RTE_JHASH_BYTE0_SHIFT;
+ a += ((uint32_t)k[1]) << RTE_JHASH_BYTE1_SHIFT;
+ a += ((uint32_t)k[2]) << RTE_JHASH_BYTE2_SHIFT;
+ a += ((uint32_t)k[3]) << RTE_JHASH_BYTE3_SHIFT;
+ b += ((uint32_t)k[4]) << RTE_JHASH_BYTE0_SHIFT;
+ b += ((uint32_t)k[5]) << RTE_JHASH_BYTE1_SHIFT;
+ b += ((uint32_t)k[6]) << RTE_JHASH_BYTE2_SHIFT;
+ b += ((uint32_t)k[7]) << RTE_JHASH_BYTE3_SHIFT;
+ c += ((uint32_t)k[8]) << RTE_JHASH_BYTE0_SHIFT;
+ c += ((uint32_t)k[9]) << RTE_JHASH_BYTE1_SHIFT;
+ c += ((uint32_t)k[10]) << RTE_JHASH_BYTE2_SHIFT;
+ c += ((uint32_t)k[11]) << RTE_JHASH_BYTE3_SHIFT;
+
+ __rte_jhash_mix(a, b, c);
+
+ k += 12;
+ length -= 12;
+ }
+
+ /* last block: affect all 32 bits of (c) */
+ /* all the case statements fall through */
+ switch (length) {
+ case 12:
+ c += ((uint32_t)k[11]) << RTE_JHASH_BYTE3_SHIFT;
+ case 11:
+ c += ((uint32_t)k[10]) << RTE_JHASH_BYTE2_SHIFT;
+ case 10:
+ c += ((uint32_t)k[9]) << RTE_JHASH_BYTE1_SHIFT;
+ case 9:
+ c += ((uint32_t)k[8]) << RTE_JHASH_BYTE0_SHIFT;
+ case 8:
+ b += ((uint32_t)k[7]) << RTE_JHASH_BYTE3_SHIFT;
+ case 7:
+ b += ((uint32_t)k[6]) << RTE_JHASH_BYTE2_SHIFT;
+ case 6:
+ b += ((uint32_t)k[5]) << RTE_JHASH_BYTE1_SHIFT;
+ case 5:
+ b += ((uint32_t)k[4]) << RTE_JHASH_BYTE0_SHIFT;
+ case 4:
+ a += ((uint32_t)k[3]) << RTE_JHASH_BYTE3_SHIFT;
+ case 3:
+ a += ((uint32_t)k[2]) << RTE_JHASH_BYTE2_SHIFT;
+ case 2:
+ a += ((uint32_t)k[1]) << RTE_JHASH_BYTE1_SHIFT;
+ case 1:
+ a += ((uint32_t)k[0]) << RTE_JHASH_BYTE0_SHIFT;
+ break;
+ case 0:
+ return c;
+ }
+ }
- __rte_jhash_mix(a,b,c);
+ __rte_jhash_final(a, b, c);
return c;
}
@@ -151,33 +258,51 @@ rte_jhash(const void *key, uint32_t length, uint32_t initval)
static inline uint32_t
rte_jhash2(const uint32_t *k, uint32_t length, uint32_t initval)
{
- uint32_t a, b, c, len;
+ uint32_t a, b, c;
- a = b = RTE_JHASH_GOLDEN_RATIO;
- c = initval;
- len = length;
+ /* Set up the internal state */
+ a = b = c = RTE_JHASH_GOLDEN_RATIO + (((uint32_t)length) << 2) + initval;
- while (len >= 3) {
+ /* Handle most of the key */
+ while (length > 3) {
a += k[0];
b += k[1];
c += k[2];
+
__rte_jhash_mix(a, b, c);
- k += 3; len -= 3;
- }
- c += length * 4;
+ k += 3;
+ length -= 3;
+ }
- switch (len) {
- case 2 : b += k[1];
- case 1 : a += k[0];
- default: break;
+ /* Handle the last 3 uint32_t's */
+ switch (length) {
+ case 3:
+ c += k[2];
+ case 2:
+ b += k[1];
+ case 1:
+ a += k[0];
+ __rte_jhash_final(a, b, c);
+ /* case 0: nothing left to add */
+ case 0:
+ break;
};
- __rte_jhash_mix(a,b,c);
-
return c;
}
+static inline uint32_t
+__rte_jhash_3words(uint32_t a, uint32_t b, uint32_t c, uint32_t initval)
+{
+ a += RTE_JHASH_GOLDEN_RATIO + initval;
+ b += RTE_JHASH_GOLDEN_RATIO + initval;
+ c += RTE_JHASH_GOLDEN_RATIO + initval;
+
+ __rte_jhash_final(a, b, c);
+
+ return c;
+}
/**
* A special ultra-optimized versions that knows it is hashing exactly
@@ -197,17 +322,7 @@ rte_jhash2(const uint32_t *k, uint32_t length, uint32_t initval)
static inline uint32_t
rte_jhash_3words(uint32_t a, uint32_t b, uint32_t c, uint32_t initval)
{
- a += RTE_JHASH_GOLDEN_RATIO;
- b += RTE_JHASH_GOLDEN_RATIO;
- c += initval;
-
- __rte_jhash_mix(a, b, c);
-
- /*
- * NOTE: In particular the "c += length; __rte_jhash_mix(a,b,c);"
- * normally done at the end is not done here.
- */
- return c;
+ return __rte_jhash_3words(a + 12, b + 12, c + 12, initval);
}
/**
@@ -226,7 +341,7 @@ rte_jhash_3words(uint32_t a, uint32_t b, uint32_t c, uint32_t initval)
static inline uint32_t
rte_jhash_2words(uint32_t a, uint32_t b, uint32_t initval)
{
- return rte_jhash_3words(a, b, 0, initval);
+ return __rte_jhash_3words(a + 8, b + 8, 8, initval);
}
/**
@@ -243,7 +358,7 @@ rte_jhash_2words(uint32_t a, uint32_t b, uint32_t initval)
static inline uint32_t
rte_jhash_1word(uint32_t a, uint32_t initval)
{
- return rte_jhash_3words(a, 0, 0, initval);
+ return __rte_jhash_3words(a + 4, 4, 4, initval);
}
#ifdef __cplusplus