[dpdk-dev,v3,3/6] hash: update jhash function with the latest available

Message ID 1430837034-21031-4-git-send-email-pablo.de.lara.guarch@intel.com (mailing list archive)
State Superseded, archived
Headers

Commit Message

De Lara Guarch, Pablo May 5, 2015, 2:43 p.m. UTC
  Jenkins hash function was developed originally in 1996,
and was integrated in first versions of DPDK.
The function has been improved in 2006,
achieving up to 60% better performance, compared to the original one.

This patch integrates that code into the rte_jhash library.

Signed-off-by: Pablo de Lara <pablo.de.lara.guarch@intel.com>
---
 lib/librte_hash/rte_jhash.h |  261 +++++++++++++++++++++++++++++++------------
 1 files changed, 188 insertions(+), 73 deletions(-)
  

Comments

Ananyev, Konstantin May 6, 2015, 12:35 a.m. UTC | #1
Hi Pablo,

> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Pablo de Lara
> Sent: Tuesday, May 05, 2015 3:44 PM
> To: dev@dpdk.org
> Subject: [dpdk-dev] [PATCH v3 3/6] hash: update jhash function with the latest available
> 
> Jenkins hash function was developed originally in 1996,
> and was integrated in first versions of DPDK.
> The function has been improved in 2006,
> achieving up to 60% better performance, compared to the original one.
> 
> This patch integrates that code into the rte_jhash library.
> 
> Signed-off-by: Pablo de Lara <pablo.de.lara.guarch@intel.com>
> ---
>  lib/librte_hash/rte_jhash.h |  261 +++++++++++++++++++++++++++++++------------
>  1 files changed, 188 insertions(+), 73 deletions(-)
> 
> diff --git a/lib/librte_hash/rte_jhash.h b/lib/librte_hash/rte_jhash.h
> index a4bf5a1..0e96b7c 100644
> --- a/lib/librte_hash/rte_jhash.h
> +++ b/lib/librte_hash/rte_jhash.h
> @@ -1,7 +1,7 @@
>  /*-
>   *   BSD LICENSE
>   *
> - *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> + *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
>   *   All rights reserved.
>   *
>   *   Redistribution and use in source and binary forms, with or without
> @@ -45,38 +45,68 @@ extern "C" {
>  #endif
> 
>  #include <stdint.h>
> +#include <string.h>
> +#include <rte_byteorder.h>
> 
>  /* jhash.h: Jenkins hash support.
>   *
> - * Copyright (C) 1996 Bob Jenkins (bob_jenkins@burtleburtle.net)
> + * Copyright (C) 2006 Bob Jenkins (bob_jenkins@burtleburtle.net)
>   *
>   * http://burtleburtle.net/bob/hash/
>   *
>   * These are the credits from Bob's sources:
>   *
> - * lookup2.c, by Bob Jenkins, December 1996, Public Domain.
> - * hash(), hash2(), hash3, and mix() are externally useful functions.
> - * Routines to test the hash are included if SELF_TEST is defined.
> - * You can use this free for any purpose.  It has no warranty.
> + * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
> + *
> + * These are functions for producing 32-bit hashes for hash table lookup.
> + * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
> + * are externally useful functions.  Routines to test the hash are included
> + * if SELF_TEST is defined.  You can use this free for any purpose.  It's in
> + * the public domain.  It has no warranty.
>   *
>   * $FreeBSD$
>   */
> 
> +#define rot(x, k) (((x) << (k)) | ((x) >> (32-(k))))
> +
>  /** @internal Internal function. NOTE: Arguments are modified. */
>  #define __rte_jhash_mix(a, b, c) do { \
> -	a -= b; a -= c; a ^= (c>>13); \
> -	b -= c; b -= a; b ^= (a<<8); \
> -	c -= a; c -= b; c ^= (b>>13); \
> -	a -= b; a -= c; a ^= (c>>12); \
> -	b -= c; b -= a; b ^= (a<<16); \
> -	c -= a; c -= b; c ^= (b>>5); \
> -	a -= b; a -= c; a ^= (c>>3); \
> -	b -= c; b -= a; b ^= (a<<10); \
> -	c -= a; c -= b; c ^= (b>>15); \
> +	a -= c; a ^= rot(c, 4); c += b; \
> +	b -= a; b ^= rot(a, 6); a += c; \
> +	c -= b; c ^= rot(b, 8); b += a; \
> +	a -= c; a ^= rot(c, 16); c += b; \
> +	b -= a; b ^= rot(a, 19); a += c; \
> +	c -= b; c ^= rot(b, 4); b += a; \
> +} while (0)
> +
> +#define __rte_jhash_final(a, b, c) do { \
> +	c ^= b; c -= rot(b, 14); \
> +	a ^= c; a -= rot(c, 11); \
> +	b ^= a; b -= rot(a, 25); \
> +	c ^= b; c -= rot(b, 16); \
> +	a ^= c; a -= rot(c, 4);  \
> +	b ^= a; b -= rot(a, 14); \
> +	c ^= b; c -= rot(b, 24); \
>  } while (0)
> 
>  /** The golden ratio: an arbitrary value. */
> -#define RTE_JHASH_GOLDEN_RATIO      0x9e3779b9
> +#define RTE_JHASH_GOLDEN_RATIO      0xdeadbeef
> +
> +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
> +#define RTE_JHASH_BYTE0_SHIFT 0
> +#define RTE_JHASH_BYTE1_SHIFT 8
> +#define RTE_JHASH_BYTE2_SHIFT 16
> +#define RTE_JHASH_BYTE3_SHIFT 24
> +#else
> +#define RTE_JHASH_BYTE0_SHIFT 24
> +#define RTE_JHASH_BYTE1_SHIFT 16
> +#define RTE_JHASH_BYTE2_SHIFT 8
> +#define RTE_JHASH_BYTE3_SHIFT 0
> +#endif
> +
> +#define LOWER8b_MASK rte_le_to_cpu_32(0xff)
> +#define LOWER16b_MASK rte_le_to_cpu_32(0xffff)
> +#define LOWER24b_MASK rte_le_to_cpu_32(0xffffff)
> 
>  /**
>   * The most generic version, hashes an arbitrary sequence
> @@ -95,42 +125,119 @@ extern "C" {
>  static inline uint32_t
>  rte_jhash(const void *key, uint32_t length, uint32_t initval)
>  {
> -	uint32_t a, b, c, len;
> -	const uint8_t *k = (const uint8_t *)key;
> -	const uint32_t *k32 = (const uint32_t *)key;
> +	uint32_t a, b, c;
> +	union {
> +		const void *ptr;
> +		size_t i;
> +	} u;
> 
> -	len = length;
> -	a = b = RTE_JHASH_GOLDEN_RATIO;
> -	c = initval;
> +	/* Set up the internal state */
> +	a = b = c = RTE_JHASH_GOLDEN_RATIO + ((uint32_t)length) + initval;
> 
> -	while (len >= 12) {
> -		a += k32[0];
> -		b += k32[1];
> -		c += k32[2];
> +	u.ptr = key;
> 
> -		__rte_jhash_mix(a,b,c);
> +	/* Check key alignment. For x86 architecture, first case is always optimal */
> +	if (!strcmp(RTE_ARCH,"x86_64") || !strcmp(RTE_ARCH,"i686") || (u.i & 0x3) == 0) {

Wonder why strcmp(), why not something like: 'if defined(RTE_ARCH_I686) || defined(RTE_ARCH_X86_64)' as in all other places?
Another question what would be in case of RTE_ARCH="x86_x32"?
Konstantin

> +		const uint32_t *k = (const uint32_t *)key;
> 
> -		k += (3 * sizeof(uint32_t)), k32 += 3;
> -		len -= (3 * sizeof(uint32_t));
> -	}
> +		while (length > 12) {
> +			a += k[0];
> +			b += k[1];
> +			c += k[2];
> 
> -	c += length;
> -	switch (len) {
> -		case 11: c += ((uint32_t)k[10] << 24);
> -		case 10: c += ((uint32_t)k[9] << 16);
> -		case 9 : c += ((uint32_t)k[8] << 8);
> -		case 8 : b += ((uint32_t)k[7] << 24);
> -		case 7 : b += ((uint32_t)k[6] << 16);
> -		case 6 : b += ((uint32_t)k[5] << 8);
> -		case 5 : b += k[4];
> -		case 4 : a += ((uint32_t)k[3] << 24);
> -		case 3 : a += ((uint32_t)k[2] << 16);
> -		case 2 : a += ((uint32_t)k[1] << 8);
> -		case 1 : a += k[0];
> -		default: break;
> -	};
> +			__rte_jhash_mix(a, b, c);
> +
> +			k += 3;
> +			length -= 12;
> +		}
> +
> +		switch (length) {
> +		case 12:
> +			c += k[2]; b += k[1]; a += k[0]; break;
> +		case 11:
> +			c += k[2] & LOWER24b_MASK; b += k[1]; a += k[0]; break;
> +		case 10:
> +			c += k[2] & LOWER16b_MASK; b += k[1]; a += k[0]; break;
> +		case 9:
> +			c += k[2] & LOWER8b_MASK; b += k[1]; a += k[0]; break;
> +		case 8:
> +			b += k[1]; a += k[0]; break;
> +		case 7:
> +			b += k[1] & LOWER24b_MASK; a += k[0]; break;
> +		case 6:
> +			b += k[1] & LOWER16b_MASK; a += k[0]; break;
> +		case 5:
> +			b += k[1] & LOWER8b_MASK; a += k[0]; break;
> +		case 4:
> +			a += k[0]; break;
> +		case 3:
> +			a += k[0] & LOWER24b_MASK; break;
> +		case 2:
> +			a += k[0] & LOWER16b_MASK; break;
> +		case 1:
> +			a += k[0] & LOWER8b_MASK; break;
> +		/* zero length strings require no mixing */
> +		case 0:
> +			return c;
> +		};
> +	} else {
> +		const uint8_t *k = (const uint8_t *)key;
> +
> +		/* all but the last block: affect some 32 bits of (a, b, c) */
> +		while (length > 12) {
> +			a += ((uint32_t)k[0]) << RTE_JHASH_BYTE0_SHIFT;
> +			a += ((uint32_t)k[1]) << RTE_JHASH_BYTE1_SHIFT;
> +			a += ((uint32_t)k[2]) << RTE_JHASH_BYTE2_SHIFT;
> +			a += ((uint32_t)k[3]) << RTE_JHASH_BYTE3_SHIFT;
> +			b += ((uint32_t)k[4]) << RTE_JHASH_BYTE0_SHIFT;
> +			b += ((uint32_t)k[5]) << RTE_JHASH_BYTE1_SHIFT;
> +			b += ((uint32_t)k[6]) << RTE_JHASH_BYTE2_SHIFT;
> +			b += ((uint32_t)k[7]) << RTE_JHASH_BYTE3_SHIFT;
> +			c += ((uint32_t)k[8]) << RTE_JHASH_BYTE0_SHIFT;
> +			c += ((uint32_t)k[9]) << RTE_JHASH_BYTE1_SHIFT;
> +			c += ((uint32_t)k[10]) << RTE_JHASH_BYTE2_SHIFT;
> +			c += ((uint32_t)k[11]) << RTE_JHASH_BYTE3_SHIFT;
> +
> +			__rte_jhash_mix(a, b, c);
> +
> +			k += 12;
> +			length -= 12;
> +		}
> +
> +		/* last block: affect all 32 bits of (c) */
> +		/* all the case statements fall through */
> +		switch (length) {
> +		case 12:
> +			c += ((uint32_t)k[11]) << RTE_JHASH_BYTE3_SHIFT;
> +		case 11:
> +			c += ((uint32_t)k[10]) << RTE_JHASH_BYTE2_SHIFT;
> +		case 10:
> +			c += ((uint32_t)k[9]) << RTE_JHASH_BYTE1_SHIFT;
> +		case 9:
> +			c += ((uint32_t)k[8]) << RTE_JHASH_BYTE0_SHIFT;
> +		case 8:
> +			b += ((uint32_t)k[7]) << RTE_JHASH_BYTE3_SHIFT;
> +		case 7:
> +			b += ((uint32_t)k[6]) << RTE_JHASH_BYTE2_SHIFT;
> +		case 6:
> +			b += ((uint32_t)k[5]) << RTE_JHASH_BYTE1_SHIFT;
> +		case 5:
> +			b += ((uint32_t)k[4]) << RTE_JHASH_BYTE0_SHIFT;
> +		case 4:
> +			a += ((uint32_t)k[3]) << RTE_JHASH_BYTE3_SHIFT;
> +		case 3:
> +			a += ((uint32_t)k[2]) << RTE_JHASH_BYTE2_SHIFT;
> +		case 2:
> +			a += ((uint32_t)k[1]) << RTE_JHASH_BYTE1_SHIFT;
> +		case 1:
> +			a += ((uint32_t)k[0]) << RTE_JHASH_BYTE0_SHIFT;
> +		break;
> +		case 0:
> +			return c;
> +		}
> +	}
> 
> -	__rte_jhash_mix(a,b,c);
> +	__rte_jhash_final(a, b, c);
> 
>  	return c;
>  }
> @@ -151,33 +258,51 @@ rte_jhash(const void *key, uint32_t length, uint32_t initval)
>  static inline uint32_t
>  rte_jhash2(const uint32_t *k, uint32_t length, uint32_t initval)
>  {
> -	uint32_t a, b, c, len;
> +	uint32_t a, b, c;
> 
> -	a = b = RTE_JHASH_GOLDEN_RATIO;
> -	c = initval;
> -	len = length;
> +	/* Set up the internal state */
> +	a = b = c = RTE_JHASH_GOLDEN_RATIO + (((uint32_t)length) << 2) + initval;
> 
> -	while (len >= 3) {
> +	/* Handle most of the key */
> +	while (length > 3) {
>  		a += k[0];
>  		b += k[1];
>  		c += k[2];
> +
>  		__rte_jhash_mix(a, b, c);
> -		k += 3; len -= 3;
> -	}
> 
> -	c += length * 4;
> +		k += 3;
> +		length -= 3;
> +	}
> 
> -	switch (len) {
> -		case 2 : b += k[1];
> -		case 1 : a += k[0];
> -		default: break;
> +	/* Handle the last 3 uint32_t's */
> +	switch (length) {
> +	case 3:
> +		c += k[2];
> +	case 2:
> +		b += k[1];
> +	case 1:
> +		a += k[0];
> +		__rte_jhash_final(a, b, c);
> +	/* case 0: nothing left to add */
> +	case 0:
> +		break;
>  	};
> 
> -	__rte_jhash_mix(a,b,c);
> -
>  	return c;
>  }
> 
> +static inline uint32_t
> +__rte_jhash_3words(uint32_t a, uint32_t b, uint32_t c, uint32_t initval)
> +{
> +	a += RTE_JHASH_GOLDEN_RATIO + initval;
> +	b += RTE_JHASH_GOLDEN_RATIO + initval;
> +	c += RTE_JHASH_GOLDEN_RATIO + initval;
> +
> +	__rte_jhash_final(a, b, c);
> +
> +	return c;
> +}
> 
>  /**
>   * A special ultra-optimized versions that knows it is hashing exactly
> @@ -197,17 +322,7 @@ rte_jhash2(const uint32_t *k, uint32_t length, uint32_t initval)
>  static inline uint32_t
>  rte_jhash_3words(uint32_t a, uint32_t b, uint32_t c, uint32_t initval)
>  {
> -	a += RTE_JHASH_GOLDEN_RATIO;
> -	b += RTE_JHASH_GOLDEN_RATIO;
> -	c += initval;
> -
> -	__rte_jhash_mix(a, b, c);
> -
> -	/*
> -	 * NOTE: In particular the "c += length; __rte_jhash_mix(a,b,c);"
> -	 *       normally done at the end is not done here.
> -	 */
> -	return c;
> +	return __rte_jhash_3words(a + 12, b + 12, c + 12, initval);
>  }
> 
>  /**
> @@ -226,7 +341,7 @@ rte_jhash_3words(uint32_t a, uint32_t b, uint32_t c, uint32_t initval)
>  static inline uint32_t
>  rte_jhash_2words(uint32_t a, uint32_t b, uint32_t initval)
>  {
> -	return rte_jhash_3words(a, b, 0, initval);
> +	return __rte_jhash_3words(a + 8, b + 8, 8, initval);
>  }
> 
>  /**
> @@ -243,7 +358,7 @@ rte_jhash_2words(uint32_t a, uint32_t b, uint32_t initval)
>  static inline uint32_t
>  rte_jhash_1word(uint32_t a, uint32_t initval)
>  {
> -	return rte_jhash_3words(a, 0, 0, initval);
> +	return __rte_jhash_3words(a + 4, 4, 4, initval);
>  }
> 
>  #ifdef __cplusplus
> --
> 1.7.4.1
  
De Lara Guarch, Pablo May 6, 2015, 9:36 a.m. UTC | #2
Hi Konstantin,

> -----Original Message-----
> From: Ananyev, Konstantin
> Sent: Wednesday, May 06, 2015 1:36 AM
> To: De Lara Guarch, Pablo; dev@dpdk.org
> Subject: RE: [dpdk-dev] [PATCH v3 3/6] hash: update jhash function with the
> latest available
> 
> 
> Hi Pablo,
> 
> > -----Original Message-----
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Pablo de Lara
> > Sent: Tuesday, May 05, 2015 3:44 PM
> > To: dev@dpdk.org
> > Subject: [dpdk-dev] [PATCH v3 3/6] hash: update jhash function with the
> latest available
> >
> > Jenkins hash function was developed originally in 1996,
> > and was integrated in first versions of DPDK.
> > The function has been improved in 2006,
> > achieving up to 60% better performance, compared to the original one.
> >
> > This patch integrates that code into the rte_jhash library.
> >
> > Signed-off-by: Pablo de Lara <pablo.de.lara.guarch@intel.com>
> > ---
> >  lib/librte_hash/rte_jhash.h |  261
> +++++++++++++++++++++++++++++++------------
> >  1 files changed, 188 insertions(+), 73 deletions(-)
> >
> > diff --git a/lib/librte_hash/rte_jhash.h b/lib/librte_hash/rte_jhash.h
> > index a4bf5a1..0e96b7c 100644
> > --- a/lib/librte_hash/rte_jhash.h
> > +++ b/lib/librte_hash/rte_jhash.h
> > @@ -1,7 +1,7 @@
> >  /*-
> >   *   BSD LICENSE
> >   *
> > - *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> > + *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
> >   *   All rights reserved.
> >   *
> >   *   Redistribution and use in source and binary forms, with or without
> > @@ -45,38 +45,68 @@ extern "C" {
> >  #endif
> >
> >  #include <stdint.h>
> > +#include <string.h>
> > +#include <rte_byteorder.h>
> >
> >  /* jhash.h: Jenkins hash support.
> >   *
> > - * Copyright (C) 1996 Bob Jenkins (bob_jenkins@burtleburtle.net)
> > + * Copyright (C) 2006 Bob Jenkins (bob_jenkins@burtleburtle.net)
> >   *
> >   * http://burtleburtle.net/bob/hash/
> >   *
> >   * These are the credits from Bob's sources:
> >   *
> > - * lookup2.c, by Bob Jenkins, December 1996, Public Domain.
> > - * hash(), hash2(), hash3, and mix() are externally useful functions.
> > - * Routines to test the hash are included if SELF_TEST is defined.
> > - * You can use this free for any purpose.  It has no warranty.
> > + * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
> > + *
> > + * These are functions for producing 32-bit hashes for hash table lookup.
> > + * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
> > + * are externally useful functions.  Routines to test the hash are included
> > + * if SELF_TEST is defined.  You can use this free for any purpose.  It's in
> > + * the public domain.  It has no warranty.
> >   *
> >   * $FreeBSD$
> >   */
> >
> > +#define rot(x, k) (((x) << (k)) | ((x) >> (32-(k))))
> > +
> >  /** @internal Internal function. NOTE: Arguments are modified. */
> >  #define __rte_jhash_mix(a, b, c) do { \
> > -	a -= b; a -= c; a ^= (c>>13); \
> > -	b -= c; b -= a; b ^= (a<<8); \
> > -	c -= a; c -= b; c ^= (b>>13); \
> > -	a -= b; a -= c; a ^= (c>>12); \
> > -	b -= c; b -= a; b ^= (a<<16); \
> > -	c -= a; c -= b; c ^= (b>>5); \
> > -	a -= b; a -= c; a ^= (c>>3); \
> > -	b -= c; b -= a; b ^= (a<<10); \
> > -	c -= a; c -= b; c ^= (b>>15); \
> > +	a -= c; a ^= rot(c, 4); c += b; \
> > +	b -= a; b ^= rot(a, 6); a += c; \
> > +	c -= b; c ^= rot(b, 8); b += a; \
> > +	a -= c; a ^= rot(c, 16); c += b; \
> > +	b -= a; b ^= rot(a, 19); a += c; \
> > +	c -= b; c ^= rot(b, 4); b += a; \
> > +} while (0)
> > +
> > +#define __rte_jhash_final(a, b, c) do { \
> > +	c ^= b; c -= rot(b, 14); \
> > +	a ^= c; a -= rot(c, 11); \
> > +	b ^= a; b -= rot(a, 25); \
> > +	c ^= b; c -= rot(b, 16); \
> > +	a ^= c; a -= rot(c, 4);  \
> > +	b ^= a; b -= rot(a, 14); \
> > +	c ^= b; c -= rot(b, 24); \
> >  } while (0)
> >
> >  /** The golden ratio: an arbitrary value. */
> > -#define RTE_JHASH_GOLDEN_RATIO      0x9e3779b9
> > +#define RTE_JHASH_GOLDEN_RATIO      0xdeadbeef
> > +
> > +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
> > +#define RTE_JHASH_BYTE0_SHIFT 0
> > +#define RTE_JHASH_BYTE1_SHIFT 8
> > +#define RTE_JHASH_BYTE2_SHIFT 16
> > +#define RTE_JHASH_BYTE3_SHIFT 24
> > +#else
> > +#define RTE_JHASH_BYTE0_SHIFT 24
> > +#define RTE_JHASH_BYTE1_SHIFT 16
> > +#define RTE_JHASH_BYTE2_SHIFT 8
> > +#define RTE_JHASH_BYTE3_SHIFT 0
> > +#endif
> > +
> > +#define LOWER8b_MASK rte_le_to_cpu_32(0xff)
> > +#define LOWER16b_MASK rte_le_to_cpu_32(0xffff)
> > +#define LOWER24b_MASK rte_le_to_cpu_32(0xffffff)
> >
> >  /**
> >   * The most generic version, hashes an arbitrary sequence
> > @@ -95,42 +125,119 @@ extern "C" {
> >  static inline uint32_t
> >  rte_jhash(const void *key, uint32_t length, uint32_t initval)
> >  {
> > -	uint32_t a, b, c, len;
> > -	const uint8_t *k = (const uint8_t *)key;
> > -	const uint32_t *k32 = (const uint32_t *)key;
> > +	uint32_t a, b, c;
> > +	union {
> > +		const void *ptr;
> > +		size_t i;
> > +	} u;
> >
> > -	len = length;
> > -	a = b = RTE_JHASH_GOLDEN_RATIO;
> > -	c = initval;
> > +	/* Set up the internal state */
> > +	a = b = c = RTE_JHASH_GOLDEN_RATIO + ((uint32_t)length) + initval;
> >
> > -	while (len >= 12) {
> > -		a += k32[0];
> > -		b += k32[1];
> > -		c += k32[2];
> > +	u.ptr = key;
> >
> > -		__rte_jhash_mix(a,b,c);
> > +	/* Check key alignment. For x86 architecture, first case is always
> optimal */
> > +	if (!strcmp(RTE_ARCH,"x86_64") || !strcmp(RTE_ARCH,"i686") || (u.i
> & 0x3) == 0) {
> 
> Wonder why strcmp(), why not something like: 'if defined(RTE_ARCH_I686)
> || defined(RTE_ARCH_X86_64)' as in all other places?
> Another question what would be in case of RTE_ARCH="x86_x32"?
> Konstantin

Functionally is the same and using this method, I can integrate all conditions in one line, so it takes less code.
I also checked the assembly code, and the compiler removes the check if it is Intel architecture, so performance remains the same.

Re x86_x32, you are right, probably I need to include it. Although, I just realized that it is not used in any other place.
Wonder if we should include it somewhere else? E.g. rte_hash_crc.h
  
Ananyev, Konstantin May 6, 2015, 4:11 p.m. UTC | #3
Hi Pablo,

> -----Original Message-----
> From: De Lara Guarch, Pablo
> Sent: Wednesday, May 06, 2015 10:36 AM
> To: Ananyev, Konstantin; dev@dpdk.org
> Subject: RE: [dpdk-dev] [PATCH v3 3/6] hash: update jhash function with the latest available
> 
> Hi Konstantin,
> 
> > -----Original Message-----
> > From: Ananyev, Konstantin
> > Sent: Wednesday, May 06, 2015 1:36 AM
> > To: De Lara Guarch, Pablo; dev@dpdk.org
> > Subject: RE: [dpdk-dev] [PATCH v3 3/6] hash: update jhash function with the
> > latest available
> >
> >
> > Hi Pablo,
> >
> > > -----Original Message-----
> > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Pablo de Lara
> > > Sent: Tuesday, May 05, 2015 3:44 PM
> > > To: dev@dpdk.org
> > > Subject: [dpdk-dev] [PATCH v3 3/6] hash: update jhash function with the
> > latest available
> > >
> > > Jenkins hash function was developed originally in 1996,
> > > and was integrated in first versions of DPDK.
> > > The function has been improved in 2006,
> > > achieving up to 60% better performance, compared to the original one.
> > >
> > > This patch integrates that code into the rte_jhash library.
> > >
> > > Signed-off-by: Pablo de Lara <pablo.de.lara.guarch@intel.com>
> > > ---
> > >  lib/librte_hash/rte_jhash.h |  261
> > +++++++++++++++++++++++++++++++------------
> > >  1 files changed, 188 insertions(+), 73 deletions(-)
> > >
> > > diff --git a/lib/librte_hash/rte_jhash.h b/lib/librte_hash/rte_jhash.h
> > > index a4bf5a1..0e96b7c 100644
> > > --- a/lib/librte_hash/rte_jhash.h
> > > +++ b/lib/librte_hash/rte_jhash.h
> > > @@ -1,7 +1,7 @@
> > >  /*-
> > >   *   BSD LICENSE
> > >   *
> > > - *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> > > + *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
> > >   *   All rights reserved.
> > >   *
> > >   *   Redistribution and use in source and binary forms, with or without
> > > @@ -45,38 +45,68 @@ extern "C" {
> > >  #endif
> > >
> > >  #include <stdint.h>
> > > +#include <string.h>
> > > +#include <rte_byteorder.h>
> > >
> > >  /* jhash.h: Jenkins hash support.
> > >   *
> > > - * Copyright (C) 1996 Bob Jenkins (bob_jenkins@burtleburtle.net)
> > > + * Copyright (C) 2006 Bob Jenkins (bob_jenkins@burtleburtle.net)
> > >   *
> > >   * http://burtleburtle.net/bob/hash/
> > >   *
> > >   * These are the credits from Bob's sources:
> > >   *
> > > - * lookup2.c, by Bob Jenkins, December 1996, Public Domain.
> > > - * hash(), hash2(), hash3, and mix() are externally useful functions.
> > > - * Routines to test the hash are included if SELF_TEST is defined.
> > > - * You can use this free for any purpose.  It has no warranty.
> > > + * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
> > > + *
> > > + * These are functions for producing 32-bit hashes for hash table lookup.
> > > + * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
> > > + * are externally useful functions.  Routines to test the hash are included
> > > + * if SELF_TEST is defined.  You can use this free for any purpose.  It's in
> > > + * the public domain.  It has no warranty.
> > >   *
> > >   * $FreeBSD$
> > >   */
> > >
> > > +#define rot(x, k) (((x) << (k)) | ((x) >> (32-(k))))
> > > +
> > >  /** @internal Internal function. NOTE: Arguments are modified. */
> > >  #define __rte_jhash_mix(a, b, c) do { \
> > > -	a -= b; a -= c; a ^= (c>>13); \
> > > -	b -= c; b -= a; b ^= (a<<8); \
> > > -	c -= a; c -= b; c ^= (b>>13); \
> > > -	a -= b; a -= c; a ^= (c>>12); \
> > > -	b -= c; b -= a; b ^= (a<<16); \
> > > -	c -= a; c -= b; c ^= (b>>5); \
> > > -	a -= b; a -= c; a ^= (c>>3); \
> > > -	b -= c; b -= a; b ^= (a<<10); \
> > > -	c -= a; c -= b; c ^= (b>>15); \
> > > +	a -= c; a ^= rot(c, 4); c += b; \
> > > +	b -= a; b ^= rot(a, 6); a += c; \
> > > +	c -= b; c ^= rot(b, 8); b += a; \
> > > +	a -= c; a ^= rot(c, 16); c += b; \
> > > +	b -= a; b ^= rot(a, 19); a += c; \
> > > +	c -= b; c ^= rot(b, 4); b += a; \
> > > +} while (0)
> > > +
> > > +#define __rte_jhash_final(a, b, c) do { \
> > > +	c ^= b; c -= rot(b, 14); \
> > > +	a ^= c; a -= rot(c, 11); \
> > > +	b ^= a; b -= rot(a, 25); \
> > > +	c ^= b; c -= rot(b, 16); \
> > > +	a ^= c; a -= rot(c, 4);  \
> > > +	b ^= a; b -= rot(a, 14); \
> > > +	c ^= b; c -= rot(b, 24); \
> > >  } while (0)
> > >
> > >  /** The golden ratio: an arbitrary value. */
> > > -#define RTE_JHASH_GOLDEN_RATIO      0x9e3779b9
> > > +#define RTE_JHASH_GOLDEN_RATIO      0xdeadbeef
> > > +
> > > +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
> > > +#define RTE_JHASH_BYTE0_SHIFT 0
> > > +#define RTE_JHASH_BYTE1_SHIFT 8
> > > +#define RTE_JHASH_BYTE2_SHIFT 16
> > > +#define RTE_JHASH_BYTE3_SHIFT 24
> > > +#else
> > > +#define RTE_JHASH_BYTE0_SHIFT 24
> > > +#define RTE_JHASH_BYTE1_SHIFT 16
> > > +#define RTE_JHASH_BYTE2_SHIFT 8
> > > +#define RTE_JHASH_BYTE3_SHIFT 0
> > > +#endif
> > > +
> > > +#define LOWER8b_MASK rte_le_to_cpu_32(0xff)
> > > +#define LOWER16b_MASK rte_le_to_cpu_32(0xffff)
> > > +#define LOWER24b_MASK rte_le_to_cpu_32(0xffffff)
> > >
> > >  /**
> > >   * The most generic version, hashes an arbitrary sequence
> > > @@ -95,42 +125,119 @@ extern "C" {
> > >  static inline uint32_t
> > >  rte_jhash(const void *key, uint32_t length, uint32_t initval)
> > >  {
> > > -	uint32_t a, b, c, len;
> > > -	const uint8_t *k = (const uint8_t *)key;
> > > -	const uint32_t *k32 = (const uint32_t *)key;
> > > +	uint32_t a, b, c;
> > > +	union {
> > > +		const void *ptr;
> > > +		size_t i;
> > > +	} u;
> > >
> > > -	len = length;
> > > -	a = b = RTE_JHASH_GOLDEN_RATIO;
> > > -	c = initval;
> > > +	/* Set up the internal state */
> > > +	a = b = c = RTE_JHASH_GOLDEN_RATIO + ((uint32_t)length) + initval;
> > >
> > > -	while (len >= 12) {
> > > -		a += k32[0];
> > > -		b += k32[1];
> > > -		c += k32[2];
> > > +	u.ptr = key;
> > >
> > > -		__rte_jhash_mix(a,b,c);
> > > +	/* Check key alignment. For x86 architecture, first case is always
> > optimal */
> > > +	if (!strcmp(RTE_ARCH,"x86_64") || !strcmp(RTE_ARCH,"i686") || (u.i
> > & 0x3) == 0) {
> >
> > Wonder why strcmp(), why not something like: 'if defined(RTE_ARCH_I686)
> > || defined(RTE_ARCH_X86_64)' as in all other places?
> > Another question what would be in case of RTE_ARCH="x86_x32"?
> > Konstantin
> 
> Functionally is the same and using this method, I can integrate all conditions in one line, so it takes less code.
> I also checked the assembly code, and the compiler removes the check if it is Intel architecture, so performance remains the same.

Well,  yes I think most modern compilers  treat strcmp() as a builtin function and are able to optimise these strcmp() calls off for that case.
But  we probably can't guarantee that it would always be the case for all different compiler/libc combinations.
Again, by some reason user might need to use ' -fno-builtin' flag while building his stuff.
So I would use pre-processor macros here, it is more predictable.
Again, that way it is consistent with other places.
 
Actually I wonder do you really need such sort of diversity for aligned/non-aligned case?
Wonder wouldn't something like that work for you:

#infdef  RTE_ARCH_X86
        const uint32_t *k = (uint32_t *)((uintptr_t)key & (uintptr_t)~3);
        const uint32_t s = ((uintptr_t)key & 3) * CHAR_BIT;
#else /*X86*/
        const uint32_t *k = key;
        const uint32_t s = 0;
#endif

  while (len > 12) {
                a += k[0] >> s | (uint64_t)k[1] << (32 - s);
                b += k[1] >> s | (uint64_t)k[2] << (32 - s);
                c += k[2] >> s | (uint64_t)k[3] << (32 - s);
                k += 3;
                length -= 12;
}

switch (length) {
case 12:
    a += k[0] >> s | (uint64_t)k[1] << (32 - s);
    b += k[1] >> s | (uint64_t)k[2] << (32 - s);
    c += k[2] >> s | (uint64_t)k[3] << (32 - s);
    break;
case 11:
    a += k[0] >> s | (uint64_t)k[1] << (32 - s);
    b += k[1] >> s | (uint64_t)k[2] << (32 - s);
    c += (k[2] >> s | (uint64_t)k[3] << (32 - s)) & & LOWER24b_MASK;
    break;
...
case 1:
   a += (k[0] >> s | (uint64_t)k[1] << (32 - s)) & LOWER8b_MASK;
   break;
...

In that way, even for non-aligned you don't need do 4B reads.
For x86, compiler would do it's optimisation work and strip off '>> s | (uint64_t)k[..] << (32 - s);'.

> 
> Re x86_x32, you are right, probably I need to include it. Although, I just realized that it is not used in any other place.
> Wonder if we should include it somewhere else? E.g. rte_hash_crc.h

Yep, that's true we are not doing it for hash_crc also...
Would probably good to have some sort of ' RTE_ARCH_X86' - that would be defined for all x86 targets and use it whenever applicable.
But I suppose, that's a subject for another patch. 

Konstantin
  
Ananyev, Konstantin May 7, 2015, 11:11 a.m. UTC | #4
> -----Original Message-----
> From: Ananyev, Konstantin
> Sent: Wednesday, May 06, 2015 5:11 PM
> To: De Lara Guarch, Pablo; dev@dpdk.org
> Subject: RE: [dpdk-dev] [PATCH v3 3/6] hash: update jhash function with the latest available
> 
> Hi Pablo,
> 
> > -----Original Message-----
> > From: De Lara Guarch, Pablo
> > Sent: Wednesday, May 06, 2015 10:36 AM
> > To: Ananyev, Konstantin; dev@dpdk.org
> > Subject: RE: [dpdk-dev] [PATCH v3 3/6] hash: update jhash function with the latest available
> >
> > Hi Konstantin,
> >
> > > -----Original Message-----
> > > From: Ananyev, Konstantin
> > > Sent: Wednesday, May 06, 2015 1:36 AM
> > > To: De Lara Guarch, Pablo; dev@dpdk.org
> > > Subject: RE: [dpdk-dev] [PATCH v3 3/6] hash: update jhash function with the
> > > latest available
> > >
> > >
> > > Hi Pablo,
> > >
> > > > -----Original Message-----
> > > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Pablo de Lara
> > > > Sent: Tuesday, May 05, 2015 3:44 PM
> > > > To: dev@dpdk.org
> > > > Subject: [dpdk-dev] [PATCH v3 3/6] hash: update jhash function with the
> > > latest available
> > > >
> > > > Jenkins hash function was developed originally in 1996,
> > > > and was integrated in first versions of DPDK.
> > > > The function has been improved in 2006,
> > > > achieving up to 60% better performance, compared to the original one.
> > > >
> > > > This patch integrates that code into the rte_jhash library.
> > > >
> > > > Signed-off-by: Pablo de Lara <pablo.de.lara.guarch@intel.com>
> > > > ---
> > > >  lib/librte_hash/rte_jhash.h |  261
> > > +++++++++++++++++++++++++++++++------------
> > > >  1 files changed, 188 insertions(+), 73 deletions(-)
> > > >
> > > > diff --git a/lib/librte_hash/rte_jhash.h b/lib/librte_hash/rte_jhash.h
> > > > index a4bf5a1..0e96b7c 100644
> > > > --- a/lib/librte_hash/rte_jhash.h
> > > > +++ b/lib/librte_hash/rte_jhash.h
> > > > @@ -1,7 +1,7 @@
> > > >  /*-
> > > >   *   BSD LICENSE
> > > >   *
> > > > - *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> > > > + *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
> > > >   *   All rights reserved.
> > > >   *
> > > >   *   Redistribution and use in source and binary forms, with or without
> > > > @@ -45,38 +45,68 @@ extern "C" {
> > > >  #endif
> > > >
> > > >  #include <stdint.h>
> > > > +#include <string.h>
> > > > +#include <rte_byteorder.h>
> > > >
> > > >  /* jhash.h: Jenkins hash support.
> > > >   *
> > > > - * Copyright (C) 1996 Bob Jenkins (bob_jenkins@burtleburtle.net)
> > > > + * Copyright (C) 2006 Bob Jenkins (bob_jenkins@burtleburtle.net)
> > > >   *
> > > >   * http://burtleburtle.net/bob/hash/
> > > >   *
> > > >   * These are the credits from Bob's sources:
> > > >   *
> > > > - * lookup2.c, by Bob Jenkins, December 1996, Public Domain.
> > > > - * hash(), hash2(), hash3, and mix() are externally useful functions.
> > > > - * Routines to test the hash are included if SELF_TEST is defined.
> > > > - * You can use this free for any purpose.  It has no warranty.
> > > > + * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
> > > > + *
> > > > + * These are functions for producing 32-bit hashes for hash table lookup.
> > > > + * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
> > > > + * are externally useful functions.  Routines to test the hash are included
> > > > + * if SELF_TEST is defined.  You can use this free for any purpose.  It's in
> > > > + * the public domain.  It has no warranty.
> > > >   *
> > > >   * $FreeBSD$
> > > >   */
> > > >
> > > > +#define rot(x, k) (((x) << (k)) | ((x) >> (32-(k))))
> > > > +
> > > >  /** @internal Internal function. NOTE: Arguments are modified. */
> > > >  #define __rte_jhash_mix(a, b, c) do { \
> > > > -	a -= b; a -= c; a ^= (c>>13); \
> > > > -	b -= c; b -= a; b ^= (a<<8); \
> > > > -	c -= a; c -= b; c ^= (b>>13); \
> > > > -	a -= b; a -= c; a ^= (c>>12); \
> > > > -	b -= c; b -= a; b ^= (a<<16); \
> > > > -	c -= a; c -= b; c ^= (b>>5); \
> > > > -	a -= b; a -= c; a ^= (c>>3); \
> > > > -	b -= c; b -= a; b ^= (a<<10); \
> > > > -	c -= a; c -= b; c ^= (b>>15); \
> > > > +	a -= c; a ^= rot(c, 4); c += b; \
> > > > +	b -= a; b ^= rot(a, 6); a += c; \
> > > > +	c -= b; c ^= rot(b, 8); b += a; \
> > > > +	a -= c; a ^= rot(c, 16); c += b; \
> > > > +	b -= a; b ^= rot(a, 19); a += c; \
> > > > +	c -= b; c ^= rot(b, 4); b += a; \
> > > > +} while (0)
> > > > +
> > > > +#define __rte_jhash_final(a, b, c) do { \
> > > > +	c ^= b; c -= rot(b, 14); \
> > > > +	a ^= c; a -= rot(c, 11); \
> > > > +	b ^= a; b -= rot(a, 25); \
> > > > +	c ^= b; c -= rot(b, 16); \
> > > > +	a ^= c; a -= rot(c, 4);  \
> > > > +	b ^= a; b -= rot(a, 14); \
> > > > +	c ^= b; c -= rot(b, 24); \
> > > >  } while (0)
> > > >
> > > >  /** The golden ratio: an arbitrary value. */
> > > > -#define RTE_JHASH_GOLDEN_RATIO      0x9e3779b9
> > > > +#define RTE_JHASH_GOLDEN_RATIO      0xdeadbeef
> > > > +
> > > > +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
> > > > +#define RTE_JHASH_BYTE0_SHIFT 0
> > > > +#define RTE_JHASH_BYTE1_SHIFT 8
> > > > +#define RTE_JHASH_BYTE2_SHIFT 16
> > > > +#define RTE_JHASH_BYTE3_SHIFT 24
> > > > +#else
> > > > +#define RTE_JHASH_BYTE0_SHIFT 24
> > > > +#define RTE_JHASH_BYTE1_SHIFT 16
> > > > +#define RTE_JHASH_BYTE2_SHIFT 8
> > > > +#define RTE_JHASH_BYTE3_SHIFT 0
> > > > +#endif
> > > > +
> > > > +#define LOWER8b_MASK rte_le_to_cpu_32(0xff)
> > > > +#define LOWER16b_MASK rte_le_to_cpu_32(0xffff)
> > > > +#define LOWER24b_MASK rte_le_to_cpu_32(0xffffff)
> > > >
> > > >  /**
> > > >   * The most generic version, hashes an arbitrary sequence
> > > > @@ -95,42 +125,119 @@ extern "C" {
> > > >  static inline uint32_t
> > > >  rte_jhash(const void *key, uint32_t length, uint32_t initval)
> > > >  {
> > > > -	uint32_t a, b, c, len;
> > > > -	const uint8_t *k = (const uint8_t *)key;
> > > > -	const uint32_t *k32 = (const uint32_t *)key;
> > > > +	uint32_t a, b, c;
> > > > +	union {
> > > > +		const void *ptr;
> > > > +		size_t i;
> > > > +	} u;
> > > >
> > > > -	len = length;
> > > > -	a = b = RTE_JHASH_GOLDEN_RATIO;
> > > > -	c = initval;
> > > > +	/* Set up the internal state */
> > > > +	a = b = c = RTE_JHASH_GOLDEN_RATIO + ((uint32_t)length) + initval;
> > > >
> > > > -	while (len >= 12) {
> > > > -		a += k32[0];
> > > > -		b += k32[1];
> > > > -		c += k32[2];
> > > > +	u.ptr = key;
> > > >
> > > > -		__rte_jhash_mix(a,b,c);
> > > > +	/* Check key alignment. For x86 architecture, first case is always
> > > optimal */
> > > > +	if (!strcmp(RTE_ARCH,"x86_64") || !strcmp(RTE_ARCH,"i686") || (u.i
> > > & 0x3) == 0) {
> > >
> > > Wonder why strcmp(), why not something like: 'if defined(RTE_ARCH_I686)
> > > || defined(RTE_ARCH_X86_64)' as in all other places?
> > > Another question what would be in case of RTE_ARCH="x86_x32"?
> > > Konstantin
> >
> > Functionally is the same and using this method, I can integrate all conditions in one line, so it takes less code.
> > I also checked the assembly code, and the compiler removes the check if it is Intel architecture, so performance remains the same.
> 
> Well,  yes I think most modern compilers  treat strcmp() as a builtin function and are able to optimise these strcmp() calls off for that
> case.
> But  we probably can't guarantee that it would always be the case for all different compiler/libc combinations.
> Again, by some reason user might need to use ' -fno-builtin' flag while building his stuff.
> So I would use pre-processor macros here, it is more predictable.
> Again, that way it is consistent with other places.
> 
> Actually I wonder do you really need such sort of diversity for aligned/non-aligned case?
> Wonder wouldn't something like that work for you:
> 
> #infdef  RTE_ARCH_X86
>         const uint32_t *k = (uint32_t *)((uintptr_t)key & (uintptr_t)~3);
>         const uint32_t s = ((uintptr_t)key & 3) * CHAR_BIT;
> #else /*X86*/
>         const uint32_t *k = key;
>         const uint32_t s = 0;
> #endif
> 
>   while (len > 12) {
>                 a += k[0] >> s | (uint64_t)k[1] << (32 - s);
>                 b += k[1] >> s | (uint64_t)k[2] << (32 - s);
>                 c += k[2] >> s | (uint64_t)k[3] << (32 - s);
>                 k += 3;
>                 length -= 12;
> }
> 
> switch (length) {
> case 12:
>     a += k[0] >> s | (uint64_t)k[1] << (32 - s);
>     b += k[1] >> s | (uint64_t)k[2] << (32 - s);
>     c += k[2] >> s | (uint64_t)k[3] << (32 - s);
>     break;
> case 11:
>     a += k[0] >> s | (uint64_t)k[1] << (32 - s);
>     b += k[1] >> s | (uint64_t)k[2] << (32 - s);
>     c += (k[2] >> s | (uint64_t)k[3] << (32 - s)) & & LOWER24b_MASK;
>     break;
> ...
> case 1:
>    a += (k[0] >> s | (uint64_t)k[1] << (32 - s)) & LOWER8b_MASK;
>    break;
> ...
> 
> In that way, even for non-aligned you don't need do 4B reads.
> For x86, compiler would do it's optimisation work and strip off '>> s | (uint64_t)k[..] << (32 - s);'.
> 

Actually, as Sergio pointed out, that approach might penalise non-x86 4B aligned case. 
So probably, a special path for s== 0 is still needed, i.e:
if (s==0) {...; a += k[0]; ...} else {...; a += k[0] >> s | (uint64_t)k[1] << (32 - s);...}
Konstantin

> >
> > Re x86_x32, you are right, probably I need to include it. Although, I just realized that it is not used in any other place.
> > Wonder if we should include it somewhere else? E.g. rte_hash_crc.h
> 
> Yep, that's true we are not doing it for hash_crc also...
> Would probably good to have some sort of ' RTE_ARCH_X86' - that would be defined for all x86 targets and use it whenever applicable.
> But I suppose, that's a subject for another patch.
> 
> Konstantin
>
  

Patch

diff --git a/lib/librte_hash/rte_jhash.h b/lib/librte_hash/rte_jhash.h
index a4bf5a1..0e96b7c 100644
--- a/lib/librte_hash/rte_jhash.h
+++ b/lib/librte_hash/rte_jhash.h
@@ -1,7 +1,7 @@ 
 /*-
  *   BSD LICENSE
  *
- *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
  *   All rights reserved.
  *
  *   Redistribution and use in source and binary forms, with or without
@@ -45,38 +45,68 @@  extern "C" {
 #endif
 
 #include <stdint.h>
+#include <string.h>
+#include <rte_byteorder.h>
 
 /* jhash.h: Jenkins hash support.
  *
- * Copyright (C) 1996 Bob Jenkins (bob_jenkins@burtleburtle.net)
+ * Copyright (C) 2006 Bob Jenkins (bob_jenkins@burtleburtle.net)
  *
  * http://burtleburtle.net/bob/hash/
  *
  * These are the credits from Bob's sources:
  *
- * lookup2.c, by Bob Jenkins, December 1996, Public Domain.
- * hash(), hash2(), hash3, and mix() are externally useful functions.
- * Routines to test the hash are included if SELF_TEST is defined.
- * You can use this free for any purpose.  It has no warranty.
+ * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
+ *
+ * These are functions for producing 32-bit hashes for hash table lookup.
+ * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
+ * are externally useful functions.  Routines to test the hash are included
+ * if SELF_TEST is defined.  You can use this free for any purpose.  It's in
+ * the public domain.  It has no warranty.
  *
  * $FreeBSD$
  */
 
+#define rot(x, k) (((x) << (k)) | ((x) >> (32-(k))))
+
 /** @internal Internal function. NOTE: Arguments are modified. */
 #define __rte_jhash_mix(a, b, c) do { \
-	a -= b; a -= c; a ^= (c>>13); \
-	b -= c; b -= a; b ^= (a<<8); \
-	c -= a; c -= b; c ^= (b>>13); \
-	a -= b; a -= c; a ^= (c>>12); \
-	b -= c; b -= a; b ^= (a<<16); \
-	c -= a; c -= b; c ^= (b>>5); \
-	a -= b; a -= c; a ^= (c>>3); \
-	b -= c; b -= a; b ^= (a<<10); \
-	c -= a; c -= b; c ^= (b>>15); \
+	a -= c; a ^= rot(c, 4); c += b; \
+	b -= a; b ^= rot(a, 6); a += c; \
+	c -= b; c ^= rot(b, 8); b += a; \
+	a -= c; a ^= rot(c, 16); c += b; \
+	b -= a; b ^= rot(a, 19); a += c; \
+	c -= b; c ^= rot(b, 4); b += a; \
+} while (0)
+
+#define __rte_jhash_final(a, b, c) do { \
+	c ^= b; c -= rot(b, 14); \
+	a ^= c; a -= rot(c, 11); \
+	b ^= a; b -= rot(a, 25); \
+	c ^= b; c -= rot(b, 16); \
+	a ^= c; a -= rot(c, 4);  \
+	b ^= a; b -= rot(a, 14); \
+	c ^= b; c -= rot(b, 24); \
 } while (0)
 
 /** The golden ratio: an arbitrary value. */
-#define RTE_JHASH_GOLDEN_RATIO      0x9e3779b9
+#define RTE_JHASH_GOLDEN_RATIO      0xdeadbeef
+
+#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
+#define RTE_JHASH_BYTE0_SHIFT 0
+#define RTE_JHASH_BYTE1_SHIFT 8
+#define RTE_JHASH_BYTE2_SHIFT 16
+#define RTE_JHASH_BYTE3_SHIFT 24
+#else
+#define RTE_JHASH_BYTE0_SHIFT 24
+#define RTE_JHASH_BYTE1_SHIFT 16
+#define RTE_JHASH_BYTE2_SHIFT 8
+#define RTE_JHASH_BYTE3_SHIFT 0
+#endif
+
+#define LOWER8b_MASK rte_le_to_cpu_32(0xff)
+#define LOWER16b_MASK rte_le_to_cpu_32(0xffff)
+#define LOWER24b_MASK rte_le_to_cpu_32(0xffffff)
 
 /**
  * The most generic version, hashes an arbitrary sequence
@@ -95,42 +125,119 @@  extern "C" {
 static inline uint32_t
 rte_jhash(const void *key, uint32_t length, uint32_t initval)
 {
-	uint32_t a, b, c, len;
-	const uint8_t *k = (const uint8_t *)key;
-	const uint32_t *k32 = (const uint32_t *)key;
+	uint32_t a, b, c;
+	union {
+		const void *ptr;
+		size_t i;
+	} u;
 
-	len = length;
-	a = b = RTE_JHASH_GOLDEN_RATIO;
-	c = initval;
+	/* Set up the internal state */
+	a = b = c = RTE_JHASH_GOLDEN_RATIO + ((uint32_t)length) + initval;
 
-	while (len >= 12) {
-		a += k32[0];
-		b += k32[1];
-		c += k32[2];
+	u.ptr = key;
 
-		__rte_jhash_mix(a,b,c);
+	/* Check key alignment. For x86 architecture, first case is always optimal */
+	if (!strcmp(RTE_ARCH,"x86_64") || !strcmp(RTE_ARCH,"i686") || (u.i & 0x3) == 0) {
+		const uint32_t *k = (const uint32_t *)key;
 
-		k += (3 * sizeof(uint32_t)), k32 += 3;
-		len -= (3 * sizeof(uint32_t));
-	}
+		while (length > 12) {
+			a += k[0];
+			b += k[1];
+			c += k[2];
 
-	c += length;
-	switch (len) {
-		case 11: c += ((uint32_t)k[10] << 24);
-		case 10: c += ((uint32_t)k[9] << 16);
-		case 9 : c += ((uint32_t)k[8] << 8);
-		case 8 : b += ((uint32_t)k[7] << 24);
-		case 7 : b += ((uint32_t)k[6] << 16);
-		case 6 : b += ((uint32_t)k[5] << 8);
-		case 5 : b += k[4];
-		case 4 : a += ((uint32_t)k[3] << 24);
-		case 3 : a += ((uint32_t)k[2] << 16);
-		case 2 : a += ((uint32_t)k[1] << 8);
-		case 1 : a += k[0];
-		default: break;
-	};
+			__rte_jhash_mix(a, b, c);
+
+			k += 3;
+			length -= 12;
+		}
+
+		switch (length) {
+		case 12:
+			c += k[2]; b += k[1]; a += k[0]; break;
+		case 11:
+			c += k[2] & LOWER24b_MASK; b += k[1]; a += k[0]; break;
+		case 10:
+			c += k[2] & LOWER16b_MASK; b += k[1]; a += k[0]; break;
+		case 9:
+			c += k[2] & LOWER8b_MASK; b += k[1]; a += k[0]; break;
+		case 8:
+			b += k[1]; a += k[0]; break;
+		case 7:
+			b += k[1] & LOWER24b_MASK; a += k[0]; break;
+		case 6:
+			b += k[1] & LOWER16b_MASK; a += k[0]; break;
+		case 5:
+			b += k[1] & LOWER8b_MASK; a += k[0]; break;
+		case 4:
+			a += k[0]; break;
+		case 3:
+			a += k[0] & LOWER24b_MASK; break;
+		case 2:
+			a += k[0] & LOWER16b_MASK; break;
+		case 1:
+			a += k[0] & LOWER8b_MASK; break;
+		/* zero length strings require no mixing */
+		case 0:
+			return c;
+		};
+	} else {
+		const uint8_t *k = (const uint8_t *)key;
+
+		/* all but the last block: affect some 32 bits of (a, b, c) */
+		while (length > 12) {
+			a += ((uint32_t)k[0]) << RTE_JHASH_BYTE0_SHIFT;
+			a += ((uint32_t)k[1]) << RTE_JHASH_BYTE1_SHIFT;
+			a += ((uint32_t)k[2]) << RTE_JHASH_BYTE2_SHIFT;
+			a += ((uint32_t)k[3]) << RTE_JHASH_BYTE3_SHIFT;
+			b += ((uint32_t)k[4]) << RTE_JHASH_BYTE0_SHIFT;
+			b += ((uint32_t)k[5]) << RTE_JHASH_BYTE1_SHIFT;
+			b += ((uint32_t)k[6]) << RTE_JHASH_BYTE2_SHIFT;
+			b += ((uint32_t)k[7]) << RTE_JHASH_BYTE3_SHIFT;
+			c += ((uint32_t)k[8]) << RTE_JHASH_BYTE0_SHIFT;
+			c += ((uint32_t)k[9]) << RTE_JHASH_BYTE1_SHIFT;
+			c += ((uint32_t)k[10]) << RTE_JHASH_BYTE2_SHIFT;
+			c += ((uint32_t)k[11]) << RTE_JHASH_BYTE3_SHIFT;
+
+			__rte_jhash_mix(a, b, c);
+
+			k += 12;
+			length -= 12;
+		}
+
+		/* last block: affect all 32 bits of (c) */
+		/* all the case statements fall through */
+		switch (length) {
+		case 12:
+			c += ((uint32_t)k[11]) << RTE_JHASH_BYTE3_SHIFT;
+		case 11:
+			c += ((uint32_t)k[10]) << RTE_JHASH_BYTE2_SHIFT;
+		case 10:
+			c += ((uint32_t)k[9]) << RTE_JHASH_BYTE1_SHIFT;
+		case 9:
+			c += ((uint32_t)k[8]) << RTE_JHASH_BYTE0_SHIFT;
+		case 8:
+			b += ((uint32_t)k[7]) << RTE_JHASH_BYTE3_SHIFT;
+		case 7:
+			b += ((uint32_t)k[6]) << RTE_JHASH_BYTE2_SHIFT;
+		case 6:
+			b += ((uint32_t)k[5]) << RTE_JHASH_BYTE1_SHIFT;
+		case 5:
+			b += ((uint32_t)k[4]) << RTE_JHASH_BYTE0_SHIFT;
+		case 4:
+			a += ((uint32_t)k[3]) << RTE_JHASH_BYTE3_SHIFT;
+		case 3:
+			a += ((uint32_t)k[2]) << RTE_JHASH_BYTE2_SHIFT;
+		case 2:
+			a += ((uint32_t)k[1]) << RTE_JHASH_BYTE1_SHIFT;
+		case 1:
+			a += ((uint32_t)k[0]) << RTE_JHASH_BYTE0_SHIFT;
+		break;
+		case 0:
+			return c;
+		}
+	}
 
-	__rte_jhash_mix(a,b,c);
+	__rte_jhash_final(a, b, c);
 
 	return c;
 }
@@ -151,33 +258,51 @@  rte_jhash(const void *key, uint32_t length, uint32_t initval)
 static inline uint32_t
 rte_jhash2(const uint32_t *k, uint32_t length, uint32_t initval)
 {
-	uint32_t a, b, c, len;
+	uint32_t a, b, c;
 
-	a = b = RTE_JHASH_GOLDEN_RATIO;
-	c = initval;
-	len = length;
+	/* Set up the internal state */
+	a = b = c = RTE_JHASH_GOLDEN_RATIO + (((uint32_t)length) << 2) + initval;
 
-	while (len >= 3) {
+	/* Handle most of the key */
+	while (length > 3) {
 		a += k[0];
 		b += k[1];
 		c += k[2];
+
 		__rte_jhash_mix(a, b, c);
-		k += 3; len -= 3;
-	}
 
-	c += length * 4;
+		k += 3;
+		length -= 3;
+	}
 
-	switch (len) {
-		case 2 : b += k[1];
-		case 1 : a += k[0];
-		default: break;
+	/* Handle the last 3 uint32_t's */
+	switch (length) {
+	case 3:
+		c += k[2];
+	case 2:
+		b += k[1];
+	case 1:
+		a += k[0];
+		__rte_jhash_final(a, b, c);
+	/* case 0: nothing left to add */
+	case 0:
+		break;
 	};
 
-	__rte_jhash_mix(a,b,c);
-
 	return c;
 }
 
+static inline uint32_t
+__rte_jhash_3words(uint32_t a, uint32_t b, uint32_t c, uint32_t initval)
+{
+	a += RTE_JHASH_GOLDEN_RATIO + initval;
+	b += RTE_JHASH_GOLDEN_RATIO + initval;
+	c += RTE_JHASH_GOLDEN_RATIO + initval;
+
+	__rte_jhash_final(a, b, c);
+
+	return c;
+}
 
 /**
  * A special ultra-optimized versions that knows it is hashing exactly
@@ -197,17 +322,7 @@  rte_jhash2(const uint32_t *k, uint32_t length, uint32_t initval)
 static inline uint32_t
 rte_jhash_3words(uint32_t a, uint32_t b, uint32_t c, uint32_t initval)
 {
-	a += RTE_JHASH_GOLDEN_RATIO;
-	b += RTE_JHASH_GOLDEN_RATIO;
-	c += initval;
-
-	__rte_jhash_mix(a, b, c);
-
-	/*
-	 * NOTE: In particular the "c += length; __rte_jhash_mix(a,b,c);"
-	 *       normally done at the end is not done here.
-	 */
-	return c;
+	return __rte_jhash_3words(a + 12, b + 12, c + 12, initval);
 }
 
 /**
@@ -226,7 +341,7 @@  rte_jhash_3words(uint32_t a, uint32_t b, uint32_t c, uint32_t initval)
 static inline uint32_t
 rte_jhash_2words(uint32_t a, uint32_t b, uint32_t initval)
 {
-	return rte_jhash_3words(a, b, 0, initval);
+	return __rte_jhash_3words(a + 8, b + 8, 8, initval);
 }
 
 /**
@@ -243,7 +358,7 @@  rte_jhash_2words(uint32_t a, uint32_t b, uint32_t initval)
 static inline uint32_t
 rte_jhash_1word(uint32_t a, uint32_t initval)
 {
-	return rte_jhash_3words(a, 0, 0, initval);
+	return __rte_jhash_3words(a + 4, 4, 4, initval);
 }
 
 #ifdef __cplusplus