10,000,000 runs, time in ns.  x86/32 under kvm.

Lookup2: jhash = 528 bytes, jhash_3words = 128 bytes
  jhash 1024 bytes = 1694ns
  jhash_3words = 18ns
  jhash2 256 words = 1646ns

Lookup3: jhash = 736 bytes, jhash_3words = 160 bytes
  jhash 1024 bytes = 1010ns
  jhash_3words = 23ns
  jhash2 256 words = 1039ns

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/jhash.h |  220 +++++++++++++++++++++++++++++++++++---------------
 1 file changed, 156 insertions(+), 64 deletions(-)

diff -r 4ac7c2f5d31a include/linux/jhash.h
--- a/include/linux/jhash.h	Thu Jun 19 13:03:40 2008 +1000
+++ b/include/linux/jhash.h	Thu Jun 19 13:16:24 2008 +1000
@@ -1,41 +1,50 @@
 #ifndef _LINUX_JHASH_H
 #define _LINUX_JHASH_H
+#include <asm/byteorder.h>
 
 /* jhash.h: Jenkins hash support.
  *
- * Copyright (C) 1996 Bob Jenkins (bob_jenkins@burtleburtle.net)
+ * Copyright (C) 2006 Bob Jenkins (bob_jenkins@burtleburtle.net)
  *
  * http://burtleburtle.net/bob/hash/
  *
  * These are the credits from Bob's sources:
  *
- * lookup2.c, by Bob Jenkins, December 1996, Public Domain.
+ * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
  * hash(), hash2(), hash3, and mix() are externally useful functions.
  * Routines to test the hash are included if SELF_TEST is defined.
  * You can use this free for any purpose.  It has no warranty.
  *
  * Copyright (C) 2003 David S. Miller (davem@redhat.com)
+ * Copyright (C) 2008 Rusty Russell IBM Corporation <rusty@rustcorp.com.au>
  *
  * I've modified Bob's hash to be useful in the Linux kernel, and
  * any bugs present are surely my fault.  -DaveM
  */
 
+#define __jrot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
+
 /* NOTE: Arguments are modified. */
-#define __jhash_mix(a, b, c) \
-{ \
-  a -= b; a -= c; a ^= (c>>13); \
-  b -= c; b -= a; b ^= (a<<8); \
-  c -= a; c -= b; c ^= (b>>13); \
-  a -= b; a -= c; a ^= (c>>12);  \
-  b -= c; b -= a; b ^= (a<<16); \
-  c -= a; c -= b; c ^= (b>>5); \
-  a -= b; a -= c; a ^= (c>>3);  \
-  b -= c; b -= a; b ^= (a<<10); \
-  c -= a; c -= b; c ^= (b>>15); \
+#define __jhash_mix(a,b,c)			\
+{						\
+	a -= c;  a ^= __jrot(c, 4);  c += b;	\
+	b -= a;  b ^= __jrot(a, 6);  a += c;	\
+	c -= b;  c ^= __jrot(b, 8);  b += a;	\
+	a -= c;  a ^= __jrot(c, 16); c += b;	\
+	b -= a;  b ^= __jrot(a, 19); a += c;	\
+	c -= b;  c ^= __jrot(b, 4);  b += a;	\
 }
 
-/* The golden ration: an arbitrary value */
-#define JHASH_GOLDEN_RATIO	0x9e3779b9
+#define __jhash_final(a,b,c)			\
+{						\
+	c ^= b; c -= __jrot(b, 14);		\
+	a ^= c; a -= __jrot(c, 11);		\
+	b ^= a; b -= __jrot(a, 25);		\
+	c ^= b; c -= __jrot(b, 16);		\
+	a ^= c; a -= __jrot(c, 4);		\
+	b ^= a; b -= __jrot(a, 14);		\
+	c ^= b; c -= __jrot(b, 24);		\
+}
 
 /* The most generic version, hashes an arbitrary sequence
  * of bytes.  No alignment or length assumptions are made about
@@ -43,41 +52,124 @@
  */
 static inline u32 jhash(const void *key, u32 length, u32 initval)
 {
-	u32 a, b, c, len;
-	const u8 *k = key;
+	u32 a, b, c;
+	union { const void *ptr; unsigned long i; } u;
 
-	len = length;
-	a = b = JHASH_GOLDEN_RATIO;
-	c = initval;
+	/* Set up the internal state */
+	a = b = c = 0xdeadbeef + length + initval;
 
-	while (len >= 12) {
-		a += (k[0] +((u32)k[1]<<8) +((u32)k[2]<<16) +((u32)k[3]<<24));
-		b += (k[4] +((u32)k[5]<<8) +((u32)k[6]<<16) +((u32)k[7]<<24));
-		c += (k[8] +((u32)k[9]<<8) +((u32)k[10]<<16)+((u32)k[11]<<24));
+	u.ptr = key;
+	if ((u.i & 0x3) == 0) {
+		const u32 *k = (const u32 *)key;	/* read 32-bit chunks */
 
-		__jhash_mix(a,b,c);
+		/* all but last block: aligned reads and affect 32
+		 * bits of (a,b,c) */
+		while (length > 12) {
+			a += k[0];
+			b += k[1];
+			c += k[2];
+			__jhash_mix(a, b, c);
+			length -= 12;
+			k += 3;
+		}
 
-		k += 12;
-		len -= 12;
+		/* handle the last (probably partial) block */
+		/* 
+		 * "k[2]&0xffffff" actually reads beyond the end of the string,
+		 * but then masks off the part it's not allowed to read.
+		 * Because the string is aligned, the masked-off tail is in the
+		 * same word as the rest of the string.  Every machine with
+		 * memory protection I've seen does it on word boundaries, so
+		 * is OK with this.  But VALGRIND will still catch it and
+		 * complain.  The masking trick does make the hash noticably
+		 * faster for short strings (like English words).
+		 */
+		switch (length) {
+		case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
+		case 11: c+=k[2]&le32_to_cpu(0xffffff); b+=k[1]; a+=k[0]; break;
+		case 10: c+=k[2]&le32_to_cpu(0xffff); b+=k[1]; a+=k[0]; break;
+		case 9 : c+=k[2]&le32_to_cpu(0xff); b+=k[1]; a+=k[0]; break;
+		case 8 : b+=k[1]; a+=k[0]; break;
+		case 7 : b+=k[1]&le32_to_cpu(0xffffff); a+=k[0]; break;
+		case 6 : b+=k[1]&le32_to_cpu(0xffff); a+=k[0]; break;
+		case 5 : b+=k[1]&le32_to_cpu(0xff); a+=k[0]; break;
+		case 4 : a+=k[0]; break;
+		case 3 : a+=k[0]&le32_to_cpu(0xffffff); break;
+		case 2 : a+=k[0]&le32_to_cpu(0xffff); break;
+		case 1 : a+=k[0]&le32_to_cpu(0xff); break;
+		case 0 : return c; /* zero length strings require no mixing */
+		}
+	} else {	/* need to read the key one byte at a time */
+		const u8 *k = (const u8 *)key;
+
+		/* all but the last block: affect some 32 bits of (a,b,c) */
+		while (length > 12) {
+#ifdef __LITTLE_ENDIAN
+			a += k[0];
+			a += ((u32)k[1])<<8;
+			a += ((u32)k[2])<<16;
+			a += ((u32)k[3])<<24;
+			b += k[4];
+			b += ((u32)k[5])<<8;
+			b += ((u32)k[6])<<16;
+			b += ((u32)k[7])<<24;
+			c += k[8];
+			c += ((u32)k[9])<<8;
+			c += ((u32)k[10])<<16;
+			c += ((u32)k[11])<<24;
+#else
+			a += ((u32)k[0])<<24;
+			a += ((u32)k[1])<<16;
+			a += ((u32)k[2])<<8;
+			a += ((u32)k[3]);
+			b += ((u32)k[4])<<24;
+			b += ((u32)k[5])<<16;
+			b += ((u32)k[6])<<8;
+			b += ((u32)k[7]);
+			c += ((u32)k[8])<<24;
+			c += ((u32)k[9])<<16;
+			c += ((u32)k[10])<<8;
+			c += ((u32)k[11]);
+#endif
+			__jhash_mix(a, b, c);
+			length -= 12;
+			k += 12;
+		}
+
+		/* last block: affect all 32 bits of (c) */
+		switch (length) { /* all the case statements fall through */
+#ifdef __LITTLE_ENDIAN
+		case 12: c+=((u32)k[11])<<24;
+		case 11: c+=((u32)k[10])<<16;
+		case 10: c+=((u32)k[9])<<8;
+		case 9 : c+=k[8];
+		case 8 : b+=((u32)k[7])<<24;
+		case 7 : b+=((u32)k[6])<<16;
+		case 6 : b+=((u32)k[5])<<8;
+		case 5 : b+=k[4];
+		case 4 : a+=((u32)k[3])<<24;
+		case 3 : a+=((u32)k[2])<<16;
+		case 2 : a+=((u32)k[1])<<8;
+		case 1 : a+=k[0];
+#else
+		case 12: c+=k[11];
+		case 11: c+=((u32)k[10])<<8;
+		case 10: c+=((u32)k[9])<<16;
+		case 9 : c+=((u32)k[8])<<24;
+		case 8 : b+=k[7];
+		case 7 : b+=((u32)k[6])<<8;
+		case 6 : b+=((u32)k[5])<<16;
+		case 5 : b+=((u32)k[4])<<24;
+		case 4 : a+=k[3];
+		case 3 : a+=((u32)k[2])<<8;
+		case 2 : a+=((u32)k[1])<<16;
+		case 1 : a+=((u32)k[0])<<24;
+#endif
+			break;
+		case 0 : return c;
+		}
 	}
-
-	c += length;
-	switch (len) {
-	case 11: c += ((u32)k[10]<<24);
-	case 10: c += ((u32)k[9]<<16);
-	case 9 : c += ((u32)k[8]<<8);
-	case 8 : b += ((u32)k[7]<<24);
-	case 7 : b += ((u32)k[6]<<16);
-	case 6 : b += ((u32)k[5]<<8);
-	case 5 : b += k[4];
-	case 4 : a += ((u32)k[3]<<24);
-	case 3 : a += ((u32)k[2]<<16);
-	case 2 : a += ((u32)k[1]<<8);
-	case 1 : a += k[0];
-	};
-
-	__jhash_mix(a,b,c);
-
+	__jhash_final(a, b, c);
 	return c;
 }
 
@@ -86,47 +178,47 @@ static inline u32 jhash(const void *key,
  */
 static inline u32 jhash2(const u32 *k, u32 length, u32 initval)
 {
-	u32 a, b, c, len;
+	u32 a, b, c;
 
-	a = b = JHASH_GOLDEN_RATIO;
-	c = initval;
-	len = length;
+	/* Set up the internal state */
+	a = b = c = 0xdeadbeef + (length<<2) + initval;
 
-	while (len >= 3) {
+	/* handle most of the key */
+	while (length > 3) {
 		a += k[0];
 		b += k[1];
 		c += k[2];
 		__jhash_mix(a, b, c);
-		k += 3; len -= 3;
+		length -= 3;
+		k += 3;
 	}
 
-	c += length * 4;
-
-	switch (len) {
+	/* handle the last 3 uint32_t's */
+	switch(length) { /* all the case statements fall through */
+	case 3 : c += k[2];
 	case 2 : b += k[1];
 	case 1 : a += k[0];
-	};
+		__jhash_final(a,b,c);
+	case 0:     /* case 0: nothing left to add */
+		break;
+	}
 
-	__jhash_mix(a,b,c);
-
+	/* report the result */
 	return c;
 }
-
 
 /* A special ultra-optimized versions that knows they are hashing exactly
  * 3, 2 or 1 word(s).
  *
- * NOTE: In partilar the "c += length; __jhash_mix(a,b,c);" normally
- *       done at the end is not done here.
+ * NOTE: In particular 0xdeadbeef and length << 2 are not added to each value.
  */
 static inline u32 jhash_3words(u32 a, u32 b, u32 c, u32 initval)
 {
-	a += JHASH_GOLDEN_RATIO;
-	b += JHASH_GOLDEN_RATIO;
+	a += initval;
+	b += initval;
 	c += initval;
-
 	__jhash_mix(a, b, c);
-
+	__jhash_final(a, b, c);
 	return c;
 }
 
