Commit 2d6759d4 authored by Rusty Russell's avatar Rusty Russell

First cut of hashing routines.

parent b27117c6
/**
* hash - routines for hashing bytes
*
* When creating a hash table it's important to have a hash function
* which mixes well and is fast. This package supplies such functions.
*
* The hash functions come in two flavors: the normal ones and the
* stable ones. The normal ones can vary from machine-to-machine and
* may change if we find better or faster hash algorithms in future.
* The stable ones will always give the same results on any computer,
* and on any version of this package.
*/
int main(int argc, char *argv[])
{
if (argc != 2)
return 1;
if (strcmp(argv[1], "depends") == 0) {
return 0;
}
return 1;
}
This diff is collapsed.
#ifndef CCAN_HASH_H
#define CCAN_HASH_H
#include <stdint.h>
#include <stdlib.h>
#include "config.h"
/* Stolen mostly from: lookup3.c, by Bob Jenkins, May 2006, Public Domain.
*
* http://burtleburtle.net/bob/c/lookup3.c
*/
/**
* hash - fast hash of an array for internal use
* @p: the array or pointer to first element
* @num: the number of elements to hash
* @base: the base number to roll into the hash (usually 0)
*
* The memory region pointed to by p is combined with the base to form
* a 32-bit hash.
*
* This hash will have different results on different machines, so is
* only useful for internal hashes (ie. not hashes sent across the
* network or saved to disk).
*
* It may also change with future versions: it could even detect at runtime
* what the fastest hash to use is.
*
* See also: hash_stable.
*
* Example:
* #include "hash/hash.h"
* #include <err.h>
* #include <stdio.h>
*
* // Simple demonstration: idential strings will have the same hash, but
* // two different strings will probably not.
* int main(int argc, char *argv[])
* {
* uint32_t hash1, hash2;
*
* if (argc != 3)
* err(1, "Usage: %s <string1> <string2>", argv[0]);
*
* hash1 = hash(argv[1], strlen(argv[1]), 0);
* hash2 = hash(argv[2], strlen(argv[2]), 0);
* printf("Hash is %s\n", hash1 == hash2 ? "same" : "different");
* return 0;
* }
*/
#define hash(p, num, base) hash_any((p), (num)*sizeof(*(p)), (base))
/**
* hash_stable - hash of an array for external use
* @p: the array or pointer to first element
* @num: the number of elements to hash
* @base: the base number to roll into the hash (usually 0)
*
* The memory region pointed to by p is combined with the base to form
* a 32-bit hash.
*
* This hash will have the same results on different machines, so can
* be used for external hashes (ie. not hashes sent across the network
* or saved to disk). The results will not change in future versions
* of this package.
*
* Example:
* #include "hash/hash.h"
* #include <err.h>
* #include <stdio.h>
*
* int main(int argc, char *argv[])
* {
* if (argc != 2)
* err(1, "Usage: %s <string-to-hash>", argv[0]);
*
* printf("Hash stable result is %u\n",
* hash_stable(argv[1], strlen(argv[1]), 0));
* return 0;
* }
*/
#define hash_stable(p, num, base) \
hash_any_stable((p), (num)*sizeof(*(p)), (base))
/**
* hash_u32 - fast hash an array of 32-bit values for internal use
* @key: the array of uint32_t
* @num: the number of elements to hash
* @base: the base number to roll into the hash (usually 0)
*
* The array of uint32_t pointed to by @key is combined with the base
* to form a 32-bit hash. This is 2-3 times faster than hash() on small
* arrays, but the advantage vanishes over large hashes.
*
* This hash will have different results on different machines, so is
* only useful for internal hashes (ie. not hashes sent across the
* network or saved to disk).
*/
uint32_t hash_u32(const uint32_t *key, size_t num, uint32_t base);
/* Our underlying operations. */
uint32_t hash_any(const void *key, size_t length, uint32_t base);
uint32_t hash_any_stable(const void *key, size_t length, uint32_t base);
/**
* hash_pointer - hash a pointer for internal use
* @p: the pointer value to hash
* @base: the base number to roll into the hash (usually 0)
*
* The pointer p (not what p points to!) is combined with the base to form
* a 32-bit hash.
*
* This hash will have different results on different machines, so is
* only useful for internal hashes (ie. not hashes sent across the
* network or saved to disk).
*
* Example:
* #include "hash/hash.h"
*
* // Code to keep track of memory regions.
* struct region {
* struct region *chain;
* void *start;
* unsigned int size;
* };
* // We keep a simple hash table.
* static struct region *region_hash[128];
*
* static void add_region(struct region *r)
* {
* unsigned int h = hash_pointer(r->start);
*
* r->chain = region_hash[h];
* region_hash[h] = r->chain;
* }
*
* static void find_region(const void *start)
* {
* struct region *r;
*
* for (r = region_hash[hash_pointer(start)]; r; r = r->chain)
* if (r->start == start)
* return r;
* return NULL;
* }
*/
static inline uint32_t hash_pointer(const void *p, uint32_t base)
{
if (sizeof(p) % sizeof(uint32_t) == 0) {
/* This convoluted union is the right way of aliasing. */
union {
uint32_t u32[sizeof(p) / sizeof(uint32_t)];
const void *p;
} u;
u.p = p;
return hash_u32(u.u32, sizeof(p) / sizeof(uint32_t), base);
}
return hash(&p, 1, base);
}
#endif /* HASH_H */
#include "hash/hash.h"
#include "tap/tap.h"
#include "hash/hash.c"
#include <stdbool.h>
#include <string.h>
#define ARRAY_WORDS 5
int main(int argc, char *argv[])
{
unsigned int i, j, k;
uint32_t array[ARRAY_WORDS], val;
char array2[sizeof(array) + sizeof(uint32_t)];
uint32_t results[256];
/* Initialize array. */
for (i = 0; i < ARRAY_WORDS; i++)
array[i] = i;
plan_tests(53);
/* hash_stable is guaranteed. */
ok1(hash_stable(array, ARRAY_WORDS, 0) == 0x13305f8c);
ok1(hash_stable(array, ARRAY_WORDS, 1) == 0x171abf74);
ok1(hash_stable(array, ARRAY_WORDS, 2) == 0x7646fcc7);
ok1(hash_stable(array, ARRAY_WORDS, 4) == 0xa758ed5);
ok1(hash_stable(array, ARRAY_WORDS, 8) == 0x2dedc2e4);
ok1(hash_stable(array, ARRAY_WORDS, 16) == 0x28e2076b);
ok1(hash_stable(array, ARRAY_WORDS, 32) == 0xb73091c5);
ok1(hash_stable(array, ARRAY_WORDS, 64) == 0x87daf5db);
ok1(hash_stable(array, ARRAY_WORDS, 128) == 0xa16dfe20);
ok1(hash_stable(array, ARRAY_WORDS, 256) == 0x300c63c3);
ok1(hash_stable(array, ARRAY_WORDS, 512) == 0x255c91fc);
ok1(hash_stable(array, ARRAY_WORDS, 1024) == 0x6357b26);
ok1(hash_stable(array, ARRAY_WORDS, 2048) == 0x4bc5f339);
ok1(hash_stable(array, ARRAY_WORDS, 4096) == 0x1301617c);
ok1(hash_stable(array, ARRAY_WORDS, 8192) == 0x506792c9);
ok1(hash_stable(array, ARRAY_WORDS, 16384) == 0xcd596705);
ok1(hash_stable(array, ARRAY_WORDS, 32768) == 0xa8713cac);
ok1(hash_stable(array, ARRAY_WORDS, 65536) == 0x94d9794);
ok1(hash_stable(array, ARRAY_WORDS, 131072) == 0xac753e8);
ok1(hash_stable(array, ARRAY_WORDS, 262144) == 0xcd8bdd20);
ok1(hash_stable(array, ARRAY_WORDS, 524288) == 0xd44faf80);
ok1(hash_stable(array, ARRAY_WORDS, 1048576) == 0x2547ccbe);
ok1(hash_stable(array, ARRAY_WORDS, 2097152) == 0xbab06dbc);
ok1(hash_stable(array, ARRAY_WORDS, 4194304) == 0xaac0e882);
ok1(hash_stable(array, ARRAY_WORDS, 8388608) == 0x443f48d0);
ok1(hash_stable(array, ARRAY_WORDS, 16777216) == 0xdff49fcc);
ok1(hash_stable(array, ARRAY_WORDS, 33554432) == 0x9ce0fd65);
ok1(hash_stable(array, ARRAY_WORDS, 67108864) == 0x9ddb1def);
ok1(hash_stable(array, ARRAY_WORDS, 134217728) == 0x86096f25);
ok1(hash_stable(array, ARRAY_WORDS, 268435456) == 0xe713b7b5);
ok1(hash_stable(array, ARRAY_WORDS, 536870912) == 0x5baeffc5);
ok1(hash_stable(array, ARRAY_WORDS, 1073741824) == 0xde874f52);
ok1(hash_stable(array, ARRAY_WORDS, 2147483648U) == 0xeca13b4e);
/* Hash should be the same, indep of memory alignment. */
val = hash(array, sizeof(array), 0);
for (i = 0; i < sizeof(uint32_t); i++) {
memcpy(array2 + i, array, sizeof(array));
ok(hash(array2 + i, sizeof(array), 0) != val,
"hash matched at offset %i", i);
}
/* Hash of random values should have random distribution:
* check one byte at a time. */
for (i = 0; i < sizeof(uint32_t); i++) {
unsigned int lowest = -1U, highest = 0;
memset(results, 0, sizeof(results));
for (j = 0; j < 256000; j++) {
for (k = 0; k < ARRAY_WORDS; k++)
array[k] = random();
results[(hash(array, sizeof(array), 0) >> i*8)&0xFF]++;
}
for (j = 0; j < 256; j++) {
if (results[j] < lowest)
lowest = results[j];
if (results[j] > highest)
highest = results[j];
}
/* Expect within 20% */
ok(lowest > 800, "Byte %i lowest %i", i, lowest);
ok(highest < 1200, "Byte %i highest %i", i, highest);
diag("Byte %i, range %u-%u", i, lowest, highest);
}
/* Hash of pointer values should also have random distribution. */
for (i = 0; i < sizeof(uint32_t); i++) {
unsigned int lowest = -1U, highest = 0;
char *p = malloc(256000);
memset(results, 0, sizeof(results));
for (j = 0; j < 256000; j++)
results[(hash_pointer(p + j, 0) >> i*8)&0xFF]++;
free(p);
for (j = 0; j < 256; j++) {
if (results[j] < lowest)
lowest = results[j];
if (results[j] > highest)
highest = results[j];
}
/* Expect within 20% */
ok(lowest > 800, "hash_pointer byte %i lowest %i", i, lowest);
ok(highest < 1200, "hash_pointer byte %i highest %i",
i, highest);
diag("hash_pointer byte %i, range %u-%u", i, lowest, highest);
}
return exit_status();
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment