Commit ab83de95 authored by Rusty Russell's avatar Rusty Russell

strset: new module using critbit trees.

Not as fast as using htable, but simple and provides order and prefix
ops.
parent 0845e796
#include <string.h>
#include "config.h"
/**
* strset - an ordered set of strings
*
* This code implements an ordered set of string as a critbit tree. See:
*
* http://cr.yp.to/critbit.html
* http://github.com/agl/critbit (which this code is based on)
*
* Note that ccan/htable is faster and uses less memory, but doesn't provide
* ordered or prefix operations.
*
* Example:
* // Print all words in order.
* #include <ccan/strset/strset.h>
* #include <ccan/grab_file/grab_file.h>
* #include <err.h>
* #include <string.h>
*
* static bool dump(const char *member, void *unused)
* {
* printf("%s ", member);
* return false;
* }
*
* int main(void)
* {
* struct strset words;
* char *file, *word;
*
* strset_init(&words);
* file = grab_fd(NULL, 0, NULL);
* if (!file)
* err(1, "Reading stdin");
*
* for (word = strtok(file, " \t\r\n");
* word;
* word = strtok(NULL, " \t\r\n")) {
* strset_set(&words, word);
* }
* strset_iterate(&words, dump, NULL);
* printf("\n");
* return 0;
* }
* // Given "foo bar" outputs "bar foo "
* // Given "foo foo bar" outputs "bar foo "
*
* License: Public domain (but some dependencies are LGPL!)
* Author: Rusty Russell <rusty@rustcorp.com.au>
* Ccanlint:
* license_depends_compat FAIL
*/
int main(int argc, char *argv[])
{
/* Expect exactly one argument */
if (argc != 2)
return 1;
if (strcmp(argv[1], "depends") == 0) {
printf("ccan/ilog\n"
"ccan/likely\n"
"ccan/short_types\n"
"ccan/str\n"
"ccan/typesafe_cb\n");
return 0;
}
return 1;
}
/* This code is based on the public domain code at
* http://github.com/agl/critbit writtem by Adam Langley
* <agl@imperialviolet.org>.
*
* Here are the main implementation differences:
* (1) We don't strdup the string on insert; we use the pointer we're given.
* (2) We use a straight bit number rather than a mask; it's simpler.
* (3) We don't use the bottom bit of the pointer, but instead use a leading
* zero to distinguish nodes from strings.
* (4) The empty string (which would look like a node) is handled
* using a special "empty node".
* (5) Delete returns the string, so you can free it if you want to.
* (6) Unions instead of void *, bool instead of int.
*/
#include <ccan/strset/strset.h>
#include <ccan/short_types/short_types.h>
#include <ccan/likely/likely.h>
#include <ccan/str/str.h>
#include <ccan/ilog/ilog.h>
#include <assert.h>
#include <stdlib.h>
struct node {
/* To differentiate us from strings. */
char nul_byte;
/* The bit where these children differ. */
u8 bit_num;
/* The byte number where first bit differs (-1 == empty string node). */
size_t byte_num;
/* These point to strings or nodes. */
struct strset child[2];
};
/* Closest member to this in a non-empty set. */
static const char *closest(struct strset n, const char *member)
{
size_t len = strlen(member);
const u8 *bytes = (const u8 *)member;
/* Anything with first byte 0 is a node. */
while (!n.u.s[0]) {
u8 direction = 0;
/* Special node which represents the empty string. */
if (unlikely(n.u.n->byte_num == (size_t)-1)) {
n = n.u.n->child[0];
break;
}
if (n.u.n->byte_num < len) {
u8 c = bytes[n.u.n->byte_num];
direction = (c >> n.u.n->bit_num) & 1;
}
n = n.u.n->child[direction];
}
return n.u.s;
}
char *strset_test(const struct strset *set, const char *member)
{
const char *str;
/* Empty set? */
if (!set->u.n)
return NULL;
str = closest(*set, member);
if (streq(member, str))
return (char *)str;
return NULL;
}
static bool set_string(struct strset *set,
struct strset *n, const char *member)
{
/* Substitute magic empty node if this is the empty string */
if (unlikely(!member[0])) {
n->u.n = malloc(sizeof(*n->u.n));
if (unlikely(!n->u.n))
return false;
n->u.n->nul_byte = '\0';
n->u.n->byte_num = (size_t)-1;
/* Attach the string to child[0] */
n = &n->u.n->child[0];
}
n->u.s = member;
return true;
}
bool strset_set(struct strset *set, const char *member)
{
size_t len = strlen(member);
const u8 *bytes = (const u8 *)member;
struct strset *np;
const char *str;
struct node *newn;
size_t byte_num;
u8 bit_num, new_dir;
/* Empty set? */
if (!set->u.n) {
return set_string(set, set, member);
}
/* Find closest existing member. */
str = closest(*set, member);
/* Find where they differ. */
for (byte_num = 0; str[byte_num] == member[byte_num]; byte_num++) {
if (member[byte_num] == '\0') {
/* All identical! */
return false;
}
}
/* Find which bit differs (if we had ilog8, we'd use it) */
bit_num = ilog32_nz((u8)str[byte_num] ^ bytes[byte_num]) - 1;
assert(bit_num < CHAR_BIT);
/* Which direction do we go at this bit? */
new_dir = ((bytes[byte_num]) >> bit_num) & 1;
/* Allocate new node. */
newn = malloc(sizeof(*newn));
if (!newn) {
/* FIXME */
return false;
}
newn->nul_byte = '\0';
newn->byte_num = byte_num;
newn->bit_num = bit_num;
if (unlikely(!set_string(set, &newn->child[new_dir], member))) {
free(newn);
return false;
}
/* Find where to insert: not closest, but first which differs! */
np = set;
while (!np->u.s[0]) {
u8 direction = 0;
/* Special node which represents the empty string will
* break here too! */
if (np->u.n->byte_num > byte_num)
break;
/* Subtle: bit numbers are "backwards" for comparison */
if (np->u.n->byte_num == byte_num && np->u.n->bit_num < bit_num)
break;
if (np->u.n->byte_num < len) {
u8 c = bytes[np->u.n->byte_num];
direction = (c >> np->u.n->bit_num) & 1;
}
np = &np->u.n->child[direction];
}
newn->child[!new_dir]= *np;
np->u.n = newn;
return true;
}
char *strset_clear(struct strset *set, const char *member)
{
size_t len = strlen(member);
const u8 *bytes = (const u8 *)member;
struct strset *parent = NULL, *n;
const char *ret = NULL;
u8 direction = 0; /* prevent bogus gcc warning. */
/* Empty set? */
if (!set->u.n)
return NULL;
/* Find closest, but keep track of parent. */
n = set;
/* Anything with first byte 0 is a node. */
while (!n->u.s[0]) {
u8 c = 0;
/* Special node which represents the empty string. */
if (unlikely(n->u.n->byte_num == (size_t)-1)) {
const char *empty_str = n->u.n->child[0].u.s;
if (member[0])
return NULL;
/* Sew empty string back so remaining logic works */
free(n->u.n);
n->u.s = empty_str;
break;
}
parent = n;
if (n->u.n->byte_num < len) {
c = bytes[n->u.n->byte_num];
direction = (c >> n->u.n->bit_num) & 1;
} else
direction = 0;
n = &n->u.n->child[direction];
}
/* Did we find it? */
if (!streq(member, n->u.s))
return NULL;
ret = n->u.s;
if (!parent) {
/* We deleted last node. */
set->u.n = NULL;
} else {
struct node *old = parent->u.n;
/* Raise other node to parent. */
*parent = old->child[!direction];
free(old);
}
return (char *)ret;
}
static bool iterate(struct strset n,
bool (*handle)(const char *, void *), void *data)
{
if (n.u.s[0])
return handle(n.u.s, data);
if (unlikely(n.u.n->byte_num == (size_t)-1))
return handle(n.u.n->child[0].u.s, data);
return iterate(n.u.n->child[0], handle, data)
|| iterate(n.u.n->child[1], handle, data);
}
void strset_iterate_(const struct strset *set,
bool (*handle)(const char *, void *), void *data)
{
/* Empty set? */
if (!set->u.n)
return;
iterate(*set, handle, data);
}
const struct strset *strset_prefix(const struct strset *set, const char *prefix)
{
const struct strset *n, *top;
size_t len = strlen(prefix);
const u8 *bytes = (const u8 *)prefix;
/* Empty set -> return empty set. */
if (!set->u.n)
return set;
top = n = set;
/* We walk to find the top, but keep going to check prefix matches. */
while (!n->u.s[0]) {
u8 c = 0, direction;
/* Special node which represents the empty string. */
if (unlikely(n->u.n->byte_num == (size_t)-1)) {
n = &n->u.n->child[0];
break;
}
if (n->u.n->byte_num < len)
c = bytes[n->u.n->byte_num];
direction = (c >> n->u.n->bit_num) & 1;
n = &n->u.n->child[direction];
if (c)
top = n;
}
if (!strstarts(n->u.s, prefix)) {
/* Convenient return for prefixes which do not appear in set. */
static const struct strset empty_set;
return &empty_set;
}
return top;
}
static void destroy(struct strset n)
{
if (!n.u.s[0]) {
if (likely(n.u.n->byte_num != (size_t)-1)) {
destroy(n.u.n->child[0]);
destroy(n.u.n->child[1]);
}
free(n.u.n);
}
}
void strset_destroy(struct strset *set)
{
if (set->u.n)
destroy(*set);
set->u.n = NULL;
}
#ifndef CCAN_STRSET_H
#define CCAN_STRSET_H
#include "config.h"
#include <ccan/typesafe_cb/typesafe_cb.h>
#include <stdlib.h>
#include <stdbool.h>
/**
* struct strset - representation of a string set
*
* It's exposed here to allow you to embed it and so we can inline the
* trivial functions.
*/
struct strset {
union {
struct node *n;
const char *s;
} u;
};
/**
* strset_init - initialize a string set (empty)
*
* For completeness; if you've arranged for it to be NULL already you don't
* need this.
*
* Example:
* struct strset set;
*
* strset_init(&set);
*/
static inline void strset_init(struct strset *set)
{
set->u.n = NULL;
}
/**
* strset_empty - is this string set empty?
* @set: the set.
*
* Example:
* if (!strset_empty(&set))
* abort();
*/
static inline bool strset_empty(const struct strset *set)
{
return set->u.n == NULL;
}
/**
* strset_test - is this a member of this string set?
* @set: the set.
* @member: the string to search for.
*
* Returns the member, or NULL if it isn't in the set.
*
* Example:
* if (strset_test(&set, "hello"))
* printf("hello is in the set\n");
*/
char *strset_test(const struct strset *set, const char *member);
/**
* strset_set - place a member in the string set.
* @set: the set.
* @member: the string to place in the set.
*
* This returns false if we run out of memory, or (more normally) if that
* string already appears in the set.
*
* Note that the pointer is placed in the set, the string is not copied. If
* you want a copy in the set, use strdup().
*
* Example:
* if (!strset_set(&set, "goodbye"))
* printf("goodbye was already in the set\n");
*/
bool strset_set(struct strset *set, const char *member);
/**
* strset_clear - remove a member from the string set.
* @set: the set.
* @member: the string to remove from the set.
*
* This returns the string which was passed to strset_set(), or NULL.
* This means that if you allocated a string (eg. using strdup()), you can
* free it here.
*
* Example:
* if (!strset_clear(&set, "goodbye"))
* printf("goodbye was not in the set?\n");
*/
char *strset_clear(struct strset *set, const char *member);
/**
* strset_destroy - remove every member from the set.
* @set: the set.
*
* The set will be empty after this.
*
* Example:
* strset_destroy(&set);
*/
void strset_destroy(struct strset *set);
/**
* strset_iterate - ordered iteration over a set
* @set: the set.
* @handle: the function to call.
* @arg: the argument for the function (types should match).
*
* You should not alter the set within the @handle function! If it returns
* true, the iteration will stop.
*
* Example:
* static bool dump_some(const char *member, int *num)
* {
* // Only dump out num nodes.
* if (*(num--) == 0)
* return true;
* printf("%s\n", member);
* return false;
* }
*
* static void dump_set(const struct strset *set)
* {
* int max = 100;
* strset_iterate(set, dump_some, &max);
* if (max < 0)
* printf("... (truncated to 100 entries)\n");
* }
*/
#define strset_iterate(set, handle, arg) \
strset_iterate_((set), typesafe_cb_preargs(bool, void *, \
(handle), (arg), \
const char *), \
(arg))
void strset_iterate_(const struct strset *set,
bool (*handle)(const char *, void *), void *data);
/**
* strset_prefix - return a subset matching a prefix
* @set: the set.
* @prefix: the prefix.
*
* This returns a pointer into @set, so don't alter @set while using
* the return value. You can use strset_iterate(), strset_test() or
* strset_empty() on the returned pointer.
*
* Example:
* static void dump_prefix(const struct strset *set, const char *prefix)
* {
* int max = 100;
* printf("Nodes with prefix %s:\n", prefix);
* strset_iterate(strset_prefix(set, prefix), dump_some, &max);
* if (max < 0)
* printf("... (truncated to 100 entries)\n");
* }
*/
const struct strset *strset_prefix(const struct strset *set,
const char *prefix);
#endif /* CCAN_STRSET_H */
/* Test high bit handling. */
#include <ccan/strset/strset.h>
#include <ccan/strset/strset.c>
#include <ccan/tap/tap.h>
#define NUM 1000
static void encode(char template[3], unsigned int val)
{
assert(val < 255 * 255);
template[0] = (val / 255) + 1;
template[1] = (val % 255) + 1;
template[2] = '\0';
}
static bool in_order(const char *value, unsigned int *count)
{
char template[3];
encode(template, *count);
ok1(streq(value, template));
(*count)++;
return false;
}
static bool in_order_by_2(const char *value, unsigned int *count)
{
char template[3];
encode(template, *count);
ok1(streq(value, template));
(*count) += 2;
return false;
}
static bool dump(const char *value, void *unused)
{
diag("%s", value);
return false;
}
int main(void)
{
struct strset set;
unsigned int i;
char *str[NUM];
plan_tests(NUM + 3 * NUM / 2);
strset_init(&set);
for (i = 0; i < NUM; i++) {
char template[3];
encode(template, i);
str[i] = strdup(template);
}
for (i = 0; i < NUM; i++)
strset_set(&set, str[i]);
strset_iterate(&set, dump, NULL);
/* Iterate. */
i = 0;
strset_iterate(&set, in_order, &i);
/* Preserve order after deletion. */
for (i = 0; i < NUM; i += 2)
ok1(strset_clear(&set, str[i]) == str[i]);
i = 1;
strset_iterate(&set, in_order_by_2, &i);
for (i = 1; i < NUM; i += 2)
ok1(strset_clear(&set, str[i]) == str[i]);
/* empty traverse. */
strset_iterate(&set, in_order_by_2, (unsigned int *)NULL);
for (i = 0; i < NUM; i++)
free(str[i]);
/* This exits depending on whether all tests passed */
return exit_status();
}
#include <ccan/strset/strset.h>
#include <ccan/strset/strset.c>
#include <ccan/tap/tap.h>
#include <stdio.h>
#define NUM 1000
static bool in_order(const char *value, unsigned int *count)
{
int i = atoi(value);
ok1(*count == i);
(*count)++;
return false;
}
static bool in_order_by_2(const char *value, unsigned int *count)
{
int i = atoi(value);
ok1(*count == i);
(*count) += 2;
return false;
}
static bool dump(const char *value, void *unused)
{
diag("%s", value);
return false;
}
int main(void)
{
struct strset set;
unsigned int i;
char *str[NUM];
plan_tests(NUM * 2 + 3 * NUM / 2);
strset_init(&set);
for (i = 0; i < NUM; i++) {
char template[10];
sprintf(template, "%08u", i);
str[i] = strdup(template);
}
for (i = 0; i < NUM; i++)
strset_set(&set, str[i]);
strset_iterate(&set, dump, NULL);
/* Iterate. */
i = 0;
strset_iterate(&set, in_order, &i);
/* Preserve order after deletion. */
for (i = 0; i < NUM; i += 2)
ok1(strset_clear(&set, str[i]) == str[i]);
i = 1;
strset_iterate(&set, in_order_by_2, &i);
for (i = 1; i < NUM; i += 2)
ok1(strset_clear(&set, str[i]) == str[i]);
/* empty traverse. */
strset_iterate(&set, in_order_by_2, (unsigned int *)NULL);
/* insert backwards, should be fine. */
for (i = 0; i < NUM; i++)
strset_set(&set, str[NUM-1-i]);
i = 0;
strset_iterate(&set, in_order, &i);
strset_destroy(&set);
for (i = 0; i < NUM; i++)
free(str[i]);
/* This exits depending on whether all tests passed */
return exit_status();
}
#include <ccan/strset/strset.h>
#include <ccan/strset/strset.c>
#include <ccan/tap/tap.h>
#include <stdio.h>
/* Must be > 100, see below. */
#define NUM 200
static bool in_order(const char *value, unsigned int *count)
{
int i = atoi(value);
ok1(*count == i);
(*count)++;
return false;
}
static bool find_empty(const char *value, char *empty)
{
if (value == empty)
pass("Found empty entry!");
return false;
}
int main(void)
{
struct strset set;
const struct strset *sub;
unsigned int i;
char *str[NUM], *empty;
plan_tests(7 + 1 + 10 + 100);
strset_init(&set);
for (i = 0; i < NUM; i++) {
char template[10];
sprintf(template, "%08u", i);
str[i] = strdup(template);
}
for (i = 0; i < NUM; i++)
strset_set(&set, str[i]);
/* Nothing */
sub = strset_prefix(&set, "a");
ok1(strset_empty(sub));
/* Everything */
sub = strset_prefix(&set, "0");
ok1(sub->u.n == set.u.n);
sub = strset_prefix(&set, "");
ok1(sub->u.n == set.u.n);
/* Singleton. */
sub = strset_prefix(&set, "00000000");
i = 0;
strset_iterate(sub, in_order, &i);
ok1(i == 1);
/* First 10. */
sub = strset_prefix(&set, "0000000");
i = 0;
strset_iterate(sub, in_order, &i);
ok1(i == 10);
/* First 100. */
sub = strset_prefix(&set, "000000");
i = 0;
strset_iterate(sub, in_order, &i);
ok1(i == 100);
/* Everything, *plus* empty string. */
empty = strdup("");
strset_set(&set, empty);
sub = strset_prefix(&set, "");
/* Check we get *our* empty string back! */
strset_iterate(sub, find_empty, empty);
strset_destroy(&set);
for (i = 0; i < NUM; i++)
free(str[i]);
free(empty);
/* This exits depending on whether all tests passed */
return exit_status();
}
#include <ccan/strset/strset.h>
#include <ccan/strset/strset.c>
#include <ccan/tap/tap.h>
int main(void)
{
struct strset set;
const char str[] = "hello";
const char none[] = "";
char *dup = strdup(str);
/* This is how many tests you plan to run */
plan_tests(24);
strset_init(&set);
ok1(!strset_test(&set, str));
ok1(!strset_test(&set, none));
ok1(!strset_clear(&set, str));
ok1(!strset_clear(&set, none));
ok1(strset_set(&set, str));
ok1(strset_test(&set, str));
/* We compare the string, not the pointer. */
ok1(strset_test(&set, dup));
ok1(!strset_test(&set, none));
/* Delete should return original string. */
ok1(strset_clear(&set, dup) == str);
ok1(!strset_test(&set, str));
ok1(!strset_test(&set, none));
/* Try insert and delete of empty string. */
ok1(strset_set(&set, none));
ok1(strset_test(&set, none));
ok1(!strset_test(&set, str));
/* Delete should return original string. */
ok1(strset_clear(&set, "") == none);
ok1(!strset_test(&set, str));
ok1(!strset_test(&set, none));
/* Both at once... */
ok1(strset_set(&set, none));
ok1(strset_set(&set, str));
ok1(strset_test(&set, str));
ok1(strset_test(&set, none));
ok1(strset_clear(&set, "") == none);
ok1(strset_clear(&set, dup) == str);
ok1(set.u.n == NULL);
free(dup);
/* This exits depending on whether all tests passed */
return exit_status();
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment