Commit 1a24a870 authored by Rusty Russell's avatar Rusty Russell

tdb2: use immobile free buckets, rename tests to show some ordering.

We put the free lists at the beginning of a zone; this means no record
can be larger than a zone, but means they cannot move.  Once we change
hashes to be expanding, they won't move either and the result should be
simpler.
parent 95458baf
This diff is collapsed.
This diff is collapsed.
......@@ -70,6 +70,8 @@ void tdb_mmap(struct tdb_context *tdb)
static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
{
struct stat st;
int ret;
if (len <= tdb->map_size)
return 0;
if (tdb->flags & TDB_INTERNAL) {
......@@ -85,7 +87,14 @@ static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
return -1;
}
if (fstat(tdb->fd, &st) == -1) {
if (tdb_lock_expand(tdb, F_RDLCK) != 0)
return -1;
ret = fstat(tdb->fd, &st);
tdb_unlock_expand(tdb, F_RDLCK);
if (ret == -1) {
tdb->ecode = TDB_ERR_IO;
return -1;
}
......@@ -103,6 +112,7 @@ static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, bool probe)
/* Unmap, update size, remap */
tdb_munmap(tdb);
tdb->map_size = st.st_size;
tdb_mmap(tdb);
return 0;
......
......@@ -26,6 +26,8 @@
*/
#include "private.h"
#include <assert.h>
#include <ccan/build_assert/build_assert.h>
static int fcntl_lock(struct tdb_context *tdb,
int rw, off_t off, off_t len, bool waitflag)
......@@ -255,19 +257,14 @@ static int tdb_nest_lock(struct tdb_context *tdb, tdb_off_t offset, int ltype,
{
struct tdb_lock_type *new_lck;
/* Header is not valid for open lock; valgrind complains. */
if (offset >= TDB_HASH_LOCK_START) {
if (offset > TDB_HASH_LOCK_START
+ (1ULL << tdb->header.v.hash_bits)
+ (tdb->header.v.num_zones
* (tdb->header.v.free_buckets+1))) {
tdb->ecode = TDB_ERR_LOCK;
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
"tdb_lock: invalid offset %llu ltype=%d\n",
(long long)offset, ltype);
return -1;
}
if (offset >= TDB_HASH_LOCK_START + (1 << 30) + tdb->map_size / 8) {
tdb->ecode = TDB_ERR_LOCK;
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
"tdb_lock: invalid offset %llu ltype=%d\n",
(long long)offset, ltype);
return -1;
}
if (tdb->flags & TDB_NOLOCK)
return 0;
......@@ -534,6 +531,16 @@ void tdb_unlock_open(struct tdb_context *tdb)
tdb_nest_unlock(tdb, TDB_OPEN_LOCK, F_WRLCK);
}
int tdb_lock_expand(struct tdb_context *tdb, int ltype)
{
return tdb_nest_lock(tdb, TDB_EXPANSION_LOCK, ltype, TDB_LOCK_WAIT);
}
void tdb_unlock_expand(struct tdb_context *tdb, int ltype)
{
tdb_nest_unlock(tdb, TDB_EXPANSION_LOCK, ltype);
}
/* unlock entire db */
int tdb_allrecord_unlock(struct tdb_context *tdb, int ltype)
{
......@@ -687,10 +694,21 @@ int tdb_unlock_list(struct tdb_context *tdb, tdb_off_t list, int ltype)
}
}
/* Free list locks come after hash locks */
int tdb_lock_free_list(struct tdb_context *tdb, tdb_off_t flist,
enum tdb_lock_flags waitflag)
/* Hash locks use TDB_HASH_LOCK_START + the next 30 bits.
* Then we begin; bucket offsets are sizeof(tdb_len_t) apart, so we divide.
* The result is that on 32 bit systems we don't use lock values > 2^31 on
* files that are less than 4GB.
*/
static tdb_off_t free_lock_off(tdb_off_t b_off)
{
return TDB_HASH_LOCK_START + (1 << 30) + b_off / sizeof(tdb_off_t);
}
int tdb_lock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off,
enum tdb_lock_flags waitflag)
{
assert(b_off >= sizeof(struct tdb_header));
/* You're supposed to have a hash lock first! */
if (!(tdb->flags & TDB_NOLOCK) && !tdb_has_locks(tdb)) {
tdb->ecode = TDB_ERR_LOCK;
......@@ -709,19 +727,15 @@ int tdb_lock_free_list(struct tdb_context *tdb, tdb_off_t flist,
return -1;
}
return tdb_nest_lock(tdb, TDB_HASH_LOCK_START
+ (1ULL << tdb->header.v.hash_bits)
+ flist, F_WRLCK, waitflag);
return tdb_nest_lock(tdb, free_lock_off(b_off), F_WRLCK, waitflag);
}
void tdb_unlock_free_list(struct tdb_context *tdb, tdb_off_t flist)
void tdb_unlock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off)
{
if (tdb->allrecord_lock.count)
return;
tdb_nest_unlock(tdb, TDB_HASH_LOCK_START
+ (1ULL << tdb->header.v.hash_bits)
+ flist, F_WRLCK);
tdb_nest_unlock(tdb, free_lock_off(b_off), F_WRLCK);
}
/* Even if the entry isn't in this hash bucket, you'd have to lock this
......
......@@ -67,12 +67,10 @@ typedef uint64_t tdb_off_t;
#define TDB_MAGIC_FOOD "TDB file\n"
#define TDB_VERSION ((uint64_t)(0x26011967 + 7))
#define TDB_MAGIC ((uint64_t)0x1999)
#define TDB_FREE_MAGIC (~(uint64_t)TDB_MAGIC)
#define TDB_FREE_MAGIC ((~(uint64_t)TDB_MAGIC) << 6)
#define TDB_HASH_MAGIC (0xA1ABE11A01092008ULL)
#define TDB_RECOVERY_MAGIC (0xf53bc0e7U)
#define TDB_RECOVERY_INVALID_MAGIC (0x0)
#define TDB_EXTRA_HASHBITS (11) /* We steal 11 bits to stash hash info. */
#define TDB_EXTRA_HASHBITS_NUM (3)
#define TDB_OFF_ERR ((tdb_off_t)-1)
......@@ -80,13 +78,21 @@ typedef uint64_t tdb_off_t;
#define TDB_OPEN_LOCK 0
/* Doing a transaction. */
#define TDB_TRANSACTION_LOCK 1
/* Expanding file. */
#define TDB_EXPANSION_LOCK 2
/* Hash chain locks. */
#define TDB_HASH_LOCK_START 2
#define TDB_HASH_LOCK_START 3
/* We start wih 256 hash buckets, 10 free buckets. A 4k-sized zone. */
/* We start wih 256 hash buckets, and a 64k-sized zone. */
#define INITIAL_HASH_BITS 8
#define INITIAL_FREE_BUCKETS 10
#define INITIAL_ZONE_BITS 12
#define INITIAL_ZONE_BITS 16
/* Try to create zones at least 32 times larger than allocations. */
#define TDB_COMFORT_FACTOR_BITS 5
/* We ensure buckets up to size 1 << (zone_bits - TDB_COMFORT_FACTOR_BITS). */
/* FIXME: test this matches size_to_bucket! */
#define BUCKETS_FOR_ZONE(zone_bits) ((zone_bits) + 2 - TDB_COMFORT_FACTOR_BITS)
#if !HAVE_BSWAP_64
static inline uint64_t bswap_64(uint64_t x)
......@@ -106,8 +112,9 @@ struct tdb_used_record {
/* For on-disk compatibility, we avoid bitfields:
magic: 16, (highest)
key_len_bits: 5,
hash:11,
extra_padding: 32 (lowest)
extra_padding: 32
hash_bits: 5,
zone_bits: 6 (lowest)
*/
uint64_t magic_and_meta;
/* The bottom key_len_bits*2 are key length, rest is data length. */
......@@ -131,12 +138,17 @@ static inline uint64_t rec_data_length(const struct tdb_used_record *r)
static inline uint64_t rec_extra_padding(const struct tdb_used_record *r)
{
return r->magic_and_meta & 0xFFFFFFFF;
return (r->magic_and_meta >> 11) & 0xFFFFFFFF;
}
static inline unsigned int rec_zone_bits(const struct tdb_used_record *r)
{
return r->magic_and_meta & ((1 << 6) - 1);
}
static inline uint64_t rec_hash(const struct tdb_used_record *r)
static inline uint32_t rec_hash(const struct tdb_used_record *r)
{
return ((r->magic_and_meta >> 32) & ((1ULL << 11) - 1));
return (r->magic_and_meta >> 6) & ((1 << 5) - 1);
}
static inline uint16_t rec_magic(const struct tdb_used_record *r)
......@@ -145,26 +157,33 @@ static inline uint16_t rec_magic(const struct tdb_used_record *r)
}
struct tdb_free_record {
uint64_t magic;
uint64_t magic_and_meta; /* Bottom 6 bits are zone bits. */
uint64_t data_len; /* Not counting these two fields. */
/* This is why the minimum record size is 16 bytes. */
uint64_t next, prev;
};
static inline unsigned int frec_zone_bits(const struct tdb_free_record *f)
{
return f->magic_and_meta & ((1 << 6) - 1);
}
static inline uint64_t frec_magic(const struct tdb_free_record *f)
{
return f->magic_and_meta & ~((1ULL << 6) - 1);
}
/* These parts can change while we have db open. */
struct tdb_header_volatile {
uint64_t generation; /* Makes sure it changes on every update. */
uint64_t hash_bits; /* Entries in hash table. */
uint64_t hash_off; /* Offset of hash table. */
uint64_t num_zones; /* How many zones in the file. */
uint64_t zone_bits; /* Size of zones. */
uint64_t free_buckets; /* How many buckets in each zone. */
uint64_t free_off; /* Arrays of free entries. */
};
/* this is stored at the front of every database */
struct tdb_header {
char magic_food[32]; /* for /etc/magic */
/* FIXME: Make me 32 bit? */
uint64_t version; /* version of the code */
uint64_t hash_test; /* result of hashing HASH_MAGIC. */
uint64_t hash_seed; /* "random" seed written at creation time. */
......@@ -174,6 +193,16 @@ struct tdb_header {
tdb_off_t reserved[19];
};
/* Each zone has its set of free lists at the head.
*
* For each zone we have a series of per-size buckets, and a final bucket for
* "too big". */
struct free_zone_header {
/* How much does this zone cover? */
uint64_t zone_bits;
/* tdb_off_t buckets[free_buckets + 1] */
};
enum tdb_lock_flags {
/* WAIT == F_SETLKW, NOWAIT == F_SETLK */
TDB_LOCK_NOWAIT = 0,
......@@ -227,7 +256,9 @@ struct tdb_context {
struct tdb_transaction *transaction;
/* What zone of the tdb to use, for spreading load. */
uint64_t last_zone;
uint64_t zone_off;
/* Cached copy of zone header. */
struct free_zone_header zhdr;
/* IO methods: changes for transactions. */
const struct tdb_methods *methods;
......@@ -268,25 +299,26 @@ tdb_off_t hash_off(struct tdb_context *tdb, uint64_t list);
/* free.c: */
void tdb_zone_init(struct tdb_context *tdb);
int tdb_zone_init(struct tdb_context *tdb);
/* If this fails, try tdb_expand. */
tdb_off_t alloc(struct tdb_context *tdb, size_t keylen, size_t datalen,
uint64_t hash, bool growing);
/* Put this record in a free list. */
int add_free_record(struct tdb_context *tdb,
int add_free_record(struct tdb_context *tdb, unsigned int zone_bits,
tdb_off_t off, tdb_len_t len_with_header);
/* Set up header for a used record. */
int set_header(struct tdb_context *tdb,
struct tdb_used_record *rec,
uint64_t keylen, uint64_t datalen,
uint64_t actuallen, uint64_t hash);
uint64_t actuallen, uint64_t hash,
unsigned int zone_bits);
/* Used by tdb_check to verify. */
unsigned int size_to_bucket(struct tdb_context *tdb, tdb_len_t data_len);
tdb_off_t zone_of(struct tdb_context *tdb, tdb_off_t off);
unsigned int size_to_bucket(unsigned int free_buckets, tdb_len_t data_len);
tdb_off_t bucket_off(tdb_off_t zone_off, tdb_off_t bucket);
/* io.c: */
/* Initialize tdb->methods. */
......@@ -352,10 +384,10 @@ tdb_off_t tdb_lock_list(struct tdb_context *tdb, uint64_t hash,
int ltype, enum tdb_lock_flags waitflag);
int tdb_unlock_list(struct tdb_context *tdb, tdb_off_t list, int ltype);
/* Lock/unlock a particular free list. */
int tdb_lock_free_list(struct tdb_context *tdb, tdb_off_t flist,
enum tdb_lock_flags waitflag);
void tdb_unlock_free_list(struct tdb_context *tdb, tdb_off_t flist);
/* Lock/unlock a particular free bucket. */
int tdb_lock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off,
enum tdb_lock_flags waitflag);
void tdb_unlock_free_bucket(struct tdb_context *tdb, tdb_off_t b_off);
/* Do we have any locks? */
bool tdb_has_locks(struct tdb_context *tdb);
......@@ -368,6 +400,11 @@ int tdb_allrecord_unlock(struct tdb_context *tdb, int ltype);
/* Serialize db open. */
int tdb_lock_open(struct tdb_context *tdb);
void tdb_unlock_open(struct tdb_context *tdb);
/* Serialize db expand. */
int tdb_lock_expand(struct tdb_context *tdb, int ltype);
void tdb_unlock_expand(struct tdb_context *tdb, int ltype);
/* Expand the file. */
int tdb_expand(struct tdb_context *tdb, tdb_len_t klen, tdb_len_t dlen,
bool growing);
......
......@@ -124,12 +124,22 @@ static uint64_t random_number(struct tdb_context *tdb)
return ret;
}
struct new_database {
struct new_db_head {
struct tdb_header hdr;
struct free_zone_header zhdr;
tdb_off_t free[BUCKETS_FOR_ZONE(INITIAL_ZONE_BITS) + 1];
struct tdb_used_record hrec;
tdb_off_t hash[1ULL << INITIAL_HASH_BITS];
struct tdb_used_record frec;
tdb_off_t free[INITIAL_FREE_BUCKETS + 1]; /* One overflow bucket */
struct tdb_free_record frec;
};
struct new_database {
struct new_db_head h;
/* Rest up to 1 << INITIAL_ZONE_BITS is empty. */
char space[(1 << INITIAL_ZONE_BITS)
- (sizeof(struct new_db_head) - sizeof(struct tdb_header))];
uint8_t tailer;
/* Don't count final padding! */
};
/* initialise a new database */
......@@ -137,51 +147,61 @@ static int tdb_new_database(struct tdb_context *tdb)
{
/* We make it up in memory, then write it out if not internal */
struct new_database newdb;
unsigned int magic_off = offsetof(struct tdb_header, magic_food);
unsigned int bucket, magic_off, dbsize;
/* Fill in the header */
newdb.hdr.version = TDB_VERSION;
newdb.hdr.hash_seed = random_number(tdb);
newdb.hdr.hash_test = TDB_HASH_MAGIC;
newdb.hdr.hash_test = tdb->khash(&newdb.hdr.hash_test,
sizeof(newdb.hdr.hash_test),
newdb.hdr.hash_seed,
tdb->hash_priv);
memset(newdb.hdr.reserved, 0, sizeof(newdb.hdr.reserved));
newdb.hdr.v.generation = 0;
/* The initial zone must cover the initial database size! */
BUILD_ASSERT((1ULL << INITIAL_ZONE_BITS) >= sizeof(newdb));
/* Free array has 1 zone, 10 buckets. All buckets empty. */
newdb.hdr.v.num_zones = 1;
newdb.hdr.v.zone_bits = INITIAL_ZONE_BITS;
newdb.hdr.v.free_buckets = INITIAL_FREE_BUCKETS;
newdb.hdr.v.free_off = offsetof(struct new_database, free);
set_header(tdb, &newdb.frec, 0,
sizeof(newdb.free), sizeof(newdb.free), 0);
memset(newdb.free, 0, sizeof(newdb.free));
/* Don't want any extra padding! */
dbsize = offsetof(struct new_database, tailer) + sizeof(newdb.tailer);
/* Fill in the header */
newdb.h.hdr.version = TDB_VERSION;
newdb.h.hdr.hash_seed = random_number(tdb);
newdb.h.hdr.hash_test = TDB_HASH_MAGIC;
newdb.h.hdr.hash_test = tdb->khash(&newdb.h.hdr.hash_test,
sizeof(newdb.h.hdr.hash_test),
newdb.h.hdr.hash_seed,
tdb->hash_priv);
memset(newdb.h.hdr.reserved, 0, sizeof(newdb.h.hdr.reserved));
newdb.h.hdr.v.generation = 0;
/* Initial hashes are empty. */
newdb.hdr.v.hash_bits = INITIAL_HASH_BITS;
newdb.hdr.v.hash_off = offsetof(struct new_database, hash);
set_header(tdb, &newdb.hrec, 0,
sizeof(newdb.hash), sizeof(newdb.hash), 0);
memset(newdb.hash, 0, sizeof(newdb.hash));
newdb.h.hdr.v.hash_bits = INITIAL_HASH_BITS;
newdb.h.hdr.v.hash_off = offsetof(struct new_database, h.hash);
set_header(tdb, &newdb.h.hrec, 0,
sizeof(newdb.h.hash), sizeof(newdb.h.hash), 0,
INITIAL_ZONE_BITS);
memset(newdb.h.hash, 0, sizeof(newdb.h.hash));
/* Create the single free entry. */
newdb.h.frec.magic_and_meta = TDB_FREE_MAGIC | INITIAL_ZONE_BITS;
newdb.h.frec.data_len = (sizeof(newdb.h.frec)
- sizeof(struct tdb_used_record)
+ sizeof(newdb.space));
/* Free is mostly empty... */
newdb.h.zhdr.zone_bits = INITIAL_ZONE_BITS;
memset(newdb.h.free, 0, sizeof(newdb.h.free));
/* ... except for this one bucket. */
bucket = size_to_bucket(INITIAL_ZONE_BITS, newdb.h.frec.data_len);
newdb.h.free[bucket] = offsetof(struct new_database, h.frec);
newdb.h.frec.next = newdb.h.frec.prev = 0;
/* Tailer contains maximum number of free_zone bits. */
newdb.tailer = INITIAL_ZONE_BITS;
/* Magic food */
memset(newdb.hdr.magic_food, 0, sizeof(newdb.hdr.magic_food));
strcpy(newdb.hdr.magic_food, TDB_MAGIC_FOOD);
memset(newdb.h.hdr.magic_food, 0, sizeof(newdb.h.hdr.magic_food));
strcpy(newdb.h.hdr.magic_food, TDB_MAGIC_FOOD);
/* This creates an endian-converted database, as if read from disk */
magic_off = offsetof(struct tdb_header, magic_food);
tdb_convert(tdb,
(char *)&newdb.hdr + magic_off,
sizeof(newdb) - magic_off);
(char *)&newdb.h.hdr + magic_off,
dbsize - 1 - magic_off);
tdb->header = newdb.hdr;
tdb->header = newdb.h.hdr;
if (tdb->flags & TDB_INTERNAL) {
tdb->map_size = sizeof(newdb);
tdb->map_size = dbsize;
tdb->map_ptr = malloc(tdb->map_size);
if (!tdb->map_ptr) {
tdb->ecode = TDB_ERR_OOM;
......@@ -196,7 +216,7 @@ static int tdb_new_database(struct tdb_context *tdb)
if (ftruncate(tdb->fd, 0) == -1)
return -1;
if (!tdb_pwrite_all(tdb->fd, &newdb, sizeof(newdb), 0)) {
if (!tdb_pwrite_all(tdb->fd, &newdb, dbsize, 0)) {
tdb->ecode = TDB_ERR_IO;
return -1;
}
......@@ -222,7 +242,7 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
tdb->name = NULL;
tdb->map_ptr = NULL;
tdb->fd = -1;
/* map_size will be set below. */
tdb->map_size = sizeof(struct tdb_header);
tdb->ecode = TDB_SUCCESS;
/* header will be read in below. */
tdb->header_uptodate = false;
......@@ -280,8 +300,7 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
}
TEST_IT(tdb->flags & TDB_CONVERT);
tdb_convert(tdb, &tdb->header, sizeof(tdb->header));
/* Zones don't matter for internal db. */
tdb->last_zone = 0;
tdb_zone_init(tdb);
return tdb;
}
......@@ -357,12 +376,16 @@ struct tdb_context *tdb_open(const char *name, int tdb_flags,
goto fail;
}
tdb->map_size = st.st_size;
tdb->device = st.st_dev;
tdb->inode = st.st_ino;
tdb_mmap(tdb);
tdb_unlock_open(tdb);
tdb_zone_init(tdb);
/* This make sure we have current map_size and mmap. */
tdb->methods->oob(tdb, tdb->map_size + 1, true);
/* Now we can pick a random free zone to start from. */
if (tdb_zone_init(tdb) == -1)
goto fail;
tdb->next = tdbs;
tdbs = tdb;
......@@ -543,7 +566,8 @@ static int update_rec_hdr(struct tdb_context *tdb,
{
uint64_t dataroom = rec_data_length(rec) + rec_extra_padding(rec);
if (set_header(tdb, rec, keylen, datalen, keylen + dataroom, h))
if (set_header(tdb, rec, keylen, datalen, keylen + dataroom, h,
rec_zone_bits(rec)))
return -1;
return tdb_write_convert(tdb, off, rec, sizeof(*rec));
......@@ -645,7 +669,7 @@ again:
r = tdb_get(tdb, oldoff - sizeof(*r), &pad, sizeof(*r));
if (!r)
goto oldheader;
add_free_record(tdb, oldoff - sizeof(*r),
add_free_record(tdb, rec_zone_bits(r), oldoff - sizeof(*r),
sizeof(*r)+rec_data_length(r)+rec_extra_padding(r));
/* Now we write the modified header. */
......@@ -736,6 +760,7 @@ static int replace_data(struct tdb_context *tdb,
uint64_t h, struct tdb_data key, struct tdb_data dbuf,
tdb_off_t bucket,
tdb_off_t old_off, tdb_len_t old_room,
unsigned old_zone,
bool growing)
{
tdb_off_t new_off;
......@@ -750,7 +775,7 @@ static int replace_data(struct tdb_context *tdb,
/* We didn't like the existing one: remove it. */
if (old_off)
add_free_record(tdb, old_off,
add_free_record(tdb, old_zone, old_off,
sizeof(struct tdb_used_record)
+ key.dsize + old_room);
......@@ -820,7 +845,8 @@ int tdb_store(struct tdb_context *tdb,
}
/* If we didn't use the old record, this implies we're growing. */
ret = replace_data(tdb, h, key, dbuf, bucket, off, old_room, off != 0);
ret = replace_data(tdb, h, key, dbuf, bucket, off, old_room,
rec_zone_bits(&rec), off != 0);
unlock_lists(tdb, start, num, F_WRLCK);
if (unlikely(ret == 1)) {
......@@ -902,7 +928,8 @@ int tdb_append(struct tdb_context *tdb,
}
/* If they're using tdb_append(), it implies they're growing record. */
ret = replace_data(tdb, h, key, new_dbuf, bucket, off, old_room, true);
ret = replace_data(tdb, h, key, new_dbuf, bucket, off, old_room,
rec_zone_bits(&rec), true);
unlock_lists(tdb, start, num, F_WRLCK);
free(newdata);
......@@ -1012,7 +1039,7 @@ int tdb_delete(struct tdb_context *tdb, struct tdb_data key)
}
/* Free the deleted entry. */
if (add_free_record(tdb, off,
if (add_free_record(tdb, rec_zone_bits(&rec), off,
sizeof(struct tdb_used_record)
+ rec_key_length(&rec)
+ rec_data_length(&rec)
......
......@@ -10,7 +10,7 @@ struct tdb_layout *new_tdb_layout(void)
struct tdb_layout *layout = malloc(sizeof(*layout));
layout->num_elems = 0;
layout->elem = NULL;
layout->ftable = layout->htable = -1;
layout->htable = -1;
return layout;
}
......@@ -22,6 +22,18 @@ static void add(struct tdb_layout *layout, union tdb_layout_elem elem)
layout->elem[layout->num_elems++] = elem;
}
void tdb_layout_add_zone(struct tdb_layout *layout,
unsigned int zone_bits,
bool fill_prev)
{
union tdb_layout_elem elem;
if (fill_prev)
tdb_layout_add_free(layout, 0);
elem.base.type = ZONE;
elem.zone.zone_bits = zone_bits;
add(layout, elem);
}
void tdb_layout_add_free(struct tdb_layout *layout, tdb_len_t len)
{
union tdb_layout_elem elem;
......@@ -64,23 +76,6 @@ void tdb_layout_add_hashtable(struct tdb_layout *layout,
add(layout, elem);
}
void tdb_layout_add_freetable(struct tdb_layout *layout,
unsigned int num_zones,
unsigned int zone_bits,
unsigned int num_buckets,
tdb_len_t extra)
{
union tdb_layout_elem elem;
elem.base.type = FREETABLE;
elem.freetable.num_zones = num_zones;
elem.freetable.zone_bits = zone_bits;
elem.freetable.num_buckets = num_buckets;
elem.freetable.extra = extra;
assert(layout->ftable == -1U);
layout->ftable = layout->num_elems;
add(layout, elem);
}
static tdb_len_t free_record_len(tdb_len_t len)
{
return sizeof(struct tdb_used_record) + len;
......@@ -101,11 +96,10 @@ static tdb_len_t hashtable_len(struct tle_hashtable *htable)
+ (sizeof(tdb_off_t) << htable->hash_bits);
}
static tdb_len_t freetable_len(struct tle_freetable *ftable)
static tdb_len_t zone_header_len(struct tle_zone *zone)
{
return sizeof(struct tdb_used_record)
+ (sizeof(tdb_off_t) * ftable->num_zones
* (ftable->num_buckets + 1));
return sizeof(struct free_zone_header)
+ sizeof(tdb_off_t) * (BUCKETS_FOR_ZONE(zone->zone_bits)+1);
}
static void set_free_record(void *mem, tdb_len_t len)
......@@ -114,43 +108,47 @@ static void set_free_record(void *mem, tdb_len_t len)
}
static void set_data_record(void *mem, struct tdb_context *tdb,
struct tle_zone *last_zone,
struct tle_used *used)
{
struct tdb_used_record *u = mem;
set_header(tdb, u, used->key.dsize, used->data.dsize,
used->key.dsize + used->data.dsize + used->extra,
tdb_hash(tdb, used->key.dptr, used->key.dsize));
tdb_hash(tdb, used->key.dptr, used->key.dsize),
last_zone->zone_bits);
memcpy(u + 1, used->key.dptr, used->key.dsize);
memcpy((char *)(u + 1) + used->key.dsize,
used->data.dptr, used->data.dsize);
}
static void set_hashtable(void *mem, struct tdb_context *tdb,
struct tle_zone *last_zone,
struct tle_hashtable *htable)
{
struct tdb_used_record *u = mem;
tdb_len_t len = sizeof(tdb_off_t) << htable->hash_bits;
set_header(tdb, u, 0, len, len + htable->extra, 0);
set_header(tdb, u, 0, len, len + htable->extra, 0,
last_zone->zone_bits);
memset(u + 1, 0, len);
}
static void set_freetable(void *mem, struct tdb_context *tdb,
struct tle_freetable *ftable)
static void set_zone(void *mem, struct tdb_context *tdb,
struct tle_zone *zone)
{
struct tdb_used_record *u = mem;
tdb_len_t len = sizeof(tdb_off_t) * ftable->num_zones
* (ftable->num_buckets + 1);
set_header(tdb, u, 0, len, len + ftable->extra, 0);
memset(u + 1, 0, len);
struct free_zone_header *fz = mem;
memset(fz, 0, zone_header_len(zone));
fz->zone_bits = zone->zone_bits;
}
static void add_to_freetable(struct tdb_context *tdb,
struct tle_zone *last_zone,
tdb_off_t eoff,
tdb_off_t elen)
{
add_free_record(tdb, eoff, sizeof(struct tdb_used_record) + elen);
add_free_record(tdb, last_zone->zone_bits, eoff,
sizeof(struct tdb_used_record) + elen);
}
static void add_to_hashtable(struct tdb_context *tdb,
......@@ -170,48 +168,62 @@ static void add_to_hashtable(struct tdb_context *tdb,
struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
{
unsigned int i;
tdb_off_t len;
tdb_off_t off, len;
tdb_len_t zone_left;
struct tdb_header *hdr;
char *mem;
struct tdb_context *tdb;
struct tle_zone *last_zone = NULL;
assert(layout->ftable != -1U);
assert(layout->htable != -1U);
assert(layout->elem[0].base.type == ZONE);
len = sizeof(struct tdb_header);
zone_left = 0;
off = sizeof(struct tdb_header);
/* First pass of layout: calc lengths */
for (i = 0; i < layout->num_elems; i++) {
union tdb_layout_elem *e = &layout->elem[i];
e->base.off = len;
e->base.off = off;
switch (e->base.type) {
case ZONE:
assert(zone_left == 0);
len = zone_header_len(&e->zone);
zone_left = 1ULL << e->zone.zone_bits;
break;
case FREE:
len += free_record_len(e->free.len);
if (e->free.len == 0)
e->free.len = zone_left
- sizeof(struct tdb_used_record);
len = free_record_len(e->free.len);
break;
case DATA:
len += data_record_len(&e->used);
len = data_record_len(&e->used);
break;
case HASHTABLE:
len += hashtable_len(&e->hashtable);
break;
case FREETABLE:
len += freetable_len(&e->freetable);
len = hashtable_len(&e->hashtable);
break;
}
off += len;
assert(zone_left >= len);
zone_left -= len;
}
mem = malloc(len);
/* Fill final zone with free record. */
if (zone_left != 0) {
tdb_layout_add_free(layout,
zone_left
- sizeof(struct tdb_used_record));
layout->elem[layout->num_elems-1].base.off = off;
off += zone_left;
}
mem = malloc(off+1);
/* Now populate our header, cribbing from a real TDB header. */
tdb = tdb_open(NULL, TDB_INTERNAL, O_RDWR, 0, &tap_log_attr);
hdr = (void *)mem;
*hdr = tdb->header;
hdr->v.generation++;
hdr->v.num_zones = layout->elem[layout->ftable].freetable.num_zones;
hdr->v.zone_bits = layout->elem[layout->ftable].freetable.zone_bits;
hdr->v.free_buckets
= layout->elem[layout->ftable].freetable.num_buckets;
hdr->v.free_off = layout->elem[layout->ftable].base.off
+ sizeof(struct tdb_used_record);
hdr->v.hash_bits = layout->elem[layout->htable].hashtable.hash_bits;
hdr->v.hash_off = layout->elem[layout->htable].base.off
+ sizeof(struct tdb_used_record);
......@@ -219,23 +231,26 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
/* Mug the tdb we have to make it use this. */
free(tdb->map_ptr);
tdb->map_ptr = mem;
tdb->map_size = len;
tdb->map_size = off+1;
header_changed(tdb);
for (i = 0; i < layout->num_elems; i++) {
union tdb_layout_elem *e = &layout->elem[i];
switch (e->base.type) {
case ZONE:
set_zone(mem + e->base.off, tdb, &e->zone);
last_zone = &e->zone;
break;
case FREE:
set_free_record(mem + e->base.off, e->free.len);
break;
case DATA:
set_data_record(mem + e->base.off, tdb, &e->used);
set_data_record(mem + e->base.off, tdb, last_zone,
&e->used);
break;
case HASHTABLE:
set_hashtable(mem + e->base.off, tdb, &e->hashtable);
break;
case FREETABLE:
set_freetable(mem + e->base.off, tdb, &e->freetable);
set_hashtable(mem + e->base.off, tdb, last_zone,
&e->hashtable);
break;
}
}
......@@ -244,8 +259,12 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
for (i = 0; i < layout->num_elems; i++) {
union tdb_layout_elem *e = &layout->elem[i];
switch (e->base.type) {
case ZONE:
last_zone = &e->zone;
break;
case FREE:
add_to_freetable(tdb, e->base.off, e->free.len);
add_to_freetable(tdb, last_zone,
e->base.off, e->free.len);
break;
case DATA:
add_to_hashtable(tdb, e->base.off, e->used.key);
......@@ -255,5 +274,7 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
}
}
/* Write tailer. */
((uint8_t *)tdb->map_ptr)[tdb->map_size-1] = last_zone->zone_bits;
return tdb;
}
......@@ -3,6 +3,9 @@
#include <ccan/tdb2/private.h>
struct tdb_layout *new_tdb_layout(void);
void tdb_layout_add_zone(struct tdb_layout *layout,
unsigned int zone_bits,
bool fill_prev);
void tdb_layout_add_free(struct tdb_layout *layout, tdb_len_t len);
void tdb_layout_add_used(struct tdb_layout *layout,
TDB_DATA key, TDB_DATA data,
......@@ -10,15 +13,10 @@ void tdb_layout_add_used(struct tdb_layout *layout,
void tdb_layout_add_hashtable(struct tdb_layout *layout,
unsigned int hash_bits,
tdb_len_t extra);
void tdb_layout_add_freetable(struct tdb_layout *layout,
unsigned int num_zones,
unsigned int zone_bits,
unsigned int num_buckets,
tdb_len_t extra);
struct tdb_context *tdb_layout_get(struct tdb_layout *layout);
enum layout_type {
FREE, DATA, HASHTABLE, FREETABLE,
ZONE, FREE, DATA, HASHTABLE,
};
/* Shared by all union members. */
......@@ -27,6 +25,11 @@ struct tle_base {
tdb_off_t off;
};
struct tle_zone {
struct tle_base base;
unsigned int zone_bits;
};
struct tle_free {
struct tle_base base;
tdb_len_t len;
......@@ -45,25 +48,17 @@ struct tle_hashtable {
tdb_len_t extra;
};
struct tle_freetable {
struct tle_base base;
unsigned int num_zones;
unsigned int zone_bits;
unsigned int num_buckets;
tdb_len_t extra;
};
union tdb_layout_elem {
struct tle_base base;
struct tle_zone zone;
struct tle_free free;
struct tle_used used;
struct tle_freetable freetable;
struct tle_hashtable hashtable;
};
struct tdb_layout {
unsigned int num_elems;
union tdb_layout_elem *elem;
unsigned int ftable, htable;
unsigned int htable;
};
#endif /* TDB2_TEST_LAYOUT_H */
......@@ -11,30 +11,35 @@ int main(int argc, char *argv[])
struct tdb_used_record rec;
struct tdb_context tdb = { .log = tap_log_fn, .log_priv = NULL };
plan_tests(64 + 32 + 48*6 + 1);
plan_tests(64 + 32 + 48*7 + 1);
/* We should be able to encode any data value. */
for (i = 0; i < 64; i++)
ok1(set_header(&tdb, &rec, 0, 1ULL << i, 1ULL << i, 0) == 0);
ok1(set_header(&tdb, &rec, 0, 1ULL << i, 1ULL << i, 0, 0)
== 0);
/* And any key and data with < 64 bits between them. */
for (i = 0; i < 32; i++) {
tdb_len_t dlen = 1ULL >> (63 - i), klen = 1ULL << i;
ok1(set_header(&tdb, &rec, klen, dlen, klen + dlen, 0) == 0);
ok1(set_header(&tdb, &rec, klen, dlen, klen + dlen, 0, 0)
== 0);
}
/* We should neatly encode all values. */
for (i = 0; i < 48; i++) {
uint64_t h = 1ULL << (i < 11 ? 63 - i : 63 - 10);
uint64_t h = 1ULL << (i < 5 ? 63 - i : 63 - 4);
uint64_t klen = 1ULL << (i < 16 ? i : 15);
uint64_t dlen = 1ULL << i;
uint64_t xlen = 1ULL << (i < 32 ? i : 31);
ok1(set_header(&tdb, &rec, klen, dlen, klen + dlen + xlen, h)
uint64_t zbits = 1ULL << (i < 6 ? i : 5);
ok1(set_header(&tdb, &rec, klen, dlen, klen + dlen + xlen, h,
zbits)
== 0);
ok1(rec_key_length(&rec) == klen);
ok1(rec_data_length(&rec) == dlen);
ok1(rec_extra_padding(&rec) == xlen);
ok1(rec_hash(&rec) << (64 - 11) == h);
ok1((uint64_t)rec_hash(&rec) << (64 - 5) == h);
ok1(rec_zone_bits(&rec) == zbits);
ok1(rec_magic(&rec) == TDB_MAGIC);
}
ok1(tap_log_messages == 0);
......
......@@ -16,7 +16,7 @@ int main(int argc, char *argv[])
plan_tests(sizeof(flags) / sizeof(flags[0]) * 2 + 1);
for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
tdb = tdb_open("run-new_database", flags[i],
tdb = tdb_open("run-new_database.tdb", flags[i],
O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
ok1(tdb);
if (tdb) {
......
#include <ccan/tdb2/tdb.c>
#include <ccan/tdb2/free.c>
#include <ccan/tdb2/lock.c>
#include <ccan/tdb2/io.c>
#include <ccan/tdb2/check.c>
#include <ccan/tap/tap.h>
#include "logging.h"
int main(int argc, char *argv[])
{
unsigned int i;
uint64_t val;
struct tdb_context *tdb;
int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
TDB_NOMMAP|TDB_CONVERT };
plan_tests(sizeof(flags) / sizeof(flags[0]) * 18 + 1);
for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
tdb = tdb_open("run-expand.tdb", flags[i],
O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
ok1(tdb);
if (!tdb)
continue;
/* First expand. Should add a zone, doubling file size.. */
val = tdb->map_size - 1 - sizeof(struct tdb_header);
ok1(tdb_expand(tdb, 1, 1, false) == 0);
ok1(tdb->map_size == 2 * val + 1 + sizeof(struct tdb_header));
ok1(tdb_check(tdb, NULL, NULL) == 0);
/* Second expand, add another zone of same size. */
ok1(tdb_expand(tdb, 1, 1, false) == 0);
ok1(tdb->map_size == 3 * val + 1 + sizeof(struct tdb_header));
ok1(tdb_check(tdb, NULL, NULL) == 0);
/* Large expand, but can only add 4th zone of same size. */
ok1(tdb_expand(tdb, 0, 4*val, false) == 0);
ok1(tdb->map_size == 4 * val + 1 + sizeof(struct tdb_header));
ok1(tdb_check(tdb, NULL, NULL) == 0);
/* Large expand now will double file. */
ok1(tdb_expand(tdb, 0, 4*val, false) == 0);
ok1(tdb->map_size == 8 * val + 1 + sizeof(struct tdb_header));
ok1(tdb_check(tdb, NULL, NULL) == 0);
/* And again? */
ok1(tdb_expand(tdb, 0, 4*val, false) == 0);
ok1(tdb->map_size == 16 * val + 1 + sizeof(struct tdb_header));
ok1(tdb_check(tdb, NULL, NULL) == 0);
/* Below comfort level, will add a single 8*val zone. */
ok1(tdb_expand(tdb, 0, ((8*val) >> TDB_COMFORT_FACTOR_BITS)
- sizeof(struct tdb_used_record), false) == 0);
ok1(tdb->map_size == 24 * val + 1 + sizeof(struct tdb_header));
tdb_close(tdb);
}
ok1(tap_log_messages == 0);
return exit_status();
}
#include <ccan/tdb2/tdb.c>
#include <ccan/tdb2/free.c>
#include <ccan/tdb2/lock.c>
#include <ccan/tdb2/io.c>
#include <ccan/tdb2/check.c>
#include <ccan/tap/tap.h>
#include "logging.h"
/* Release lock to check db. */
static void check(struct tdb_context *tdb)
{
tdb_allrecord_unlock(tdb, F_WRLCK);
ok1(tdb_check(tdb, NULL, NULL) == 0);
ok1(tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false) == 0);
}
int main(int argc, char *argv[])
{
unsigned int i;
tdb_off_t off;
uint64_t val, buckets;
struct tdb_context *tdb;
int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
TDB_NOMMAP|TDB_CONVERT };
plan_tests(sizeof(flags) / sizeof(flags[0]) * 40 + 1);
/* First, lower level expansion tests. */
for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
tdb = tdb_open("run-expand.tdb", flags[i],
O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
ok1(tdb);
if (!tdb)
continue;
ok1(tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false)
== 0);
/* Expanding file is pretty easy. */
off = expand_to_fill_zones(tdb);
ok1(off > 0 && off != TDB_OFF_ERR);
check(tdb);
/* Second expand should do nothing. */
ok1(expand_to_fill_zones(tdb) == 0);
check(tdb);
/* Now, try adding a zone. */
val = tdb->header.v.num_zones + 1;
ok1(update_zones(tdb, val,
tdb->header.v.zone_bits,
tdb->header.v.free_buckets,
1ULL << tdb->header.v.zone_bits) == 0);
ok1(tdb->header.v.num_zones == val);
check(tdb);
/* Now, try doubling zone size. */
val = tdb->header.v.zone_bits + 1;
ok1(update_zones(tdb, tdb->header.v.num_zones,
val,
tdb->header.v.free_buckets,
1ULL << val) == 0);
ok1(tdb->header.v.zone_bits == val);
check(tdb);
/* Now, try adding a zone, and a bucket. */
val = tdb->header.v.num_zones + 1;
buckets = tdb->header.v.free_buckets + 1;
ok1(update_zones(tdb, val,
tdb->header.v.zone_bits,
buckets,
1ULL << tdb->header.v.zone_bits) == 0);
ok1(tdb->header.v.num_zones == val);
ok1(tdb->header.v.free_buckets == buckets);
check(tdb);
/* Now, try doubling zone size, and adding a bucket. */
val = tdb->header.v.zone_bits + 1;
buckets = tdb->header.v.free_buckets + 1;
ok1(update_zones(tdb, tdb->header.v.num_zones,
val,
buckets,
1ULL << val) == 0);
ok1(tdb->header.v.zone_bits == val);
ok1(tdb->header.v.free_buckets == buckets);
check(tdb);
/* Now, try massive zone increase. */
val = tdb->header.v.zone_bits + 4;
ok1(update_zones(tdb, tdb->header.v.num_zones,
val,
tdb->header.v.free_buckets,
1ULL << val) == 0);
ok1(tdb->header.v.zone_bits == val);
check(tdb);
tdb_allrecord_unlock(tdb, F_WRLCK);
tdb_close(tdb);
}
/* Now using tdb_expand. */
for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
tdb = tdb_open("run-expand.tdb", flags[i],
O_RDWR|O_CREAT|O_TRUNC, 0600, &tap_log_attr);
ok1(tdb);
if (!tdb)
continue;
/* First expand (expand file to fill zone). */
ok1(tdb_expand(tdb, 1, 1, false) == 0);
ok1(tdb->header.v.num_zones == 1);
ok1(tdb_check(tdb, NULL, NULL) == 0);
/* Little expand (extra zone). */
ok1(tdb_expand(tdb, 1, 1, false) == 0);
ok1(tdb->header.v.num_zones == 2);
ok1(tdb_check(tdb, NULL, NULL) == 0);
/* Big expand (enlarge zones) */
ok1(tdb_expand(tdb, 1, 4096, false) == 0);
ok1(tdb->header.v.num_zones == 2);
ok1(tdb_check(tdb, NULL, NULL) == 0);
tdb_close(tdb);
}
ok1(tap_log_messages == 0);
return exit_status();
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment