Commit 5e30abc6 authored by Rusty Russell's avatar Rusty Russell

tdb2: shrink free header from 32 to 24 bytes.

This reduces our minimum key+data length to 8 bytes; we do this by packing
the prev pointer where we used to put the flist pointer, and storing the
flist as an 8 bit index (meaning we can only have 256 free tables).

Note that this has a perverse result on the size of the database, as our
4-byte key and 4-byte data now fit perfectly in a minimal record, so
appeding causes us to allocate new records which are 50% larger,
since we detect growing.

Current results of speed test:
$ ./speed 1000000
Adding 1000000 records:  23210 ns (59193360 bytes)
Finding 1000000 records:  2387 ns (59193360 bytes)
Traversing 1000000 records:  2150 ns (59193360 bytes)
Deleting 1000000 records:  13392 ns (59193360 bytes)
Re-adding 1000000 records:  11546 ns (59193360 bytes)
Appending 1000000 records:  29327 ns (91193360 bytes)
Churning 1000000 records:  33026 ns (91193360 bytes)

Previous:
$ ./speed 1000000
Adding 1000000 records:  28324 ns (67232528 bytes)
Finding 1000000 records:  2468 ns (67232528 bytes)
Traversing 1000000 records:  2200 ns (67232528 bytes)
Deleting 1000000 records:  13083 ns (67232528 bytes)
Re-adding 1000000 records:  16433 ns (67232528 bytes)
Appending 1000000 records:  2511 ns (67232528 bytes)
Churning 1000000 records:  31068 ns (67570448 bytes)
parent dfae76fd
......@@ -315,37 +315,37 @@ static bool check_hash(struct tdb_context *tdb,
static bool check_free(struct tdb_context *tdb,
tdb_off_t off,
const struct tdb_free_record *frec,
tdb_off_t prev, tdb_off_t flist_off, unsigned int bucket)
tdb_off_t prev, unsigned int flist, unsigned int bucket)
{
if (frec_magic(frec) != TDB_FREE_MAGIC) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
"tdb_check: offset %llu bad magic 0x%llx\n",
(long long)off, (long long)frec->magic_and_meta);
(long long)off, (long long)frec->magic_and_prev);
return false;
}
if (frec_flist(frec) != flist_off) {
if (frec_flist(frec) != flist) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
"tdb_check: offset %llu bad freelist 0x%llx\n",
(long long)off, (long long)frec_flist(frec));
"tdb_check: offset %llu bad freelist %u\n",
(long long)off, frec_flist(frec));
return false;
}
if (tdb->methods->oob(tdb, off
+ frec->data_len+sizeof(struct tdb_used_record),
+ frec_len(frec) + sizeof(struct tdb_used_record),
false))
return false;
if (size_to_bucket(frec->data_len) != bucket) {
if (size_to_bucket(frec_len(frec)) != bucket) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
"tdb_check: offset %llu in wrong bucket %u vs %u\n",
(long long)off,
bucket, size_to_bucket(frec->data_len));
bucket, size_to_bucket(frec_len(frec)));
return false;
}
if (prev != frec->prev) {
if (prev != frec_prev(frec)) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
"tdb_check: offset %llu bad prev %llu vs %llu\n",
(long long)off,
(long long)prev, (long long)frec->prev);
(long long)prev, (long long)frec_len(frec));
return false;
}
return true;
......@@ -353,6 +353,7 @@ static bool check_free(struct tdb_context *tdb,
static bool check_free_list(struct tdb_context *tdb,
tdb_off_t flist_off,
unsigned flist_num,
tdb_off_t free[],
size_t num_free,
size_t *num_found)
......@@ -384,7 +385,7 @@ static bool check_free_list(struct tdb_context *tdb,
return false;
if (tdb_read_convert(tdb, off, &f, sizeof(f)))
return false;
if (!check_free(tdb, off, &f, prev, flist_off, i))
if (!check_free(tdb, off, &f, prev, flist_num, i))
return false;
/* FIXME: Check hash bits */
......@@ -436,13 +437,17 @@ static bool check_linear(struct tdb_context *tdb,
struct tdb_free_record f;
struct tdb_recovery_record r;
} pad, *p;
p = tdb_get(tdb, off, &pad, sizeof(pad));
/* r is larger: only get that if we need to. */
p = tdb_get(tdb, off, &pad, sizeof(pad.f));
if (!p)
return false;
/* If we crash after ftruncate, we can get zeroes or fill. */
if (p->r.magic == TDB_RECOVERY_INVALID_MAGIC
|| p->r.magic == 0x4343434343434343ULL) {
p = tdb_get(tdb, off, &pad, sizeof(pad.r));
if (!p)
return false;
if (recovery == off) {
found_recovery = true;
len = sizeof(p->r) + p->r.max_len;
......@@ -462,6 +467,9 @@ static bool check_linear(struct tdb_context *tdb,
(size_t)tdb->map_size);
}
} else if (p->r.magic == TDB_RECOVERY_MAGIC) {
p = tdb_get(tdb, off, &pad, sizeof(pad.r));
if (!p)
return false;
if (recovery != off) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
"tdb_check: unexpected recovery"
......@@ -469,11 +477,23 @@ static bool check_linear(struct tdb_context *tdb,
(size_t)off);
return false;
}
if (p->r.len > p->r.max_len) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
"tdb_check: invalid recovery length"
" %zu\n", (size_t)p->r.len);
return false;
}
if (p->r.eof > tdb->map_size) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
"tdb_check: invalid old EOF"
" %zu\n", (size_t)p->r.eof);
return false;
}
found_recovery = true;
len = sizeof(p->r) + p->r.max_len;
} else if (frec_magic(&p->f) == TDB_FREE_MAGIC
|| frec_magic(&p->f) == TDB_COALESCING_MAGIC) {
len = sizeof(p->u) + p->f.data_len;
len = sizeof(p->u) + frec_len(&p->f);
if (off + len > tdb->map_size) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
"tdb_check: free overlength %llu"
......@@ -560,7 +580,8 @@ int tdb_check(struct tdb_context *tdb,
for (flist = first_flist(tdb); flist; flist = next_flist(tdb, flist)) {
if (flist == TDB_OFF_ERR)
goto fail;
if (!check_free_list(tdb, flist, free, num_free, &num_found))
if (!check_free_list(tdb, flist, num_flists, free, num_free,
&num_found))
goto fail;
num_flists++;
}
......
......@@ -62,10 +62,11 @@ tdb_off_t next_flist(struct tdb_context *tdb, tdb_off_t flist)
int tdb_flist_init(struct tdb_context *tdb)
{
/* Use reservoir sampling algorithm to select a free list at random. */
unsigned int rnd, max = 0;
unsigned int rnd, max = 0, count = 0;
tdb_off_t off;
tdb->flist_off = off = first_flist(tdb);
tdb->flist = 0;
while (off) {
if (off == TDB_OFF_ERR)
......@@ -74,10 +75,12 @@ int tdb_flist_init(struct tdb_context *tdb)
rnd = random();
if (rnd >= max) {
tdb->flist_off = off;
tdb->flist = count;
max = rnd;
}
off = next_flist(tdb, off);
count++;
}
return 0;
}
......@@ -107,10 +110,10 @@ static int remove_from_list(struct tdb_context *tdb,
tdb_off_t off;
/* Front of list? */
if (r->prev == 0) {
if (frec_prev(r) == 0) {
off = b_off;
} else {
off = r->prev + offsetof(struct tdb_free_record, next);
off = frec_prev(r) + offsetof(struct tdb_free_record, next);
}
#ifdef DEBUG
......@@ -128,11 +131,11 @@ static int remove_from_list(struct tdb_context *tdb,
}
if (r->next != 0) {
off = r->next + offsetof(struct tdb_free_record, prev);
off = r->next + offsetof(struct tdb_free_record,magic_and_prev);
/* r->next->prev = r->prev */
#ifdef DEBUG
if (tdb_read_off(tdb, off) != r_off) {
if (tdb_read_off(tdb, off) & TDB_OFF_MASK != r_off) {
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
"remove_from_list: %llu bad list %llu\n",
(long long)r_off, (long long)b_off);
......@@ -140,7 +143,7 @@ static int remove_from_list(struct tdb_context *tdb,
}
#endif
if (tdb_write_off(tdb, off, r->prev)) {
if (tdb_write_off(tdb, off, r->magic_and_prev)) {
return -1;
}
}
......@@ -151,58 +154,65 @@ static int remove_from_list(struct tdb_context *tdb,
static int enqueue_in_free(struct tdb_context *tdb,
tdb_off_t b_off,
tdb_off_t off,
struct tdb_free_record *new)
tdb_len_t len)
{
new->prev = 0;
struct tdb_free_record new;
uint64_t magic = (TDB_FREE_MAGIC << (64 - TDB_OFF_UPPER_STEAL));
/* We only need to set flist_and_len; rest is set in enqueue_in_free */
new.flist_and_len = ((uint64_t)tdb->flist << (64 - TDB_OFF_UPPER_STEAL))
| len;
/* prev = 0. */
new.magic_and_prev = magic;
/* new->next = head. */
new->next = tdb_read_off(tdb, b_off);
if (new->next == TDB_OFF_ERR)
new.next = tdb_read_off(tdb, b_off);
if (new.next == TDB_OFF_ERR)
return -1;
if (new->next) {
if (new.next) {
#ifdef DEBUG
if (tdb_read_off(tdb,
new->next
+ offsetof(struct tdb_free_record, prev))
!= 0) {
new.next + offsetof(struct tdb_free_record,
magic_and_prev))
!= magic) {
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
"enqueue_in_free: %llu bad head prev %llu\n",
(long long)new->next, (long long)b_off);
(long long)new.next, (long long)b_off);
return -1;
}
#endif
/* next->prev = new. */
if (tdb_write_off(tdb, new->next
+ offsetof(struct tdb_free_record, prev),
off) != 0)
if (tdb_write_off(tdb, new.next
+ offsetof(struct tdb_free_record,
magic_and_prev),
off | magic) != 0)
return -1;
}
/* head = new */
if (tdb_write_off(tdb, b_off, off) != 0)
return -1;
return tdb_write_convert(tdb, off, new, sizeof(*new));
return tdb_write_convert(tdb, off, &new, sizeof(new));
}
/* List need not be locked. */
int add_free_record(struct tdb_context *tdb,
tdb_off_t off, tdb_len_t len_with_header)
{
struct tdb_free_record new;
tdb_off_t b_off;
tdb_len_t len;
int ret;
assert(len_with_header >= sizeof(new));
assert(len_with_header >= sizeof(struct tdb_free_record));
new.magic_and_meta = TDB_FREE_MAGIC << (64 - TDB_OFF_UPPER_STEAL)
| tdb->flist_off;
new.data_len = len_with_header - sizeof(struct tdb_used_record);
len = len_with_header - sizeof(struct tdb_used_record);
b_off = bucket_off(tdb->flist_off, size_to_bucket(new.data_len));
b_off = bucket_off(tdb->flist_off, size_to_bucket(len));
if (tdb_lock_free_bucket(tdb, b_off, TDB_LOCK_WAIT) != 0)
return -1;
ret = enqueue_in_free(tdb, b_off, off, &new);
ret = enqueue_in_free(tdb, b_off, off, len);
tdb_unlock_free_bucket(tdb, b_off);
return ret;
}
......@@ -234,6 +244,17 @@ static size_t record_leftover(size_t keylen, size_t datalen,
return leftover;
}
/* FIXME: Shortcut common case where tdb->flist == flist */
static tdb_off_t flist_offset(struct tdb_context *tdb, unsigned int flist)
{
tdb_off_t off = first_flist(tdb);
unsigned int i;
for (i = 0; i < flist; i++)
off = next_flist(tdb, off);
return off;
}
/* Note: we unlock the current bucket if we coalesce or fail. */
static int coalesce(struct tdb_context *tdb,
tdb_off_t off, tdb_off_t b_off, tdb_len_t data_len)
......@@ -245,6 +266,7 @@ static int coalesce(struct tdb_context *tdb,
while (end < tdb->map_size) {
tdb_off_t nb_off;
unsigned flist, bucket;
/* FIXME: do tdb_get here and below really win? */
r = tdb_get(tdb, end, &pad, sizeof(pad));
......@@ -254,7 +276,9 @@ static int coalesce(struct tdb_context *tdb,
if (frec_magic(r) != TDB_FREE_MAGIC)
break;
nb_off = bucket_off(frec_flist(r), size_to_bucket(r->data_len));
flist = frec_flist(r);
bucket = size_to_bucket(frec_len(r));
nb_off = bucket_off(flist_offset(tdb, flist), bucket);
/* We may be violating lock order here, so best effort. */
if (tdb_lock_free_bucket(tdb, nb_off, TDB_LOCK_NOWAIT) == -1)
......@@ -272,9 +296,8 @@ static int coalesce(struct tdb_context *tdb,
break;
}
if (unlikely(bucket_off(frec_flist(r),
size_to_bucket(r->data_len))
!= nb_off)) {
if (unlikely(frec_flist(r) != flist)
|| unlikely(size_to_bucket(frec_len(r)) != bucket)) {
tdb_unlock_free_bucket(tdb, nb_off);
break;
}
......@@ -284,7 +307,7 @@ static int coalesce(struct tdb_context *tdb,
goto err;
}
end += sizeof(struct tdb_used_record) + r->data_len;
end += sizeof(struct tdb_used_record) + frec_len(r);
tdb_unlock_free_bucket(tdb, nb_off);
}
......@@ -297,11 +320,11 @@ static int coalesce(struct tdb_context *tdb,
if (!r)
goto err;
if (r->data_len != data_len) {
if (frec_len(r) != data_len) {
tdb->ecode = TDB_ERR_CORRUPT;
tdb->log(tdb, TDB_DEBUG_FATAL, tdb->log_priv,
"coalesce: expected data len %llu not %llu\n",
(long long)data_len, (long long)r->data_len);
(long long)data_len, (long long)frec_len(r));
goto err;
}
......@@ -314,8 +337,9 @@ static int coalesce(struct tdb_context *tdb,
/* We have to drop this to avoid deadlocks, so make sure record
* doesn't get coalesced by someone else! */
r->magic_and_meta = TDB_COALESCING_MAGIC << (64 - TDB_OFF_UPPER_STEAL);
r->data_len = end - off - sizeof(struct tdb_used_record);
r->magic_and_prev = TDB_COALESCING_MAGIC << (64 - TDB_OFF_UPPER_STEAL);
/* FIXME: Use 255 as invalid free list? */
r->flist_and_len = end - off - sizeof(struct tdb_used_record);
if (tdb_access_commit(tdb, r) != 0)
goto err;
......@@ -353,7 +377,7 @@ again:
return TDB_OFF_ERR;
}
best.data_len = -1ULL;
best.flist_and_len = -1ULL;
best_off = 0;
/* Get slack if we're after extra. */
......@@ -377,22 +401,22 @@ again:
if (frec_magic(r) != TDB_FREE_MAGIC) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
"lock_and_alloc: %llu non-free 0x%llx\n",
(long long)off, (long long)r->magic_and_meta);
(long long)off, (long long)r->magic_and_prev);
goto unlock_err;
}
if (r->data_len >= size && r->data_len < best.data_len) {
if (frec_len(r) >= size && frec_len(r) < frec_len(&best)) {
best_off = off;
best = *r;
}
if (best.data_len < size * multiplier && best_off)
if (frec_len(&best) < size * multiplier && best_off)
break;
multiplier *= 1.01;
/* Since we're going slow anyway, try coalescing here. */
switch (coalesce(tdb, off, b_off, r->data_len)) {
switch (coalesce(tdb, off, b_off, frec_len(r))) {
case -1:
/* This has already unlocked on error. */
return -1;
......@@ -413,13 +437,13 @@ again:
goto unlock_err;
leftover = record_leftover(keylen, datalen, want_extra,
best.data_len);
frec_len(&best));
assert(keylen + datalen + leftover <= best.data_len);
assert(keylen + datalen + leftover <= frec_len(&best));
/* We need to mark non-free before we drop lock, otherwise
* coalesce() could try to merge it! */
if (set_used_header(tdb, &rec, keylen, datalen,
best.data_len - leftover,
frec_len(&best) - leftover,
hashlow) != 0)
goto unlock_err;
......@@ -431,7 +455,7 @@ again:
if (leftover) {
if (add_free_record(tdb,
best_off + sizeof(rec)
+ best.data_len - leftover,
+ frec_len(&best) - leftover,
leftover))
return TDB_OFF_ERR;
}
......@@ -451,8 +475,8 @@ static tdb_off_t get_free(struct tdb_context *tdb,
size_t keylen, size_t datalen, bool want_extra,
unsigned hashlow)
{
tdb_off_t off, flist;
unsigned start_b, b;
tdb_off_t off, flist_off;
unsigned start_b, b, flist;
bool wrapped = false;
/* If they are growing, add 50% to get to higher bucket. */
......@@ -462,31 +486,35 @@ static tdb_off_t get_free(struct tdb_context *tdb,
else
start_b = size_to_bucket(adjust_size(keylen, datalen));
flist = tdb->flist_off;
while (!wrapped || flist != tdb->flist_off) {
flist_off = tdb->flist_off;
flist = tdb->flist;
while (!wrapped || flist_off != tdb->flist_off) {
/* Start at exact size bucket, and search up... */
for (b = find_free_head(tdb, flist, start_b);
for (b = find_free_head(tdb, flist_off, start_b);
b < TDB_FREE_BUCKETS;
b = find_free_head(tdb, flist, b + 1)) {
b = find_free_head(tdb, flist_off, b + 1)) {
/* Try getting one from list. */
off = lock_and_alloc(tdb, flist,
off = lock_and_alloc(tdb, flist_off,
b, keylen, datalen, want_extra,
hashlow);
if (off == TDB_OFF_ERR)
return TDB_OFF_ERR;
if (off != 0) {
/* Worked? Stay using this list. */
tdb->flist_off = flist;
tdb->flist_off = flist_off;
tdb->flist = flist;
return off;
}
/* Didn't work. Try next bucket. */
}
/* Hmm, try next list. */
flist = next_flist(tdb, flist);
if (flist == 0) {
flist_off = next_flist(tdb, flist_off);
flist++;
if (flist_off == 0) {
wrapped = true;
flist = first_flist(tdb);
flist_off = first_flist(tdb);
flist = 0;
}
}
......
......@@ -173,20 +173,30 @@ static inline uint16_t rec_magic(const struct tdb_used_record *r)
}
struct tdb_free_record {
uint64_t magic_and_meta; /* TDB_OFF_UPPER_STEAL bits of magic */
uint64_t data_len; /* Not counting these two fields. */
/* This is why the minimum record size is 16 bytes. */
uint64_t next, prev;
uint64_t magic_and_prev; /* TDB_OFF_UPPER_STEAL bits magic, then prev */
uint64_t flist_and_len; /* Len not counting these two fields. */
/* This is why the minimum record size is 8 bytes. */
uint64_t next;
};
static inline uint64_t frec_prev(const struct tdb_free_record *f)
{
return f->magic_and_prev & ((1ULL << (64 - TDB_OFF_UPPER_STEAL)) - 1);
}
static inline uint64_t frec_magic(const struct tdb_free_record *f)
{
return f->magic_and_meta >> (64 - TDB_OFF_UPPER_STEAL);
return f->magic_and_prev >> (64 - TDB_OFF_UPPER_STEAL);
}
static inline uint64_t frec_len(const struct tdb_free_record *f)
{
return f->flist_and_len & ((1ULL << (64 - TDB_OFF_UPPER_STEAL))-1);
}
static inline uint64_t frec_flist(const struct tdb_free_record *f)
static inline unsigned frec_flist(const struct tdb_free_record *f)
{
return f->magic_and_meta & ((1ULL << (64 - TDB_OFF_UPPER_STEAL)) - 1);
return f->flist_and_len >> (64 - TDB_OFF_UPPER_STEAL);
}
struct tdb_recovery_record {
......@@ -311,6 +321,7 @@ struct tdb_context {
/* What freelist are we using? */
uint64_t flist_off;
unsigned int flist;
/* IO methods: changes for transactions. */
const struct tdb_methods *methods;
......
......@@ -63,7 +63,7 @@ static bool summarize(struct tdb_context *tdb,
|| p->r.magic == TDB_RECOVERY_MAGIC) {
len = sizeof(p->r) + p->r.max_len;
} else if (rec_magic(&p->u) != TDB_MAGIC) {
len = p->f.data_len;
len = frec_len(&p->f);
tally_add(free, len);
tally_add(buckets, size_to_bucket(len));
len += sizeof(p->u);
......
......@@ -136,9 +136,11 @@ static void set_freelist(void *mem, struct tdb_context *tdb,
static void add_to_freetable(struct tdb_context *tdb,
tdb_off_t eoff,
tdb_off_t elen,
unsigned flist,
struct tle_freelist *freelist)
{
tdb->flist_off = freelist->base.off;
tdb->flist = flist;
add_free_record(tdb, eoff, sizeof(struct tdb_used_record) + elen);
}
......@@ -288,6 +290,7 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
switch (e->base.type) {
case FREE:
add_to_freetable(tdb, e->base.off, e->free.len,
e->free.flist_num,
find_flist(layout, e->free.flist_num));
break;
case DATA:
......
......@@ -17,7 +17,7 @@ static tdb_len_t free_record_length(struct tdb_context *tdb, tdb_off_t off)
return TDB_OFF_ERR;
if (frec_magic(&f) != TDB_FREE_MAGIC)
return TDB_OFF_ERR;
return f.data_len;
return frec_len(&f);
}
int main(int argc, char *argv[])
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment