Commit 5e30abc6 authored by Rusty Russell's avatar Rusty Russell

tdb2: shrink free header from 32 to 24 bytes.

This reduces our minimum key+data length to 8 bytes; we do this by packing
the prev pointer where we used to put the flist pointer, and storing the
flist as an 8 bit index (meaning we can only have 256 free tables).

Note that this has a perverse result on the size of the database, as our
4-byte key and 4-byte data now fit perfectly in a minimal record, so
appeding causes us to allocate new records which are 50% larger,
since we detect growing.

Current results of speed test:
$ ./speed 1000000
Adding 1000000 records:  23210 ns (59193360 bytes)
Finding 1000000 records:  2387 ns (59193360 bytes)
Traversing 1000000 records:  2150 ns (59193360 bytes)
Deleting 1000000 records:  13392 ns (59193360 bytes)
Re-adding 1000000 records:  11546 ns (59193360 bytes)
Appending 1000000 records:  29327 ns (91193360 bytes)
Churning 1000000 records:  33026 ns (91193360 bytes)

Previous:
$ ./speed 1000000
Adding 1000000 records:  28324 ns (67232528 bytes)
Finding 1000000 records:  2468 ns (67232528 bytes)
Traversing 1000000 records:  2200 ns (67232528 bytes)
Deleting 1000000 records:  13083 ns (67232528 bytes)
Re-adding 1000000 records:  16433 ns (67232528 bytes)
Appending 1000000 records:  2511 ns (67232528 bytes)
Churning 1000000 records:  31068 ns (67570448 bytes)
parent dfae76fd
...@@ -315,37 +315,37 @@ static bool check_hash(struct tdb_context *tdb, ...@@ -315,37 +315,37 @@ static bool check_hash(struct tdb_context *tdb,
static bool check_free(struct tdb_context *tdb, static bool check_free(struct tdb_context *tdb,
tdb_off_t off, tdb_off_t off,
const struct tdb_free_record *frec, const struct tdb_free_record *frec,
tdb_off_t prev, tdb_off_t flist_off, unsigned int bucket) tdb_off_t prev, unsigned int flist, unsigned int bucket)
{ {
if (frec_magic(frec) != TDB_FREE_MAGIC) { if (frec_magic(frec) != TDB_FREE_MAGIC) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
"tdb_check: offset %llu bad magic 0x%llx\n", "tdb_check: offset %llu bad magic 0x%llx\n",
(long long)off, (long long)frec->magic_and_meta); (long long)off, (long long)frec->magic_and_prev);
return false; return false;
} }
if (frec_flist(frec) != flist_off) { if (frec_flist(frec) != flist) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
"tdb_check: offset %llu bad freelist 0x%llx\n", "tdb_check: offset %llu bad freelist %u\n",
(long long)off, (long long)frec_flist(frec)); (long long)off, frec_flist(frec));
return false; return false;
} }
if (tdb->methods->oob(tdb, off if (tdb->methods->oob(tdb, off
+ frec->data_len+sizeof(struct tdb_used_record), + frec_len(frec) + sizeof(struct tdb_used_record),
false)) false))
return false; return false;
if (size_to_bucket(frec->data_len) != bucket) { if (size_to_bucket(frec_len(frec)) != bucket) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
"tdb_check: offset %llu in wrong bucket %u vs %u\n", "tdb_check: offset %llu in wrong bucket %u vs %u\n",
(long long)off, (long long)off,
bucket, size_to_bucket(frec->data_len)); bucket, size_to_bucket(frec_len(frec)));
return false; return false;
} }
if (prev != frec->prev) { if (prev != frec_prev(frec)) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
"tdb_check: offset %llu bad prev %llu vs %llu\n", "tdb_check: offset %llu bad prev %llu vs %llu\n",
(long long)off, (long long)off,
(long long)prev, (long long)frec->prev); (long long)prev, (long long)frec_len(frec));
return false; return false;
} }
return true; return true;
...@@ -353,6 +353,7 @@ static bool check_free(struct tdb_context *tdb, ...@@ -353,6 +353,7 @@ static bool check_free(struct tdb_context *tdb,
static bool check_free_list(struct tdb_context *tdb, static bool check_free_list(struct tdb_context *tdb,
tdb_off_t flist_off, tdb_off_t flist_off,
unsigned flist_num,
tdb_off_t free[], tdb_off_t free[],
size_t num_free, size_t num_free,
size_t *num_found) size_t *num_found)
...@@ -384,7 +385,7 @@ static bool check_free_list(struct tdb_context *tdb, ...@@ -384,7 +385,7 @@ static bool check_free_list(struct tdb_context *tdb,
return false; return false;
if (tdb_read_convert(tdb, off, &f, sizeof(f))) if (tdb_read_convert(tdb, off, &f, sizeof(f)))
return false; return false;
if (!check_free(tdb, off, &f, prev, flist_off, i)) if (!check_free(tdb, off, &f, prev, flist_num, i))
return false; return false;
/* FIXME: Check hash bits */ /* FIXME: Check hash bits */
...@@ -436,13 +437,17 @@ static bool check_linear(struct tdb_context *tdb, ...@@ -436,13 +437,17 @@ static bool check_linear(struct tdb_context *tdb,
struct tdb_free_record f; struct tdb_free_record f;
struct tdb_recovery_record r; struct tdb_recovery_record r;
} pad, *p; } pad, *p;
p = tdb_get(tdb, off, &pad, sizeof(pad)); /* r is larger: only get that if we need to. */
p = tdb_get(tdb, off, &pad, sizeof(pad.f));
if (!p) if (!p)
return false; return false;
/* If we crash after ftruncate, we can get zeroes or fill. */ /* If we crash after ftruncate, we can get zeroes or fill. */
if (p->r.magic == TDB_RECOVERY_INVALID_MAGIC if (p->r.magic == TDB_RECOVERY_INVALID_MAGIC
|| p->r.magic == 0x4343434343434343ULL) { || p->r.magic == 0x4343434343434343ULL) {
p = tdb_get(tdb, off, &pad, sizeof(pad.r));
if (!p)
return false;
if (recovery == off) { if (recovery == off) {
found_recovery = true; found_recovery = true;
len = sizeof(p->r) + p->r.max_len; len = sizeof(p->r) + p->r.max_len;
...@@ -462,6 +467,9 @@ static bool check_linear(struct tdb_context *tdb, ...@@ -462,6 +467,9 @@ static bool check_linear(struct tdb_context *tdb,
(size_t)tdb->map_size); (size_t)tdb->map_size);
} }
} else if (p->r.magic == TDB_RECOVERY_MAGIC) { } else if (p->r.magic == TDB_RECOVERY_MAGIC) {
p = tdb_get(tdb, off, &pad, sizeof(pad.r));
if (!p)
return false;
if (recovery != off) { if (recovery != off) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
"tdb_check: unexpected recovery" "tdb_check: unexpected recovery"
...@@ -469,11 +477,23 @@ static bool check_linear(struct tdb_context *tdb, ...@@ -469,11 +477,23 @@ static bool check_linear(struct tdb_context *tdb,
(size_t)off); (size_t)off);
return false; return false;
} }
if (p->r.len > p->r.max_len) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
"tdb_check: invalid recovery length"
" %zu\n", (size_t)p->r.len);
return false;
}
if (p->r.eof > tdb->map_size) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
"tdb_check: invalid old EOF"
" %zu\n", (size_t)p->r.eof);
return false;
}
found_recovery = true; found_recovery = true;
len = sizeof(p->r) + p->r.max_len; len = sizeof(p->r) + p->r.max_len;
} else if (frec_magic(&p->f) == TDB_FREE_MAGIC } else if (frec_magic(&p->f) == TDB_FREE_MAGIC
|| frec_magic(&p->f) == TDB_COALESCING_MAGIC) { || frec_magic(&p->f) == TDB_COALESCING_MAGIC) {
len = sizeof(p->u) + p->f.data_len; len = sizeof(p->u) + frec_len(&p->f);
if (off + len > tdb->map_size) { if (off + len > tdb->map_size) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv, tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
"tdb_check: free overlength %llu" "tdb_check: free overlength %llu"
...@@ -560,7 +580,8 @@ int tdb_check(struct tdb_context *tdb, ...@@ -560,7 +580,8 @@ int tdb_check(struct tdb_context *tdb,
for (flist = first_flist(tdb); flist; flist = next_flist(tdb, flist)) { for (flist = first_flist(tdb); flist; flist = next_flist(tdb, flist)) {
if (flist == TDB_OFF_ERR) if (flist == TDB_OFF_ERR)
goto fail; goto fail;
if (!check_free_list(tdb, flist, free, num_free, &num_found)) if (!check_free_list(tdb, flist, num_flists, free, num_free,
&num_found))
goto fail; goto fail;
num_flists++; num_flists++;
} }
......
This diff is collapsed.
...@@ -173,20 +173,30 @@ static inline uint16_t rec_magic(const struct tdb_used_record *r) ...@@ -173,20 +173,30 @@ static inline uint16_t rec_magic(const struct tdb_used_record *r)
} }
struct tdb_free_record { struct tdb_free_record {
uint64_t magic_and_meta; /* TDB_OFF_UPPER_STEAL bits of magic */ uint64_t magic_and_prev; /* TDB_OFF_UPPER_STEAL bits magic, then prev */
uint64_t data_len; /* Not counting these two fields. */ uint64_t flist_and_len; /* Len not counting these two fields. */
/* This is why the minimum record size is 16 bytes. */ /* This is why the minimum record size is 8 bytes. */
uint64_t next, prev; uint64_t next;
}; };
static inline uint64_t frec_prev(const struct tdb_free_record *f)
{
return f->magic_and_prev & ((1ULL << (64 - TDB_OFF_UPPER_STEAL)) - 1);
}
static inline uint64_t frec_magic(const struct tdb_free_record *f) static inline uint64_t frec_magic(const struct tdb_free_record *f)
{ {
return f->magic_and_meta >> (64 - TDB_OFF_UPPER_STEAL); return f->magic_and_prev >> (64 - TDB_OFF_UPPER_STEAL);
}
static inline uint64_t frec_len(const struct tdb_free_record *f)
{
return f->flist_and_len & ((1ULL << (64 - TDB_OFF_UPPER_STEAL))-1);
} }
static inline uint64_t frec_flist(const struct tdb_free_record *f) static inline unsigned frec_flist(const struct tdb_free_record *f)
{ {
return f->magic_and_meta & ((1ULL << (64 - TDB_OFF_UPPER_STEAL)) - 1); return f->flist_and_len >> (64 - TDB_OFF_UPPER_STEAL);
} }
struct tdb_recovery_record { struct tdb_recovery_record {
...@@ -311,6 +321,7 @@ struct tdb_context { ...@@ -311,6 +321,7 @@ struct tdb_context {
/* What freelist are we using? */ /* What freelist are we using? */
uint64_t flist_off; uint64_t flist_off;
unsigned int flist;
/* IO methods: changes for transactions. */ /* IO methods: changes for transactions. */
const struct tdb_methods *methods; const struct tdb_methods *methods;
......
...@@ -63,7 +63,7 @@ static bool summarize(struct tdb_context *tdb, ...@@ -63,7 +63,7 @@ static bool summarize(struct tdb_context *tdb,
|| p->r.magic == TDB_RECOVERY_MAGIC) { || p->r.magic == TDB_RECOVERY_MAGIC) {
len = sizeof(p->r) + p->r.max_len; len = sizeof(p->r) + p->r.max_len;
} else if (rec_magic(&p->u) != TDB_MAGIC) { } else if (rec_magic(&p->u) != TDB_MAGIC) {
len = p->f.data_len; len = frec_len(&p->f);
tally_add(free, len); tally_add(free, len);
tally_add(buckets, size_to_bucket(len)); tally_add(buckets, size_to_bucket(len));
len += sizeof(p->u); len += sizeof(p->u);
......
...@@ -136,9 +136,11 @@ static void set_freelist(void *mem, struct tdb_context *tdb, ...@@ -136,9 +136,11 @@ static void set_freelist(void *mem, struct tdb_context *tdb,
static void add_to_freetable(struct tdb_context *tdb, static void add_to_freetable(struct tdb_context *tdb,
tdb_off_t eoff, tdb_off_t eoff,
tdb_off_t elen, tdb_off_t elen,
unsigned flist,
struct tle_freelist *freelist) struct tle_freelist *freelist)
{ {
tdb->flist_off = freelist->base.off; tdb->flist_off = freelist->base.off;
tdb->flist = flist;
add_free_record(tdb, eoff, sizeof(struct tdb_used_record) + elen); add_free_record(tdb, eoff, sizeof(struct tdb_used_record) + elen);
} }
...@@ -288,6 +290,7 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout) ...@@ -288,6 +290,7 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
switch (e->base.type) { switch (e->base.type) {
case FREE: case FREE:
add_to_freetable(tdb, e->base.off, e->free.len, add_to_freetable(tdb, e->base.off, e->free.len,
e->free.flist_num,
find_flist(layout, e->free.flist_num)); find_flist(layout, e->free.flist_num));
break; break;
case DATA: case DATA:
......
...@@ -17,7 +17,7 @@ static tdb_len_t free_record_length(struct tdb_context *tdb, tdb_off_t off) ...@@ -17,7 +17,7 @@ static tdb_len_t free_record_length(struct tdb_context *tdb, tdb_off_t off)
return TDB_OFF_ERR; return TDB_OFF_ERR;
if (frec_magic(&f) != TDB_FREE_MAGIC) if (frec_magic(&f) != TDB_FREE_MAGIC)
return TDB_OFF_ERR; return TDB_OFF_ERR;
return f.data_len; return frec_len(&f);
} }
int main(int argc, char *argv[]) int main(int argc, char *argv[])
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment