Commit 5e30abc6 authored by Rusty Russell's avatar Rusty Russell

tdb2: shrink free header from 32 to 24 bytes.

This reduces our minimum key+data length to 8 bytes; we do this by packing
the prev pointer where we used to put the flist pointer, and storing the
flist as an 8 bit index (meaning we can only have 256 free tables).

Note that this has a perverse result on the size of the database, as our
4-byte key and 4-byte data now fit perfectly in a minimal record, so
appeding causes us to allocate new records which are 50% larger,
since we detect growing.

Current results of speed test:
$ ./speed 1000000
Adding 1000000 records:  23210 ns (59193360 bytes)
Finding 1000000 records:  2387 ns (59193360 bytes)
Traversing 1000000 records:  2150 ns (59193360 bytes)
Deleting 1000000 records:  13392 ns (59193360 bytes)
Re-adding 1000000 records:  11546 ns (59193360 bytes)
Appending 1000000 records:  29327 ns (91193360 bytes)
Churning 1000000 records:  33026 ns (91193360 bytes)

Previous:
$ ./speed 1000000
Adding 1000000 records:  28324 ns (67232528 bytes)
Finding 1000000 records:  2468 ns (67232528 bytes)
Traversing 1000000 records:  2200 ns (67232528 bytes)
Deleting 1000000 records:  13083 ns (67232528 bytes)
Re-adding 1000000 records:  16433 ns (67232528 bytes)
Appending 1000000 records:  2511 ns (67232528 bytes)
Churning 1000000 records:  31068 ns (67570448 bytes)
parent dfae76fd
......@@ -315,37 +315,37 @@ static bool check_hash(struct tdb_context *tdb,
static bool check_free(struct tdb_context *tdb,
tdb_off_t off,
const struct tdb_free_record *frec,
tdb_off_t prev, tdb_off_t flist_off, unsigned int bucket)
tdb_off_t prev, unsigned int flist, unsigned int bucket)
{
if (frec_magic(frec) != TDB_FREE_MAGIC) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
"tdb_check: offset %llu bad magic 0x%llx\n",
(long long)off, (long long)frec->magic_and_meta);
(long long)off, (long long)frec->magic_and_prev);
return false;
}
if (frec_flist(frec) != flist_off) {
if (frec_flist(frec) != flist) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
"tdb_check: offset %llu bad freelist 0x%llx\n",
(long long)off, (long long)frec_flist(frec));
"tdb_check: offset %llu bad freelist %u\n",
(long long)off, frec_flist(frec));
return false;
}
if (tdb->methods->oob(tdb, off
+ frec->data_len+sizeof(struct tdb_used_record),
+ frec_len(frec) + sizeof(struct tdb_used_record),
false))
return false;
if (size_to_bucket(frec->data_len) != bucket) {
if (size_to_bucket(frec_len(frec)) != bucket) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
"tdb_check: offset %llu in wrong bucket %u vs %u\n",
(long long)off,
bucket, size_to_bucket(frec->data_len));
bucket, size_to_bucket(frec_len(frec)));
return false;
}
if (prev != frec->prev) {
if (prev != frec_prev(frec)) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
"tdb_check: offset %llu bad prev %llu vs %llu\n",
(long long)off,
(long long)prev, (long long)frec->prev);
(long long)prev, (long long)frec_len(frec));
return false;
}
return true;
......@@ -353,6 +353,7 @@ static bool check_free(struct tdb_context *tdb,
static bool check_free_list(struct tdb_context *tdb,
tdb_off_t flist_off,
unsigned flist_num,
tdb_off_t free[],
size_t num_free,
size_t *num_found)
......@@ -384,7 +385,7 @@ static bool check_free_list(struct tdb_context *tdb,
return false;
if (tdb_read_convert(tdb, off, &f, sizeof(f)))
return false;
if (!check_free(tdb, off, &f, prev, flist_off, i))
if (!check_free(tdb, off, &f, prev, flist_num, i))
return false;
/* FIXME: Check hash bits */
......@@ -436,13 +437,17 @@ static bool check_linear(struct tdb_context *tdb,
struct tdb_free_record f;
struct tdb_recovery_record r;
} pad, *p;
p = tdb_get(tdb, off, &pad, sizeof(pad));
/* r is larger: only get that if we need to. */
p = tdb_get(tdb, off, &pad, sizeof(pad.f));
if (!p)
return false;
/* If we crash after ftruncate, we can get zeroes or fill. */
if (p->r.magic == TDB_RECOVERY_INVALID_MAGIC
|| p->r.magic == 0x4343434343434343ULL) {
p = tdb_get(tdb, off, &pad, sizeof(pad.r));
if (!p)
return false;
if (recovery == off) {
found_recovery = true;
len = sizeof(p->r) + p->r.max_len;
......@@ -462,6 +467,9 @@ static bool check_linear(struct tdb_context *tdb,
(size_t)tdb->map_size);
}
} else if (p->r.magic == TDB_RECOVERY_MAGIC) {
p = tdb_get(tdb, off, &pad, sizeof(pad.r));
if (!p)
return false;
if (recovery != off) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
"tdb_check: unexpected recovery"
......@@ -469,11 +477,23 @@ static bool check_linear(struct tdb_context *tdb,
(size_t)off);
return false;
}
if (p->r.len > p->r.max_len) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
"tdb_check: invalid recovery length"
" %zu\n", (size_t)p->r.len);
return false;
}
if (p->r.eof > tdb->map_size) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
"tdb_check: invalid old EOF"
" %zu\n", (size_t)p->r.eof);
return false;
}
found_recovery = true;
len = sizeof(p->r) + p->r.max_len;
} else if (frec_magic(&p->f) == TDB_FREE_MAGIC
|| frec_magic(&p->f) == TDB_COALESCING_MAGIC) {
len = sizeof(p->u) + p->f.data_len;
len = sizeof(p->u) + frec_len(&p->f);
if (off + len > tdb->map_size) {
tdb->log(tdb, TDB_DEBUG_ERROR, tdb->log_priv,
"tdb_check: free overlength %llu"
......@@ -560,7 +580,8 @@ int tdb_check(struct tdb_context *tdb,
for (flist = first_flist(tdb); flist; flist = next_flist(tdb, flist)) {
if (flist == TDB_OFF_ERR)
goto fail;
if (!check_free_list(tdb, flist, free, num_free, &num_found))
if (!check_free_list(tdb, flist, num_flists, free, num_free,
&num_found))
goto fail;
num_flists++;
}
......
This diff is collapsed.
......@@ -173,20 +173,30 @@ static inline uint16_t rec_magic(const struct tdb_used_record *r)
}
struct tdb_free_record {
uint64_t magic_and_meta; /* TDB_OFF_UPPER_STEAL bits of magic */
uint64_t data_len; /* Not counting these two fields. */
/* This is why the minimum record size is 16 bytes. */
uint64_t next, prev;
uint64_t magic_and_prev; /* TDB_OFF_UPPER_STEAL bits magic, then prev */
uint64_t flist_and_len; /* Len not counting these two fields. */
/* This is why the minimum record size is 8 bytes. */
uint64_t next;
};
static inline uint64_t frec_prev(const struct tdb_free_record *f)
{
return f->magic_and_prev & ((1ULL << (64 - TDB_OFF_UPPER_STEAL)) - 1);
}
static inline uint64_t frec_magic(const struct tdb_free_record *f)
{
return f->magic_and_meta >> (64 - TDB_OFF_UPPER_STEAL);
return f->magic_and_prev >> (64 - TDB_OFF_UPPER_STEAL);
}
static inline uint64_t frec_len(const struct tdb_free_record *f)
{
return f->flist_and_len & ((1ULL << (64 - TDB_OFF_UPPER_STEAL))-1);
}
static inline uint64_t frec_flist(const struct tdb_free_record *f)
static inline unsigned frec_flist(const struct tdb_free_record *f)
{
return f->magic_and_meta & ((1ULL << (64 - TDB_OFF_UPPER_STEAL)) - 1);
return f->flist_and_len >> (64 - TDB_OFF_UPPER_STEAL);
}
struct tdb_recovery_record {
......@@ -311,6 +321,7 @@ struct tdb_context {
/* What freelist are we using? */
uint64_t flist_off;
unsigned int flist;
/* IO methods: changes for transactions. */
const struct tdb_methods *methods;
......
......@@ -63,7 +63,7 @@ static bool summarize(struct tdb_context *tdb,
|| p->r.magic == TDB_RECOVERY_MAGIC) {
len = sizeof(p->r) + p->r.max_len;
} else if (rec_magic(&p->u) != TDB_MAGIC) {
len = p->f.data_len;
len = frec_len(&p->f);
tally_add(free, len);
tally_add(buckets, size_to_bucket(len));
len += sizeof(p->u);
......
......@@ -136,9 +136,11 @@ static void set_freelist(void *mem, struct tdb_context *tdb,
static void add_to_freetable(struct tdb_context *tdb,
tdb_off_t eoff,
tdb_off_t elen,
unsigned flist,
struct tle_freelist *freelist)
{
tdb->flist_off = freelist->base.off;
tdb->flist = flist;
add_free_record(tdb, eoff, sizeof(struct tdb_used_record) + elen);
}
......@@ -288,6 +290,7 @@ struct tdb_context *tdb_layout_get(struct tdb_layout *layout)
switch (e->base.type) {
case FREE:
add_to_freetable(tdb, e->base.off, e->free.len,
e->free.flist_num,
find_flist(layout, e->free.flist_num));
break;
case DATA:
......
......@@ -17,7 +17,7 @@ static tdb_len_t free_record_length(struct tdb_context *tdb, tdb_off_t off)
return TDB_OFF_ERR;
if (frec_magic(&f) != TDB_FREE_MAGIC)
return TDB_OFF_ERR;
return f.data_len;
return frec_len(&f);
}
int main(int argc, char *argv[])
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment