Commit cfc7d301 authored by Rusty Russell's avatar Rusty Russell

tdb2: reduce transaction before writing to recovery area.

We don't need to write the whole page to the recovery area if it
hasn't all changed.  Simply skipping the start and end of the pages
which are similar saves us about 20% on growtdb-bench 250000, and 45%
on tdbtorture.  The more thorough examination of page differences
gives us a saving of 90% on growtdb-bench and 98% on tdbtorture!

And we do win a bit on timings for transaction commit:

Before:
$ time ./growtdb-bench 250000 10 > /dev/null && ls -l /tmp/growtdb.tdb && time ./tdbtorture -s 0 && ls -l torture.tdb && ./speed --transaction 2000000
real	1m4.844s
user	0m15.537s
sys	0m3.796s
-rw------- 1 rusty rusty 626693096 2011-04-27 21:28 /tmp/growtdb.tdb
testing with 3 processes, 5000 loops, seed=0
OK

real	1m17.021s
user	0m0.272s
sys	0m0.540s
-rw------- 1 rusty rusty 458800 2011-04-27 21:29 torture.tdb
Adding 2000000 records:  894 ns (110556088 bytes)
Finding 2000000 records:  569 ns (110556088 bytes)
Missing 2000000 records:  390 ns (110556088 bytes)
Traversing 2000000 records:  403 ns (110556088 bytes)
Deleting 2000000 records:  710 ns (244003768 bytes)
Re-adding 2000000 records:  825 ns (244003768 bytes)
Appending 2000000 records:  1262 ns (268404160 bytes)
Churning 2000000 records:  2311 ns (268404160 bytes)


After:
$ time ./growtdb-bench 250000 10 > /dev/null && ls -l /tmp/growtdb.tdb && time ./tdbtorture -s 0 && ls -l torture.tdb && ./speed --transaction 2000000
real	0m50.366s
user	0m17.109s
sys	0m2.468s
-rw------- 1 rusty rusty 564215952 2011-04-27 21:31 /tmp/growtdb.tdb
testing with 3 processes, 5000 loops, seed=0
OK

real	1m23.818s
user	0m0.304s
sys	0m0.508s
-rw------- 1 rusty rusty 669856 2011-04-27 21:32 torture.tdb
Adding 2000000 records:  887 ns (110556088 bytes)
Finding 2000000 records:  556 ns (110556088 bytes)
Missing 2000000 records:  385 ns (110556088 bytes)
Traversing 2000000 records:  401 ns (110556088 bytes)
Deleting 2000000 records:  710 ns (244003768 bytes)
Re-adding 2000000 records:  825 ns (244003768 bytes)
Appending 2000000 records:  1255 ns (268404160 bytes)
Churning 2000000 records:  2299 ns (268404160 bytes)
parent ba7740e6
......@@ -130,6 +130,7 @@ static enum TDB_ERROR tdb_oob(struct tdb_context *tdb, tdb_off_t len,
/* Endian conversion: we only ever deal with 8 byte quantities */
void *tdb_convert(const struct tdb_context *tdb, void *buf, tdb_len_t size)
{
assert(size % 8 == 0);
if (unlikely((tdb->flags & TDB_CONVERT)) && buf) {
uint64_t i, *p = (uint64_t *)buf;
for (i = 0; i < size / 8; i++)
......
......@@ -757,6 +757,44 @@ static void set_recovery_header(struct tdb_recovery_record *rec,
rec->eof = oldsize;
}
static unsigned int same(const unsigned char *new,
const unsigned char *old,
unsigned int length)
{
unsigned int i;
for (i = 0; i < length; i++) {
if (new[i] != old[i])
break;
}
return i;
}
static unsigned int different(const unsigned char *new,
const unsigned char *old,
unsigned int length,
unsigned int min_same,
unsigned int *samelen)
{
unsigned int i;
*samelen = 0;
for (i = 0; i < length; i++) {
if (new[i] == old[i]) {
(*samelen)++;
} else {
if (*samelen >= min_same) {
return i - *samelen;
}
*samelen = 0;
}
}
if (*samelen < min_same)
*samelen = 0;
return length - *samelen;
}
/*
setup the recovery data that will be used on a crash during commit
*/
......@@ -791,9 +829,6 @@ static enum TDB_ERROR transaction_setup_recovery(struct tdb_context *tdb,
}
rec = (struct tdb_recovery_record *)data;
set_recovery_header(rec, TDB_RECOVERY_INVALID_MAGIC,
recovery_size, recovery_max_size, old_map_size);
tdb_convert(tdb, rec, sizeof(*rec));
/* build the recovery data into a single blob to allow us to do a single
large write, which should be more efficient */
......@@ -801,6 +836,8 @@ static enum TDB_ERROR transaction_setup_recovery(struct tdb_context *tdb,
for (i=0;i<tdb->transaction->num_blocks;i++) {
tdb_off_t offset;
tdb_len_t length;
unsigned int off;
unsigned char buffer[PAGESIZE];
if (tdb->transaction->blocks[i] == NULL) {
continue;
......@@ -823,50 +860,60 @@ static enum TDB_ERROR transaction_setup_recovery(struct tdb_context *tdb,
" transaction data over new region"
" boundary");
}
memcpy(p, &offset, sizeof(offset));
memcpy(p + sizeof(offset), &length, sizeof(length));
tdb_convert(tdb, p, sizeof(offset) + sizeof(length));
/* the recovery area contains the old data, not the
new data, so we have to call the original tdb_read
method to get it */
if (offset + length > old_map_size) {
/* Short read at EOF, and zero fill. */
unsigned int len = old_map_size - offset;
ecode = methods->tread(tdb, offset,
p + sizeof(offset) + sizeof(length),
len);
memset(p + sizeof(offset) + sizeof(length) + len, 0,
length - len);
} else {
ecode = methods->tread(tdb, offset,
p + sizeof(offset) + sizeof(length),
length);
/* Short read at EOF. */
length = old_map_size - offset;
}
ecode = methods->tread(tdb, offset, buffer, length);
if (ecode != TDB_SUCCESS) {
free(data);
return ecode;
}
p += sizeof(offset) + sizeof(length) + length;
/* Skip over anything the same at the start. */
off = same(tdb->transaction->blocks[i], buffer, length);
offset += off;
while (off < length) {
tdb_len_t len;
unsigned int samelen;
len = different(tdb->transaction->blocks[i] + off,
buffer + off, length - off,
sizeof(offset) + sizeof(len) + 1,
&samelen);
memcpy(p, &offset, sizeof(offset));
memcpy(p + sizeof(offset), &len, sizeof(len));
tdb_convert(tdb, p, sizeof(offset) + sizeof(len));
p += sizeof(offset) + sizeof(len);
memcpy(p, buffer + off, len);
p += len;
off += len + samelen;
offset += len + samelen;
}
}
/* Now we know size, set up rec header. */
set_recovery_header(rec, TDB_RECOVERY_INVALID_MAGIC,
p - data - sizeof(*rec),
recovery_max_size, old_map_size);
tdb_convert(tdb, rec, sizeof(*rec));
/* write the recovery data to the recovery area */
ecode = methods->twrite(tdb, recovery_offset, data,
sizeof(*rec) + recovery_size);
ecode = methods->twrite(tdb, recovery_offset, data, p - data);
if (ecode != TDB_SUCCESS) {
free(data);
return tdb_logerr(tdb, ecode, TDB_LOG_ERROR,
"tdb_transaction_setup_recovery:"
" failed to write recovery data");
}
transaction_write_existing(tdb, recovery_offset, data,
sizeof(*rec) + recovery_size);
transaction_write_existing(tdb, recovery_offset, data, p - data);
/* as we don't have ordered writes, we have to sync the recovery
data before we update the magic to indicate that the recovery
data is present */
ecode = transaction_sync(tdb, recovery_offset,
sizeof(*rec) + recovery_size);
ecode = transaction_sync(tdb, recovery_offset, p - data);
if (ecode != TDB_SUCCESS) {
free(data);
return ecode;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment