Commit 1164029f authored by Yoni Fogel's avatar Yoni Fogel

fixes #6086 Merge 6086 to main. We now read in basement nodes if a full...

fixes #6086 Merge 6086 to main.  We now read in basement nodes if a full keyrange resides in it so that we can answer more accurately.


git-svn-id: file:///svn/toku/tokudb@54342 c7de825b-a66e-492c-adef-691d508d4ae1
parent 76fe5767
...@@ -420,7 +420,9 @@ static void print_db_struct (void) { ...@@ -420,7 +420,9 @@ static void print_db_struct (void) {
STRUCT_SETUP(DB, set_pagesize, "int (*%s) (DB *, uint32_t)"); STRUCT_SETUP(DB, set_pagesize, "int (*%s) (DB *, uint32_t)");
STRUCT_SETUP(DB, stat, "int (*%s) (DB *, void *, uint32_t)"); STRUCT_SETUP(DB, stat, "int (*%s) (DB *, void *, uint32_t)");
STRUCT_SETUP(DB, verify, "int (*%s) (DB *, const char *, const char *, FILE *, uint32_t)"); STRUCT_SETUP(DB, verify, "int (*%s) (DB *, const char *, const char *, FILE *, uint32_t)");
const char *extra[]={"int (*key_range64)(DB*, DB_TXN *, DBT *, uint64_t *less, uint64_t *equal, uint64_t *greater, int *is_exact)", const char *extra[]={
"int (*key_range64)(DB*, DB_TXN *, DBT *, uint64_t *less, uint64_t *equal, uint64_t *greater, int *is_exact)",
"int (*keys_range64)(DB*, DB_TXN *, DBT *keyleft, DBT *keyright, uint64_t *less, uint64_t *left, uint64_t *between, uint64_t *right, uint64_t *greater, bool *middle_3_exact)",
"int (*stat64)(DB *, DB_TXN *, DB_BTREE_STAT64 *)", "int (*stat64)(DB *, DB_TXN *, DB_BTREE_STAT64 *)",
"int (*pre_acquire_table_lock)(DB*, DB_TXN*)", "int (*pre_acquire_table_lock)(DB*, DB_TXN*)",
"int (*pre_acquire_fileops_lock)(DB*, DB_TXN*)", "int (*pre_acquire_fileops_lock)(DB*, DB_TXN*)",
......
...@@ -51,9 +51,24 @@ enum ftnode_fetch_type { ...@@ -51,9 +51,24 @@ enum ftnode_fetch_type {
ftnode_fetch_none=1, // no partitions needed. ftnode_fetch_none=1, // no partitions needed.
ftnode_fetch_subset, // some subset of partitions needed ftnode_fetch_subset, // some subset of partitions needed
ftnode_fetch_prefetch, // this is part of a prefetch call ftnode_fetch_prefetch, // this is part of a prefetch call
ftnode_fetch_all // every partition is needed ftnode_fetch_all, // every partition is needed
ftnode_fetch_keymatch, // one child is needed if it holds both keys
}; };
static bool is_valid_ftnode_fetch_type(enum ftnode_fetch_type type) UU();
static bool is_valid_ftnode_fetch_type(enum ftnode_fetch_type type) {
switch (type) {
case ftnode_fetch_none:
case ftnode_fetch_subset:
case ftnode_fetch_prefetch:
case ftnode_fetch_all:
case ftnode_fetch_keymatch:
return true;
default:
return false;
}
}
// //
// An extra parameter passed to cachetable functions // An extra parameter passed to cachetable functions
// That is used in all types of fetch callbacks. // That is used in all types of fetch callbacks.
...@@ -730,6 +745,46 @@ static inline void fill_bfe_for_full_read(struct ftnode_fetch_extra *bfe, FT h) ...@@ -730,6 +745,46 @@ static inline void fill_bfe_for_full_read(struct ftnode_fetch_extra *bfe, FT h)
bfe->decompress_time = 0; bfe->decompress_time = 0;
} }
//
// Helper function to fill a ftnode_fetch_extra with data
// that will tell the fetch callback that an explicit range of children is
// necessary. Used in cases where the portion of the node that is required
// is known in advance, e.g. for keysrange when the left and right key
// are in the same basement node.
//
static inline void fill_bfe_for_keymatch(
struct ftnode_fetch_extra *bfe,
FT h,
DBT *left,
DBT *right,
bool disable_prefetching,
bool read_all_partitions
)
{
paranoid_invariant(h->h->type == FT_CURRENT);
bfe->type = ftnode_fetch_keymatch;
bfe->h = h;
bfe->search = nullptr;
toku_init_dbt(&bfe->range_lock_left_key);
toku_init_dbt(&bfe->range_lock_right_key);
if (left) {
toku_copyref_dbt(&bfe->range_lock_left_key, *left);
}
if (right) {
toku_copyref_dbt(&bfe->range_lock_right_key, *right);
}
bfe->left_is_neg_infty = left == nullptr;
bfe->right_is_pos_infty = right == nullptr;
bfe->child_to_read = -1;
bfe->disable_prefetching = disable_prefetching;
bfe->read_all_partitions = read_all_partitions;
bfe->bytes_read = 0;
bfe->io_time = 0;
bfe->deserialize_time = 0;
bfe->decompress_time = 0;
}
// //
// Helper function to fill a ftnode_fetch_extra with data // Helper function to fill a ftnode_fetch_extra with data
// that will tell the fetch callback that some subset of the node // that will tell the fetch callback that some subset of the node
......
This diff is collapsed.
...@@ -205,6 +205,7 @@ enum ft_flags { ...@@ -205,6 +205,7 @@ enum ft_flags {
}; };
void toku_ft_keyrange(FT_HANDLE brt, DBT *key, uint64_t *less, uint64_t *equal, uint64_t *greater); void toku_ft_keyrange(FT_HANDLE brt, DBT *key, uint64_t *less, uint64_t *equal, uint64_t *greater);
void toku_ft_keysrange(FT_HANDLE brt, DBT* key_left, DBT* key_right, uint64_t *less_p, uint64_t* equal_left_p, uint64_t* middle_p, uint64_t* equal_right_p, uint64_t* greater_p, bool* middle_3_exact_p);
struct ftstat64_s { struct ftstat64_s {
uint64_t nkeys; /* estimate how many unique keys (even when flattened this may be an estimate) */ uint64_t nkeys; /* estimate how many unique keys (even when flattened this may be an estimate) */
......
...@@ -1372,6 +1372,20 @@ update_bfe_using_ftnode(FTNODE node, struct ftnode_fetch_extra *bfe) ...@@ -1372,6 +1372,20 @@ update_bfe_using_ftnode(FTNODE node, struct ftnode_fetch_extra *bfe)
node, node,
bfe->search bfe->search
); );
} else if (bfe->type == ftnode_fetch_keymatch) {
// we do not take into account prefetching yet
// as of now, if we need a subset, the only thing
// we can possibly require is a single basement node
// we find out what basement node the query cares about
// and check if it is available
paranoid_invariant(bfe->h->compare_fun);
if (node->height == 0) {
int left_child = toku_bfe_leftmost_child_wanted(bfe, node);
int right_child = toku_bfe_rightmost_child_wanted(bfe, node);
if (left_child == right_child) {
bfe->child_to_read = left_child;
}
}
} }
} }
...@@ -1688,7 +1702,7 @@ deserialize_ftnode_header_from_rbuf_if_small_enough (FTNODE *ftnode, ...@@ -1688,7 +1702,7 @@ deserialize_ftnode_header_from_rbuf_if_small_enough (FTNODE *ftnode,
// rbuf, so we might be able to store the compressed data for some // rbuf, so we might be able to store the compressed data for some
// objects. // objects.
// We can proceed to deserialize the individual subblocks. // We can proceed to deserialize the individual subblocks.
paranoid_invariant(bfe->type == ftnode_fetch_none || bfe->type == ftnode_fetch_subset || bfe->type == ftnode_fetch_all || bfe->type == ftnode_fetch_prefetch); paranoid_invariant(is_valid_ftnode_fetch_type(bfe->type));
// setup the memory of the partitions // setup the memory of the partitions
// for partitions being decompressed, create either FIFO or basement node // for partitions being decompressed, create either FIFO or basement node
...@@ -2323,7 +2337,7 @@ deserialize_ftnode_from_rbuf( ...@@ -2323,7 +2337,7 @@ deserialize_ftnode_from_rbuf(
// now that the node info has been deserialized, we can proceed to deserialize // now that the node info has been deserialized, we can proceed to deserialize
// the individual sub blocks // the individual sub blocks
paranoid_invariant(bfe->type == ftnode_fetch_none || bfe->type == ftnode_fetch_subset || bfe->type == ftnode_fetch_all || bfe->type == ftnode_fetch_prefetch); paranoid_invariant(is_valid_ftnode_fetch_type(bfe->type));
// setup the memory of the partitions // setup the memory of the partitions
// for partitions being decompressed, create either FIFO or basement node // for partitions being decompressed, create either FIFO or basement node
......
...@@ -66,6 +66,49 @@ static void maybe_reopen (enum memory_state ms, uint64_t limit) { ...@@ -66,6 +66,49 @@ static void maybe_reopen (enum memory_state ms, uint64_t limit) {
assert(0); assert(0);
} }
static void verify_keysrange(enum memory_state UU(ms), uint64_t limit,
uint64_t intkey1,
uint64_t intkey2,
uint64_t less,
uint64_t equal1,
uint64_t middle,
uint64_t equal2,
uint64_t greater,
bool middle3exact) {
uint64_t max_item = limit * 2 - 1;
uint64_t perfect_total = limit;
uint64_t perfect_less = intkey1 / 2;
uint64_t perfect_equal1 = intkey1 % 2 == 1;
uint64_t perfect_equal2 = intkey2 % 2 == 1 && intkey2 <= max_item;
uint64_t perfect_greater = intkey2 >= max_item ? 0 : (max_item + 1 - intkey2) / 2;
uint64_t perfect_middle = perfect_total - perfect_less - perfect_equal1 - perfect_equal2 - perfect_greater;
uint64_t total = less + equal1 + middle + equal2 + greater;
assert(total > 0);
assert(total < 2 * perfect_total);
assert(total > perfect_total / 2);
assert(equal1 == perfect_equal1 || (equal1 == 0 && !middle3exact));
assert(equal2 == perfect_equal2 || (equal2 == 0 && !middle3exact));
// As of 2013-02-25 this is accurate with fiddle ~= total/50.
// Set to 1/10th to prevent flakiness.
uint64_t fiddle = perfect_total / 10;
assert(less + fiddle > perfect_less);
assert(less < perfect_less + fiddle);
assert(middle + fiddle > perfect_middle);
assert(middle < perfect_middle + fiddle);
assert(greater + fiddle > perfect_greater);
assert(greater < perfect_greater + fiddle);
if (middle3exact) {
assert(middle == perfect_middle);
}
}
static void test_keyrange (enum memory_state ms, uint64_t limit) { static void test_keyrange (enum memory_state ms, uint64_t limit) {
open_ft_and_ct(true); open_ft_and_ct(true);
...@@ -123,7 +166,9 @@ static void test_keyrange (enum memory_state ms, uint64_t limit) { ...@@ -123,7 +166,9 @@ static void test_keyrange (enum memory_state ms, uint64_t limit) {
#endif #endif
} else { } else {
// after reopen, none of the basements are in memory // after reopen, none of the basements are in memory
assert(equal == 0); // However, "both" keys can be in the same basement (specifically the last basement node in the tree)
// Without trying to figure out how many are in the last basement node, we expect at least the first half not to be in the last basement node.
assert(i > limit / 2 || equal == 0);
#if 0 #if 0
if (i<10) { if (i<10) {
assert(less==0); assert(less==0);
...@@ -189,6 +234,80 @@ static void test_keyrange (enum memory_state ms, uint64_t limit) { ...@@ -189,6 +234,80 @@ static void test_keyrange (enum memory_state ms, uint64_t limit) {
#endif #endif
} }
maybe_reopen(ms, limit);
{
uint64_t totalqueries = 0;
uint64_t num_middle3_exact = 0;
for (uint64_t i=0; i < 2*limit; i++) {
char key[100];
char keyplus4[100];
char keyplus5[100];
uint64_t intkey = i;
snprintf(key, 100, "%08" PRIu64 "", intkey);
snprintf(keyplus4, 100, "%08" PRIu64 "", intkey+4);
snprintf(keyplus5, 100, "%08" PRIu64 "", intkey+5);
DBT k;
DBT k2;
DBT k3;
toku_fill_dbt(&k, key, 1+strlen(key));
toku_fill_dbt(&k2, keyplus4, 1+strlen(keyplus4));
toku_fill_dbt(&k3, keyplus5, 1+strlen(keyplus5));
uint64_t less,equal1,middle,equal2,greater;
bool middle3exact;
toku_ft_keysrange(t, &k, &k2, &less, &equal1, &middle, &equal2, &greater, &middle3exact);
if (ms == CLOSE_AND_REOPEN_LEAVE_ON_DISK) {
//TODO(yoni): when reading basement nodes is implemented, get rid of this hack
middle3exact = false;
}
totalqueries++;
num_middle3_exact += middle3exact;
if (verbose > 1) {
printf("Rkey2 %" PRIu64 "/%" PRIu64
" %" PRIu64
" %" PRIu64
" %" PRIu64
" %" PRIu64
" %" PRIu64
" %s\n",
intkey, 2*limit, less, equal1, middle, equal2, greater, middle3exact ? "true" : "false");
}
verify_keysrange(ms, limit, intkey, intkey+4,
less, equal1, middle, equal2, greater, middle3exact);
toku_ft_keysrange(t, &k, &k3, &less, &equal1, &middle, &equal2, &greater, &middle3exact);
if (ms == CLOSE_AND_REOPEN_LEAVE_ON_DISK) {
//TODO(yoni): when reading basement nodes is implemented, get rid of this hack
middle3exact = false;
}
totalqueries++;
num_middle3_exact += middle3exact;
if (verbose > 1) {
printf("Rkey3 %" PRIu64 "/%" PRIu64
" %" PRIu64
" %" PRIu64
" %" PRIu64
" %" PRIu64
" %" PRIu64
" %s\n",
intkey, 2*limit, less, equal1, middle, equal2, greater, middle3exact ? "true" : "false");
}
verify_keysrange(ms, limit, intkey, intkey+5,
less, equal1, middle, equal2, greater, middle3exact);
}
assert(num_middle3_exact <= totalqueries);
if (ms == CLOSE_AND_REOPEN_LEAVE_ON_DISK) {
//TODO(yoni): when reading basement nodes is implemented, get rid of this hack
assert(num_middle3_exact == 0);
} else {
// About 85% of the time, the key for an int (and +4 or +5) is in the
// same basement node. Check >= 70% so this isn't very flaky.
assert(num_middle3_exact > totalqueries * 7 / 10);
}
}
close_ft_and_ct(); close_ft_and_ct();
} }
......
...@@ -695,12 +695,8 @@ if(BUILD_TESTING OR BUILD_SRC_TESTS) ...@@ -695,12 +695,8 @@ if(BUILD_TESTING OR BUILD_SRC_TESTS)
declare_custom_tests(keyrange.tdb) declare_custom_tests(keyrange.tdb)
add_ydb_test_aux(keyrange-get0.tdb keyrange.tdb --get 0) add_ydb_test_aux(keyrange-get0.tdb keyrange.tdb --get 0)
add_ydb_test_aux(keyrange-get1.tdb keyrange.tdb --get 1) add_ydb_test_aux(keyrange-get1.tdb keyrange.tdb --get 1)
if (0)
add_ydb_test_aux(keyrange-random-get0.tdb keyrange.tdb --get 0 --random_keys 1) add_ydb_test_aux(keyrange-random-get0.tdb keyrange.tdb --get 0 --random_keys 1)
add_ydb_test_aux(keyrange-random-get1.tdb keyrange.tdb --get 1 --random_keys 1) add_ydb_test_aux(keyrange-random-get1.tdb keyrange.tdb --get 1 --random_keys 1)
else ()
message(WARNING "TODO(leif): re-enable keyrange tests, see #5666")
endif ()
add_ydb_test_aux(keyrange-loader-get0.tdb keyrange.tdb --get 0 --loader 1) add_ydb_test_aux(keyrange-loader-get0.tdb keyrange.tdb --get 0 --loader 1)
add_ydb_test_aux(keyrange-loader-get1.tdb keyrange.tdb --get 1 --loader 1) add_ydb_test_aux(keyrange-loader-get1.tdb keyrange.tdb --get 1 --loader 1)
......
...@@ -60,7 +60,7 @@ run_test(void) { ...@@ -60,7 +60,7 @@ run_test(void) {
size_t key_size = 9; size_t key_size = 9;
size_t val_size = 9; size_t val_size = 9;
size_t est_row_size_with_overhead = 8 + key_size + 4 + val_size + 4; // xid + key + key_len + val + val)len size_t est_row_size_with_overhead = 8 + key_size + 4 + val_size + 4 + 5; // xid + key + key_len + val + val_len + mvcc overhead
size_t rows_per_basement = db_basement_size / est_row_size_with_overhead; size_t rows_per_basement = db_basement_size / est_row_size_with_overhead;
int r; int r;
...@@ -72,7 +72,8 @@ run_test(void) { ...@@ -72,7 +72,8 @@ run_test(void) {
r = env->open(env, envdir, DB_INIT_LOCK|DB_INIT_LOG|DB_INIT_MPOOL|DB_INIT_TXN|DB_CREATE|DB_PRIVATE, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); r = env->open(env, envdir, DB_INIT_LOCK|DB_INIT_LOG|DB_INIT_MPOOL|DB_INIT_TXN|DB_CREATE|DB_PRIVATE, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r);
r = db_create(&db, env, 0); CKERR(r); r = db_create(&db, env, 0); CKERR(r);
r = db->set_pagesize(db, db_page_size); r = db->set_pagesize(db, db_page_size); CKERR(r);
r = db->set_readpagesize(db, db_basement_size); CKERR(r);
r = env->txn_begin(env, 0, &txn, 0); CKERR(r); r = env->txn_begin(env, 0, &txn, 0); CKERR(r);
r = db->open(db, txn, "foo.db", 0, DB_BTREE, DB_CREATE, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); r = db->open(db, txn, "foo.db", 0, DB_BTREE, DB_CREATE, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r);
r = txn->commit(txn, 0); CKERR(r); r = txn->commit(txn, 0); CKERR(r);
...@@ -145,7 +146,11 @@ run_test(void) { ...@@ -145,7 +146,11 @@ run_test(void) {
if (0) goto skipit; // debug: just write the tree if (0) goto skipit; // debug: just write the tree
bool last_basement;
last_basement = false;
// verify key_range for keys that exist in the tree // verify key_range for keys that exist in the tree
uint64_t random_fudge;
random_fudge = random_keys ? rows_per_basement + nrows / 10 : 0;
for (uint64_t i=0; i<nrows; i++) { for (uint64_t i=0; i<nrows; i++) {
char key[100]; char key[100];
snprintf(key, 100, "%08llu", (unsigned long long)2*i+1); snprintf(key, 100, "%08llu", (unsigned long long)2*i+1);
...@@ -160,15 +165,31 @@ run_test(void) { ...@@ -160,15 +165,31 @@ run_test(void) {
assert(0 < less + equal + greater); assert(0 < less + equal + greater);
if (use_loader) { if (use_loader) {
assert(less + equal + greater <= nrows); assert(less + equal + greater <= nrows);
assert(get_all ? equal == 1 : equal == 0); if (get_all || last_basement) {
assert(equal == 1);
} else if (i < nrows - rows_per_basement * 2) {
assert(equal == 0);
} else if (i == nrows - 1) {
assert(equal == 1);
} else if (equal == 1) {
last_basement = true;
}
assert(less <= max64(i, i + rows_per_basement/2)); assert(less <= max64(i, i + rows_per_basement/2));
assert(greater <= nrows - less); assert(greater <= nrows - less);
} else { } else {
assert(less + equal + greater <= nrows + nrows / 8); assert(less + equal + greater <= nrows + nrows / 8);
assert(get_all ? equal == 1 : equal == 0); if (get_all || last_basement) {
uint64_t est_i = max64(i, i + rows_per_basement/2); assert(equal == 1);
assert(less <= est_i + est_i / 1); } else if (i < nrows - rows_per_basement * 2) {
assert(greater <= nrows - i + rows_per_basement/2); assert(equal == 0);
} else if (i == nrows - 1) {
assert(equal == 1);
} else if (equal == 1) {
last_basement = true;
}
uint64_t est_i = i * 2 + rows_per_basement;
assert(less <= est_i + random_fudge);
assert(greater <= nrows - i + rows_per_basement + random_fudge);
} }
} }
...@@ -193,9 +214,9 @@ run_test(void) { ...@@ -193,9 +214,9 @@ run_test(void) {
} else { } else {
assert(less + equal + greater <= nrows + nrows / 8); assert(less + equal + greater <= nrows + nrows / 8);
assert(equal == 0); assert(equal == 0);
uint64_t est_i = max64(i, i + rows_per_basement/2); uint64_t est_i = i * 2 + rows_per_basement;
assert(less <= est_i + est_i / 1); assert(less <= est_i + random_fudge);
assert(greater <= nrows - i + rows_per_basement/2); assert(greater <= nrows - i + rows_per_basement + random_fudge);
} }
} }
......
...@@ -642,17 +642,30 @@ toku_db_stat64(DB * db, DB_TXN *txn, DB_BTREE_STAT64 *s) { ...@@ -642,17 +642,30 @@ toku_db_stat64(DB * db, DB_TXN *txn, DB_BTREE_STAT64 *s) {
} }
static int static int
toku_db_key_range64(DB* db, DB_TXN* txn __attribute__((__unused__)), DBT* key, uint64_t* less, uint64_t* equal, uint64_t* greater, int* is_exact) { toku_db_keys_range64(DB* db, DB_TXN* txn __attribute__((__unused__)), DBT* keyleft, DBT* keyright, uint64_t* less, uint64_t* left, uint64_t* between, uint64_t *right, uint64_t *greater, bool* middle_3_exact) {
HANDLE_PANICKED_DB(db); HANDLE_PANICKED_DB(db);
HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn); HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
// note that toku_ft_keyrange does not have a txn param // note that we ignore the txn param. It would be more complicated to support it.
// this will be fixed later // TODO(yoni): Maybe add support for txns later? How would we do this? ydb lock comment about db_keyrange64 is obsolete.
// temporarily, because the caller, locked_db_keyrange, toku_ft_keysrange(db->i->ft_handle, keyleft, keyright, less, left, between, right, greater, middle_3_exact);
// has the ydb lock, we are ok return 0;
toku_ft_keyrange(db->i->ft_handle, key, less, equal, greater); }
// temporarily set is_exact to 0 because ft_keyrange does not have this parameter
*is_exact = 0; static int
toku_db_key_range64(DB* db, DB_TXN* txn, DBT* key, uint64_t* less_p, uint64_t* equal_p, uint64_t* greater_p, int* is_exact) {
uint64_t less, equal_left, middle, equal_right, greater;
bool ignore;
int r = toku_db_keys_range64(db, txn, key, NULL, &less, &equal_left, &middle, &equal_right, &greater, &ignore);
if (r == 0) {
*less_p = less;
*equal_p = equal_left;
*greater_p = middle;
paranoid_invariant_zero(greater); // no keys are greater than positive infinity
paranoid_invariant_zero(equal_right); // no keys are equal to positive infinity
// toku_ft_keysrange does not know when all 3 are exact, so set is_exact to false
*is_exact = false;
}
return 0; return 0;
} }
...@@ -928,6 +941,7 @@ toku_db_create(DB ** db, DB_ENV * env, uint32_t flags) { ...@@ -928,6 +941,7 @@ toku_db_create(DB ** db, DB_ENV * env, uint32_t flags) {
USDB(pre_acquire_table_lock); USDB(pre_acquire_table_lock);
USDB(pre_acquire_fileops_lock); USDB(pre_acquire_fileops_lock);
USDB(key_range64); USDB(key_range64);
USDB(keys_range64);
USDB(hot_optimize); USDB(hot_optimize);
USDB(stat64); USDB(stat64);
USDB(get_fractal_tree_info64); USDB(get_fractal_tree_info64);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment