Commit 3b853c9a authored by Zardosht Kasheff's avatar Zardosht Kasheff Committed by Yoni Fogel

closes #5804, merge 51139, work to read in entire internal nodes into memory,...

closes #5804, merge 51139, work to read in entire internal nodes into memory, compressed, as opposed to using two I/O's, to main.

git-svn-id: file:///svn/toku/tokudb@51185 c7de825b-a66e-492c-adef-691d508d4ae1
parent 856ab364
......@@ -87,10 +87,13 @@ struct ftnode_fetch_extra {
// and the user is doing a dictionary wide scan, then
// even though a query may only want one basement node,
// we fetch all basement nodes in a leaf node.
bool disable_prefetching;
bool disable_prefetching;
// this value will be set during the fetch_callback call by toku_ftnode_fetch_callback or toku_ftnode_pf_req_callback
// thi callbacks need to evaluate this anyway, so we cache it here so the search code does not reevaluate it
int child_to_read;
// when we read internal nodes, we want to read all the data off disk in one I/O
// then we'll treat it as normal and only decompress the needed partitions etc.
bool read_all_partitions;
// Accounting: How many bytes were fetched, and how much time did it take?
tokutime_t bytes_read;
uint64_t read_time;
......@@ -724,6 +727,7 @@ static inline void fill_bfe_for_full_read(struct ftnode_fetch_extra *bfe, FT h)
bfe->right_is_pos_infty = false;
bfe->child_to_read = -1;
bfe->disable_prefetching = false;
bfe->read_all_partitions = false;
bfe->bytes_read = 0;
bfe->read_time = 0;
}
......@@ -742,7 +746,8 @@ static inline void fill_bfe_for_subset_read(
DBT *right,
bool left_is_neg_infty,
bool right_is_pos_infty,
bool disable_prefetching
bool disable_prefetching,
bool read_all_partitions
)
{
paranoid_invariant(h->h->type == FT_CURRENT);
......@@ -755,6 +760,7 @@ static inline void fill_bfe_for_subset_read(
bfe->right_is_pos_infty = right_is_pos_infty;
bfe->child_to_read = -1;
bfe->disable_prefetching = disable_prefetching;
bfe->read_all_partitions = read_all_partitions;
bfe->bytes_read = 0;
bfe->read_time = 0;
}
......@@ -776,6 +782,7 @@ static inline void fill_bfe_for_min_read(struct ftnode_fetch_extra *bfe, FT h) {
bfe->right_is_pos_infty = false;
bfe->child_to_read = -1;
bfe->disable_prefetching = false;
bfe->read_all_partitions = false;
bfe->bytes_read = 0;
bfe->read_time = 0;
}
......@@ -824,6 +831,7 @@ static inline void fill_bfe_for_prefetch(struct ftnode_fetch_extra *bfe,
bfe->right_is_pos_infty = c->right_is_pos_infty;
bfe->child_to_read = -1;
bfe->disable_prefetching = c->disable_prefetching;
bfe->read_all_partitions = false;
bfe->bytes_read = 0;
bfe->read_time = 0;
}
......
......@@ -4860,6 +4860,9 @@ ft_search_child(FT_HANDLE brt, FTNODE node, int childnum, ft_search_t *search, F
uint32_t fullhash = compute_child_fullhash(brt->ft->cf, node, childnum);
FTNODE childnode;
// If the current node's height is greater than 1, then its child is an internal node.
// Therefore, to warm the cache better (#5798), we want to read all the partitions off disk in one shot.
bool read_all_partitions = node->height > 1;
struct ftnode_fetch_extra bfe;
fill_bfe_for_subset_read(
&bfe,
......@@ -4869,7 +4872,8 @@ ft_search_child(FT_HANDLE brt, FTNODE node, int childnum, ft_search_t *search, F
&ftcursor->range_lock_right_key,
ftcursor->left_is_neg_infty,
ftcursor->right_is_pos_infty,
ftcursor->disable_prefetching
ftcursor->disable_prefetching,
read_all_partitions
);
bool msgs_applied = false;
{
......@@ -5195,7 +5199,8 @@ try_again:
&ftcursor->range_lock_right_key,
ftcursor->left_is_neg_infty,
ftcursor->right_is_pos_infty,
ftcursor->disable_prefetching
ftcursor->disable_prefetching,
true // We may as well always read the whole root into memory, if it's a leaf node it's a tiny tree anyway.
);
FTNODE node = NULL;
{
......
......@@ -2445,9 +2445,15 @@ toku_deserialize_ftnode_from (int fd,
toku_trace("deserial start");
int r = 0;
struct rbuf rb = RBUF_INITIALIZER;
read_ftnode_header_from_fd_into_rbuf_if_small_enough(fd, blocknum, bfe->h, &rb, bfe);
r = deserialize_ftnode_header_from_rbuf_if_small_enough(ftnode, ndd, blocknum, fullhash, bfe, &rb, fd);
if (!bfe->read_all_partitions) {
read_ftnode_header_from_fd_into_rbuf_if_small_enough(fd, blocknum, bfe->h, &rb, bfe);
r = deserialize_ftnode_header_from_rbuf_if_small_enough(ftnode, ndd, blocknum, fullhash, bfe, &rb, fd);
} else {
// force us to do it the old way
r = -1;
}
if (r != 0) {
// Something went wrong, go back to doing it the old way.
r = deserialize_ftnode_from_fd(fd, blocknum, fullhash, ftnode, ndd, bfe, NULL);
......
......@@ -192,6 +192,7 @@ test_subset_read(int fd, FT_HANDLE UU(brt), FT brt_h) {
&right,
false,
false,
false,
false
);
......
......@@ -164,6 +164,7 @@ test2(int fd, FT brt_h, FTNODE *dn) {
&right,
true,
true,
false,
false
);
FTNODE_DISK_DATA ndd = NULL;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment