refs #5770 Only check one basement node on pin, remove the assumption that adjacent

available nodes are query-able.

refs #5770 Only check one basement node on pin, remove the assumption that adjacent
available nodes are query-able.
9858bf38 · John Esmet · 08adc6e1 · 9858bf38 · 9858bf38 · 9858bf38
Commit 9858bf38 authored Jul 15, 2013 by John Esmet
5 changed files
--- a/ft/ft-cachetable-wrappers.cc
+++ b/ft/ft-cachetable-wrappers.cc
@@ -193,6 +193,11 @@ toku_create_new_ftnode (
        NULL);
 }

+//
+// On success, this function assumes that the caller is trying to pin the node
+// with a PL_READ lock. If message application is needed,
+// then a PL_WRITE_CHEAP lock is grabbed
+//
 int
 toku_pin_ftnode_batched(
    FT_HANDLE brt,
@@ -202,15 +207,22 @@ toku_pin_ftnode_batched(
    ANCESTORS ancestors,
    const PIVOT_BOUNDS bounds,
    FTNODE_FETCH_EXTRA bfe,
-    pair_lock_type lock_type,
    bool apply_ancestor_messages, // this bool is probably temporary, for #3972, once we know how range query estimates work, will revisit this
    FTNODE *node_p,
    bool* msgs_applied)
 {
    void *node_v;
    *msgs_applied = false;
-    pair_lock_type needed_lock_type = lock_type;
-try_again_for_write_lock:
+    FTNODE node = nullptr;
+    MSN max_msn_in_path = ZERO_MSN;
+    bool needs_ancestors_messages = false;
+    // this function assumes that if you want ancestor messages applied,
+    // you are doing a read for a query. This is so we can make some optimizations
+    // below.
+    if (apply_ancestor_messages) {
+        paranoid_invariant(bfe->type == ftnode_fetch_subset);
+    }
+    
    int r = toku_cachetable_get_and_pin_nonblocking_batched(
            brt->ft->cf,
            blocknum,
@@ -221,63 +233,82 @@ try_again_for_write_lock:
            toku_ftnode_fetch_callback,
            toku_ftnode_pf_req_callback,
            toku_ftnode_pf_callback,
-            needed_lock_type,
+            PL_READ,
            bfe, //read_extraargs
            unlockers);
-    if (r==0) {
-        FTNODE node = static_cast<FTNODE>(node_v);
-        MSN max_msn_in_path;
-        bool needs_ancestors_messages = false;
-        if (apply_ancestor_messages && node->height == 0) {
-            needs_ancestors_messages = toku_ft_leaf_needs_ancestors_messages(brt->ft, node, ancestors, bounds, &max_msn_in_path);
-            if (needs_ancestors_messages && needed_lock_type == PL_READ) {
-                toku_unpin_ftnode_read_only(brt->ft, node);
-                needed_lock_type = PL_WRITE_CHEAP;
-                goto try_again_for_write_lock;
+    if (r != 0) {
+        assert(r == TOKUDB_TRY_AGAIN); // Any other error and we should bomb out ASAP.
+        goto exit;
+    }
+    node = static_cast<FTNODE>(node_v);
+    if (apply_ancestor_messages && node->height == 0) {
+        needs_ancestors_messages = toku_ft_leaf_needs_ancestors_messages(
+            brt->ft, 
+            node, 
+            ancestors, 
+            bounds, 
+            &max_msn_in_path, 
+            bfe->child_to_read
+            );
+        if (needs_ancestors_messages) {
+            toku_unpin_ftnode_read_only(brt->ft, node);
+            int rr = toku_cachetable_get_and_pin_nonblocking_batched(
+                    brt->ft->cf,
+                    blocknum,
+                    fullhash,
+                    &node_v,
+                    NULL,
+                    get_write_callbacks_for_node(brt->ft),
+                    toku_ftnode_fetch_callback,
+                    toku_ftnode_pf_req_callback,
+                    toku_ftnode_pf_callback,
+                    PL_WRITE_CHEAP,
+                    bfe, //read_extraargs
+                    unlockers);
+            if (rr != 0) {
+                assert(rr == TOKUDB_TRY_AGAIN); // Any other error and we should bomb out ASAP.
+                r = TOKUDB_TRY_AGAIN;
+                goto exit;
            }
-        }
-        if (apply_ancestor_messages && node->height == 0) {
-            if (needs_ancestors_messages) {
-                invariant(needed_lock_type != PL_READ);
-                toku_apply_ancestors_messages_to_node(brt, node, ancestors, bounds, msgs_applied);
-            } else {
-                // At this point, we aren't going to run
-                // toku_apply_ancestors_messages_to_node but that doesn't
-                // mean max_msn_applied shouldn't be updated if possible
-                // (this saves the CPU work involved in
-                // toku_ft_leaf_needs_ancestors_messages).
-                //
-                // We still have a read lock, so we have not resolved
-                // checkpointing.  If the node is pending and dirty, we
-                // can't modify anything, including max_msn, until we
-                // resolve checkpointing.  If we do, the node might get
-                // written out that way as part of a checkpoint with a
-                // root that was already written out with a smaller
-                // max_msn.  During recovery, we would then inject a
-                // message based on the root's max_msn, and that message
-                // would get filtered by the leaf because it had too high
-                // a max_msn value. (see #5407)
-                //
-                // So for simplicity we only update the max_msn if the
-                // node is clean.  That way, in order for the node to get
-                // written out, it would have to be dirtied.  That
-                // requires a write lock, and a write lock requires you to
-                // resolve checkpointing.
-                if (!node->dirty) {
-                    toku_ft_bn_update_max_msn(node, max_msn_in_path);
-                }
+            node = static_cast<FTNODE>(node_v);
+            toku_apply_ancestors_messages_to_node(
+                brt, 
+                node, 
+                ancestors, 
+                bounds, 
+                msgs_applied,
+                bfe->child_to_read
+                );
+        } else {
+            // At this point, we aren't going to run
+            // toku_apply_ancestors_messages_to_node but that doesn't
+            // mean max_msn_applied shouldn't be updated if possible
+            // (this saves the CPU work involved in
+            // toku_ft_leaf_needs_ancestors_messages).
+            //
+            // We still have a read lock, so we have not resolved
+            // checkpointing.  If the node is pending and dirty, we
+            // can't modify anything, including max_msn, until we
+            // resolve checkpointing.  If we do, the node might get
+            // written out that way as part of a checkpoint with a
+            // root that was already written out with a smaller
+            // max_msn.  During recovery, we would then inject a
+            // message based on the root's max_msn, and that message
+            // would get filtered by the leaf because it had too high
+            // a max_msn value. (see #5407)
+            //
+            // So for simplicity we only update the max_msn if the
+            // node is clean.  That way, in order for the node to get
+            // written out, it would have to be dirtied.  That
+            // requires a write lock, and a write lock requires you to
+            // resolve checkpointing.
+            if (!node->dirty) {
+                toku_ft_bn_update_max_msn(node, max_msn_in_path, bfe->child_to_read);
            }
-            invariant(needed_lock_type != PL_READ || !*msgs_applied);
-        }
-        if ((lock_type != PL_READ) && node->height > 0) {
-            toku_move_ftnode_messages_to_stale(brt->ft, node);
        }
-        *node_p = node;
-        // printf("%*sPin %ld\n", 8-node->height, "", blocknum.b);
-    } else {
-        assert(r==TOKUDB_TRY_AGAIN); // Any other error and we should bomb out ASAP.
-        // printf("%*sPin %ld try again\n", 8, "", blocknum.b);
    }
+    *node_p = node;
+exit:
    return r;
 }


--- a/ft/ft-cachetable-wrappers.h
+++ b/ft/ft-cachetable-wrappers.h
@@ -150,7 +150,6 @@ toku_pin_ftnode_batched(
    ANCESTORS ancestors,
    const PIVOT_BOUNDS pbounds,
    FTNODE_FETCH_EXTRA bfe,
-    pair_lock_type lock_type,
    bool apply_ancestor_messages, // this bool is probably temporary, for #3972, once we know how range query estimates work, will revisit this
    FTNODE *node_p,
    bool* msgs_applied

--- a/ft/ft-internal.h
+++ b/ft/ft-internal.h
@@ -727,13 +727,6 @@ STAT64INFO_S toku_get_and_clear_basement_stats(FTNODE leafnode);
 #define VERIFY_NODE(t,n) ((void)0)
 #endif

-//#define FT_TRACE
-#ifdef FT_TRACE
-#define WHEN_FTTRACE(x) x
-#else
-#define WHEN_FTTRACE(x) ((void)0)
-#endif
-
 void toku_ft_status_update_pivot_fetch_reason(struct ftnode_fetch_extra *bfe);
 void toku_ft_status_update_flush_reason(FTNODE node, uint64_t uncompressed_bytes_flushed, uint64_t bytes_written, tokutime_t write_time, bool for_checkpoint);
 void toku_ft_status_update_serialize_times(FTNODE node, tokutime_t serialize_time, tokutime_t compress_time);
@@ -982,11 +975,11 @@ struct pivot_bounds {

 __attribute__((nonnull))
 void toku_move_ftnode_messages_to_stale(FT ft, FTNODE node);
-void toku_apply_ancestors_messages_to_node (FT_HANDLE t, FTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds, bool* msgs_applied);
+void toku_apply_ancestors_messages_to_node (FT_HANDLE t, FTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds, bool* msgs_applied, int child_to_read);
 __attribute__((nonnull))
-bool toku_ft_leaf_needs_ancestors_messages(FT ft, FTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds, MSN *const max_msn_in_path);
+bool toku_ft_leaf_needs_ancestors_messages(FT ft, FTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds, MSN *const max_msn_in_path, int child_to_read);
 __attribute__((nonnull))
-void toku_ft_bn_update_max_msn(FTNODE node, MSN max_msn_applied);
+void toku_ft_bn_update_max_msn(FTNODE node, MSN max_msn_applied, int child_to_read);

 __attribute__((const,nonnull))
 size_t toku_ft_msg_memsize_in_fifo(FT_MSG cmd);

--- a/ft/ft-ops.cc
+++ b/ft/ft-ops.cc
--- a/ft/tests/orthopush-flush.cc
+++ b/ft/tests/orthopush-flush.cc
@@ -696,7 +696,7 @@ flush_to_leaf(FT_HANDLE t, bool make_leaf_up_to_date, bool use_flush) {
        struct ancestors ancestors = { .node = parentnode, .childnum = 0, .next = NULL };
        const struct pivot_bounds infinite_bounds = { .lower_bound_exclusive = NULL, .upper_bound_inclusive = NULL };
        bool msgs_applied;
-        toku_apply_ancestors_messages_to_node(t, child, &ancestors, &infinite_bounds, &msgs_applied);
+        toku_apply_ancestors_messages_to_node(t, child, &ancestors, &infinite_bounds, &msgs_applied, -1);

        FIFO_ITERATE(parent_bnc->buffer, key, keylen, val, vallen, type, msn, xids, is_fresh,
                     {
@@ -921,7 +921,7 @@ flush_to_leaf_with_keyrange(FT_HANDLE t, bool make_leaf_up_to_date) {
        .upper_bound_inclusive = toku_clone_dbt(&ubi, childkeys[7])
    };
    bool msgs_applied;
-    toku_apply_ancestors_messages_to_node(t, child, &ancestors, &bounds, &msgs_applied);
+    toku_apply_ancestors_messages_to_node(t, child, &ancestors, &bounds, &msgs_applied, -1);

    FIFO_ITERATE(parent_bnc->buffer, key, keylen, val, vallen, type, msn, xids, is_fresh,
                 {
@@ -1104,7 +1104,7 @@ compare_apply_and_flush(FT_HANDLE t, bool make_leaf_up_to_date) {
    struct ancestors ancestors = { .node = parentnode, .childnum = 0, .next = NULL };
    const struct pivot_bounds infinite_bounds = { .lower_bound_exclusive = NULL, .upper_bound_inclusive = NULL };
    bool msgs_applied;
-    toku_apply_ancestors_messages_to_node(t, child2, &ancestors, &infinite_bounds, &msgs_applied);
+    toku_apply_ancestors_messages_to_node(t, child2, &ancestors, &infinite_bounds, &msgs_applied, -1);

    FIFO_ITERATE(parent_bnc->buffer, key, keylen, val, vallen, type, msn, xids, is_fresh,
                 {