[t:4844] closes #4844 Fix deadlock issue with row locks, add minor optimizations, and cleanup

Deleted duplicate function, added some optimizations to make 'no lock conflict' faster since it's the common case. Cleaned up the way lock tree row lock conflicts are tested (only for the new test and one existing one) git-svn-id: file:///svn/toku/tokudb@43303 c7de825b-a66e-492c-adef-691d508d4ae1

[t:4844] closes #4844 Fix deadlock issue with row locks, add minor optimizations, and cleanup
Deleted duplicate function, added some optimizations to make 'no lock conflict' faster since it's the common case. Cleaned up the way lock tree row lock conflicts are tested (only for the new test and one existing one) git-svn-id: file:///svn/toku/tokudb@43303 c7de825b-a66e-492c-adef-691d508d4ae1
e3a289e8 · Yoni Fogel · cadf4f16 · e3a289e8 · e3a289e8 · e3a289e8
Commit e3a289e8 authored May 10, 2012 by Yoni Fogel
4 changed files
--- a/src/lock_tree/locktree.c
+++ b/src/lock_tree/locktree.c
@@ -572,6 +572,13 @@ lt_rt_dominates(toku_lock_tree* tree, toku_interval* query, toku_range_tree* rt,
 #if TOKU_LT_USE_BORDERWRITE
+static inline bool 
+interval_strictly_internal(toku_interval* query, toku_interval* to) {
+    assert(query && to);
+    return (bool)(toku_lt_point_cmp(query->left,  to->left) > 0 &&
+                  toku_lt_point_cmp(query->right, to->right) < 0);
+}
 typedef enum {TOKU_NO_CONFLICT, TOKU_MAYBE_CONFLICT, TOKU_YES_CONFLICT} toku_conflict;
 /*
@@ -580,9 +587,12 @@ typedef enum {TOKU_NO_CONFLICT, TOKU_MAYBE_CONFLICT, TOKU_YES_CONFLICT} toku_con
    If >= 2 ranges overlap the query then, by definition of borderwrite,
    at least one overlapping regions must not be 'self'. Design document
    explains why this MUST cause a conflict.
-    If exactly one range overlaps and its data == self, there is no conflict.
+    If exactly one border_range overlaps and its data == self, there is no conflict.
-    If exactly one range overlaps and its data != self, there might be a
+    If exactly one border_range overlaps and its data != self:
-    conflict.  We need to check the 'peer'write table to verify.
+     - If the query range overlaps one of the endpoints of border_range,
+       there must be a conflict
+     - Otherwise (query range is strictly internal to border_range),
+       we need to check the 'peer'write table to determine if there is a conflict or not.
 */
 static inline int 
 lt_borderwrite_conflict(toku_lock_tree* tree, TXNID self,
@@ -602,13 +612,28 @@ lt_borderwrite_conflict(toku_lock_tree* tree, TXNID self,
    if (r != 0) 
        return r;
    assert(numfound <= query_size);
-    if (numfound == 2) 
+    if (numfound == 0)
-        *conflict = TOKU_YES_CONFLICT;
-    else if (numfound == 0 || !lt_txn_cmp(buf[0].data, self)) 
        *conflict = TOKU_NO_CONFLICT;
+    else if (numfound == 1) {
+        toku_interval* border_range = &buf[0].ends;
+        TXNID border_txn = buf[0].data;
+        if (!lt_txn_cmp(border_txn, self))
+            *conflict = TOKU_NO_CONFLICT;
+        else if (interval_strictly_internal(query, border_range)) {
+            // Only the end-points of border_range are known to be locked.
+            // We need to look at the self_write tree to determine
+            // if there is a conflict or not.
+            *conflict = TOKU_MAYBE_CONFLICT;
+            *peer = border_txn;
+        }
+        else
+            *conflict = TOKU_YES_CONFLICT;
+    }
    else {
-        *conflict = TOKU_MAYBE_CONFLICT;
+        // query overlaps >= 2 border ranges and therefore overlaps end points
-        *peer = buf[0].data;
+        // of >= 2 border_ranges with different transactions (at least one must
+        // conflict).
+        *conflict = TOKU_YES_CONFLICT;
    }
    return 0;
 }
@@ -646,35 +671,6 @@ lt_meets(toku_lock_tree* tree, toku_interval* query, toku_range_tree* rt, bool*
    return 0;
 }
-/* 
-    Determines whether 'query' meets 'rt' at txn2 not equal to txn.
-    This function supports all range trees, but queries must either be a single point,
-    or the range tree is homogenous.
-    Uses the standard definition of 'query' meets 'tree' at 'data' from the
-    design document.
-*/
-static inline int 
-lt_meets_peer(toku_lock_tree* tree, toku_interval* query, 
-              toku_range_tree* rt, bool is_homogenous,
-              TXNID self, bool* met) {
-    assert(tree && query && rt && met);
-    assert(query->left == query->right || is_homogenous);
-    const uint32_t query_size = is_homogenous ? 1 : 2;
-    toku_range   buffer[2];
-    uint32_t     buflen     = query_size;
-    toku_range*  buf        = &buffer[0];
-    uint32_t     numfound;
-    int          r;
-    r = toku_rt_find(rt, query, query_size, &buf, &buflen, &numfound);
-    if (r != 0) 
-        return r;
-    assert(numfound <= query_size);
-    *met = (bool) (numfound == 2 || (numfound == 1 && lt_txn_cmp(buf[0].data, self)));
-    return 0;
-}
 /* Checks for if a write range conflicts with reads.
   Supports ranges. */
 static inline int 
@@ -686,7 +682,7 @@ lt_write_range_conflicts_reads(toku_lock_tree* tree, TXNID txn, toku_interval* q
    while ((forest = toku_rth_next(tree->rth)) != NULL) {
        if (forest->self_read != NULL && lt_txn_cmp(forest->hash_key, txn)) {
-            r = lt_meets_peer(tree, query, forest->self_read, TRUE, txn, &met);
+            r = lt_meets(tree, query, forest->self_read, &met);
            if (r != 0)
                goto cleanup;
            if (met)  { 
@@ -710,7 +706,7 @@ lt_write_range_conflicts_writes(toku_lock_tree* tree, TXNID txn, toku_interval*
    while ((forest = toku_rth_next(tree->rth)) != NULL) {
        if (forest->self_write != NULL && lt_txn_cmp(forest->hash_key, txn)) {
-            r = lt_meets_peer(tree, query, forest->self_write, TRUE, txn, &met);
+            r = lt_meets(tree, query, forest->self_write, &met);
            if (r != 0) 
                goto cleanup;
            if (met)  { 
@@ -752,10 +748,10 @@ lt_check_borderwrite_conflict(toku_lock_tree* tree, TXNID txn, toku_interval* qu
            return r;
        conflict = met ? TOKU_YES_CONFLICT : TOKU_NO_CONFLICT;
    }
-    if (conflict == TOKU_YES_CONFLICT) 
+    if (conflict == TOKU_NO_CONFLICT) 
-        return DB_LOCK_NOTGRANTED;
+        return 0;
-    assert(conflict == TOKU_NO_CONFLICT);
+    assert(conflict == TOKU_YES_CONFLICT);
-    return 0;
+    return DB_LOCK_NOTGRANTED;
 #else
    int r = lt_write_range_conflicts_writes(tree, txn, query);
    return r;
@@ -2551,7 +2547,8 @@ find_read_conflicts(toku_lock_tree *tree, toku_interval *query, TXNID id, txnid_
    while ((forest = toku_rth_next(tree->rth)) != NULL) {
        if (forest->self_read != NULL && lt_txn_cmp(forest->hash_key, id)) {
            numfound = 0;
-            int r = toku_rt_find(forest->self_read, query, 0, range_ptr, n_expected_ranges_ptr, &numfound);
+            // All ranges in a self_read tree have the same txn
+            int r = toku_rt_find(forest->self_read, query, 1, range_ptr, n_expected_ranges_ptr, &numfound);
            if (r == 0)
                add_conflicts(conflicts, *range_ptr, numfound, id);
        }
@@ -2585,11 +2582,28 @@ toku_lt_get_lock_request_conflicts(toku_lock_tree *tree, toku_lock_request *lock
    uint32_t numfound = 0;
    r = toku_rt_find(tree->borderwrite, &query, 0, &ranges, &n_expected_ranges, &numfound);
    if (r == 0) {
-        for (uint32_t i = 0; i < numfound; i++) 
+        bool false_positive = false;
-            if (ranges[i].data != lock_request->txnid)
+        if (numfound == 1 && interval_strictly_internal(&query, &ranges[0].ends)) {
-                txnid_set_add(conflicts, ranges[i].data);
+            toku_range_tree* peer_selfwrite = toku_lt_ifexist_selfwrite(tree, ranges[0].data);
+            if (!peer_selfwrite) {
+                r = lt_panic(tree, TOKU_LT_INCONSISTENT);
+                goto cleanup;
+            }
+            bool met;
+            r = lt_meets(tree, &query, peer_selfwrite, &met);
+            if (r != 0)   
+                goto cleanup;
+            false_positive = !met;
+        }
+        if (!false_positive) {
+            for (uint32_t i = 0; i < numfound; i++) 
+                if (ranges[i].data != lock_request->txnid)
+                    txnid_set_add(conflicts, ranges[i].data);
+        }
    }
+cleanup:
    if (ranges) 
        toku_free(ranges);

--- a/src/lock_tree/tests/test.h
+++ b/src/lock_tree/tests/test.h
@@ -102,3 +102,79 @@ static inline void init_point(toku_point* point, toku_lock_tree* tree) {
    point->lt = tree;
 }
+#define READ_REQUEST(TXN, KEY) \
+    toku_lock_request TXN ## _r_ ## KEY; \
+    toku_lock_request_init(&TXN ## _r_ ## KEY, txn_ ## TXN, &key_ ## KEY, &key_ ## KEY, LOCK_REQUEST_READ);
+#define WRITE_REQUEST(TXN, KEY) \
+    toku_lock_request TXN ## _w_ ## KEY; \
+    toku_lock_request_init(&TXN ## _w_ ## KEY, txn_ ## TXN, &key_ ## KEY, &key_ ## KEY, LOCK_REQUEST_WRITE)
+static inline void
+verify_txnid_set_sorted(txnid_set *txns) {
+    size_t n = txnid_set_size(txns);
+    for (size_t i = 1; i < n; i++)
+        assert(txnid_set_get(txns, i) > txnid_set_get(txns, i-1));
+}
+static inline void
+verify_and_clean_finished_request(toku_lock_tree *lt, toku_lock_request *request) {
+    int r;
+    txnid_set conflicts; 
+    assert(request->state == LOCK_REQUEST_COMPLETE);
+    assert(request->complete_r == 0);
+    txnid_set_init(&conflicts);
+    r = toku_lt_get_lock_request_conflicts(lt, request, &conflicts);
+    assert(r == 0);
+    assert(txnid_set_size(&conflicts) == 0);
+    txnid_set_destroy(&conflicts);
+    toku_lock_request_destroy(request);
+}
+static inline void
+do_request_and_succeed(toku_lock_tree *lt, toku_lock_request *request) {
+    int r;
+    r = toku_lock_request_start(request, lt, false);
+    CKERR(r);
+    verify_and_clean_finished_request(lt, request);
+}
+static inline void
+request_still_blocked(
+        toku_lock_tree *lt,
+        toku_lock_request *request,
+        size_t num_conflicts,
+        TXNID conflicting_txns[num_conflicts]) {
+    int r;
+    txnid_set conflicts; 
+    assert(request->state == LOCK_REQUEST_PENDING);
+    txnid_set_init(&conflicts);
+    r = toku_lt_get_lock_request_conflicts(lt, request, &conflicts);
+    CKERR(r);
+    assert(txnid_set_size(&conflicts) == num_conflicts);
+    verify_txnid_set_sorted(&conflicts);
+    size_t i;
+    for (i = 0; i < num_conflicts; i++) {
+        assert(txnid_set_get(&conflicts, i) == conflicting_txns[i]);
+    }
+    txnid_set_destroy(&conflicts);
+}
+static inline void
+do_request_that_blocks(
+        toku_lock_tree *lt,
+        toku_lock_request *request,
+        int num_conflicts,
+        TXNID conflicting_txns[num_conflicts]) {
+    int r;
+    r = toku_lock_request_start(request, lt, false);
+    CKERR2(r, DB_LOCK_NOTGRANTED);
+    request_still_blocked(lt, request, num_conflicts, conflicting_txns);
+}
--- a/src/lock_tree/tests/test_borderwrite_no_deadlock.c
+++ b/src/lock_tree/tests/test_borderwrite_no_deadlock.c
+// See #4844
+//
+// T(A) gets R(1)
+// T(B) gets W(3)
+// T(B) gets W(7)
+// T(C) gets R(5)
+// T(A) trys W(5) blocked
+// T(A) gets conflicts { C }
+// T(B) trys W(1) blocked
+// T(B) gets conflicts { A }
+// T(C) releases locks
+// T(A) gets W(5)
+// T(A) releases locks
+// T(B) gets W(1)
+#include "test.h"
+int main(int argc, const char *argv[]) {
+    int r;
+    uint32_t max_locks = 4;
+    uint64_t max_lock_memory = 4096;
+    for (int i = 1; i < argc; i++) {
+        if (strcmp(argv[i], "-v") == 0 || strcmp(argv[i], "--verbose") == 0) {
+            verbose++;
+            continue;
+        }
+        if (strcmp(argv[i], "-q") == 0 || strcmp(argv[i], "--quiet") == 0) {
+            if (verbose > 0) verbose--;
+            continue;
+        }
+        if (strcmp(argv[i], "--max_locks") == 0 && i+1 < argc) {
+            max_locks = atoi(argv[++i]);
+            continue;
+        }
+        if (strcmp(argv[i], "--max_lock_memory") == 0 && i+1 < argc) {
+            max_lock_memory = atoi(argv[++i]);
+            continue;
+        }        
+        assert(0);
+    }
+    // setup
+    toku_ltm *ltm = NULL;
+    r = toku_ltm_create(&ltm, max_locks, max_lock_memory, dbpanic);
+    assert(r == 0 && ltm);
+    toku_lock_tree *lt = NULL;
+    r = toku_ltm_get_lt(ltm, &lt, (DICTIONARY_ID){1}, NULL, dbcmp);
+    assert(r == 0 && lt);
+    const TXNID txn_a = 1;
+    const TXNID txn_b = 2;
+    const TXNID txn_c = 3;
+    DBT key_1; dbt_init(&key_1, "1", 1);
+    DBT key_3; dbt_init(&key_3, "3", 1);
+    DBT key_5; dbt_init(&key_5, "5", 1);
+    DBT key_7; dbt_init(&key_7, "7", 1);
+    READ_REQUEST(a, 1);
+    WRITE_REQUEST(b, 3);
+    WRITE_REQUEST(b, 7);
+    READ_REQUEST(c, 5);
+    WRITE_REQUEST(a, 5);
+    WRITE_REQUEST(b, 1);
+    do_request_and_succeed(lt, &a_r_1);
+    do_request_and_succeed(lt, &b_w_3);
+    do_request_and_succeed(lt, &b_w_7);
+    do_request_and_succeed(lt, &c_r_5);
+    do_request_that_blocks(lt, &a_w_5, 1, (TXNID[]){ txn_c });
+    do_request_that_blocks(lt, &b_w_1, 1, (TXNID[]){ txn_a });
+    r = toku_lt_unlock_txn(lt, txn_c);
+    CKERR(r);
+    verify_and_clean_finished_request(lt, &a_w_5);
+    r = toku_lt_unlock_txn(lt, txn_a);
+    CKERR(r);
+    verify_and_clean_finished_request(lt, &b_w_1);
+    r = toku_lt_unlock_txn(lt, txn_b);
+    CKERR(r);
+    // shutdown 
+    toku_lt_remove_db_ref(lt);
+    r = toku_ltm_close(ltm); assert(r == 0);
+    return 0;
+}
--- a/src/lock_tree/tests/test_conflict_read_table_write.c
+++ b/src/lock_tree/tests/test_conflict_read_table_write.c
 // T(A) gets R(TABLE)
-// T(B) gets R(TABLE)
+// T(B) gets R(L)
 // T(C) trys W(L) blocked
 // T(C) gets conflicts { A, B }
 // T(A) releases locks
@@ -57,52 +57,24 @@ int main(int argc, const char *argv[]) {
    const TXNID txn_a = 1;
    toku_lock_request a_r_t; toku_lock_request_init(&a_r_t, txn_a, toku_lt_neg_infinity, toku_lt_infinity, LOCK_REQUEST_READ);
-    r = toku_lock_request_start(&a_r_t, lt, false); assert(r == 0); 
-    assert(a_r_t.state == LOCK_REQUEST_COMPLETE && a_r_t.complete_r == 0);
+    do_request_and_succeed(lt, &a_r_t);
-    txnid_set_init(&conflicts);
-    r = toku_lt_get_lock_request_conflicts(lt, &a_r_t, &conflicts);
-    assert(r == 0);
-    assert(txnid_set_size(&conflicts) == 0);
-    txnid_set_destroy(&conflicts);
-    toku_lock_request_destroy(&a_r_t);
    const TXNID txn_b = 2;
-    toku_lock_request b_r_l; toku_lock_request_init(&b_r_l, txn_b, &key_l, &key_l, LOCK_REQUEST_READ);
+    READ_REQUEST(b, l);
-    r = toku_lock_request_start(&b_r_l, lt, false); assert(r == 0); 
+    do_request_and_succeed(lt, &b_r_l);
-    assert(b_r_l.state == LOCK_REQUEST_COMPLETE && b_r_l.complete_r == 0);
-    txnid_set_init(&conflicts);
-    r = toku_lt_get_lock_request_conflicts(lt, &b_r_l, &conflicts);
-    assert(r == 0);
-    assert(txnid_set_size(&conflicts) == 0);
-    txnid_set_destroy(&conflicts);
-    toku_lock_request_destroy(&b_r_l);
    const TXNID txn_c = 3;
-    toku_lock_request c_w_l; toku_lock_request_init(&c_w_l, txn_c, &key_l, &key_l, LOCK_REQUEST_WRITE);
+    WRITE_REQUEST(c, l);
-    r = toku_lock_request_start(&c_w_l, lt, false); assert(r != 0); 
+    do_request_that_blocks(lt, &c_w_l, 2, (TXNID[]){ txn_a, txn_b });
-    assert(c_w_l.state == LOCK_REQUEST_PENDING);
-    txnid_set_init(&conflicts);
-    r = toku_lt_get_lock_request_conflicts(lt, &c_w_l, &conflicts);
-    assert(r == 0);
-    assert(txnid_set_size(&conflicts) == 2);
-    sortit(&conflicts);
-    assert(txnid_set_get(&conflicts, 0) == txn_a);
-    assert(txnid_set_get(&conflicts, 1) == txn_b);
-    txnid_set_destroy(&conflicts);
    r = toku_lt_unlock_txn(lt, txn_a); assert(r == 0);
-    assert(c_w_l.state == LOCK_REQUEST_PENDING);
+    request_still_blocked(lt, &c_w_l, 1, (TXNID[]){ txn_b });
-    txnid_set_init(&conflicts);
-    r = toku_lt_get_lock_request_conflicts(lt, &c_w_l, &conflicts);
-    assert(r == 0);
-    assert(txnid_set_size(&conflicts) == 1);
-    assert(txnid_set_get(&conflicts, 0) == txn_b);
-    txnid_set_destroy(&conflicts);
    r = toku_lt_unlock_txn(lt, txn_b); assert(r == 0);
-    assert(c_w_l.state == LOCK_REQUEST_COMPLETE && c_w_l.complete_r == 0);
-    toku_lock_request_destroy(&c_w_l);
+    verify_and_clean_finished_request(lt, &c_w_l);
    r = toku_lt_unlock_txn(lt, txn_c); assert(r == 0);
    // shutdown