[t:5097], add fix and test for table corruption issue

git-svn-id: file:///svn/toku/tokudb@44705 c7de825b-a66e-492c-adef-691d508d4ae1

[t:5097], add fix and test for table corruption issue
git-svn-id: file:///svn/toku/tokudb@44705 c7de825b-a66e-492c-adef-691d508d4ae1
25102f30 · Zardosht Kasheff · Yoni Fogel · 65558fb2 · 25102f30 · 25102f30
Commit 25102f30 authored Jun 20, 2012 by Zardosht Kasheff Committed by Yoni Fogel Apr 17, 2013
Show whitespace changes
Inline Side-by-side

Showing with 179 additions and 21 deletions

ft/cachetable.c ft/cachetable.c +43 -21

ft/tests/cachetable-5097.c ft/tests/cachetable-5097.c +136 -0

No files found.
--- a/ft/cachetable.c
+++ b/ft/cachetable.c
@@ -2669,6 +2669,11 @@ static void assert_cachefile_is_flushed_and_removed (CACHETABLE ct, CACHEFILE cf
 // trying to access the cachefile while this function is executing.
 // This implies no client thread will be trying to lock any nodes
 // belonging to the cachefile.
+//
+// This function also assumes that the cachefile is not in the process
+// of being used by a checkpoint. If a checkpoint is currently happening,
+// it does NOT include this cachefile.
+//
 static void cachetable_flush_cachefile(CACHETABLE ct, CACHEFILE cf) {
    unsigned nfound = 0;
    //
@@ -2695,8 +2700,9 @@ static void cachetable_flush_cachefile(CACHETABLE ct, CACHEFILE cf) {

    // find all of the pairs owned by a cachefile and redirect their completion
    // to a completion queue.  If an unlocked PAIR is dirty, flush and remove 
-    // the PAIR. Locked PAIRs are on either a reader/writer thread (or maybe a
-    // checkpoint thread?) and therefore will be placed on the completion queue.
+    // the PAIR. Locked PAIRs are on either a reader/writer thread 
+    // and therefore will be placed on the completion queue.
+    //
    // The assumptions above lead to this reasoning. All pairs belonging to
    // this cachefile are either:
    //  - unlocked
@@ -2709,11 +2715,6 @@ static void cachetable_flush_cachefile(CACHETABLE ct, CACHEFILE cf) {
    //    when the function started).
    //  - Once the writer thread is done with a PAIR, remove it
    //
-    // A question from Zardosht: Is it possible for a checkpoint thread
-    // to be running, and also trying to get access to a PAIR while 
-    // the PAIR is on the writer thread? Will this cause problems?
-    // This question is encapsulated in #3941
-    //
    unsigned i;

    unsigned num_pairs = 0;
@@ -2774,16 +2775,6 @@ static void cachetable_flush_cachefile(CACHETABLE ct, CACHEFILE cf) {

    // wait for all of the pairs in the work queue to complete
    //
-    // An important assumption here is that none of the PAIRs that we
-    // pop off  the work queue need to be written out to disk. So, it is
-    // safe to simply call cachetable_maybe_remove_and_free_pair on 
-    // the PAIRs we find. The reason we can make this assumption
-    // is based on the assumption that upon entry of this function,
-    // all PAIRs belonging to this cachefile are either idle,
-    // being processed by a writer thread, or being processed by a kibbutz. 
-    // At this point in the code, kibbutz work is finished and we 
-    // assume the client will not add any more kibbutz work for this cachefile.
-    // 
    // If it were possible
    // for some thread to change the state of the node before passing
    // it off here, and a write to disk were necessary, then the code
@@ -2795,13 +2786,44 @@ static void cachetable_flush_cachefile(CACHETABLE ct, CACHEFILE cf) {
        //This workqueue's mutex is NOT the cachetable lock.
        //You must not be holding the cachetable lock during the dequeue.
        int r = workqueue_deq(&cq, &wi, 1); assert(r == 0);
+        //Some other thread owned the lock, but transferred ownership to the thread executing this function
        cachetable_lock(ct);
        PAIR p = workitem_arg(wi);
        p->cq = 0;
-        //Some other thread owned the lock, but transferred ownership to the thread executing this function
+        // check some assertions.
+        // A checkpoint should not be running on this cachefile, so
+        // the checkpoint_pending bit must be FALSE and
+        // no other thread should be accessing this PAIR
+        assert(!p->checkpoint_pending);
+        // we are only thread using the PAIR
+        assert(nb_mutex_users(&p->value_nb_mutex) == 1);
+        assert(nb_mutex_users(&p->disk_nb_mutex) == 0);
+        assert(!p->cloned_value_data);
+
+        // first we remove the PAIR from the cachetable's linked lists
+        // and hashtable, so we guarantee that no other thread can access
+        // this PAIR if we release the cachetable lock        
+        cachetable_remove_pair(ct, p);
+        // 
+        // #5097 found a bug where another thread had a dirty PAIR pinned
+        // and was trying to run partial eviction. So, when the ownership
+        // of the lock is transferred here, the PAIR may still be dirty.
+        // If so, we need to write it to disk.
+        //
+        if (p->dirty) {
+            PAIR_ATTR attr;
+            cachetable_only_write_locked_data(
+                ct,
+                p,
+                FALSE, // not for a checkpoint, as we assert above
+                &attr,
+                FALSE // not a clone
+                );
+        }
+        // now that we are assured that the PAIR has been written to disk,
+        // we free the PAIR
        nb_mutex_unlock(&p->value_nb_mutex);  //Release the lock, no one has a pin, per our assumptions above.        
-        BOOL destroyed;
-        cachetable_maybe_remove_and_free_pair(ct, p, &destroyed);
+        cachetable_free_pair(ct, p);
    }
    workqueue_destroy(&cq);
    if (cf) {

--- a/ft/tests/cachetable-5097.c
+++ b/ft/tests/cachetable-5097.c
+/* -*- mode: C; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id: cachetable-simple-verify.c 43762 2012-05-22 16:17:53Z yfogel $"
+#ident "Copyright (c) 2007-2011 Tokutek Inc.  All rights reserved."
+#include "includes.h"
+#include "test.h"
+
+CACHEFILE f1;
+CACHEFILE f2;
+
+BOOL check_flush;
+BOOL dirty_flush_called;
+BOOL check_pe_callback;
+BOOL pe_callback_called;
+
+static int 
+pe_callback (
+    void *ftnode_pv __attribute__((__unused__)), 
+    PAIR_ATTR bytes_to_free __attribute__((__unused__)), 
+    PAIR_ATTR* bytes_freed, 
+    void* extraargs __attribute__((__unused__))
+    ) 
+{
+    *bytes_freed = make_pair_attr(1);
+    if (check_pe_callback) {
+        pe_callback_called = TRUE;
+    }
+    usleep(4*1024*1024);
+    return 0;
+}
+
+static void
+flush (CACHEFILE f __attribute__((__unused__)),
+       int UU(fd),
+       CACHEKEY k  __attribute__((__unused__)),
+       void *v     __attribute__((__unused__)),
+       void **dd     __attribute__((__unused__)),
+       void *e     __attribute__((__unused__)),
+       PAIR_ATTR s      __attribute__((__unused__)),
+       PAIR_ATTR* new_size      __attribute__((__unused__)),
+       BOOL w      __attribute__((__unused__)),
+       BOOL keep   __attribute__((__unused__)),
+       BOOL c      __attribute__((__unused__)),
+       BOOL UU(is_clone)
+       ) {
+    if (check_flush && w) {
+        dirty_flush_called = TRUE;
+    }
+}
+
+static void *f2_pin(void *arg) {
+    int r;    
+    void* v1;
+    long s1;
+    CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
+    //
+    // these booleans for pe_callback just ensure that the
+    // test is working as we expect it to. We expect the get_and_pin to 
+    // cause a partial eviction of f1's PAIR, reducing its size from 8 to 1
+    // and we expect that to be enough so that the unpin does not invoke a partial eviction
+    // This is just to ensure that the bug is being exercised
+    //
+    check_pe_callback = TRUE;
+    r = toku_cachetable_get_and_pin(f2, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
+    assert(r == 0);
+    assert(pe_callback_called);
+    pe_callback_called = FALSE;
+    r = toku_cachetable_unpin(f2, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8));
+    check_pe_callback = FALSE;
+    assert(!pe_callback_called);
+    assert(r == 0);
+    
+    return arg;
+}
+
+static void
+cachetable_test (void) {
+    const int test_limit = 12;
+    int r;
+    check_flush = FALSE;
+    dirty_flush_called = FALSE;
+    
+    CACHETABLE ct;
+    r = toku_create_cachetable(&ct, test_limit, ZERO_LSN, NULL_LOGGER); assert(r == 0);
+    char fname1[] = __SRCFILE__ "test1.dat";
+    unlink(fname1);
+    char fname2[] = __SRCFILE__ "test2.dat";
+    unlink(fname2);
+
+    r = toku_cachetable_openf(&f1, ct, fname1, O_RDWR|O_CREAT, S_IRWXU|S_IRWXG|S_IRWXO); 
+    assert(r == 0);
+    r = toku_cachetable_openf(&f2, ct, fname2, O_RDWR|O_CREAT, S_IRWXU|S_IRWXG|S_IRWXO); 
+    assert(r == 0);
+
+    void* v1;
+    long s1;
+    CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
+    wc.pe_callback = pe_callback;
+    wc.flush_callback = flush;
+    // pin and unpin a node 20 times, just to get clock count up
+    for (int i = 0; i < 20; i++) {
+        r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, TRUE, NULL);
+        assert(r == 0);
+        r = toku_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_DIRTY, make_pair_attr(8));
+        assert(r == 0);
+    }
+
+    // at this point, we have a dirty PAIR in the cachetable associated with cachefile f1
+    // launch a thread that will put another PAIR in the cachetable, and get partial eviction started
+    toku_pthread_t tid;
+    r = toku_pthread_create(&tid, NULL, f2_pin, NULL); 
+    assert_zero(r);
+
+    usleep(2*1024*1024);
+    check_flush = TRUE;
+    r = toku_cachefile_close(&f1, 0, FALSE, ZERO_LSN); 
+    assert(r == 0);
+    assert(dirty_flush_called);
+    check_flush = FALSE;
+
+    void *ret;
+    r = toku_pthread_join(tid, &ret); 
+    assert_zero(r);
+
+
+    toku_cachetable_verify(ct);
+    r = toku_cachefile_close(&f2, 0, FALSE, ZERO_LSN); assert(r == 0);
+    r = toku_cachetable_close(&ct); lazy_assert_zero(r);
+}
+
+int
+test_main(int argc, const char *argv[]) {
+  default_parse_args(argc, argv);
+  cachetable_test();
+  return 0;
+}