[t:2494], merge read committed to main

git-svn-id: file:///svn/toku/tokudb@19073 c7de825b-a66e-492c-adef-691d508d4ae1

[t:2494], merge read committed to main
git-svn-id: file:///svn/toku/tokudb@19073 c7de825b-a66e-492c-adef-691d508d4ae1
a87006b1 · Zardosht Kasheff · Yoni Fogel · 95035e9d · a87006b1 · a87006b1
Commit a87006b1 authored Apr 16, 2013 by Zardosht Kasheff Committed by Yoni Fogel Apr 16, 2013
21 changed files
--- a/buildheader/db.h_4_4
+++ b/buildheader/db.h_4_4
@@ -189,6 +189,7 @@ typedef enum {
 #define DB_TXN_NOWAIT 8192
 #define DB_TXN_SYNC 16384
 #define DB_READ_UNCOMMITTED 67108864
+#define DB_READ_COMMITTED 33554432
 #define DB_INHERIT_ISOLATION 1
 #endif
 /* TOKUDB specific error codes */

--- a/buildheader/db.h_4_5
+++ b/buildheader/db.h_4_5
@@ -189,6 +189,7 @@ typedef enum {
 #define DB_TXN_NOWAIT 16384
 #define DB_TXN_SYNC 32768
 #define DB_READ_UNCOMMITTED 134217728
+#define DB_READ_COMMITTED 67108864
 #define DB_INHERIT_ISOLATION 1
 #endif
 /* TOKUDB specific error codes */

--- a/buildheader/db.h_4_6
+++ b/buildheader/db.h_4_6
@@ -191,6 +191,7 @@ typedef enum {
 #define DB_TXN_NOWAIT 1024
 #define DB_TXN_SYNC 16384
 #define DB_READ_UNCOMMITTED 134217728
+#define DB_READ_COMMITTED 67108864
 #define DB_INHERIT_ISOLATION 1
 #endif
 /* TOKUDB specific error codes */

--- a/buildheader/make_db_h.c
+++ b/buildheader/make_db_h.c
@@ -177,6 +177,9 @@ static void print_defines (void) {
        dodefine_track(txn_flags, DB_TXN_SYNC);
 #ifdef DB_READ_UNCOMMITTED
        dodefine_track(txn_flags, DB_READ_UNCOMMITTED);
+#endif
+#ifdef DB_READ_COMMITTED
+        dodefine_track(txn_flags, DB_READ_COMMITTED);
 #endif
        dodefine_from_track(txn_flags, DB_INHERIT_ISOLATION);
    }

--- a/buildheader/tdb.h
+++ b/buildheader/tdb.h
@@ -191,6 +191,7 @@ typedef enum {
 #define DB_TXN_NOWAIT 1024
 #define DB_TXN_SYNC 16384
 #define DB_READ_UNCOMMITTED 134217728
+#define DB_READ_COMMITTED 67108864
 #define DB_INHERIT_ISOLATION 1
 #endif
 /* TOKUDB specific error codes */

--- a/include/db.h
+++ b/include/db.h
@@ -191,6 +191,7 @@ typedef enum {
 #define DB_TXN_NOWAIT 1024
 #define DB_TXN_SYNC 16384
 #define DB_READ_UNCOMMITTED 134217728
+#define DB_READ_COMMITTED 67108864
 #define DB_INHERIT_ISOLATION 1
 #endif
 /* TOKUDB specific error codes */

--- a/newbrt/brt-internal.h
+++ b/newbrt/brt-internal.h
@@ -289,6 +289,9 @@ struct brt_cursor {
    OMTCURSOR omtcursor;
    u_int64_t  root_put_counter; // what was the count on the BRT when we validated the cursor?
    TXNID      oldest_living_xid;// what was the oldest live txnid when we created the cursor?
+    TOKULOGGER logger; // to give access to list of live transactions, needed for read_committed queries
+    TXNID ancestor_id; // txnid of ancestor, needed for read_committed queries
+    BOOL is_read_committed; // true if query is read_committed, false otherwise
    struct brt_cursor_leaf_info  leaf_info;
 };


--- a/newbrt/brt.c
+++ b/newbrt/brt.c
@@ -4034,15 +4034,61 @@ brt_cursor_cleanup_dbts(BRT_CURSOR c) {
    }
 }

+static inline void brt_cursor_extract_key_and_val(
+    LEAFENTRY le, 
+    BRT_CURSOR cursor, 
+    u_int32_t* keylen,
+    bytevec*   key,
+    u_int32_t* vallen,
+    bytevec*   val
+    )
+{
+    if (cursor->is_read_committed) {
+        TXNID le_anc_id = le_outermost_uncommitted_xid(le);
+        if (le_anc_id < cursor->logger->oldest_living_xid || //current transaction has inserted this element
+            le_anc_id == 0 || // le is a committed value with no provisional data
+            le_anc_id == cursor->ancestor_id || //quick check to avoid more expensive is_txnid_live check
+            !is_txnid_live(cursor->logger,le_anc_id)) 
+        {
+            *key = le_latest_key_and_len(le, keylen);
+            *val = le_latest_val_and_len(le, vallen);
+        }
+        else {
+            *key = le_outermost_key_and_len(le, keylen);
+            *val = le_outermost_val_and_len(le, vallen);
+        }
+    }
+    else {
+        *key = le_latest_key_and_len(le, keylen);
+        *val = le_latest_val_and_len(le, vallen);
+    }
+}
+
+
 static inline void load_dbts_from_omt(BRT_CURSOR c, DBT *key, DBT *val) {
    OMTVALUE le = 0;
    int r = toku_omt_cursor_current(c->omtcursor, &le);
    assert(r==0);
+    u_int32_t keylen;
+    bytevec   key_vec    = NULL;
+    u_int32_t vallen;
+    bytevec   val_vec    = NULL;
+    
+    brt_cursor_extract_key_and_val(
+        le,
+        c,
+        &keylen,
+        &key_vec,
+        &vallen,
+        &val_vec
+        );
    if (key) {
-        key->data = le_latest_key_and_len(le, &key->size);
+        key->data = (void *)key_vec;
+        key->size = keylen;
    }
    if (val) {
-        val->data = le_latest_val_and_len(le, &val->size);
+        val->data = (void *)val_vec;
+        val->size = vallen;
    }
 }

@@ -4083,8 +4129,13 @@ brt_cursor_invalidate(BRT_CURSOR brtcursor) {
    }
 }

-int toku_brt_cursor (BRT brt, BRT_CURSOR *cursorptr, TOKULOGGER logger) {
+int toku_brt_cursor (BRT brt, BRT_CURSOR *cursorptr, TOKULOGGER logger, TXNID txnid, BOOL is_read_committed) {
    BRT_CURSOR cursor = toku_malloc(sizeof *cursor);
+    // if this cursor is to do read_committed fetches, then the txn objects must be valid.
+    if (is_read_committed) {
+        assert(logger != NULL);
+        assert(txnid != TXNID_NONE);
+    }
    if (cursor == 0)
        return ENOMEM;
    memset(cursor, 0, sizeof(*cursor));
@@ -4092,6 +4143,9 @@ int toku_brt_cursor (BRT brt, BRT_CURSOR *cursorptr, TOKULOGGER logger) {
    cursor->current_in_omt = FALSE;
    cursor->prefetching = FALSE;
    cursor->oldest_living_xid = toku_logger_get_oldest_living_xid(logger);
+    cursor->logger = logger;
+    cursor->ancestor_id = txnid;
+    cursor->is_read_committed = is_read_committed;
    toku_list_push(&brt->cursors, &cursor->cursors_link);
    int r = toku_omt_cursor_create(&cursor->omtcursor);
    assert(r==0);
@@ -4199,6 +4253,37 @@ brt_cursor_update(BRT_CURSOR brtcursor) {
    toku_omt_cursor_set_index(omtcursor, brtcursor->leaf_info.to_be.index);
 }

+//
+// Returns true if the value that is to be read is empty.
+// If is_read_committed is false, then it checks the innermost value
+// (and is the equivalent of le_is_provdel)
+// If is_read_committed is true, then for live transactions, it checks the committed
+// value in le. For committed transactions, it checks the innermost value
+//
+static inline int 
+is_le_val_empty(LEAFENTRY le, BRT_CURSOR brtcursor) {
+    if (brtcursor->is_read_committed) {
+        TXNID le_anc_id = le_outermost_uncommitted_xid(le);
+        if (le_anc_id < brtcursor->oldest_living_xid || //current transaction has inserted this element
+            le_anc_id == 0 || // le is a committed value with no provisional data
+            le_anc_id == brtcursor->ancestor_id|| //quick check to avoid more expensive is_txnid_live check
+            !is_txnid_live(brtcursor->logger,le_anc_id)) 
+        {
+            return le_is_provdel(le);
+        }
+        // le_anc_id is an active transaction,
+        else {
+            //
+            // need to check the committed val, which requires unpack of le
+            //
+            return le_outermost_is_del(le);
+        }
+    }
+    else {
+        return le_is_provdel(le);
+    }
+}
+
 // This is a bottom layer of the search functions.
 static int
 brt_search_leaf_node(BRTNODE node, brt_search_t *search, BRT_GET_STRADDLE_CALLBACK_FUNCTION getf, void *getf_v, enum reactivity *re, BOOL *doprefetch, BRT_CURSOR brtcursor)
@@ -4224,7 +4309,7 @@ brt_search_leaf_node(BRTNODE node, brt_search_t *search, BRT_GET_STRADDLE_CALLBA
    if (r!=0) return r;

    LEAFENTRY le = datav;
-    if (le_is_provdel(le)) {
+    if (is_le_val_empty(le,brtcursor)) {
        // Provisionally deleted stuff is gone.
        // So we need to scan in the direction to see if we can find something
        while (1) {
@@ -4249,7 +4334,7 @@ brt_search_leaf_node(BRTNODE node, brt_search_t *search, BRT_GET_STRADDLE_CALLBA
            r = toku_omt_fetch(node->u.l.buffer, idx, &datav, NULL);
            assert(r==0); // we just validated the index
            le = datav;
-            if (!le_is_provdel(le)) goto got_a_good_value;
+            if (!is_le_val_empty(le,brtcursor)) goto got_a_good_value;
        }
    }
 got_a_good_value:
@@ -4258,9 +4343,18 @@ got_a_good_value:
    maybe_do_implicit_promotion_on_query(brtcursor, le);
    {
        u_int32_t keylen;
-        bytevec   key    = le_latest_key_and_len(le, &keylen);
+        bytevec   key    = NULL;
        u_int32_t vallen;
-        bytevec   val    = le_latest_val_and_len(le, &vallen);
+        bytevec   val    = NULL;
+
+        brt_cursor_extract_key_and_val(
+            le,
+            brtcursor,
+            &keylen,
+            &key,
+            &vallen,
+            &val
+            );

        assert(brtcursor->current_in_omt == FALSE);
        r = getf(keylen, key,
@@ -4636,7 +4730,7 @@ int
 toku_brt_flatten(BRT brt, TOKULOGGER logger)
 {
    BRT_CURSOR tmp_cursor;
-    int r = toku_brt_cursor(brt, &tmp_cursor, logger);
+    int r = toku_brt_cursor(brt, &tmp_cursor, logger, TXNID_NONE, FALSE);
    if (r!=0) return r;
    brt_search_t search; brt_search_init(&search, brt_cursor_compare_one, BRT_SEARCH_LEFT, 0, 0, tmp_cursor->brt);
    r = brt_cursor_search(tmp_cursor, &search, brt_flatten_getf, NULL);
@@ -4693,12 +4787,21 @@ brt_cursor_shortcut (BRT_CURSOR cursor, int direction, u_int32_t limit, BRT_GET_
            r = toku_omt_fetch(omt, index, &le, NULL);
            assert(r==0);

-            if (!le_is_provdel(le)) {
+            if (!is_le_val_empty(le,cursor)) {
                maybe_do_implicit_promotion_on_query(cursor, le);
                u_int32_t keylen;
-                bytevec   key    = le_latest_key_and_len(le, &keylen);
+                bytevec   key    = NULL;
                u_int32_t vallen;
-                bytevec   val    = le_latest_val_and_len(le, &vallen);
+                bytevec   val    = NULL;
+
+                brt_cursor_extract_key_and_val(
+                    le,
+                    cursor,
+                    &keylen,
+                    &key,
+                    &vallen,
+                    &val
+                    );

                r = getf(keylen, key, vallen, val, getf_v);
                if (r==0) {
@@ -5190,7 +5293,7 @@ toku_brt_lookup (BRT brt, DBT *k, DBT *v, BRT_GET_CALLBACK_FUNCTION getf, void *
    int r, rr;
    BRT_CURSOR cursor;

-    rr = toku_brt_cursor(brt, &cursor, NULL);
+    rr = toku_brt_cursor(brt, &cursor, NULL, TXNID_NONE, FALSE);
    if (rr != 0) return rr;

    int op = brt->flags & TOKU_DB_DUPSORT ? DB_GET_BOTH : DB_SET;
@@ -5573,7 +5676,7 @@ brt_is_empty (BRT brt) {
    BRT_CURSOR cursor;
    int r, r2;
    BOOL is_empty;
-    r = toku_brt_cursor(brt, &cursor, NULL);
+    r = toku_brt_cursor(brt, &cursor, NULL, TXNID_NONE, FALSE);
    if (r == 0) {
        r = toku_brt_cursor_first(cursor, getf_nothing, NULL);
        r2 = toku_brt_cursor_close(cursor);

--- a/newbrt/brt.h
+++ b/newbrt/brt.h
@@ -125,7 +125,7 @@ int toku_verify_brt (BRT brt);
 //int show_brt_blocknumbers(BRT);

 typedef struct brt_cursor *BRT_CURSOR;
-int toku_brt_cursor (BRT, BRT_CURSOR*, TOKULOGGER);
+int toku_brt_cursor (BRT, BRT_CURSOR*, TOKULOGGER, TXNID, BOOL);

 // get is deprecated in favor of the individual functions below
 int toku_brt_cursor_get (BRT_CURSOR cursor, DBT *key, DBT *val, BRT_GET_CALLBACK_FUNCTION getf, void *getf_v, int get_flags);

--- a/newbrt/leafentry.h
+++ b/newbrt/leafentry.h
@@ -91,13 +91,16 @@ void wbuf_LEAFENTRY(struct wbuf *w, LEAFENTRY le);
 void wbuf_nocrc_LEAFENTRY(struct wbuf *w, LEAFENTRY le);
 int print_leafentry (FILE *outf, LEAFENTRY v); // Print a leafentry out in human-readable form.

+int le_outermost_is_del(LEAFENTRY le);
 int le_is_provdel(LEAFENTRY le); // Return true if it is a provisional delete.
 int le_has_xids(LEAFENTRY le, XIDS xids); // Return true transaction represented by xids is still provisional in this leafentry (le's xid stack is a superset or equal to xids)
 void*     le_latest_key (LEAFENTRY le); // Return the latest key (return NULL for provisional deletes)
 u_int32_t le_latest_keylen (LEAFENTRY le); // Return the latest keylen.
+void* le_outermost_key_and_len (LEAFENTRY le, u_int32_t *len);
 void* le_latest_key_and_len (LEAFENTRY le, u_int32_t *len);
 void*     le_latest_val (LEAFENTRY le); // Return the latest val (return NULL for provisional deletes)
 u_int32_t le_latest_vallen (LEAFENTRY le); // Return the latest vallen.  Returns 0 for provisional deletes.
+void* le_outermost_val_and_len (LEAFENTRY le, u_int32_t *len);
 void* le_latest_val_and_len (LEAFENTRY le, u_int32_t *len);

 // Return any key or value (even if it's only provisional).

--- a/newbrt/logger.c
+++ b/newbrt/logger.c
@@ -1005,6 +1005,14 @@ find_by_xid (OMTVALUE v, void *txnidv) {
    return 0;
 }

+BOOL is_txnid_live(TOKULOGGER logger, TXNID txnid) {
+    assert(logger);
+    TOKUTXN result = NULL;
+    int rval = toku_txnid2txn(logger, txnid, &result);
+    assert(rval == 0);
+    return (result != NULL);
+}
+
 int toku_txnid2txn (TOKULOGGER logger, TXNID txnid, TOKUTXN *result) {
    if (logger==NULL) return -1;


--- a/newbrt/logger.h
+++ b/newbrt/logger.h
@@ -71,6 +71,7 @@ LSN toku_txn_get_last_lsn (TOKUTXN txn);
 LSN toku_logger_last_lsn(TOKULOGGER logger);
 TOKULOGGER toku_txn_logger (TOKUTXN txn);

+BOOL is_txnid_live(TOKULOGGER logger, TXNID txnid);
 int toku_txnid2txn (TOKULOGGER logger, TXNID txnid, TOKUTXN *result);
 //int toku_logger_log_checkpoint (TOKULOGGER);
 //int toku_set_func_fsync (int (*fsync_function)(int));

--- a/newbrt/tests/brt-serialize-sub-block-test.c
+++ b/newbrt/tests/brt-serialize-sub-block-test.c
@@ -50,7 +50,7 @@ static void test_sub_block(int n) {
    assert(error == 0);

    BRT_CURSOR cursor;
-    error = toku_brt_cursor(brt, &cursor, NULL);
+    error = toku_brt_cursor(brt, &cursor, NULL, TXNID_NONE, FALSE);
    assert(error == 0);

    for (i=0; ; i++) {

--- a/newbrt/tests/brt-test-cursor-2.c
+++ b/newbrt/tests/brt-test-cursor-2.c
@@ -51,7 +51,7 @@ static void test_multiple_brt_cursor_dbts(int n, DB *db) {
    }

    for (i=0; i<n; i++) {
-        r = toku_brt_cursor(brt, &cursors[i], NULL);
+        r = toku_brt_cursor(brt, &cursors[i], NULL, TXNID_NONE, FALSE);
        assert(r == 0);
    }


--- a/newbrt/tests/brt-test-cursor.c
+++ b/newbrt/tests/brt-test-cursor.c
@@ -19,7 +19,7 @@ static void assert_cursor_notfound(BRT brt, int position) {
    BRT_CURSOR cursor=0;
    int r;

-    r = toku_brt_cursor(brt, &cursor, NULL);
+    r = toku_brt_cursor(brt, &cursor, NULL, TXNID_NONE, FALSE);
    assert(r==0);

    struct check_pair pair = {0,0,0,0,0};
@@ -35,7 +35,7 @@ static void assert_cursor_value(BRT brt, int position, long long value) {
    BRT_CURSOR cursor=0;
    int r;

-    r = toku_brt_cursor(brt, &cursor, NULL);
+    r = toku_brt_cursor(brt, &cursor, NULL, TXNID_NONE, FALSE);
    assert(r==0);

    if (test_cursor_debug && verbose) printf("key: ");
@@ -52,7 +52,7 @@ static void assert_cursor_first_last(BRT brt, long long firstv, long long lastv)
    BRT_CURSOR cursor=0;
    int r;

-    r = toku_brt_cursor(brt, &cursor, NULL);
+    r = toku_brt_cursor(brt, &cursor, NULL, TXNID_NONE, FALSE);
    assert(r==0);

    if (test_cursor_debug && verbose) printf("first key: ");
@@ -250,7 +250,7 @@ static void assert_cursor_walk(BRT brt, int n) {
    int i;
    int r;

-    r = toku_brt_cursor(brt, &cursor, NULL);
+    r = toku_brt_cursor(brt, &cursor, NULL, TXNID_NONE, FALSE);
    assert(r==0);

    if (test_cursor_debug && verbose) printf("key: ");
@@ -316,7 +316,7 @@ static void assert_cursor_rwalk(BRT brt, int n) {
    int i;
    int r;

-    r = toku_brt_cursor(brt, &cursor, NULL);
+    r = toku_brt_cursor(brt, &cursor, NULL, TXNID_NONE, FALSE);
    assert(r==0);

    if (test_cursor_debug && verbose) printf("key: ");
@@ -402,7 +402,7 @@ static void assert_cursor_walk_inorder(BRT brt, int n) {
    int r;
    char *prevkey = 0;

-    r = toku_brt_cursor(brt, &cursor, NULL);
+    r = toku_brt_cursor(brt, &cursor, NULL, TXNID_NONE, FALSE);
    assert(r==0);

    if (test_cursor_debug && verbose) printf("key: ");
@@ -504,7 +504,7 @@ static void test_brt_cursor_split(int n, DB *db) {
        assert(r==0);
    }

-    r = toku_brt_cursor(brt, &cursor, NULL);
+    r = toku_brt_cursor(brt, &cursor, NULL, TXNID_NONE, FALSE);
    assert(r==0);

    if (test_cursor_debug && verbose) printf("key: ");
@@ -569,7 +569,7 @@ static void test_multiple_brt_cursors(int n, DB *db) {

    int i;
    for (i=0; i<n; i++) {
-        r = toku_brt_cursor(brt, &cursors[i], NULL);
+        r = toku_brt_cursor(brt, &cursors[i], NULL, TXNID_NONE, FALSE);
        assert(r == 0);
    }

@@ -619,7 +619,7 @@ static void test_multiple_brt_cursor_walk(int n, DB *db) {
    int c;
    /* create the cursors */
    for (c=0; c<ncursors; c++) {
-        r = toku_brt_cursor(brt, &cursors[c], NULL);
+        r = toku_brt_cursor(brt, &cursors[c], NULL, TXNID_NONE, FALSE);
        assert(r == 0);
    }

@@ -706,7 +706,7 @@ static void test_brt_cursor_set(int n, int cursor_op, DB *db) {
        assert(r == 0);
    }

-    r = toku_brt_cursor(brt, &cursor, NULL);
+    r = toku_brt_cursor(brt, &cursor, NULL, TXNID_NONE, FALSE);
    assert(r==0);

    /* set cursor to random keys in set { 0, 10, 20, .. 10*(n-1) } */
@@ -779,7 +779,7 @@ static void test_brt_cursor_set_range(int n, DB *db) {
        assert(r == 0);
    }

-    r = toku_brt_cursor(brt, &cursor, NULL);
+    r = toku_brt_cursor(brt, &cursor, NULL, TXNID_NONE, FALSE);
    assert(r==0);

    /* pick random keys v in 0 <= v < 10*n, the cursor should point
@@ -829,7 +829,7 @@ static void test_brt_cursor_delete(int n, DB *db) {
    error = toku_open_brt(fname, 1, &brt, 1<<12, ct, null_txn, test_brt_cursor_keycompare, db);
    assert(error == 0);

-    error = toku_brt_cursor(brt, &cursor, NULL);
+    error = toku_brt_cursor(brt, &cursor, NULL, TXNID_NONE, FALSE);
    assert(error == 0);

    DBT key, val;
@@ -890,7 +890,7 @@ static void test_brt_cursor_get_both(int n, DB *db) {
    error = toku_open_brt(fname, 1, &brt, 1<<12, ct, null_txn, test_brt_cursor_keycompare, db);
    assert(error == 0);

-    error = toku_brt_cursor(brt, &cursor, NULL);
+    error = toku_brt_cursor(brt, &cursor, NULL, TXNID_NONE, FALSE);
    assert(error == 0);

    {

--- a/newbrt/tests/brt-test.c
+++ b/newbrt/tests/brt-test.c
@@ -255,7 +255,7 @@ static void test_cursor_last_empty(void) {
    //printf("%s:%d %d alloced\n", __FILE__, __LINE__, toku_get_n_items_malloced()); toku_print_malloced_items();
    r = toku_open_brt(fname, 1, &brt, 1<<12, ct, null_txn, toku_builtin_compare_fun, null_db);  assert(r==0);
    //printf("%s:%d %d alloced\n", __FILE__, __LINE__, toku_get_n_items_malloced()); toku_print_malloced_items();
-    r = toku_brt_cursor(brt, &cursor, NULL);            assert(r==0);
+    r = toku_brt_cursor(brt, &cursor, NULL, TXNID_NONE, FALSE);            assert(r==0);
    {
 	struct check_pair pair = {0,0,0,0,0};
 	r = toku_brt_cursor_get(cursor, NULL, NULL, lookup_checkf, &pair, DB_LAST);
@@ -291,7 +291,7 @@ static void test_cursor_next (void) {
    r = toku_brt_insert(brt, toku_fill_dbt(&kbt, "hello", 6), toku_fill_dbt(&vbt, "there", 6), null_txn);
    r = toku_brt_insert(brt, toku_fill_dbt(&kbt, "byebye", 7), toku_fill_dbt(&vbt, "byenow", 7), null_txn);
    if (verbose) printf("%s:%d calling toku_brt_cursor(...)\n", __FILE__, __LINE__);
-    r = toku_brt_cursor(brt, &cursor, NULL);            assert(r==0);
+    r = toku_brt_cursor(brt, &cursor, NULL, TXNID_NONE, FALSE);            assert(r==0);
    toku_init_dbt(&kbt);
    //printf("%s:%d %d alloced\n", __FILE__, __LINE__, toku_get_n_items_malloced()); toku_print_malloced_items();
    toku_init_dbt(&vbt);
@@ -383,7 +383,7 @@ static void test_wrongendian_compare (int wrong_p, unsigned int N) {
    }
    {
 	BRT_CURSOR cursor=0;
-	r = toku_brt_cursor(brt, &cursor, NULL);            assert(r==0);
+	r = toku_brt_cursor(brt, &cursor, NULL, TXNID_NONE, FALSE);            assert(r==0);

 	for (i=0; i<2; i++) {
 	    unsigned char a[4],b[4];
@@ -423,7 +423,7 @@ static void test_wrongendian_compare (int wrong_p, unsigned int N) {
 	    toku_cachetable_verify(ct);
 	}
 	BRT_CURSOR cursor=0;
-	r = toku_brt_cursor(brt, &cursor, NULL);            assert(r==0);
+	r = toku_brt_cursor(brt, &cursor, NULL, TXNID_NONE, FALSE);            assert(r==0);
 	
 	for (i=0; i<N; i++) {
 	    unsigned char a[4],b[4];
@@ -567,7 +567,7 @@ static void test_brt_delete_present(int n) {
    /* cursor should not find anything */
    BRT_CURSOR cursor=0;

-    r = toku_brt_cursor(t, &cursor, NULL);
+    r = toku_brt_cursor(t, &cursor, NULL, TXNID_NONE, FALSE);
    assert(r == 0);

    {
@@ -698,7 +698,7 @@ static void test_brt_delete_cursor_first(int n) {
    /* cursor should find the last key: n-1 */
    BRT_CURSOR cursor=0;

-    r = toku_brt_cursor(t, &cursor, NULL);
+    r = toku_brt_cursor(t, &cursor, NULL, TXNID_NONE, FALSE);
    assert(r == 0);

    {
@@ -820,7 +820,7 @@ static void test_brt_delete_both(int n) {
    /* cursor should find only odd pairs */
    BRT_CURSOR cursor=0;

-    r = toku_brt_cursor(t, &cursor, NULL); assert(r == 0);
+    r = toku_brt_cursor(t, &cursor, NULL, TXNID_NONE, FALSE); assert(r == 0);

    for (i=1; ; i += 2) {
 	int kv = toku_htonl(0);
@@ -866,7 +866,7 @@ static void test_new_brt_cursor_create_close (void) {

    int i;
    for (i=0; i<n; i++) {
-        r = toku_brt_cursor(brt, &cursors[i], NULL); assert(r == 0);
+        r = toku_brt_cursor(brt, &cursors[i], NULL, TXNID_NONE, FALSE); assert(r == 0);
    }

    for (i=0; i<n; i++) {
@@ -901,7 +901,7 @@ static void test_new_brt_cursor_first(int n, int dup_mode) {

    BRT_CURSOR cursor=0;

-    r = toku_brt_cursor(t, &cursor, NULL); assert(r == 0);
+    r = toku_brt_cursor(t, &cursor, NULL, TXNID_NONE, FALSE); assert(r == 0);

    toku_init_dbt(&key); key.flags = DB_DBT_REALLOC;
    toku_init_dbt(&val); val.flags = DB_DBT_REALLOC;
@@ -954,7 +954,7 @@ static void test_new_brt_cursor_last(int n, int dup_mode) {

    BRT_CURSOR cursor=0;

-    r = toku_brt_cursor(t, &cursor, NULL); assert(r == 0);
+    r = toku_brt_cursor(t, &cursor, NULL, TXNID_NONE, FALSE); assert(r == 0);

    toku_init_dbt(&key); key.flags = DB_DBT_REALLOC;
    toku_init_dbt(&val); val.flags = DB_DBT_REALLOC;
@@ -1007,7 +1007,7 @@ static void test_new_brt_cursor_next(int n, int dup_mode) {

    BRT_CURSOR cursor=0;

-    r = toku_brt_cursor(t, &cursor, NULL); assert(r == 0);
+    r = toku_brt_cursor(t, &cursor, NULL, TXNID_NONE, FALSE); assert(r == 0);

    for (i=0; ; i++) {
 	int kk = toku_htonl(i);
@@ -1051,7 +1051,7 @@ static void test_new_brt_cursor_prev(int n, int dup_mode) {

    BRT_CURSOR cursor=0;

-    r = toku_brt_cursor(t, &cursor, NULL); assert(r == 0);
+    r = toku_brt_cursor(t, &cursor, NULL, TXNID_NONE, FALSE); assert(r == 0);

    for (i=n-1; ; i--) {
 	int kk = toku_htonl(i);
@@ -1095,7 +1095,7 @@ static void test_new_brt_cursor_current(int n, int dup_mode) {

    BRT_CURSOR cursor=0;

-    r = toku_brt_cursor(t, &cursor, NULL); assert(r == 0);
+    r = toku_brt_cursor(t, &cursor, NULL, TXNID_NONE, FALSE); assert(r == 0);

    for (i=0; ; i++) {
 	{
@@ -1180,7 +1180,7 @@ static void test_new_brt_cursor_set_range(int n, int dup_mode) {
        r = toku_brt_insert(brt, toku_fill_dbt(&key, &k, sizeof k), toku_fill_dbt(&val, &v, sizeof v), 0); assert(r == 0);
    }

-    r = toku_brt_cursor(brt, &cursor, NULL); assert(r==0);
+    r = toku_brt_cursor(brt, &cursor, NULL, TXNID_NONE, FALSE); assert(r==0);

    /* pick random keys v in 0 <= v < 10*n, the cursor should point
       to the smallest key in the tree that is >= v */
@@ -1238,7 +1238,7 @@ static void test_new_brt_cursor_set(int n, int cursor_op, DB *db) {
        r = toku_brt_insert(brt, toku_fill_dbt(&key, &k, sizeof k), toku_fill_dbt(&val, &v, sizeof v), 0); assert(r == 0);
    }

-    r = toku_brt_cursor(brt, &cursor, NULL); assert(r==0);
+    r = toku_brt_cursor(brt, &cursor, NULL, TXNID_NONE, FALSE); assert(r==0);

    /* set cursor to random keys in set { 0, 10, 20, .. 10*(n-1) } */
    for (i=0; i<n; i++) {

--- a/newbrt/tests/dup-delete-all.c
+++ b/newbrt/tests/dup-delete-all.c
@@ -79,7 +79,7 @@ static void test_delete_all (void) {
    // Now use a cursor to see if it is all empty
    {
 	BRT_CURSOR cursor = 0;
-	r = toku_brt_cursor(t, &cursor, 0); assert(r==0);
+	r = toku_brt_cursor(t, &cursor, 0, TXNID_NONE, FALSE); assert(r==0);
        struct check_pair pair = {len_ignore, NULL, len_ignore, NULL, 0};
        r = toku_brt_cursor_get(cursor, NULL, NULL, lookup_checkf, &pair, DB_FIRST);
 	assert(r == DB_NOTFOUND);

--- a/newbrt/tests/shortcut.c
+++ b/newbrt/tests/shortcut.c
@@ -22,7 +22,7 @@ test_main (int argc __attribute__((__unused__)), const char *argv[]  __attribute

    r = toku_brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);                               assert(r==0);
    r = toku_open_brt(fname, 1, &brt, 1<<12, ct, null_txn, test_brt_cursor_keycompare, db);   assert(r==0);
-    r = toku_brt_cursor(brt, &cursor, NULL);                                                           assert(r==0);
+    r = toku_brt_cursor(brt, &cursor, NULL, TXNID_NONE, FALSE);                                        assert(r==0);

    int i;
    for (i=0; i<1000; i++) {

--- a/newbrt/ule.c
+++ b/newbrt/ule.c
@@ -64,6 +64,7 @@ static void ule_apply_commit(ULE ule, XIDS xids);
 static void ule_push_insert_uxr(ULE ule, TXNID xid, u_int32_t vallen, void * valp);
 static void ule_push_delete_uxr(ULE ule, TXNID xid);
 static void ule_push_placeholder_uxr(ULE ule, TXNID xid);
+static UXR ule_get_outermost_uxr(ULE ule);
 static UXR ule_get_innermost_uxr(ULE ule);
 static UXR ule_get_first_empty_uxr(ULE ule);
 static void ule_remove_innermost_uxr(ULE ule);
@@ -735,6 +736,13 @@ le_full_promotion(LEAFENTRY le,
 #endif
 }

+int le_outermost_is_del(LEAFENTRY le) {
+    ULE_S ule;
+    le_unpack(&ule, le);
+    UXR outermost_uxr = ule_get_outermost_uxr(&ule);
+    int rval = uxr_is_delete(outermost_uxr);
+    return rval;
+}

 int le_is_provdel(LEAFENTRY le) {
    int rval;
@@ -857,6 +865,25 @@ have_answer:
    return rval;
 }

+void*
+le_outermost_key_and_len (LEAFENTRY le, u_int32_t *len) {
+    ULE_S ule;
+    le_unpack(&ule, le);
+    UXR uxr = ule_get_outermost_uxr(&ule);
+    void     *slow_keyp;
+    u_int32_t slow_len;
+    if (uxr_is_insert(uxr)) {
+        slow_keyp = ule.keyp;
+        slow_len  = ule.keylen; 
+    }
+    else {
+        slow_keyp = NULL;
+        slow_len  = 0;
+    }
+    *len = slow_len;
+    return slow_keyp;
+}
+
 //If le_is_provdel, return (NULL,0)
 //Else,             return (key,keylen)
 void*
@@ -943,6 +970,25 @@ le_latest_keylen (LEAFENTRY le) {
    return rval;
 }

+void*
+le_outermost_val_and_len (LEAFENTRY le, u_int32_t *len) {
+    ULE_S ule;
+    le_unpack(&ule, le);
+    UXR uxr = ule_get_outermost_uxr(&ule);
+    void     *slow_valp;
+    u_int32_t slow_len;
+    if (uxr_is_insert(uxr)) {
+        slow_valp = uxr->valp;
+        slow_len  = uxr->vallen; 
+    }
+    else {
+        slow_valp = NULL;
+        slow_len  = 0;
+    }
+    *len = slow_len;
+    return slow_valp;
+}
+
 void*
 le_latest_val_and_len (LEAFENTRY le, u_int32_t *len) {
    u_int8_t num_xrs = le->num_xrs;
@@ -1418,6 +1464,14 @@ ule_get_innermost_uxr(ULE ule) {
    return rval;
 }

+// Return innermost transaction record.
+static UXR 
+ule_get_outermost_uxr(ULE ule) {
+    assert(ule->num_uxrs > 0);
+    UXR rval = &(ule->uxrs[0]);
+    return rval;
+}
+
 // Return first empty transaction record
 static UXR 
 ule_get_first_empty_uxr(ULE ule) {

--- a/src/tests/isolation-read-committed.c
+++ b/src/tests/isolation-read-committed.c
+// Test that isolation works right for subtransactions.
+// In particular, check to see what happens if a subtransaction has different isolation level from its parent.
+
+#include "test.h"
+
+const int envflags = DB_INIT_MPOOL|DB_CREATE|DB_THREAD |DB_INIT_LOCK|DB_INIT_LOG|DB_INIT_TXN|DB_PRIVATE;
+
+int test_main (int argc, char * const argv[]) {
+    parse_args(argc, argv);
+    int r;
+    system("rm -rf " ENVDIR);
+    toku_os_mkdir(ENVDIR, S_IRWXU+S_IRWXG+S_IRWXO);
+    DB_ENV *env;
+    r = db_env_create(&env, 0);                                                         CKERR(r);
+    env->set_errfile(env, stderr);
+    r = env->open(env, ENVDIR, envflags, S_IRWXU+S_IRWXG+S_IRWXO);                      CKERR(r);
+    
+    DB *db;
+    {
+        DB_TXN *txna;
+        r = env->txn_begin(env, NULL, &txna, 0);                                        CKERR(r);
+
+        r = db_create(&db, env, 0);                                                     CKERR(r);
+        r = db->open(db, txna, "foo.db", NULL, DB_BTREE, DB_CREATE, 0666);              CKERR(r);
+
+        DBT key,val;
+        r = db->put(db, txna, dbt_init(&key, "a", 4), dbt_init(&val, "a", 4), 0);       CKERR(r);
+
+        r = txna->commit(txna, 0);                                                      CKERR(r);
+    }
+    DB_TXN *txn_put, *txn_committed, *txn_uncommitted;
+    r = env->txn_begin(env, NULL, &txn_put, DB_READ_COMMITTED);                          CKERR(r);
+    r = env->txn_begin(env, NULL, &txn_committed, DB_READ_COMMITTED);                          CKERR(r);
+    r = env->txn_begin(env, NULL, &txn_uncommitted, DB_READ_UNCOMMITTED);                          CKERR(r);
+
+    //
+    // test a simple get
+    //
+    {
+        DBT key,val;
+        r = db->put(db, txn_put, dbt_init(&key, "x", 4), dbt_init(&val, "x", 4), 0);   CKERR(r);
+        dbt_init_malloc(&val);
+        r = db->get(db, txn_put, dbt_init(&key, "x", 4), &val, 0);  CKERR(r);
+        r = db->get(db, txn_committed, dbt_init(&key, "x", 4), &val, 0);    CKERR2(r, DB_NOTFOUND);
+        r = db->get(db, txn_uncommitted, dbt_init(&key, "x", 4), &val, 0);  CKERR(r);
+        toku_free(val.data);
+        
+        r = db->del(db, txn_put, dbt_init(&key, "a", 4), 0);  CKERR(r);
+        dbt_init_malloc(&val);
+        r = db->get(db, txn_put, dbt_init(&key, "a", 4), &val, 0);  CKERR2(r, DB_NOTFOUND);
+        r = db->get(db, txn_committed, dbt_init(&key, "a", 4), &val, 0);    CKERR(r);
+        r = db->get(db, txn_uncommitted, dbt_init(&key, "a", 4), &val, 0);  CKERR2(r, DB_NOTFOUND);
+        val.data = NULL;
+        toku_free(val.data);
+    }
+
+    
+    r = txn_put->commit(txn_put, 0);                                                          CKERR(r);
+    r = txn_committed->commit(txn_committed, 0);                                             CKERR(r);
+    r = txn_uncommitted->commit(txn_uncommitted, 0);                                             CKERR(r);
+
+    r = env->txn_begin(env, NULL, &txn_put, DB_READ_COMMITTED);                          CKERR(r);
+    r = env->txn_begin(env, NULL, &txn_committed, DB_READ_COMMITTED);                          CKERR(r);
+    r = env->txn_begin(env, NULL, &txn_uncommitted, DB_READ_UNCOMMITTED);                          CKERR(r);
+
+    //
+    // test a simple get
+    //
+    {
+        DBT key,val;
+        DBT curr_key, curr_val;
+        DBC* cursor_committed = NULL;
+        DBC* cursor_uncommitted = NULL;
+        memset(&curr_key, 0, sizeof(curr_key));
+        memset(&curr_val, 0, sizeof(curr_val));
+	
+        r = db->cursor(db, txn_committed, &cursor_committed, 0); assert(r == 0);
+        r = db->cursor(db, txn_uncommitted, &cursor_uncommitted, 0); assert(r == 0);
+
+        r = db->put(db, txn_put, dbt_init(&key, "y", 4), dbt_init(&val, "y", 4), 0);   CKERR(r);
+
+        r = cursor_uncommitted->c_get(cursor_uncommitted, &curr_key, &curr_val, DB_NEXT); CKERR(r);
+        assert(((char *)(curr_key.data))[0] == 'x');
+        assert(((char *)(curr_val.data))[0] == 'x');
+
+        r = cursor_committed->c_get(cursor_committed, &curr_key, &curr_val, DB_NEXT); CKERR(r);
+        assert(((char *)(curr_key.data))[0] == 'x');
+        assert(((char *)(curr_val.data))[0] == 'x');
+    
+
+
+        r = cursor_committed->c_get(cursor_committed, &curr_key, &curr_val, DB_NEXT); CKERR2(r, DB_NOTFOUND);
+        r = cursor_uncommitted->c_get(cursor_uncommitted, &curr_key, &curr_val, DB_NEXT); CKERR(r);
+        assert(((char *)(curr_key.data))[0] == 'y');
+        assert(((char *)(curr_val.data))[0] == 'y');
+
+    }
+    r = txn_put->commit(txn_put, 0);                                                          CKERR(r);
+    r = txn_committed->commit(txn_committed, 0);                                             CKERR(r);
+    r = txn_uncommitted->commit(txn_uncommitted, 0);                                             CKERR(r);
+
+
+    r = db->close(db, 0);                                                               CKERR(r);
+    r = env->close(env, 0);                                                             CKERR(r);
+    
+    return 0;
+}
--- a/src/ydb.c
+++ b/src/ydb.c
@@ -1996,12 +1996,19 @@ static int toku_txn_begin(DB_ENV *env, DB_TXN * stxn, DB_TXN ** txn, u_int32_t f
    if (!(env->i->open_flags & DB_INIT_TXN))  return toku_ydb_do_error(env, EINVAL, "Environment does not have transactions enabled\n");
    u_int32_t txn_flags = 0;
    txn_flags |= DB_TXN_NOWAIT; //We do not support blocking locks.
-    uint32_t child_isolation_flags = 0; //TODO: #2126 DB_READ_COMMITTED should be added here once supported.
+    uint32_t child_isolation_flags = 0;
    uint32_t parent_isolation_flags = 0;
    int inherit = 0;
    int set_isolation = 0;
+    if ((flags & DB_READ_UNCOMMITTED) && (flags & DB_READ_COMMITTED)) {
+        return toku_ydb_do_error(
+            env, 
+            EINVAL, 
+            "Transaction cannot have both DB_READ_COMMITTED and DB_READ_UNCOMMITTED set\n"
+            );
+    }
    if (stxn) {
-        parent_isolation_flags = db_txn_struct_i(stxn)->flags & (DB_READ_UNCOMMITTED); //TODO: #2126 DB_READ_COMMITTED should be added here once supported.
+        parent_isolation_flags = db_txn_struct_i(stxn)->flags & (DB_READ_UNCOMMITTED | DB_READ_COMMITTED);
        if (internal || flags&DB_INHERIT_ISOLATION) {
            flags &= ~DB_INHERIT_ISOLATION;
            inherit = 1;
@@ -2009,12 +2016,12 @@ static int toku_txn_begin(DB_ENV *env, DB_TXN * stxn, DB_TXN ** txn, u_int32_t f
            child_isolation_flags = parent_isolation_flags;
        }
    }
-    if (flags&DB_READ_UNCOMMITTED) {
+    if (flags & (DB_READ_UNCOMMITTED|DB_READ_COMMITTED)) {
        if (set_isolation)
            return toku_ydb_do_error(env, EINVAL, "Cannot set isolation two different ways in DB_ENV->txn_begin\n");
        set_isolation = 1;
-        child_isolation_flags |=  DB_READ_UNCOMMITTED;
-        flags                 &= ~DB_READ_UNCOMMITTED;
+        child_isolation_flags |=  (flags & (DB_READ_UNCOMMITTED|DB_READ_COMMITTED));
+        flags                 &= ~(DB_READ_UNCOMMITTED | DB_READ_COMMITTED);
    }
    txn_flags |= child_isolation_flags;
    if (flags&DB_TXN_NOWAIT) {
@@ -2406,8 +2413,13 @@ static inline u_int32_t get_prelocked_flags(u_int32_t flags, DB_TXN* txn, DB* db

    // for internal (non-user) dictionary, do not set DB_PRELOCK
    if (db->i->dname) {
-	//DB_READ_UNCOMMITTED transactions 'own' all read locks for user-data dictionaries.
-	if (txn && db_txn_struct_i(txn)->flags&DB_READ_UNCOMMITTED) lock_flags |= DB_PRELOCKED;
+        //DB_READ_UNCOMMITTED and DB_READ_COMMITTED transactions 'own' all read locks for user-data dictionaries.
+        if (txn && 
+            (db_txn_struct_i(txn)->flags& (DB_READ_UNCOMMITTED | DB_READ_COMMITTED))
+           )
+        {
+            lock_flags |= DB_PRELOCKED;
+        }
    }
    return lock_flags;
 }
@@ -4007,7 +4019,21 @@ static int toku_db_cursor(DB * db, DB_TXN * txn, DBC ** c, u_int32_t flags, int
 	dbc_struct_i(result)->skey = &dbc_struct_i(result)->skey_s;
 	dbc_struct_i(result)->sval = &dbc_struct_i(result)->sval_s;
    }
-    int r = toku_brt_cursor(db->i->brt, &dbc_struct_i(result)->c, db->dbenv->i->logger);
+    DB_TXN* txn_anc = NULL;
+    TXNID txn_anc_id = TXNID_NONE;
+    BOOL is_read_committed = FALSE;
+    if (txn) {
+        txn_anc = toku_txn_ancestor(txn);
+        txn_anc_id = toku_txn_get_txnid(db_txn_struct_i(txn_anc)->tokutxn);
+        is_read_committed = ((db_txn_struct_i(txn_anc)->flags & DB_READ_COMMITTED) != 0);
+    }
+    int r = toku_brt_cursor(
+        db->i->brt, 
+        &dbc_struct_i(result)->c, 
+        db->dbenv->i->logger, 
+        txn_anc_id, 
+        is_read_committed
+        );
    assert(r == 0);
    *c = result;
    return 0;
@@ -4228,8 +4254,9 @@ toku_db_open(DB * db, DB_TXN * txn, const char *fname, const char *dbname, DBTYP
    int is_db_excl    = flags & DB_EXCL;    unused_flags&=~DB_EXCL;
    int is_db_create  = flags & DB_CREATE;  unused_flags&=~DB_CREATE;

-    //We support READ_UNCOMMITTED whether or not the flag is provided.
+    //We support READ_UNCOMMITTED and READ_COMMITTED whether or not the flag is provided.
                                            unused_flags&=~DB_READ_UNCOMMITTED;
+                                            unused_flags&=~DB_READ_COMMITTED;
    if (unused_flags & ~DB_THREAD) return EINVAL; // unknown flags

    if (is_db_excl && !is_db_create) return EINVAL;
@@ -4329,8 +4356,9 @@ db_open_iname(DB * db, DB_TXN * txn, const char *iname_in_env, u_int32_t flags,

    int is_db_excl    = flags & DB_EXCL;    flags&=~DB_EXCL;
    int is_db_create  = flags & DB_CREATE;  flags&=~DB_CREATE;
-    //We support READ_UNCOMMITTED whether or not the flag is provided.
+    //We support READ_UNCOMMITTED and READ_COMMITTED whether or not the flag is provided.
                                            flags&=~DB_READ_UNCOMMITTED;
+                                            flags&=~DB_READ_COMMITTED;
    if (flags & ~DB_THREAD) return EINVAL; // unknown flags

    if (is_db_excl && !is_db_create) return EINVAL;
@@ -4941,8 +4969,9 @@ cleanup:
 static int toku_db_pre_acquire_read_lock(DB *db, DB_TXN *txn, const DBT *key_left, const DBT *val_left, const DBT *key_right, const DBT *val_right) {
    HANDLE_PANICKED_DB(db);
    if (!db->i->lt || !txn) return EINVAL;
-    //READ_UNCOMMITTED transactions do not need read locks.
+    //READ_UNCOMMITTED and READ_COMMITTED transactions do not need read locks.
    if (db_txn_struct_i(txn)->flags&DB_READ_UNCOMMITTED) return 0;
+    if (db_txn_struct_i(txn)->flags&DB_READ_COMMITTED) return 0;

    DB_TXN* txn_anc = toku_txn_ancestor(txn);
    int r;