ydb.c 218 KB
Newer Older
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1
/* -*- mode: C; c-basic-offset: 4 -*- */
2
#ident "Copyright (c) 2007-2009 Tokutek Inc.  All rights reserved."
Zardosht Kasheff's avatar
Zardosht Kasheff committed
3
 
4 5 6
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."

const char *toku_patent_string = "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it.";
7
const char *toku_copyright_string = "Copyright (c) 2007-2009 Tokutek Inc.  All rights reserved.";
8

9
#include <toku_portability.h>
10
#include <toku_pthread.h>
11
#include <ctype.h>
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
12 13
#include <errno.h>
#include <limits.h>
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
14 15 16
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
17
#include <fcntl.h>
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
18
#include <sys/stat.h>
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
19 20
#include <sys/types.h>
#include <unistd.h>
21
#include <db.h>
22
#include "toku_assert.h"
23
#include "ydb.h"
24
#include "ydb-internal.h"
25
#include "brt-internal.h"
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
26
#include "cachetable.h"
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
27 28
#include "log.h"
#include "memory.h"
29
#include "dlmalloc.h"
30
#include "checkpoint.h"
31
#include "key.h"
32 33
#include "loader.h"
#include "ydb_load.h"
34
#include "brtloader.h"
Yoni Fogel's avatar
Yoni Fogel committed
35

36 37 38 39 40 41 42 43 44 45
#ifdef TOKUTRACE
 #define DB_ENV_CREATE_FUN db_env_create_toku10
 #define DB_CREATE_FUN db_create_toku10
#else
 #define DB_ENV_CREATE_FUN db_env_create
 #define DB_CREATE_FUN db_create
 int toku_set_trace_file (char *fname __attribute__((__unused__))) { return 0; }
 int toku_close_trace_file (void) { return 0; } 
#endif

46 47 48

// Accountability: operation counters available for debugging and for "show engine status"
static u_int64_t num_inserts;
49
static u_int64_t num_inserts_fail;
50
static u_int64_t num_deletes;
51
static u_int64_t num_deletes_fail;
52 53
static u_int64_t num_point_queries;
static u_int64_t num_sequential_queries;
54 55 56 57
static u_int64_t logsuppress;                // number of times logs are suppressed for empty table (2440)
static u_int64_t logsuppressfail;            // number of times unable to suppress logs for empty table (2440)
static time_t    startuptime;                // timestamp of system startup
    
58

59 60 61
const char * environmentdictionary = "tokudb.environment";
const char * fileopsdirectory = "tokudb.directory";

62
static int env_get_iname(DB_ENV* env, DBT* dname_dbt, DBT* iname_dbt);
63 64


Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
65 66
/** The default maximum number of persistent locks in a lock tree  */
const u_int32_t __toku_env_default_max_locks = 1000;
Rich Prohaska's avatar
Rich Prohaska committed
67

68 69 70 71 72 73 74
static inline DBT*
init_dbt_realloc(DBT *dbt) {
    memset(dbt, 0, sizeof(*dbt));
    dbt->flags = DB_DBT_REALLOC;
    return dbt;
}

75 76 77 78 79 80
//Callback used for redirecting dictionaries.
static void
ydb_set_brt(DB *db, BRT brt) {
    db->i->brt = brt;
}

Yoni Fogel's avatar
Yoni Fogel committed
81 82
int toku_ydb_init(void) {
    int r = 0;
83
    startuptime = time(NULL);
Yoni Fogel's avatar
Yoni Fogel committed
84 85
    //Lower level must be initialized first.
    if (r==0) 
86
        r = toku_brt_init(toku_ydb_lock, toku_ydb_unlock, ydb_set_brt);
Yoni Fogel's avatar
Yoni Fogel committed
87 88 89
    if (r==0) 
        r = toku_ydb_lock_init();
    return r;
90 91
}

Yoni Fogel's avatar
Yoni Fogel committed
92 93 94 95 96 97 98 99
int toku_ydb_destroy(void) {
    int r = 0;
    if (r==0)
        r = toku_ydb_lock_destroy();
    //Lower level must be cleaned up last.
    if (r==0)
        r = toku_brt_destroy();
    return r;
100 101
}

102 103 104 105 106
static int
ydb_getf_do_nothing(DBT const* UU(key), DBT const* UU(val), void* UU(extra)) {
    return 0;
}

Rich Prohaska's avatar
Rich Prohaska committed
107 108 109
/* the ydb reference is used to cleanup the library when there are no more references to it */
static int toku_ydb_refs = 0;

110
static inline void ydb_add_ref(void) {
Rich Prohaska's avatar
Rich Prohaska committed
111 112 113
    ++toku_ydb_refs;
}

114
static inline void ydb_unref(void) {
Rich Prohaska's avatar
Rich Prohaska committed
115 116 117 118 119 120 121 122 123
    assert(toku_ydb_refs > 0);
    if (--toku_ydb_refs == 0) {
        /* call global destructors */
        toku_malloc_cleanup();
    }
}

/* env methods */
static int toku_env_close(DB_ENV *env, u_int32_t flags);
124 125 126
static int toku_env_set_data_dir(DB_ENV * env, const char *dir);
static int toku_env_set_lg_dir(DB_ENV * env, const char *dir);
static int toku_env_set_tmp_dir(DB_ENV * env, const char *tmp_dir);
Rich Prohaska's avatar
Rich Prohaska committed
127

128 129
static inline int env_opened(DB_ENV *env) {
    return env->i->cachetable != 0;
Rich Prohaska's avatar
Rich Prohaska committed
130 131
}

132
static void env_init_open_txn(DB_ENV *env) {
133
    toku_list_init(&env->i->open_txns);
Rich Prohaska's avatar
Rich Prohaska committed
134 135
}

136 137
// add a txn to the list of open txn's
static void env_add_open_txn(DB_ENV *env, DB_TXN *txn) {
138
    toku_list_push(&env->i->open_txns, (struct toku_list *) (void *) &txn->open_txns);
139 140 141 142
}

// remove a txn from the list of open txn's
static void env_remove_open_txn(DB_ENV *UU(env), DB_TXN *txn) {
143
    toku_list_remove((struct toku_list *) (void *) &txn->open_txns);
Rich Prohaska's avatar
Rich Prohaska committed
144 145
}

146
static int toku_txn_abort(DB_TXN * txn, TXN_PROGRESS_POLL_FUNCTION, void*);
147

148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
static void
env_fs_report_in_yellow(DB_ENV *UU(env)) {
    char tbuf[26];
    time_t tnow = time(NULL);
    fprintf(stderr, "%.24s Tokudb file system space is low\n", ctime_r(&tnow, tbuf)); fflush(stderr);
}

static void
env_fs_report_in_red(DB_ENV *UU(env)) {
    char tbuf[26];
    time_t tnow = time(NULL);
    fprintf(stderr, "%.24s Tokudb file system space is really low and access is restricted\n", ctime_r(&tnow, tbuf)); fflush(stderr);
}

static inline uint64_t
env_fs_redzone(DB_ENV *env, uint64_t total) {
    return total * env->i->redzone / 100;
}

#define ZONEREPORTLIMIT 12
// Check the available space in the file systems used by tokudb and erect barriers when available space gets low.
static int
env_fs_poller(void *arg) {
    if (0) printf("%s:%d %p\n", __FUNCTION__, __LINE__, arg);
 
    DB_ENV *env = (DB_ENV *) arg;
    int r;
#if 0
    // get the cachetable size limit (not yet needed)
    uint64_t cs = toku_cachetable_get_size_limit(env->i->cachetable);
#endif

    int in_yellow; // set true to issue warning to user
181
    int in_red;    // set true to prevent certain operations (returning ENOSPC)
182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283

    // get the fs sizes for the home dir
    uint64_t avail_size, total_size;
    r = toku_get_filesystem_sizes(env->i->dir, &avail_size, NULL, &total_size);
    assert(r == 0);
    if (0) fprintf(stderr, "%s %"PRIu64" %"PRIu64"\n", env->i->dir, avail_size, total_size);
    in_yellow = (avail_size < 2 * env_fs_redzone(env, total_size));
    in_red = (avail_size < env_fs_redzone(env, total_size));
    
    // get the fs sizes for the data dir if different than the home dir
    if (strcmp(env->i->dir, env->i->real_data_dir) != 0) {
        r = toku_get_filesystem_sizes(env->i->real_data_dir, &avail_size, NULL, &total_size);
        assert(r == 0);
        if (0) fprintf(stderr, "%s %"PRIu64" %"PRIu64"\n", env->i->real_data_dir, avail_size, total_size);
        in_yellow += (avail_size < 2 * env_fs_redzone(env, total_size));
        in_red += (avail_size < env_fs_redzone(env, total_size));
    }

    // get the fs sizes for the log dir if different than the home dir and data dir
    if (strcmp(env->i->dir, env->i->real_log_dir) != 0 && strcmp(env->i->real_data_dir, env->i->real_log_dir) != 0) {
        r = toku_get_filesystem_sizes(env->i->real_log_dir, &avail_size, NULL, &total_size);
        assert(r == 0);
        if (0) fprintf(stderr, "%s %"PRIu64" %"PRIu64"\n", env->i->real_log_dir, avail_size, total_size);
        in_yellow += (avail_size < 2 * env_fs_redzone(env, total_size));
        in_red += (avail_size < env_fs_redzone(env, total_size));
    }

    
    env->i->fs_seq++;                    // how many times through this polling loop?
    uint64_t now = env->i->fs_seq;

    // Don't issue report if we have not been out of this fs_state for a while, unless we're at system startup
    switch (env->i->fs_state) {
    case FS_RED:
	if (!in_red) {
	    if (in_yellow) {
		env->i->fs_state = FS_YELLOW;
	    } else {
		env->i->fs_state = FS_GREEN;
	    }
	}
        break;
    case FS_YELLOW:
        if (in_red) {
	    if ((now - env->i->last_seq_entered_red > ZONEREPORTLIMIT) || (now < ZONEREPORTLIMIT))
		env_fs_report_in_red(env);
            env->i->fs_state = FS_RED;
	    env->i->last_seq_entered_red = now;
        } else if (!in_yellow) {
            env->i->fs_state = FS_GREEN;
        }
        break;
    case FS_GREEN:
        if (in_red) {
	    if ((now - env->i->last_seq_entered_red > ZONEREPORTLIMIT) || (now < ZONEREPORTLIMIT))
		env_fs_report_in_red(env);
            env->i->fs_state = FS_RED;
	    env->i->last_seq_entered_red = now;
        } else if (in_yellow) {
	    if ((now - env->i->last_seq_entered_yellow > ZONEREPORTLIMIT) || (now < ZONEREPORTLIMIT))
		env_fs_report_in_yellow(env);
            env->i->fs_state = FS_YELLOW;
	    env->i->last_seq_entered_yellow = now;
        }
        break;
    }
    return 0;
}
#undef ZONEREPORTLIMIT

static void
env_fs_init(DB_ENV *env) {
    env->i->fs_state = FS_GREEN;
    env->i->fs_poll_time = 5;  // seconds
    env->i->redzone = 5;       // percent of total space
    env->i->fs_poller_is_init = FALSE;
}

// Initialize the minicron that polls file system space
static int
env_fs_init_minicron(DB_ENV *env) {
    int r = toku_minicron_setup(&env->i->fs_poller, env->i->fs_poll_time, env_fs_poller, env); 
    assert(r == 0);
    env->i->fs_poller_is_init = TRUE;
    return r;
}

// Destroy the file system space minicron
static void
env_fs_destroy(DB_ENV *env) {
    if (env->i->fs_poller_is_init) {
        int r = toku_minicron_shutdown(&env->i->fs_poller);
        assert(r == 0);
        env->i->fs_poller_is_init = FALSE;
    }
}

// Check if the available file system space is less than the reserve
// Returns ENOSPC if not enough space, othersize 0
static inline int 
env_check_avail_fs_space(DB_ENV *env) {
    int r = env->i->fs_state == FS_RED ? ENOSPC : 0; 
284
    if (r) env->i->enospc_redzone_ctr++;
285 286 287
    return r;
}

Rich Prohaska's avatar
Rich Prohaska committed
288 289
/* db methods */
static inline int db_opened(DB *db) {
290
    return db->i->opened != 0;
Rich Prohaska's avatar
Rich Prohaska committed
291 292 293 294
}

static int toku_db_put(DB * db, DB_TXN * txn, DBT * key, DBT * data, u_int32_t flags);
static int toku_db_get (DB * db, DB_TXN * txn, DBT * key, DBT * data, u_int32_t flags);
295
static int toku_db_cursor(DB *db, DB_TXN * txn, DBC **c, u_int32_t flags, int is_temporary_cursor);
Rich Prohaska's avatar
Rich Prohaska committed
296 297 298

/* txn methods */

299
/* lightweight cursor methods. */
300 301 302
static int toku_c_getf_first(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra);

static int toku_c_getf_last(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra);
Yoni Fogel's avatar
Yoni Fogel committed
303

304 305 306
static int toku_c_getf_next(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra);
static int toku_c_getf_next_nodup(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra);
static int toku_c_getf_next_dup(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra);
Yoni Fogel's avatar
Yoni Fogel committed
307

308 309 310
static int toku_c_getf_prev(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra);
static int toku_c_getf_prev_nodup(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra);
static int toku_c_getf_prev_dup(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra);
Yoni Fogel's avatar
Yoni Fogel committed
311

312 313
static int toku_c_getf_current(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra);
static int toku_c_getf_current_binding(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra);
Yoni Fogel's avatar
Yoni Fogel committed
314

315 316
static int toku_c_getf_set(DBC *c, u_int32_t flag, DBT *key, YDB_CALLBACK_FUNCTION f, void *extra);
static int toku_c_getf_set_range(DBC *c, u_int32_t flag, DBT *key, YDB_CALLBACK_FUNCTION f, void *extra);
317
static int toku_c_getf_set_range_reverse(DBC *c, u_int32_t flag, DBT *key, YDB_CALLBACK_FUNCTION f, void *extra);
318 319
static int toku_c_getf_get_both(DBC *c, u_int32_t flag, DBT *key, DBT *val, YDB_CALLBACK_FUNCTION f, void *extra);
static int toku_c_getf_get_both_range(DBC *c, u_int32_t flag, DBT *key, DBT *val, YDB_CALLBACK_FUNCTION f, void *extra);
320
static int toku_c_getf_get_both_range_reverse(DBC *c, u_int32_t flag, DBT *key, DBT *val, YDB_CALLBACK_FUNCTION f, void *extra);
321

322 323 324
static int toku_c_getf_heaviside(DBC *c, u_int32_t flags,
                                 YDB_HEAVISIDE_CALLBACK_FUNCTION f, void *extra_f,
                                 YDB_HEAVISIDE_FUNCTION h, void *extra_h, int direction); 
Yoni Fogel's avatar
Yoni Fogel committed
325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459
// There is a total order on all key/value pairs in the database.
// In a DB_DUPSORT db, let V_i = (Key,Value) refer to the ith element (0 based indexing).
// In a NODUP      db, let V_i = (Key)       refer to the ith element (0 based indexing).
// We define V_{-1}             = -\infty and
//           V_{|V|}            =  \infty and
//           h(-\infty,extra_h) = -1 by definition and
//           h( \infty,extra_h) =  1 by definition
// Requires: Direction != 0
// Effect: 
//    if direction >0 then find the smallest i such that h(V_i,extra_h)>=0.
//    if direction <0 then find the largest  i such that h(V_i,extra_h)<=0.
//    Let signus(r_h) = signus(h(V_i, extra_h)) 
//    If flags&(DB_PRELOCKED|DB_PRELOCKED_WRITE) then skip locking
//      That is, we already own the locks
//    else 
//      if direction >0 then readlock [V_{i-1}, V_i]
//      if direction <0 then readlock [V_i,     V_{i+1}]
//      That is, If we search from the right, lock the element we found, up to the
//           next element to the right.
//      If locking fails, return the locking error code
//    
//    If (0<=i<|V|) then
//      call f(V_i.Key, V_i.Value, extra_f, r_h)
//      Note: The lifetime of V_i.Key and V_i.Value is limited: they may only
//            be referenced until f returns
//      and return 0
//    else
//      return DB_NOTFOUND
// Rationale: Locking
//      If we approach from the left (direction<0) we need to prevent anyone
//      from inserting anything to our right that could change our answer,
//      so we lock the range from the element found, to the next element to the right.
//      The inverse argument applies for approaching from the right.
// Rationale: passing r_h to f
//      We want to save the performance hit of requiring f to call h again to
//      find out what h's return value was.
// Rationale: separate extra_f, extra_h parameters
//      If the same extra parameter is sent to both f and h, then you need a
//      special struct for each tuple (f_i, h_i) you use instead of a struct for each
//      f_i and each h_i.
// Requires: The signum of h is monotically increasing.
//  Requires: f does not create references to key, value, or data within once f
//           exits
// Returns
//      0                   success
//      DB_NOTFOUND         i is not in [0,|V|)
//      DB_LOCK_NOTGRANTED  Failed to obtain a lock.
//  On nonzero return, what c points to becomes undefined, That is, c becomes uninitialized
// Performance: ... TODO
// Implementation Notes:
//      How do we get the extra locking information efficiently?
//        After finding the target, we can copy the cursor, do a DB_NEXT,
//        or do a DB_NEXT+DB_PREV (vice versa for direction<0).
//        Can we have the BRT provide two key/value pairs instead of one?
//        That is, brt_cursor_c_getf_heavi_and_next for direction >0
//        and  brt_cursor_c_getf_heavi_and_prev for direction <0
//      Current suggestion is to make a copy of the cursor, and use the
//        copy to find the next(prev) element by using DB_NEXT(DB_PREV).
//        This has the overhead of needing to make a copy of the cursor,
//        which probably has a memcpy involved.
//        The argument against returning two key/value pairs is that
//        we should not have to pay to retreive both when we're doing something
//        simple like DB_NEXT.
//        This could be mitigated by having two BRT functions (or one with a
//        BOOL parameter) such that it only returns two values when necessary.
// Parameters
//  c           The cursor
//  flags       Additional bool parameters. The current allowed flags are
//              DB_PRELOCKED and DB_PRELOCKED_WRITE (bitwise or'd to use both)
//  h           A heaviside function that, along with direction, defines the query.
//              extra_h is passed to h
//              For additional information on heaviside functions, see omt.h
//              NOTE: In a DB_DUPSORT database, both key and value will be
//              passed to h.  In a NODUP database, only key will be passed to h.
//  f           A callback function (i.e. smart dbts) to provide the result of the
//              query.  key and value are the key/value pair found, extra_f is
//              passed to f, r_h is the return value for h for the key and value returned.
//              This is used as output. That is, we call f with the outputs of the
//              function.
//  direction   Which direction to search in on the heaviside function.  >0
//              means from the right, <0 means from the left.
//  extra_f     Any extra information required for f
//  extra_h     Any extra information required for h
//
// Example:
//  Find the smallest V_i = (key_i,val_i) such that key_i > key_x, assume
//   key.data and val.data are c strings, and print them out.
//      Create a struct to hold key_x, that is extra_h
//      Direction = 1 (We approach from the right, and want the smallest such
//          element).
//      Construct a heaviside function that returns >=0 if the
//      given key > key_x, and -1 otherwise
//          That is, call the comparison function on (key, key_x)
//      Create a struct to hold key_x, that is extra_f
//      construct f to call printf on key_x.data, key_i.data, val_i.data.
//  Find the least upper bound (greatest lower bound)
//      In this case, h can just return the comparison function's answer.
//      direction >0 means upper bound, direction <0 means lower bound.
//      (If you want upper/lower bound of the keyvalue pair, you need
//      to call the comparison function on the values if the key comparison
//      returns 0).
// Handlerton implications:
//  The handlerton needs at most one heaviside function per special query type (where a
//  special query is one that is not directly supported by the bdb api excluding
//  this function).
//  It is possible that more than query type can use the same heaviside function
//  if the extra_h parameter can be used to change its behavior sufficiently.
//
//  That is, part of extra_h can be a boolean strictly_greater
//  You can construct a single heaviside function that converts 0 to -1
//  (strictly greater) from the comparison function, or one that just returns
//  the results of the comparison function (greater or equal).
//
// Implementation Notes:
//  The BRT search function supports the following searches:
//      SEARCH_LEFT(h(V_i))
//          Given a step function b, that goes from 0 to 1
//          find the greatest i such that h_b(V_i) == 1
//          If it does not exist, return not found
//      SEARCH_RIGHT(h(V_i))
//          Given a step function b, that goes from 1 to 0
//          find the smallest i such that h_b(V_i) == 1
//          If it does not exist, return not found
//  We can implement c_getf_heavi using these BRT search functions.
//  A query of direction<0:
//      Create wrapper function B
//          return h(V_i) <=0 ? 1 : 0;
//      SEARCH_RIGHT(B)
//  A query of direction>0:
//      Create wrapper function B
//          return h(V_i) >=0 ? 1 : 0;
//      SEARCH_LEFT(B)

// Effect: Lightweight cursor get

Rich Prohaska's avatar
Rich Prohaska committed
460 461 462 463 464
/* cursor methods */
static int toku_c_get(DBC * c, DBT * key, DBT * data, u_int32_t flag);
static int toku_c_del(DBC *c, u_int32_t flags);
static int toku_c_count(DBC *cursor, db_recno_t *count, u_int32_t flags);
static int toku_c_close(DBC * c);
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
465

466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482
static void
env_setup_real_data_dir(DB_ENV *env) {
    toku_free(env->i->real_data_dir);
    env->i->real_data_dir = NULL;

    assert(env->i->dir);
    if (env->i->data_dir) 
        env->i->real_data_dir = toku_construct_full_name(2, env->i->dir, env->i->data_dir);
    else
        env->i->real_data_dir = toku_strdup(env->i->dir);
}

static void
env_setup_real_log_dir(DB_ENV *env) {
    toku_free(env->i->real_log_dir);
    env->i->real_log_dir = NULL;

483
    if (env->i->lg_dir) {
484 485
        assert(env->i->dir);
        env->i->real_log_dir = toku_construct_full_name(2, env->i->dir, env->i->lg_dir);
486
    } else {
487 488
        assert(env->i->dir);
        env->i->real_log_dir = toku_strdup(env->i->dir);
489
    }
490 491
}

492 493
static int 
ydb_do_recovery (DB_ENV *env) {
494
    assert(env->i->real_log_dir);
495
    toku_ydb_unlock();
496
    int r = tokudb_recover(env->i->dir, env->i->real_log_dir, env->i->bt_compare, env->i->dup_compare,
497
                           env->i->generate_row_for_put, env->i->generate_row_for_del,
498
                           env->i->cachetable_size);
499
    toku_ydb_lock();
500
    return r;
501 502 503
}

static int needs_recovery (DB_ENV *env) {
504 505
    assert(env->i->real_log_dir);
    int recovery_needed = tokudb_needs_recovery(env->i->real_log_dir, TRUE);
506
    return recovery_needed ? DB_RUNRECOVERY : 0;
507 508
}

509 510 511 512 513 514
static int toku_db_create(DB ** db, DB_ENV * env, u_int32_t flags);
static int toku_db_set_bt_compare(DB * db, int (*bt_compare) (DB *, const DBT *, const DBT *));
static int toku_db_set_dup_compare(DB *db, int (*dup_compare)(DB *, const DBT *, const DBT *));
static int toku_db_open(DB * db, DB_TXN * txn, const char *fname, const char *dbname, DBTYPE dbtype, u_int32_t flags, int mode);
static int toku_env_txn_checkpoint(DB_ENV * env, u_int32_t kbyte, u_int32_t min, u_int32_t flags);
static int toku_db_close(DB * db, u_int32_t flags);
515
static int toku_txn_begin(DB_ENV *env, DB_TXN * stxn, DB_TXN ** txn, u_int32_t flags, int internal);
516
static int toku_txn_commit(DB_TXN * txn, u_int32_t flags, TXN_PROGRESS_POLL_FUNCTION, void*);
517 518
static int db_open_iname(DB * db, DB_TXN * txn, const char *iname, u_int32_t flags, int mode);

519
static void finalize_file_removal(DICTIONARY_ID dict_id, void * extra);
520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565

// Instruct db to use the default (built-in) key comparison function
// by setting the flag bits in the db and brt structs
static int
db_use_builtin_key_cmp(DB *db) {
    HANDLE_PANICKED_DB(db);
    int r;
    if (db_opened(db))
        r = toku_ydb_do_error(db->dbenv, EINVAL, "Comparison functions cannot be set after DB open.\n");
    else if (db->i->key_compare_was_set)
        r = toku_ydb_do_error(db->dbenv, EINVAL, "Key comparison function already set.\n");
    else {
        u_int32_t tflags;
        r = toku_brt_get_flags(db->i->brt, &tflags);
        if (r!=0) return r;

        tflags |= TOKU_DB_KEYCMP_BUILTIN;
        r = toku_brt_set_flags(db->i->brt, tflags);
        if (!r)
            db->i->key_compare_was_set = TRUE;
    }
    return r;
}

static int
db_use_builtin_val_cmp(DB *db) {
    HANDLE_PANICKED_DB(db);
    int r;
    if (db_opened(db))
        r = toku_ydb_do_error(db->dbenv, EINVAL, "Comparison functions cannot be set after DB open.\n");
    else if (db->i->val_compare_was_set)
        r = toku_ydb_do_error(db->dbenv, EINVAL, "Val comparison function already set.\n");
    else {
        u_int32_t tflags;
        r = toku_brt_get_flags(db->i->brt, &tflags);
        if (r!=0) return r;

        tflags |= TOKU_DB_VALCMP_BUILTIN;
        r = toku_brt_set_flags(db->i->brt, tflags);
        if (!r)
            db->i->val_compare_was_set = TRUE;
    }
    return r;
}


566 567
static const char * curr_env_ver_key = "current_version";
static const char * orig_env_ver_key = "original_version";
568

569 570 571 572

// requires: persistent environment dictionary is already open
static int
upgrade_env(DB_ENV * env, DB_TXN * txn) {
573 574 575 576
    int r;
    uint64_t stored_env_version;
    DBT key, val;

577
    toku_fill_dbt(&key, curr_env_ver_key, strlen(curr_env_ver_key));
578 579
    toku_init_dbt(&val);
    r = toku_db_get(env->i->persistent_environment, txn, &key, &val, 0);
580
    assert(r == 0);
581 582
    stored_env_version = toku_dtoh32(*(uint32_t*)val.data);
    if (stored_env_version != BRT_LAYOUT_VERSION)
583 584
	r = TOKUDB_DICTIONARY_TOO_NEW;
    return r;
585 586
}

587 588 589
// return 0 if log exists or ENOENT if log does not exist
static int
ydb_recover_log_exists(DB_ENV *env) {
590
    int r = tokudb_recover_log_exists(env->i->real_log_dir);
591 592 593 594 595
    return r;
}


// Validate that all required files are present, no side effects.
596 597
// Return 0 if all is well, ENOENT if some files are present but at least one is missing, 
// other non-zero value if some other error occurs.
598 599 600
// Set *valid_newenv if creating a new environment (all files missing).
// (Note, if special dictionaries exist, then they were created transactionally and log should exist.)
static int 
601
validate_env(DB_ENV * env, BOOL * valid_newenv, BOOL need_rollback_cachefile) {
602
    int r;
603
    BOOL expect_newenv = FALSE;        // set true if we expect to create a new env
604 605
    toku_struct_stat buf;
    char* path = NULL;
606

607
    // Test for persistent environment
Yoni Fogel's avatar
Yoni Fogel committed
608
    path = toku_construct_full_name(2, env->i->dir, environmentdictionary);
609 610
    assert(path);
    r = toku_stat(path, &buf);
611
    int stat_errno = errno;
612
    toku_free(path);
613 614 615
    if (r == 0) {
	expect_newenv = FALSE;  // persistent info exists
    }
616
    else if (stat_errno == ENOENT) {
617 618
	expect_newenv = TRUE;
	r = 0;
619 620
    }
    else {
621 622
	r = toku_ydb_do_error(env, errno, "Unable to access persistent environment\n");
	assert(r);
623
    }
624

625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647
    // Test for rollback cachefile
    if (r == 0 && need_rollback_cachefile) {
	path = toku_construct_full_name(2, env->i->dir, ROLLBACK_CACHEFILE_NAME);
	assert(path);
	r = toku_stat(path, &buf);
	stat_errno = errno;
	toku_free(path);
	if (r == 0) {  
	    if (expect_newenv)  // rollback cachefile exists, but persistent env is missing
		r = toku_ydb_do_error(env, ENOENT, "Persistent environment is missing\n");
	}
	else if (stat_errno == ENOENT) {
	    if (!expect_newenv)  // rollback cachefile is missing but persistent env exists
		r = toku_ydb_do_error(env, ENOENT, "rollback cachefile directory is missing\n");
	    else 
		r = 0;           // both rollback cachefile and persistent env are missing
	}
	else {
	    r = toku_ydb_do_error(env, errno, "Unable to access rollback cachefile\n");
	    assert(r);
	}
    }

648 649
    // Test for fileops directory
    if (r == 0) {
Yoni Fogel's avatar
Yoni Fogel committed
650
	path = toku_construct_full_name(2, env->i->dir, fileopsdirectory);
651 652
	assert(path);
	r = toku_stat(path, &buf);
653
	stat_errno = errno;
654 655 656 657 658
	toku_free(path);
	if (r == 0) {  
	    if (expect_newenv)  // fileops directory exists, but persistent env is missing
		r = toku_ydb_do_error(env, ENOENT, "Persistent environment is missing\n");
	}
659
	else if (stat_errno == ENOENT) {
660 661 662 663 664 665 666 667 668 669 670 671
	    if (!expect_newenv)  // fileops directory is missing but persistent env exists
		r = toku_ydb_do_error(env, ENOENT, "Fileops directory is missing\n");
	    else 
		r = 0;           // both fileops directory and persistent env are missing
	}
	else {
	    r = toku_ydb_do_error(env, errno, "Unable to access fileops directory\n");
	    assert(r);
	}
    }

    // Test for recovery log
672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697
    if ((r == 0) && (env->i->open_flags & DB_INIT_LOG)) {
	// if using transactions, test for existence of log
	r = ydb_recover_log_exists(env);  // return 0 or ENOENT
	if (expect_newenv && (r != ENOENT))
	    r = toku_ydb_do_error(env, ENOENT, "Persistent environment information is missing (but log exists)\n");
	else if (!expect_newenv && r == ENOENT)
	    r = toku_ydb_do_error(env, ENOENT, "Recovery log is missing (persistent environment information is present)\n");
	else
	    r = 0;
    }

    if (r == 0)
	*valid_newenv = expect_newenv;
    else 
	*valid_newenv = FALSE;
    return r;
}



// Open the environment.
// If this is a new environment, then create the necessary files.
// Return 0 on success, ENOENT if any of the expected necessary files are missing.
// (The set of necessary files is defined in the function validate_env() above.)
static int 
toku_env_open(DB_ENV * env, const char *home, u_int32_t flags, int mode) {
698
    HANDLE_PANICKED_ENV(env);
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
699
    int r;
700
    BOOL newenv;  // true iff creating a new environment
701
    u_int32_t unused_flags=flags;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
702

Rich Prohaska's avatar
Rich Prohaska committed
703
    if (env_opened(env)) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
704
	return toku_ydb_do_error(env, EINVAL, "The environment is already open\n");
705
    }
Yoni Fogel's avatar
Yoni Fogel committed
706

707 708 709 710
    HANDLE_EXTRA_FLAGS(env, flags, 
                       DB_CREATE|DB_PRIVATE|DB_INIT_LOG|DB_INIT_TXN|DB_RECOVER|DB_INIT_MPOOL|DB_INIT_LOCK|DB_THREAD);


711 712
    // DB_CREATE means create if env does not exist, and Tokudb requires it because
    // Tokudb requries DB_PRIVATE.
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
713 714 715 716
    if ((flags & DB_PRIVATE) && !(flags & DB_CREATE)) {
	return toku_ydb_do_error(env, ENOENT, "DB_PRIVATE requires DB_CREATE (seems gratuitous to us, but that's BDB's behavior\n");
    }

717 718
    if (!(flags & DB_PRIVATE)) {
	return toku_ydb_do_error(env, ENOENT, "TokuDB requires DB_PRIVATE\n");
Yoni Fogel's avatar
Yoni Fogel committed
719
    }
720 721 722

    if ((flags & DB_INIT_LOG) && !(flags & DB_INIT_TXN)) 
	return toku_ydb_do_error(env, EINVAL, "TokuDB requires transactions for logging\n");
723

Yoni Fogel's avatar
Yoni Fogel committed
724
    if (!home) home = ".";
Yoni Fogel's avatar
Yoni Fogel committed
725

726 727 728
    // Verify that the home exists.
    {
	BOOL made_new_home = FALSE;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
729
        char* new_home = NULL;
730
    	toku_struct_stat buf;
731
        if (strlen(home) > 1 && home[strlen(home)-1] == '\\') {
Zardosht Kasheff's avatar
Zardosht Kasheff committed
732 733 734 735 736
            new_home = toku_malloc(strlen(home));
            memcpy(new_home, home, strlen(home));
            new_home[strlen(home) - 1] = 0;
            made_new_home = TRUE;
        }
737
    	r = toku_stat(made_new_home? new_home : home, &buf);
Zardosht Kasheff's avatar
Zardosht Kasheff committed
738 739 740 741
        if (made_new_home) {
            toku_free(new_home);
        }
    	if (r!=0) {
742
    	    return toku_ydb_do_error(env, errno, "Error from toku_stat(\"%s\",...)\n", home);
Zardosht Kasheff's avatar
Zardosht Kasheff committed
743
    	}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
744
    }
745
    unused_flags &= ~DB_PRIVATE;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
746 747 748

    if (env->i->dir)
        toku_free(env->i->dir);
Yoni Fogel's avatar
Yoni Fogel committed
749
    env->i->dir = toku_strdup(home);
750
    if (env->i->dir == 0) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
751
	return toku_ydb_do_error(env, ENOMEM, "Out of memory\n");
752
    }
Yoni Fogel's avatar
Yoni Fogel committed
753 754 755 756 757 758
    if (0) {
        died1:
        toku_free(env->i->dir);
        env->i->dir = NULL;
        return r;
    }
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
759 760
    env->i->open_flags = flags;
    env->i->open_mode = mode;
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
761

762 763 764
    env_setup_real_data_dir(env);
    env_setup_real_log_dir(env);

765 766 767 768 769 770
    BOOL need_rollback_cachefile = FALSE;
    if (flags & (DB_INIT_TXN | DB_INIT_LOG)) {
        need_rollback_cachefile = TRUE;
    }

    r = validate_env(env, &newenv, need_rollback_cachefile);  // make sure that environment is either new or complete
771 772
    if (r != 0) return r;

773
    unused_flags &= ~DB_INIT_TXN & ~DB_INIT_LOG;
774

775 776 777 778 779 780 781 782 783 784 785 786 787
    // do recovery only if there exists a log and recovery is requested
    // otherwise, a log is created when the logger is opened later
    if (!newenv) {
        if (flags & DB_INIT_LOG) {
            // the log does exist
            if (flags & DB_RECOVER) {
                r = ydb_do_recovery(env);
                if (r != 0) return r;
            } else {
                // the log is required to have clean shutdown if recovery is not requested
                r = needs_recovery(env);
                if (r != 0) return r;
            }
788 789
        }
    }
790 791
    
    toku_loader_cleanup_temp_files(env);
792

Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
793
    if (flags & (DB_INIT_TXN | DB_INIT_LOG)) {
794
	assert(env->i->logger);
795
        toku_logger_write_log_files(env->i->logger, (BOOL)((flags & DB_INIT_LOG) != 0));
796
        r = toku_logger_open(env->i->real_log_dir, env->i->logger);
797
	if (r!=0) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
798
	    toku_ydb_do_error(env, r, "Could not open logger\n");
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
799
	died2:
800
	    toku_logger_close(&env->i->logger);
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
801 802
	    goto died1;
	}
803 804 805
    } else {
	r = toku_logger_close(&env->i->logger); // if no logging system, then kill the logger
	assert(r==0);
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
806 807
    }

808 809 810 811 812 813 814 815 816 817 818 819
    unused_flags &= ~DB_INIT_MPOOL; // we always init an mpool.
    unused_flags &= ~DB_CREATE;     // we always do DB_CREATE
    unused_flags &= ~DB_INIT_LOCK;  // we check this later (e.g. in db->open)
    unused_flags &= ~DB_RECOVER;

// This is probably correct, but it will be pain...
//    if ((flags & DB_THREAD)==0) {
//	return toku_ydb_do_error(env, EINVAL, "TokuDB requires DB_THREAD");
//    }
    unused_flags &= ~DB_THREAD;

    if (unused_flags!=0) {
820
	return toku_ydb_do_error(env, EINVAL, "Extra flags not understood by tokudb: %u\n", unused_flags);
821 822
    }

823
    r = toku_brt_create_cachetable(&env->i->cachetable, env->i->cachetable_size, ZERO_LSN, env->i->logger);
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
824
    if (r!=0) goto died2;
825

Yoni Fogel's avatar
Yoni Fogel committed
826 827
    toku_cachetable_set_env_dir(env->i->cachetable, env->i->dir);

828 829 830 831 832
    int using_txns = env->i->open_flags & DB_INIT_TXN;
    if (env->i->logger) {
	assert (using_txns);
	toku_logger_set_cachetable(env->i->logger, env->i->cachetable);
	toku_logger_set_remove_finalize_callback(env->i->logger, finalize_file_removal, env->i->ltm);
833 834
        r = toku_logger_open_rollback(env->i->logger, env->i->cachetable, newenv);
        assert(r==0);
835
    }
836

837 838
    DB_TXN *txn=NULL;
    if (using_txns) {
839
        r = toku_txn_begin(env, 0, &txn, 0, 1);
840
        assert(r==0);
841
    }
842

843 844
    {
        r = toku_db_create(&env->i->persistent_environment, env, 0);
845
        assert(r==0);
846 847 848 849
        r = db_use_builtin_key_cmp(env->i->persistent_environment);
        assert(r==0);
        r = db_use_builtin_val_cmp(env->i->persistent_environment);
        assert(r==0);
850 851
	r = db_open_iname(env->i->persistent_environment, txn, environmentdictionary, DB_CREATE, mode);
	if (newenv) {
852 853
	    // create new persistent_environment
	    DBT key, val;
854
	    const uint32_t environment_version = toku_htod32(BRT_LAYOUT_VERSION);
855
	    assert(r==0);
856
	    toku_fill_dbt(&key, orig_env_ver_key, strlen(orig_env_ver_key));
857 858 859
	    toku_fill_dbt(&val, &environment_version, sizeof(environment_version));
	    r = toku_db_put(env->i->persistent_environment, txn, &key, &val, 0);
	    assert(r==0);
860
	    toku_fill_dbt(&key, curr_env_ver_key, strlen(curr_env_ver_key));
861 862 863 864 865 866
	    toku_fill_dbt(&val, &environment_version, sizeof(environment_version));
	    r = toku_db_put(env->i->persistent_environment, txn, &key, &val, 0);
	    assert(r==0);
	}
	else {
	    assert(r==0);
867
	    r = upgrade_env(env, txn);
868
	}
869 870 871
    }
    {
        r = toku_db_create(&env->i->directory, env, 0);
872
        assert(r==0);
873 874 875 876
        r = db_use_builtin_key_cmp(env->i->directory);
        assert(r==0);
        r = db_use_builtin_val_cmp(env->i->directory);
        assert(r==0);
877 878
        r = db_open_iname(env->i->directory, txn, fileopsdirectory, DB_CREATE, mode);
        assert(r==0);
879 880
    }
    if (using_txns) {
881
        r = toku_txn_commit(txn, 0, NULL, NULL);
882
        assert(r==0);
883 884
    }
    toku_ydb_unlock();
885
    r = toku_checkpoint(env->i->cachetable, env->i->logger, NULL, NULL, NULL, NULL);
886
    assert(r==0);
887
    toku_ydb_lock();
888 889
    env_fs_poller(env);          // get the file system state at startup
    env_fs_init_minicron(env); 
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
890 891
    return 0;
}
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
892

893

Rich Prohaska's avatar
Rich Prohaska committed
894
static int toku_env_close(DB_ENV * env, u_int32_t flags) {
895 896 897
    int r = 0;

    // if panicked, or if any open transactions, or any open dbs, then do nothing.
898

899
    if (toku_env_is_panicked(env)) goto panic_and_quit_early;
900
    if (!toku_list_empty(&env->i->open_txns)) {
901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923
        r = toku_ydb_do_error(env, EINVAL, "Cannot close environment due to open transactions\n");
        goto panic_and_quit_early;
    }
    if (toku_omt_size(env->i->open_dbs) > 0) {
        r = toku_ydb_do_error(env, EINVAL, "Cannot close environment due to open DBs\n");
        goto panic_and_quit_early;
    }
    {
        if (env->i->persistent_environment) {
            r = toku_db_close(env->i->persistent_environment, 0);
            if (r) {
                toku_ydb_do_error(env, r, "Cannot close persistent environment dictionary (DB->close error)\n");
                goto panic_and_quit_early;
            }
        }
        if (env->i->directory) {
            r = toku_db_close(env->i->directory, 0);
            if (r) {
                toku_ydb_do_error(env, r, "Cannot close Directory dictionary (DB->close error)\n");
                goto panic_and_quit_early;
            }
        }
    }
924
    if (env->i->cachetable) {
925 926
	toku_ydb_unlock();  // ydb lock must not be held when shutting down minicron
	toku_cachetable_minicron_shutdown(env->i->cachetable);
927
        if (env->i->logger) {
928 929 930
            if ( flags && DB_CLOSE_DONT_TRIM_LOG ) {
                toku_logger_trim_log_files(env->i->logger, FALSE);
            }
931
            r = toku_checkpoint(env->i->cachetable, env->i->logger, NULL, NULL, NULL, NULL);
932 933 934 935
            if (r) {
                toku_ydb_do_error(env, r, "Cannot close environment (error during checkpoint)\n");
                goto panic_and_quit_early;
            }
936 937 938 939 940 941 942 943 944 945 946
            r = toku_logger_close_rollback(env->i->logger, FALSE);
            if (r) {
                toku_ydb_do_error(env, r, "Cannot close environment (error during closing rollback cachefile)\n");
                goto panic_and_quit_early;
            }
            //Do a second checkpoint now that the rollback cachefile is closed.
            r = toku_checkpoint(env->i->cachetable, env->i->logger, NULL, NULL, NULL, NULL);
            if (r) {
                toku_ydb_do_error(env, r, "Cannot close environment (error during checkpoint)\n");
                goto panic_and_quit_early;
            }
947 948 949 950 951
            r = toku_logger_shutdown(env->i->logger); 
            if (r) {
                toku_ydb_do_error(env, r, "Cannot close environment (error during logger shutdown)\n");
                goto panic_and_quit_early;
            }
952
        }
953
	toku_ydb_lock();
954 955 956 957
        r=toku_cachetable_close(&env->i->cachetable);
	if (r) {
	    toku_ydb_do_error(env, r, "Cannot close environment (cachetable close error)\n");
            goto panic_and_quit_early;
958 959 960
	}
    }
    if (env->i->logger) {
961 962 963 964 965
        r=toku_logger_close(&env->i->logger);
	if (r) {
            env->i->logger = NULL;
	    toku_ydb_do_error(env, r, "Cannot close environment (logger close error)\n");
            goto panic_and_quit_early;
966 967 968 969
	}
    }
    // Even if nothing else went wrong, but we were panicked, then raise an error.
    // But if something else went wrong then raise that error (above)
970 971 972 973
    if (toku_env_is_panicked(env))
        goto panic_and_quit_early;
    else
	assert(env->i->panic_string==0);
974

975
    env_fs_destroy(env);
976 977
    if (env->i->data_dir)
        toku_free(env->i->data_dir);
978 979
    if (env->i->lg_dir)
        toku_free(env->i->lg_dir);
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
980 981
    if (env->i->tmp_dir)
        toku_free(env->i->tmp_dir);
982 983 984 985
    if (env->i->real_data_dir)
	toku_free(env->i->real_data_dir);
    if (env->i->real_log_dir)
	toku_free(env->i->real_log_dir);
986 987
    if (env->i->open_dbs)
        toku_omt_destroy(&env->i->open_dbs);
988 989
    if (env->i->dir)
	toku_free(env->i->dir);
Yoni Fogel's avatar
Yoni Fogel committed
990
    toku_ltm_close(env->i->ltm);
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
991
    toku_free(env->i);
992
    env->i = NULL;
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
993
    toku_free(env);
994
    env = NULL;
995
    ydb_unref();
996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008
    if ((flags!=0) && !(flags==DB_CLOSE_DONT_TRIM_LOG))
        r = EINVAL;
    return r;

panic_and_quit_early:
    //r is the panic error
    if (toku_env_is_panicked(env)) {
        char *panic_string = env->i->panic_string;
        r = toku_ydb_do_error(env, toku_env_is_panicked(env), "Cannot close environment due to previous error: %s\n", panic_string);
    }
    else
        env->i->is_panicked = r;
    return r;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1009
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1010

Rich Prohaska's avatar
Rich Prohaska committed
1011
static int toku_env_log_archive(DB_ENV * env, char **list[], u_int32_t flags) {
1012
    return toku_logger_log_archive(env->i->logger, list, flags);
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1013
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1014

1015
static int toku_env_log_flush(DB_ENV * env, const DB_LSN * lsn __attribute__((__unused__))) {
1016
    HANDLE_PANICKED_ENV(env);
1017 1018
    // We just flush everything.  MySQL uses lsn==0 which means flush everything.  For anyone else using the log, it is correct to flush too much, so we are OK.
    return toku_logger_fsync(env->i->logger);
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1019
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1020

Rich Prohaska's avatar
Rich Prohaska committed
1021
static int toku_env_set_cachesize(DB_ENV * env, u_int32_t gbytes, u_int32_t bytes, int ncache) {
1022
    HANDLE_PANICKED_ENV(env);
Rich Prohaska's avatar
Rich Prohaska committed
1023 1024
    if (ncache != 1)
        return EINVAL;
Rich Prohaska's avatar
Rich Prohaska committed
1025 1026 1027 1028 1029
    u_int64_t cs64 = ((u_int64_t) gbytes << 30) + bytes;
    unsigned long cs = cs64;
    if (cs64 > cs)
        return EINVAL;
    env->i->cachetable_size = cs;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1030 1031 1032
    return 0;
}

1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057
static int toku_env_dbremove(DB_ENV * env, DB_TXN *txn, const char *fname, const char *dbname, u_int32_t flags);

static int
locked_env_dbremove(DB_ENV * env, DB_TXN *txn, const char *fname, const char *dbname, u_int32_t flags) {
    toku_multi_operation_client_lock(); //Cannot begin checkpoint
    toku_ydb_lock();
    int r = toku_env_dbremove(env, txn, fname, dbname, flags);
    toku_ydb_unlock();
    toku_multi_operation_client_unlock(); //Can now begin checkpoint
    return r;
}

static int toku_env_dbrename(DB_ENV *env, DB_TXN *txn, const char *fname, const char *dbname, const char *newname, u_int32_t flags);

static int
locked_env_dbrename(DB_ENV *env, DB_TXN *txn, const char *fname, const char *dbname, const char *newname, u_int32_t flags) {
    toku_multi_operation_client_lock(); //Cannot begin checkpoint
    toku_ydb_lock();
    int r = toku_env_dbrename(env, txn, fname, dbname, newname, flags);
    toku_ydb_unlock();
    toku_multi_operation_client_unlock(); //Can now begin checkpoint
    return r;
}


Rich Prohaska's avatar
Rich Prohaska committed
1058 1059
#if DB_VERSION_MAJOR == 4 && DB_VERSION_MINOR >= 3

Rich Prohaska's avatar
Rich Prohaska committed
1060
static int toku_env_get_cachesize(DB_ENV * env, u_int32_t *gbytes, u_int32_t *bytes, int *ncache) {
1061
    HANDLE_PANICKED_ENV(env);
Rich Prohaska's avatar
Rich Prohaska committed
1062 1063 1064 1065 1066 1067
    *gbytes = env->i->cachetable_size >> 30;
    *bytes = env->i->cachetable_size & ((1<<30)-1);
    *ncache = 1;
    return 0;
}

Rich Prohaska's avatar
Rich Prohaska committed
1068
static int locked_env_get_cachesize(DB_ENV *env, u_int32_t *gbytes, u_int32_t *bytes, int *ncache) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1069
    toku_ydb_lock(); int r = toku_env_get_cachesize(env, gbytes, bytes, ncache); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
1070
}
Rich Prohaska's avatar
Rich Prohaska committed
1071 1072
#endif

Rich Prohaska's avatar
Rich Prohaska committed
1073
static int toku_env_set_data_dir(DB_ENV * env, const char *dir) {
1074
    HANDLE_PANICKED_ENV(env);
Yoni Fogel's avatar
Yoni Fogel committed
1075 1076
    int r;
    
Rich Prohaska's avatar
Rich Prohaska committed
1077
    if (env_opened(env) || !dir) {
1078
	r = toku_ydb_do_error(env, EINVAL, "You cannot set the data dir after opening the env\n");
1079
    }
1080 1081
    else if (env->i->data_dir)
	r = toku_ydb_do_error(env, EINVAL, "You cannot set the data dir more than once.\n");
1082 1083 1084 1085 1086
    else {
        env->i->data_dir = toku_strdup(dir);
        if (env->i->data_dir==NULL) {
            assert(errno == ENOMEM);
            r = toku_ydb_do_error(env, ENOMEM, "Out of memory\n");
Yoni Fogel's avatar
Yoni Fogel committed
1087
        }
1088
        else r = 0;
Yoni Fogel's avatar
Yoni Fogel committed
1089
    }
1090
    return r;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1091
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1092

Rich Prohaska's avatar
Rich Prohaska committed
1093
static void toku_env_set_errcall(DB_ENV * env, toku_env_errcall_t errcall) {
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1094
    env->i->errcall = errcall;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1095
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1096

Rich Prohaska's avatar
Rich Prohaska committed
1097
static void toku_env_set_errfile(DB_ENV*env, FILE*errfile) {
1098 1099 1100
    env->i->errfile = errfile;
}

Rich Prohaska's avatar
Rich Prohaska committed
1101
static void toku_env_set_errpfx(DB_ENV * env, const char *errpfx) {
1102
    env->i->errpfx = errpfx;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1103
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1104

Rich Prohaska's avatar
Rich Prohaska committed
1105
static int toku_env_set_flags(DB_ENV * env, u_int32_t flags, int onoff) {
1106
    HANDLE_PANICKED_ENV(env);
Yoni Fogel's avatar
Yoni Fogel committed
1107 1108 1109 1110 1111 1112

    u_int32_t change = 0;
    if (flags & DB_AUTO_COMMIT) {
        change |=  DB_AUTO_COMMIT;
        flags  &= ~DB_AUTO_COMMIT;
    }
1113
    if (flags != 0 && onoff) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1114
	return toku_ydb_do_error(env, EINVAL, "TokuDB does not (yet) support any nonzero ENV flags other than DB_AUTO_COMMIT\n");
1115
    }
Yoni Fogel's avatar
Yoni Fogel committed
1116 1117
    if   (onoff) env->i->open_flags |=  change;
    else         env->i->open_flags &= ~change;
Rich Prohaska's avatar
Rich Prohaska committed
1118
    return 0;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1119
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1120

Rich Prohaska's avatar
Rich Prohaska committed
1121
static int toku_env_set_lg_bsize(DB_ENV * env, u_int32_t bsize) {
1122
    HANDLE_PANICKED_ENV(env);
1123
    return toku_logger_set_lg_bsize(env->i->logger, bsize);
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1124
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1125

Rich Prohaska's avatar
Rich Prohaska committed
1126
static int toku_env_set_lg_dir(DB_ENV * env, const char *dir) {
1127
    HANDLE_PANICKED_ENV(env);
Rich Prohaska's avatar
Rich Prohaska committed
1128
    if (env_opened(env)) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1129
	return toku_ydb_do_error(env, EINVAL, "Cannot set log dir after opening the env\n");
1130
    }
1131 1132

    if (env->i->lg_dir) toku_free(env->i->lg_dir);
1133 1134
    if (dir) {
        env->i->lg_dir = toku_strdup(dir);
1135
        if (!env->i->lg_dir) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1136
	    return toku_ydb_do_error(env, ENOMEM, "Out of memory\n");
1137
	}
1138
    }
1139 1140
    else env->i->lg_dir = NULL;
    return 0;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1141
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1142

Rich Prohaska's avatar
Rich Prohaska committed
1143
static int toku_env_set_lg_max(DB_ENV * env, u_int32_t lg_max) {
1144
    HANDLE_PANICKED_ENV(env);
1145 1146 1147 1148 1149 1150
    return toku_logger_set_lg_max(env->i->logger, lg_max);
}

static int toku_env_get_lg_max(DB_ENV * env, u_int32_t *lg_maxp) {
    HANDLE_PANICKED_ENV(env);
    return toku_logger_get_lg_max(env->i->logger, lg_maxp);
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1151
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1152

Rich Prohaska's avatar
Rich Prohaska committed
1153
static int toku_env_set_lk_detect(DB_ENV * env, u_int32_t detect) {
1154
    HANDLE_PANICKED_ENV(env);
1155
    detect=detect;
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1156
    return toku_ydb_do_error(env, EINVAL, "TokuDB does not (yet) support set_lk_detect\n");
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1157
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1158

Yoni Fogel's avatar
Yoni Fogel committed
1159
static int toku_env_set_lk_max_locks(DB_ENV *dbenv, u_int32_t max) {
Yoni Fogel's avatar
Yoni Fogel committed
1160
    int r = ENOSYS;
Yoni Fogel's avatar
Yoni Fogel committed
1161
    HANDLE_PANICKED_ENV(dbenv);
Yoni Fogel's avatar
Yoni Fogel committed
1162
    if (env_opened(dbenv))         { return EINVAL; }
Yoni Fogel's avatar
Yoni Fogel committed
1163
    r = toku_ltm_set_max_locks_per_db(dbenv->i->ltm, max);
Yoni Fogel's avatar
Yoni Fogel committed
1164
    return r;
Yoni Fogel's avatar
Yoni Fogel committed
1165 1166
}

1167
#if DB_VERSION_MAJOR == 4 && DB_VERSION_MINOR <= 4
Rich Prohaska's avatar
Rich Prohaska committed
1168
static int toku_env_set_lk_max(DB_ENV * env, u_int32_t lk_max) {
Yoni Fogel's avatar
Yoni Fogel committed
1169
    return toku_env_set_lk_max_locks(env, lk_max);
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1170
}
Rich Prohaska's avatar
Rich Prohaska committed
1171 1172

static int locked_env_set_lk_max(DB_ENV * env, u_int32_t lk_max) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1173
    toku_ydb_lock(); int r = toku_env_set_lk_max(env, lk_max); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
1174
}
1175
#endif
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1176

1177 1178
static int toku_env_get_lk_max_locks(DB_ENV *dbenv, u_int32_t *lk_maxp) {
    HANDLE_PANICKED_ENV(dbenv);
Yoni Fogel's avatar
Yoni Fogel committed
1179
    return toku_ltm_get_max_locks_per_db(dbenv->i->ltm, lk_maxp);
1180 1181 1182
}

static int locked_env_set_lk_max_locks(DB_ENV *dbenv, u_int32_t max) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1183
    toku_ydb_lock(); int r = toku_env_set_lk_max_locks(dbenv, max); toku_ydb_unlock(); return r;
1184 1185 1186
}

static int __attribute__((unused)) locked_env_get_lk_max_locks(DB_ENV *dbenv, u_int32_t *lk_maxp) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1187
    toku_ydb_lock(); int r = toku_env_get_lk_max_locks(dbenv, lk_maxp); toku_ydb_unlock(); return r;
1188 1189
}

Yoni Fogel's avatar
Yoni Fogel committed
1190
//void toku__env_set_noticecall (DB_ENV *env, void (*noticecall)(DB_ENV *, db_notices)) {
Bradley C. Kuszmaul's avatar
Fixup  
Bradley C. Kuszmaul committed
1191 1192
//    env->i->noticecall = noticecall;
//}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1193

Rich Prohaska's avatar
Rich Prohaska committed
1194
static int toku_env_set_tmp_dir(DB_ENV * env, const char *tmp_dir) {
1195
    HANDLE_PANICKED_ENV(env);
Rich Prohaska's avatar
Rich Prohaska committed
1196
    if (env_opened(env)) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1197
	return toku_ydb_do_error(env, EINVAL, "Cannot set the tmp dir after opening an env\n");
1198 1199
    }
    if (!tmp_dir) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1200
	return toku_ydb_do_error(env, EINVAL, "Tmp dir bust be non-null\n");
1201
    }
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1202 1203
    if (env->i->tmp_dir)
        toku_free(env->i->tmp_dir);
Yoni Fogel's avatar
Yoni Fogel committed
1204
    env->i->tmp_dir = toku_strdup(tmp_dir);
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1205
    return env->i->tmp_dir ? 0 : ENOMEM;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1206
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1207

Rich Prohaska's avatar
Rich Prohaska committed
1208
static int toku_env_set_verbose(DB_ENV * env, u_int32_t which, int onoff) {
1209 1210
    HANDLE_PANICKED_ENV(env);
    which=which; onoff=onoff;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1211
    return 1;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1212
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1213

1214
// For test purposes only.
1215 1216
// These callbacks are never used in production code, only as a way to test the system
// (for example, by causing crashes at predictable times).
1217 1218
static void (*checkpoint_callback_f)(void*) = NULL;
static void * checkpoint_callback_extra     = NULL;
1219 1220
static void (*checkpoint_callback2_f)(void*) = NULL;
static void * checkpoint_callback2_extra     = NULL;
1221

1222
static int toku_env_txn_checkpoint(DB_ENV * env, u_int32_t kbyte __attribute__((__unused__)), u_int32_t min __attribute__((__unused__)), u_int32_t flags __attribute__((__unused__))) {
1223
    int r = toku_checkpoint(env->i->cachetable, env->i->logger,
1224 1225
			    checkpoint_callback_f,  checkpoint_callback_extra,
			    checkpoint_callback2_f, checkpoint_callback2_extra);
1226 1227
    if (r) {
	env->i->is_panicked = r; // Panicking the whole environment may be overkill, but I'm not sure what else to do.
1228 1229
	env->i->panic_string = toku_strdup("checkpoint error");
        toku_ydb_do_error(env, r, "Checkpoint\n");
1230 1231
    }
    return r;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1232 1233
}

Rich Prohaska's avatar
Rich Prohaska committed
1234
static int toku_env_txn_stat(DB_ENV * env, DB_TXN_STAT ** statp, u_int32_t flags) {
1235 1236
    HANDLE_PANICKED_ENV(env);
    statp=statp;flags=flags;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1237
    return 1;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1238 1239
}

1240
#if 0
1241
#if DB_VERSION_MAJOR == 4 && DB_VERSION_MINOR == 1
1242
static void toku_default_errcall(const char *errpfx, char *msg) {
1243 1244
    fprintf(stderr, "YDB: %s: %s", errpfx, msg);
}
1245
#else
1246
static void toku_default_errcall(const DB_ENV *env, const char *errpfx, const char *msg) {
1247
    env = env;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1248 1249
    fprintf(stderr, "YDB: %s: %s", errpfx, msg);
}
1250
#endif
1251
#endif
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1252

Rich Prohaska's avatar
Rich Prohaska committed
1253
static int locked_env_open(DB_ENV * env, const char *home, u_int32_t flags, int mode) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1254
    toku_ydb_lock(); int r = toku_env_open(env, home, flags, mode); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
1255 1256 1257
}

static int locked_env_close(DB_ENV * env, u_int32_t flags) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1258
    toku_ydb_lock(); int r = toku_env_close(env, flags); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
1259 1260 1261
}

static int locked_env_log_archive(DB_ENV * env, char **list[], u_int32_t flags) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1262
    toku_ydb_lock(); int r = toku_env_log_archive(env, list, flags); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
1263 1264 1265
}

static int locked_env_log_flush(DB_ENV * env, const DB_LSN * lsn) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1266
    toku_ydb_lock(); int r = toku_env_log_flush(env, lsn); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
1267 1268 1269
}

static int locked_env_set_cachesize(DB_ENV *env, u_int32_t gbytes, u_int32_t bytes, int ncache) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1270
    toku_ydb_lock(); int r = toku_env_set_cachesize(env, gbytes, bytes, ncache); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
1271 1272 1273
}

static int locked_env_set_data_dir(DB_ENV * env, const char *dir) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1274
    toku_ydb_lock(); int r = toku_env_set_data_dir(env, dir); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
1275 1276 1277
}

static int locked_env_set_flags(DB_ENV * env, u_int32_t flags, int onoff) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1278
    toku_ydb_lock(); int r = toku_env_set_flags(env, flags, onoff); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
1279 1280 1281
}

static int locked_env_set_lg_bsize(DB_ENV * env, u_int32_t bsize) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1282
    toku_ydb_lock(); int r = toku_env_set_lg_bsize(env, bsize); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
1283 1284 1285
}

static int locked_env_set_lg_dir(DB_ENV * env, const char *dir) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1286
    toku_ydb_lock(); int r = toku_env_set_lg_dir(env, dir); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
1287 1288 1289
}

static int locked_env_set_lg_max(DB_ENV * env, u_int32_t lg_max) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1290
    toku_ydb_lock(); int r = toku_env_set_lg_max(env, lg_max); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
1291 1292
}

1293 1294 1295 1296
static int locked_env_get_lg_max(DB_ENV * env, u_int32_t *lg_maxp) {
    toku_ydb_lock(); int r = toku_env_get_lg_max(env, lg_maxp); toku_ydb_unlock(); return r;
}

Rich Prohaska's avatar
Rich Prohaska committed
1297
static int locked_env_set_lk_detect(DB_ENV * env, u_int32_t detect) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1298
    toku_ydb_lock(); int r = toku_env_set_lk_detect(env, detect); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
1299 1300 1301
}

static int locked_env_set_tmp_dir(DB_ENV * env, const char *tmp_dir) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1302
    toku_ydb_lock(); int r = toku_env_set_tmp_dir(env, tmp_dir); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
1303 1304 1305
}

static int locked_env_set_verbose(DB_ENV * env, u_int32_t which, int onoff) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1306
    toku_ydb_lock(); int r = toku_env_set_verbose(env, which, onoff); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
1307 1308 1309
}

static int locked_env_txn_stat(DB_ENV * env, DB_TXN_STAT ** statp, u_int32_t flags) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1310
    toku_ydb_lock(); int r = toku_env_txn_stat(env, statp, flags); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
1311 1312
}

1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342
static int
env_checkpointing_set_period(DB_ENV * env, u_int32_t seconds) {
    HANDLE_PANICKED_ENV(env);
    int r;
    if (!env_opened(env)) r = EINVAL;
    else
        r = toku_set_checkpoint_period(env->i->cachetable, seconds);
    return r;
}

static int
locked_env_checkpointing_set_period(DB_ENV * env, u_int32_t seconds) {
    toku_ydb_lock(); int r = env_checkpointing_set_period(env, seconds); toku_ydb_unlock(); return r;
}

static int
env_checkpointing_get_period(DB_ENV * env, u_int32_t *seconds) {
    HANDLE_PANICKED_ENV(env);
    int r = 0;
    if (!env_opened(env)) r = EINVAL;
    else 
        *seconds = toku_get_checkpoint_period(env->i->cachetable);
    return r;
}

static int
locked_env_checkpointing_get_period(DB_ENV * env, u_int32_t *seconds) {
    toku_ydb_lock(); int r = env_checkpointing_get_period(env, seconds); toku_ydb_unlock(); return r;
}

1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416
static int
env_checkpointing_postpone(DB_ENV * env) {
    HANDLE_PANICKED_ENV(env);
    int r = 0;
    if (!env_opened(env)) r = EINVAL;
    else toku_checkpoint_safe_client_lock();
    return r;
}

static int
env_checkpointing_resume(DB_ENV * env) {
    HANDLE_PANICKED_ENV(env);
    int r = 0;
    if (!env_opened(env)) r = EINVAL;
    else toku_checkpoint_safe_client_unlock();
    return r;
}

static int
env_checkpointing_begin_atomic_operation(DB_ENV * env) {
    HANDLE_PANICKED_ENV(env);
    int r = 0;
    if (!env_opened(env)) r = EINVAL;
    else toku_multi_operation_client_lock();
    return r;
}

static int
env_checkpointing_end_atomic_operation(DB_ENV * env) {
    HANDLE_PANICKED_ENV(env);
    int r = 0;
    if (!env_opened(env)) r = EINVAL;
    else toku_multi_operation_client_unlock();
    return r;
}

static int
env_set_default_dup_compare(DB_ENV * env, int (*dup_compare) (DB *, const DBT *, const DBT *)) {
    HANDLE_PANICKED_ENV(env);
    int r = 0;
    if (env_opened(env)) r = EINVAL;
    else {
        env->i->dup_compare = dup_compare;
    }
    return r;
}

static int
locked_env_set_default_dup_compare(DB_ENV * env, int (*dup_compare) (DB *, const DBT *, const DBT *)) {
    toku_ydb_lock();
    int r = env_set_default_dup_compare(env, dup_compare);
    toku_ydb_unlock();
    return r;
}

static int
env_set_default_bt_compare(DB_ENV * env, int (*bt_compare) (DB *, const DBT *, const DBT *)) {
    HANDLE_PANICKED_ENV(env);
    int r = 0;
    if (env_opened(env)) r = EINVAL;
    else {
        env->i->bt_compare = bt_compare;
    }
    return r;
}

static int
locked_env_set_default_bt_compare(DB_ENV * env, int (*bt_compare) (DB *, const DBT *, const DBT *)) {
    toku_ydb_lock();
    int r = env_set_default_bt_compare(env, bt_compare);
    toku_ydb_unlock();
    return r;
}

1417
static int
1418
env_set_generate_row_callback_for_put(DB_ENV *env, generate_row_for_put_func generate_row_for_put) {
1419 1420 1421 1422
    HANDLE_PANICKED_ENV(env);
    int r = 0;
    if (env_opened(env)) r = EINVAL;
    else {
1423
        env->i->generate_row_for_put = generate_row_for_put;
1424 1425 1426 1427 1428
    }
    return r;
}

static int
1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440
env_set_generate_row_callback_for_del(DB_ENV *env, generate_row_for_del_func generate_row_for_del) {
    HANDLE_PANICKED_ENV(env);
    int r = 0;
    if (env_opened(env)) r = EINVAL;
    else {
        env->i->generate_row_for_del = generate_row_for_del;
    }
    return r;
}

static int
locked_env_set_generate_row_callback_for_put(DB_ENV *env, generate_row_for_put_func generate_row_for_put) {
1441
    toku_ydb_lock();
1442
    int r = env_set_generate_row_callback_for_put(env, generate_row_for_put);
1443 1444 1445 1446
    toku_ydb_unlock();
    return r;
}

1447 1448 1449 1450 1451 1452 1453 1454 1455 1456
static int
locked_env_set_generate_row_callback_for_del(DB_ENV *env, generate_row_for_del_func generate_row_for_del) {
    toku_ydb_lock();
    int r = env_set_generate_row_callback_for_del(env, generate_row_for_del);
    toku_ydb_unlock();
    return r;
}

static int env_put_multiple(DB_ENV *env, DB *src_db, DB_TXN *txn, const DBT *key, const DBT *val, uint32_t num_dbs, DB **db_array, DBT *keys, DBT *vals, uint32_t *flags_array, void *extra);
static int env_del_multiple(DB_ENV *env, DB *src_db, DB_TXN *txn, const DBT *key, const DBT *val, uint32_t num_dbs, DB **db_array, DBT *keys, uint32_t *flags_array, void *extra);
1457 1458

static int
1459
locked_env_put_multiple(DB_ENV *env, DB *src_db, DB_TXN *txn, const DBT *key, const DBT *val, uint32_t num_dbs, DB **db_array, DBT *keys, DBT *vals, uint32_t *flags_array, void *extra) {
1460
    toku_ydb_lock();
1461
    int r = env_put_multiple(env, src_db, txn, key, val, num_dbs, db_array, keys, vals, flags_array, extra);
1462 1463 1464 1465 1466
    toku_ydb_unlock();
    return r;
}

static int
1467
locked_env_del_multiple(DB_ENV *env, DB *src_db, DB_TXN *txn, const DBT *key, const DBT *val, uint32_t num_dbs, DB **db_array, DBT *keys, uint32_t *flags_array, void *extra) {
1468
    toku_ydb_lock();
1469
    int r = env_del_multiple(env, src_db, txn, key, val, num_dbs, db_array, keys, flags_array, extra);
1470 1471 1472 1473 1474
    toku_ydb_unlock();
    return r;
}


1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495
static int
env_set_redzone(DB_ENV *env, int redzone) {
    HANDLE_PANICKED_ENV(env);
    int r;
    if (env_opened(env))
        r = EINVAL;
    else {
        env->i->redzone = redzone;
        r = 0;
    }
    return r;
}

static int 
locked_env_set_redzone(DB_ENV *env, int redzone) {
    toku_ydb_lock();
    int r= env_set_redzone(env, redzone);
    toku_ydb_unlock();
    return r;
}

1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511
static void
format_time(const time_t *timer, char *buf) {
    ctime_r(timer, buf);
    size_t len = strlen(buf);
    assert(len < 26);
    char end;

    assert(len>=1);
    end = buf[len-1];
    while (end == '\n' || end == '\r') {
        buf[len-1] = '\0';
        len--;
        assert(len>=1);
        end = buf[len-1];
    }
}
1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523

// Do not take ydb lock around or in this function.  
// If the engine is blocked because some thread is holding the ydb lock, this function
// can help diagnose the problem.
// This function only collects information, and it does not matter if something gets garbled
// because of a race condition.  
static int
env_get_engine_status(DB_ENV * env, ENGINE_STATUS * engstat) {
    HANDLE_PANICKED_ENV(env);
    int r = 0;
    if (!env_opened(env)) r = EINVAL;
    else {
1524 1525
	time_t now = time(NULL);
        format_time(&now, engstat->now);
1526
        format_time(&startuptime, engstat->startuptime);
1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544
	{
	    SCHEDULE_STATUS_S schedstat;
	    toku_ydb_lock_get_status(&schedstat);
	    engstat->ydb_lock_ctr = schedstat.ydb_lock_ctr;                        /* how many times has ydb lock been taken/released */ 
	    engstat->max_possible_sleep = schedstat.max_possible_sleep;            /* max possible sleep time for ydb lock scheduling (constant) */ 
	    engstat->processor_freq_mhz = schedstat.processor_freq_mhz;            /* clock frequency in MHz */
	    engstat->max_requested_sleep = schedstat.max_requested_sleep;          /* max sleep time requested, can be larger than max possible */ 
	    engstat->times_max_sleep_used = schedstat.times_max_sleep_used;        /* number of times the max_possible_sleep was used to sleep */ 
	    engstat->total_sleepers = schedstat.total_sleepers;                    /* total number of times a client slept for ydb lock scheduling */ 
	    engstat->total_sleep_time = schedstat.total_sleep_time;                /* total time spent sleeping for ydb lock scheduling */ 
	    engstat->max_waiters = schedstat.max_waiters;                          /* max number of simultaneous client threads kept waiting for ydb lock  */ 
	    engstat->total_waiters = schedstat.total_waiters;                      /* total number of times a client thread waited for ydb lock  */ 
	    engstat->total_clients = schedstat.total_clients;                      /* total number of separate client threads that use ydb lock  */ 
	    engstat->time_ydb_lock_held_unavailable = schedstat.time_ydb_lock_held_unavailable;  /* number of times a thread migrated and theld is unavailable */ 
	    engstat->total_time_ydb_lock_held = schedstat.total_time_ydb_lock_held;/* total time client threads held the ydb lock  */ 
	    engstat->max_time_ydb_lock_held = schedstat.max_time_ydb_lock_held;    /* max time client threads held the ydb lock  */ 
	}

1545
	env_checkpointing_get_period(env, &(engstat->checkpoint_period));  // do not take ydb lock (take minicron lock, but that's a very ephemeral low-level lock)
1546 1547 1548 1549 1550 1551 1552
	{
            CHECKPOINT_STATUS_S cpstat;
            toku_checkpoint_get_status(&cpstat);
            engstat->checkpoint_footprint = cpstat.footprint;
	    format_time(&cpstat.time_last_checkpoint_begin_complete, engstat->checkpoint_time_begin_complete);
	    format_time(&cpstat.time_last_checkpoint_begin,          engstat->checkpoint_time_begin);
	    format_time(&cpstat.time_last_checkpoint_end,            engstat->checkpoint_time_end);
1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574
	    engstat->checkpoint_last_lsn   = cpstat.last_lsn;
	    engstat->checkpoint_count      = cpstat.checkpoint_count;
	    engstat->checkpoint_count_fail = cpstat.checkpoint_count_fail;
	}
	{
	    TXN_STATUS_S txnstat;
	    toku_txn_get_status(&txnstat);
	    engstat->txn_begin   = txnstat.begin;
	    engstat->txn_commit  = txnstat.commit;
	    engstat->txn_abort   = txnstat.abort;
	    engstat->txn_close   = txnstat.close;
	    {
		uint64_t oldest_xid = 0;
		uint64_t next_lsn   = 0;
		TOKULOGGER logger = env->i->logger;
		if (logger) {
		    oldest_xid = toku_logger_get_oldest_living_xid(env->i->logger);
		    next_lsn   = (toku_logger_get_next_lsn(env->i->logger)).lsn;
		}
		engstat->txn_oldest_live = oldest_xid;
		engstat->next_lsn = next_lsn;
	    }
1575
	}
1576 1577 1578
	{
	    CACHETABLE_STATUS_S ctstat;
	    toku_cachetable_get_status(env->i->cachetable, &ctstat);
1579 1580 1581 1582 1583 1584 1585 1586
	    engstat->cachetable_lock_taken    = ctstat.lock_taken;
	    engstat->cachetable_lock_released = ctstat.lock_released;
	    engstat->cachetable_hit           = ctstat.hit;
	    engstat->cachetable_miss          = ctstat.miss;
	    engstat->cachetable_misstime      = ctstat.misstime;
	    engstat->cachetable_waittime      = ctstat.waittime;
	    engstat->cachetable_wait_reading  = ctstat.wait_reading;
	    engstat->cachetable_wait_writing  = ctstat.wait_writing;
1587
	    engstat->cachetable_wait_checkpoint = ctstat.wait_checkpoint;
1588 1589 1590 1591 1592 1593 1594 1595
	    engstat->puts                     = ctstat.puts;
	    engstat->prefetches               = ctstat.prefetches;
	    engstat->maybe_get_and_pins       = ctstat.maybe_get_and_pins;
	    engstat->maybe_get_and_pin_hits   = ctstat.maybe_get_and_pin_hits;
	    engstat->cachetable_size_current  = ctstat.size_current;
	    engstat->cachetable_size_limit    = ctstat.size_limit;
	    engstat->cachetable_size_writing  = ctstat.size_writing;
	    engstat->get_and_pin_footprint    = ctstat.get_and_pin_footprint;
1596 1597 1598
	    engstat->local_checkpoint         = ctstat.local_checkpoint;
	    engstat->local_checkpoint_files   = ctstat.local_checkpoint_files;
	    engstat->local_checkpoint_during_checkpoint = ctstat.local_checkpoint_during_checkpoint;
1599
	}
1600 1601
	{
	    toku_ltm* ltm = env->i->ltm;
1602
	    LTM_STATUS_S ltmstat;
1603 1604 1605 1606 1607
	    uint32_t max_locks, curr_locks, max_locks_per_db;
	    toku_ltm_get_status(ltm, &max_locks, &curr_locks, &max_locks_per_db, &ltmstat);
	    engstat->range_locks_max                 = max_locks;
	    engstat->range_locks_max_per_index       = max_locks_per_db;
	    engstat->range_locks_curr                = curr_locks;
1608 1609
	    engstat->range_lock_escalation_successes = ltmstat.lock_escalation_successes;
	    engstat->range_lock_escalation_failures  = ltmstat.lock_escalation_failures;
1610 1611 1612 1613 1614 1615
	    engstat->range_read_locks                = ltmstat.read_lock;
	    engstat->range_read_locks_fail           = ltmstat.read_lock_fail;
	    engstat->range_out_of_read_locks         = ltmstat.out_of_read_locks;
	    engstat->range_write_locks               = ltmstat.write_lock;
	    engstat->range_write_locks_fail          = ltmstat.write_lock_fail;
	    engstat->range_out_of_write_locks        = ltmstat.out_of_write_locks;
1616
	}
1617 1618
	{
	    engstat->inserts            = num_inserts;
1619
	    engstat->inserts_fail       = num_inserts_fail;
1620
	    engstat->deletes            = num_deletes;
1621
	    engstat->deletes_fail       = num_deletes_fail;
1622 1623 1624
	    engstat->point_queries      = num_point_queries;
	    engstat->sequential_queries = num_sequential_queries;
	}
1625 1626 1627 1628 1629 1630
	{
	    u_int64_t fsync_count, fsync_time;
	    toku_get_fsync_times(&fsync_count, &fsync_time);
	    engstat->fsync_count = fsync_count;
	    engstat->fsync_time  = fsync_time;
	}
1631 1632 1633 1634 1635 1636 1637 1638
	{
	    LOGGER_STATUS_S log_stat;
	    TOKULOGGER logger = env->i->logger;
	    toku_logger_get_status(logger, &log_stat);
	    engstat->logger_ilock_ctr = log_stat.ilock_ctr;
	    engstat->logger_olock_ctr = log_stat.olock_ctr;
	    engstat->logger_swap_ctr  = log_stat.swap_ctr;
	}
1639 1640
	{
	    time_t    enospc_most_recent_timestamp;
1641 1642
	    u_int64_t enospc_threads_blocked, enospc_ctr;
	    toku_fs_get_write_info(&enospc_most_recent_timestamp, &enospc_threads_blocked, &enospc_ctr);
1643 1644
	    format_time(&enospc_most_recent_timestamp, engstat->enospc_most_recent);	    
	    engstat->enospc_threads_blocked = enospc_threads_blocked;
1645
	    engstat->enospc_ctr = enospc_ctr;
1646
	}
1647
	{
1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664
	    engstat->enospc_redzone_ctr   = env->i->enospc_redzone_ctr;   // number of operations rejected by enospc prevention (red zone)
	    engstat->enospc_state         = env->i->fs_state;
	}
	{
	    LOADER_STATUS_S loader_stat;
	    toku_loader_get_status(&loader_stat);
	    engstat->loader_create         = loader_stat.create;
	    engstat->loader_create_fail    = loader_stat.create_fail;
	    engstat->loader_put            = loader_stat.put;
	    engstat->loader_close          = loader_stat.close;
	    engstat->loader_close_fail     = loader_stat.close_fail;
	    engstat->loader_abort          = loader_stat.abort;
	    engstat->loader_current        = loader_stat.current;
	    engstat->loader_max            = loader_stat.max;
	    
	    engstat->logsuppress     = logsuppress;
	    engstat->logsuppressfail = logsuppressfail;
1665
	}
1666 1667 1668 1669
    }
    return r;
}

1670 1671 1672 1673 1674 1675 1676 1677
// Fill buff with text description of engine status up to bufsiz bytes.
// Intended for use by test programs that do not have the handlerton available.
static int
env_get_engine_status_text(DB_ENV * env, char * buff, int bufsiz) {
    ENGINE_STATUS engstat;
    int r = env_get_engine_status(env, &engstat);    
    int n = 0;  // number of characters printed so far

1678
    n += snprintf(buff + n, bufsiz - n, "startuptime                      %s \n", engstat.startuptime);
1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697
    n += snprintf(buff + n, bufsiz - n, "now                              %s \n", engstat.now);
    n += snprintf(buff + n, bufsiz - n, "ydb_lock_ctr                     %"PRIu64"\n", engstat.ydb_lock_ctr);
    n += snprintf(buff + n, bufsiz - n, "max_possible_sleep               %"PRIu64"\n", engstat.max_possible_sleep);
    n += snprintf(buff + n, bufsiz - n, "processor_freq_mhz               %"PRIu64"\n", engstat.processor_freq_mhz);
    n += snprintf(buff + n, bufsiz - n, "max_requested_sleep              %"PRIu64"\n", engstat.max_requested_sleep);
    n += snprintf(buff + n, bufsiz - n, "times_max_sleep_used             %"PRIu64"\n", engstat.times_max_sleep_used);
    n += snprintf(buff + n, bufsiz - n, "total_sleepers                   %"PRIu64"\n", engstat.total_sleepers);
    n += snprintf(buff + n, bufsiz - n, "total_sleep_time                 %"PRIu64"\n", engstat.total_sleep_time);
    n += snprintf(buff + n, bufsiz - n, "max_waiters                      %"PRIu64"\n", engstat.max_waiters);
    n += snprintf(buff + n, bufsiz - n, "total_waiters                    %"PRIu64"\n", engstat.total_waiters);
    n += snprintf(buff + n, bufsiz - n, "total_clients                    %"PRIu64"\n", engstat.total_clients);
    n += snprintf(buff + n, bufsiz - n, "time_ydb_lock_held_unavailable   %"PRIu64"\n", engstat.time_ydb_lock_held_unavailable);
    n += snprintf(buff + n, bufsiz - n, "max_time_ydb_lock_held           %"PRIu64"\n", engstat.max_time_ydb_lock_held);
    n += snprintf(buff + n, bufsiz - n, "total_time_ydb_lock_held         %"PRIu64"\n", engstat.total_time_ydb_lock_held);
    n += snprintf(buff + n, bufsiz - n, "checkpoint_period                %d \n", engstat.checkpoint_period);
    n += snprintf(buff + n, bufsiz - n, "checkpoint_footprint             %d \n", engstat.checkpoint_footprint);
    n += snprintf(buff + n, bufsiz - n, "checkpoint_time_begin            %s \n", engstat.checkpoint_time_begin);
    n += snprintf(buff + n, bufsiz - n, "checkpoint_time_begin_complete   %s \n", engstat.checkpoint_time_begin_complete);
    n += snprintf(buff + n, bufsiz - n, "checkpoint_time_end              %s \n", engstat.checkpoint_time_end);
1698 1699 1700 1701 1702 1703 1704 1705 1706
    n += snprintf(buff + n, bufsiz - n, "checkpoint_last_lsn              %"PRIu64"\n", engstat.checkpoint_last_lsn);
    n += snprintf(buff + n, bufsiz - n, "checkpoint_count                 %"PRIu32"\n", engstat.checkpoint_count);
    n += snprintf(buff + n, bufsiz - n, "checkpoint_count_fail            %"PRIu32"\n", engstat.checkpoint_count_fail);
    n += snprintf(buff + n, bufsiz - n, "txn_begin                        %"PRIu64"\n", engstat.txn_begin);
    n += snprintf(buff + n, bufsiz - n, "txn_commit                       %"PRIu64"\n", engstat.txn_commit);
    n += snprintf(buff + n, bufsiz - n, "txn_abort                        %"PRIu64"\n", engstat.txn_abort);
    n += snprintf(buff + n, bufsiz - n, "txn_close                        %"PRIu64"\n", engstat.txn_close);
    n += snprintf(buff + n, bufsiz - n, "txn_oldest_live                  %"PRIu64"\n", engstat.txn_oldest_live);
    n += snprintf(buff + n, bufsiz - n, "next_lsn                         %"PRIu64"\n", engstat.next_lsn);
1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722
    n += snprintf(buff + n, bufsiz - n, "cachetable_lock_taken            %"PRIu64"\n", engstat.cachetable_lock_taken);
    n += snprintf(buff + n, bufsiz - n, "cachetable_lock_released         %"PRIu64"\n", engstat.cachetable_lock_released);
    n += snprintf(buff + n, bufsiz - n, "cachetable_hit                   %"PRIu64"\n", engstat.cachetable_hit);
    n += snprintf(buff + n, bufsiz - n, "cachetable_miss                  %"PRIu64"\n", engstat.cachetable_miss);
    n += snprintf(buff + n, bufsiz - n, "cachetable_misstime              %"PRIu64"\n", engstat.cachetable_misstime);
    n += snprintf(buff + n, bufsiz - n, "cachetable_waittime              %"PRIu64"\n", engstat.cachetable_waittime);
    n += snprintf(buff + n, bufsiz - n, "cachetable_wait_reading          %"PRIu64"\n", engstat.cachetable_wait_reading);
    n += snprintf(buff + n, bufsiz - n, "cachetable_wait_writing          %"PRIu64"\n", engstat.cachetable_wait_writing);
    n += snprintf(buff + n, bufsiz - n, "puts                             %"PRIu64"\n", engstat.puts);
    n += snprintf(buff + n, bufsiz - n, "prefetches                       %"PRIu64"\n", engstat.prefetches);
    n += snprintf(buff + n, bufsiz - n, "maybe_get_and_pins               %"PRIu64"\n", engstat.maybe_get_and_pins);
    n += snprintf(buff + n, bufsiz - n, "maybe_get_and_pin_hits           %"PRIu64"\n", engstat.maybe_get_and_pin_hits);
    n += snprintf(buff + n, bufsiz - n, "cachetable_size_current          %"PRId64"\n", engstat.cachetable_size_current);
    n += snprintf(buff + n, bufsiz - n, "cachetable_size_limit            %"PRId64"\n", engstat.cachetable_size_limit);
    n += snprintf(buff + n, bufsiz - n, "cachetable_size_writing          %"PRId64"\n", engstat.cachetable_size_writing);
    n += snprintf(buff + n, bufsiz - n, "get_and_pin_footprint            %"PRId64"\n", engstat.get_and_pin_footprint);
1723 1724 1725
    n += snprintf(buff + n, bufsiz - n, "local_checkpoint                 %"PRId64"\n", engstat.local_checkpoint);
    n += snprintf(buff + n, bufsiz - n, "local_checkpoint_files           %"PRId64"\n", engstat.local_checkpoint_files);
    n += snprintf(buff + n, bufsiz - n, "local_checkpoint_during_checkpoint  %"PRId64"\n", engstat.local_checkpoint_during_checkpoint);
1726
    n += snprintf(buff + n, bufsiz - n, "range_locks_max                  %"PRIu32"\n", engstat.range_locks_max);
1727
    n += snprintf(buff + n, bufsiz - n, "range_locks_max_per_index        %"PRIu32"\n", engstat.range_locks_max_per_index);
1728
    n += snprintf(buff + n, bufsiz - n, "range_locks_curr                 %"PRIu32"\n", engstat.range_locks_curr);
1729 1730
    n += snprintf(buff + n, bufsiz - n, "range_locks_escalation_successes %"PRIu32"\n", engstat.range_lock_escalation_successes);
    n += snprintf(buff + n, bufsiz - n, "range_locks_escalation_failures  %"PRIu32"\n", engstat.range_lock_escalation_failures);
1731 1732 1733 1734 1735 1736
    n += snprintf(buff + n, bufsiz - n, "range_read_locks                 %"PRIu64"\n", engstat.range_read_locks);
    n += snprintf(buff + n, bufsiz - n, "range_read_locks_fail            %"PRIu64"\n", engstat.range_read_locks_fail);
    n += snprintf(buff + n, bufsiz - n, "range_out_of_read_locks          %"PRIu64"\n", engstat.range_out_of_read_locks);
    n += snprintf(buff + n, bufsiz - n, "range_write_locks                %"PRIu64"\n", engstat.range_write_locks);
    n += snprintf(buff + n, bufsiz - n, "range_write_locks_fail           %"PRIu64"\n", engstat.range_write_locks_fail);
    n += snprintf(buff + n, bufsiz - n, "range_out_of_write_locks         %"PRIu64"\n", engstat.range_out_of_write_locks);
1737
    n += snprintf(buff + n, bufsiz - n, "inserts                          %"PRIu64"\n", engstat.inserts);
1738
    n += snprintf(buff + n, bufsiz - n, "inserts_fail                     %"PRIu64"\n", engstat.inserts_fail);
1739
    n += snprintf(buff + n, bufsiz - n, "deletes                          %"PRIu64"\n", engstat.deletes);
1740
    n += snprintf(buff + n, bufsiz - n, "deletes_fail                     %"PRIu64"\n", engstat.deletes_fail);
1741 1742
    n += snprintf(buff + n, bufsiz - n, "point_queries                    %"PRIu64"\n", engstat.point_queries);
    n += snprintf(buff + n, bufsiz - n, "sequential_queries               %"PRIu64"\n", engstat.sequential_queries);
1743 1744
    n += snprintf(buff + n, bufsiz - n, "fsync_count                      %"PRIu64"\n", engstat.fsync_count);
    n += snprintf(buff + n, bufsiz - n, "fsync_time                       %"PRIu64"\n", engstat.fsync_time);
1745 1746 1747 1748 1749
    n += snprintf(buff + n, bufsiz - n, "logger ilock count               %"PRIu64"\n", engstat.logger_ilock_ctr);
    n += snprintf(buff + n, bufsiz - n, "logger olock count               %"PRIu64"\n", engstat.logger_olock_ctr);
    n += snprintf(buff + n, bufsiz - n, "logger swap count                %"PRIu64"\n", engstat.logger_swap_ctr);
    n += snprintf(buff + n, bufsiz - n, "enospc_most_recent               %s \n", engstat.enospc_most_recent);
    n += snprintf(buff + n, bufsiz - n, "enospc threads blocked           %"PRIu64"\n", engstat.enospc_threads_blocked);
1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762
    n += snprintf(buff + n, bufsiz - n, "enospc count                     %"PRIu64"\n", engstat.enospc_ctr);
    n += snprintf(buff + n, bufsiz - n, "enospc redzone ctr               %"PRIu64"\n", engstat.enospc_redzone_ctr);
    n += snprintf(buff + n, bufsiz - n, "enospc state                     %"PRIu64"\n", engstat.enospc_state);
    n += snprintf(buff + n, bufsiz - n, "loader_create                    %"PRIu64"\n", engstat.loader_create);
    n += snprintf(buff + n, bufsiz - n, "loader_createf_fail              %"PRIu64"\n", engstat.loader_create_fail);
    n += snprintf(buff + n, bufsiz - n, "loader_put                       %"PRIu64"\n", engstat.loader_put);
    n += snprintf(buff + n, bufsiz - n, "loader_close                     %"PRIu64"\n", engstat.loader_close);
    n += snprintf(buff + n, bufsiz - n, "loader_close_fail                %"PRIu64"\n", engstat.loader_close_fail);
    n += snprintf(buff + n, bufsiz - n, "loader_abort                     %"PRIu64"\n", engstat.loader_abort);
    n += snprintf(buff + n, bufsiz - n, "loader_current                   %"PRIu32"\n", engstat.loader_current);
    n += snprintf(buff + n, bufsiz - n, "loader_max                       %"PRIu32"\n", engstat.loader_max);
    n += snprintf(buff + n, bufsiz - n, "logsuppress                      %"PRIu64"\n", engstat.logsuppress);
    n += snprintf(buff + n, bufsiz - n, "logsuppressfail                  %"PRIu64"\n", engstat.logsuppressfail);
1763 1764 1765
    if (n > bufsiz) {
	char * errmsg = "BUFFER TOO SMALL\n";
	int len = strlen(errmsg) + 1;
1766
	(void) snprintf(buff + (bufsiz - 1) - len, len, "%s", errmsg);
1767 1768 1769 1770 1771
    }

    return r;
}

Rich Prohaska's avatar
Rich Prohaska committed
1772 1773
static int locked_txn_begin(DB_ENV * env, DB_TXN * stxn, DB_TXN ** txn, u_int32_t flags);

Yoni Fogel's avatar
Yoni Fogel committed
1774 1775 1776 1777 1778 1779
static int toku_db_lt_panic(DB* db, int r);

static toku_dbt_cmp toku_db_get_compare_fun(DB* db);

static toku_dbt_cmp toku_db_get_dup_compare(DB* db);

Rich Prohaska's avatar
Rich Prohaska committed
1780
static int toku_env_create(DB_ENV ** envp, u_int32_t flags) {
Yoni Fogel's avatar
Yoni Fogel committed
1781 1782 1783 1784 1785 1786
    int r = ENOSYS;
    DB_ENV* result = NULL;

    if (flags!=0)    { r = EINVAL; goto cleanup; }
    MALLOC(result);
    if (result == 0) { r = ENOMEM; goto cleanup; }
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1787
    memset(result, 0, sizeof *result);
1788
    result->err = (void (*)(const DB_ENV * env, int error, const char *fmt, ...)) toku_locked_env_err;
1789 1790 1791 1792 1793
#define SENV(name) result->name = locked_env_ ## name
    SENV(dbremove);
    SENV(dbrename);
    SENV(set_default_bt_compare);
    SENV(set_default_dup_compare);
1794 1795
    SENV(set_generate_row_callback_for_put);
    SENV(set_generate_row_callback_for_del);
1796 1797
    SENV(put_multiple);
    SENV(del_multiple);
1798 1799
    SENV(checkpointing_set_period);
    SENV(checkpointing_get_period);
1800 1801 1802 1803
    result->checkpointing_postpone = env_checkpointing_postpone;
    result->checkpointing_resume = env_checkpointing_resume;
    result->checkpointing_begin_atomic_operation = env_checkpointing_begin_atomic_operation;
    result->checkpointing_end_atomic_operation = env_checkpointing_end_atomic_operation;
1804
    result->get_engine_status = env_get_engine_status;
1805
    result->get_engine_status_text = env_get_engine_status_text;
1806 1807 1808
    result->get_iname = env_get_iname;
    SENV(open);
    SENV(close);
1809
    result->txn_checkpoint = toku_env_txn_checkpoint;
1810
    SENV(log_flush);
Rich Prohaska's avatar
Rich Prohaska committed
1811 1812 1813
    result->set_errcall = toku_env_set_errcall;
    result->set_errfile = toku_env_set_errfile;
    result->set_errpfx = toku_env_set_errpfx;
1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825
    //SENV(set_noticecall);
    SENV(set_flags);
    SENV(set_data_dir);
    SENV(set_tmp_dir);
    SENV(set_verbose);
    SENV(set_lg_bsize);
    SENV(set_lg_dir);
    SENV(set_lg_max);
    SENV(get_lg_max);
    SENV(set_lk_max_locks);
    SENV(get_lk_max_locks);
    SENV(set_cachesize);
Rich Prohaska's avatar
Rich Prohaska committed
1826
#if DB_VERSION_MAJOR == 4 && DB_VERSION_MINOR >= 3
1827
    SENV(get_cachesize);
Rich Prohaska's avatar
Rich Prohaska committed
1828
#endif
1829
    SENV(set_lk_detect);
1830
#if DB_VERSION_MAJOR == 4 && DB_VERSION_MINOR <= 4
1831
    SENV(set_lk_max);
1832
#endif
1833 1834
    SENV(log_archive);
    SENV(txn_stat);
Rich Prohaska's avatar
Rich Prohaska committed
1835
    result->txn_begin = locked_txn_begin;
1836
    SENV(set_redzone);
1837
#undef SENV
1838
    result->create_loader = toku_loader_create_loader;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1839

1840
    MALLOC(result->i);
Yoni Fogel's avatar
Yoni Fogel committed
1841
    if (result->i == 0) { r = ENOMEM; goto cleanup; }
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1842
    memset(result->i, 0, sizeof *result->i);
1843
    env_init_open_txn(result);
1844
    env_fs_init(result);
Yoni Fogel's avatar
Yoni Fogel committed
1845 1846

    r = toku_ltm_create(&result->i->ltm, __toku_env_default_max_locks,
Yoni Fogel's avatar
Yoni Fogel committed
1847 1848 1849
                         toku_db_lt_panic, 
                         toku_db_get_compare_fun, toku_db_get_dup_compare, 
                         toku_malloc, toku_free, toku_realloc);
Yoni Fogel's avatar
Yoni Fogel committed
1850
    if (r!=0) { goto cleanup; }
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1851

1852
    {
Yoni Fogel's avatar
Yoni Fogel committed
1853 1854
	r = toku_logger_create(&result->i->logger);
	if (r!=0) { goto cleanup; }
1855 1856
	assert(result->i->logger);
    }
1857 1858 1859 1860 1861
    {
        r = toku_omt_create(&result->i->open_dbs);
        if (r!=0) goto cleanup;
        assert(result->i->open_dbs);
    }
1862

1863
    ydb_add_ref();
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1864
    *envp = result;
Yoni Fogel's avatar
Yoni Fogel committed
1865 1866 1867 1868 1869 1870 1871 1872
    r = 0;
cleanup:
    if (r!=0) {
        if (result) {
            if (result->i) {
                if (result->i->ltm) {
                    toku_ltm_close(result->i->ltm);
                }
1873 1874
                if (result->i->open_dbs)
                    toku_omt_destroy(&result->i->open_dbs);
Yoni Fogel's avatar
Yoni Fogel committed
1875 1876 1877 1878 1879 1880
                toku_free(result->i);
            }
            toku_free(result);
        }
    }
    return r;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1881 1882
}

1883
int DB_ENV_CREATE_FUN (DB_ENV ** envp, u_int32_t flags) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1884
    toku_ydb_lock(); int r = toku_env_create(envp, flags); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
1885 1886
}

Yoni Fogel's avatar
Yoni Fogel committed
1887
static int toku_txn_release_locks(DB_TXN* txn) {
Yoni Fogel's avatar
Yoni Fogel committed
1888
    assert(txn);
1889
    toku_lth* lth = db_txn_struct_i(txn)->lth;
Yoni Fogel's avatar
Yoni Fogel committed
1890

Yoni Fogel's avatar
Yoni Fogel committed
1891 1892
    int r = ENOSYS;
    int first_error = 0;
Yoni Fogel's avatar
Yoni Fogel committed
1893 1894 1895 1896
    if (lth) {
        toku_lth_start_scan(lth);
        toku_lock_tree* next = toku_lth_next(lth);
        while (next) {
1897
            r = toku_lt_unlock(next, toku_txn_get_txnid(db_txn_struct_i(txn)->tokutxn));
Yoni Fogel's avatar
Yoni Fogel committed
1898 1899 1900 1901 1902
            if (!first_error && r!=0) { first_error = r; }
            if (r == 0) {
                r = toku_lt_remove_ref(next);
                if (!first_error && r!=0) { first_error = r; }
            }
Yoni Fogel's avatar
Yoni Fogel committed
1903 1904 1905
            next = toku_lth_next(lth);
        }
        toku_lth_close(lth);
1906
        db_txn_struct_i(txn)->lth = NULL;
Yoni Fogel's avatar
Yoni Fogel committed
1907
    }
Yoni Fogel's avatar
Yoni Fogel committed
1908 1909
    r = first_error;

Yoni Fogel's avatar
Yoni Fogel committed
1910
    return r;
Yoni Fogel's avatar
Yoni Fogel committed
1911 1912
}

1913 1914
// Yield the lock so someone else can work, and then reacquire the lock.
// Useful while processing commit or rollback logs, to allow others to access the system.
1915
static void ydb_yield (voidfp f, void *fv, void *UU(v)) {
1916
    toku_ydb_unlock(); 
1917
    if (f) f(fv);
1918 1919 1920
    toku_ydb_lock();
}

1921
static int toku_txn_commit(DB_TXN * txn, u_int32_t flags,
1922
			   TXN_PROGRESS_POLL_FUNCTION poll, void* poll_extra) {
1923
    if (!txn) return EINVAL;
1924
    HANDLE_PANICKED_ENV(txn->mgrp);
Yoni Fogel's avatar
Yoni Fogel committed
1925
    //Recursively kill off children
1926 1927
    if (db_txn_struct_i(txn)->child) {
        //commit of child sets the child pointer to NULL
1928
        int r_child = toku_txn_commit(db_txn_struct_i(txn)->child, flags, NULL, NULL);
1929 1930 1931 1932
        if (r_child !=0 && !toku_env_is_panicked(txn->mgrp)) {
            txn->mgrp->i->is_panicked = r_child;
            txn->mgrp->i->panic_string = toku_strdup("Recursive child commit failed during parent commit.\n");
        }
Yoni Fogel's avatar
Yoni Fogel committed
1933 1934
        //In a panicked env, the child may not be removed from the list.
        HANDLE_PANICKED_ENV(txn->mgrp);
Yoni Fogel's avatar
Yoni Fogel committed
1935
    }
1936
    assert(!db_txn_struct_i(txn)->child);
Yoni Fogel's avatar
Yoni Fogel committed
1937 1938
    //Remove from parent
    if (txn->parent) {
1939 1940
        assert(db_txn_struct_i(txn->parent)->child == txn);
        db_txn_struct_i(txn->parent)->child=NULL;
Yoni Fogel's avatar
Yoni Fogel committed
1941
    }
1942
    env_remove_open_txn(txn->mgrp, txn);
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1943
    //toku_ydb_notef("flags=%d\n", flags);
1944 1945 1946 1947
    if (flags & DB_TXN_SYNC) {
        toku_txn_force_fsync_on_commit(db_txn_struct_i(txn)->tokutxn);
        flags &= ~DB_TXN_SYNC;
    }
1948
    int nosync = (flags & DB_TXN_NOSYNC)!=0 || (db_txn_struct_i(txn)->flags&DB_TXN_NOSYNC);
1949
    flags &= ~DB_TXN_NOSYNC;
Yoni Fogel's avatar
Yoni Fogel committed
1950 1951

    int r;
1952
    if (flags!=0)
1953 1954
	// frees the tokutxn
	// Calls ydb_yield(NULL) occasionally
1955
        //r = toku_logger_abort(db_txn_struct_i(txn)->tokutxn, ydb_yield, NULL);
1956
        r = toku_txn_abort_txn(db_txn_struct_i(txn)->tokutxn, ydb_yield, NULL, poll, poll_extra);
Yoni Fogel's avatar
Yoni Fogel committed
1957
    else
1958 1959
	// frees the tokutxn
	// Calls ydb_yield(NULL) occasionally
1960
        //r = toku_logger_commit(db_txn_struct_i(txn)->tokutxn, nosync, ydb_yield, NULL);
1961
        r = toku_txn_commit_txn(db_txn_struct_i(txn)->tokutxn, nosync, ydb_yield, NULL,
1962
				poll, poll_extra);
1963

1964 1965 1966 1967 1968 1969 1970 1971
    if (r!=0 && !toku_env_is_panicked(txn->mgrp)) {
        txn->mgrp->i->is_panicked = r;
        txn->mgrp->i->panic_string = toku_strdup("Error during commit.\n");
    }
    //If panicked, we're done.
    HANDLE_PANICKED_ENV(txn->mgrp);
    assert(r==0);

1972
    // Close the logger after releasing the locks
1973
    r = toku_txn_release_locks(txn);
1974 1975
    //toku_logger_txn_close(db_txn_struct_i(txn)->tokutxn);
    toku_txn_close_txn(db_txn_struct_i(txn)->tokutxn);
1976
    // the toxutxn is freed, and we must free the rest. */
Yoni Fogel's avatar
Yoni Fogel committed
1977

1978 1979 1980
    //Promote list to parent (dbs that must close before abort)
    if (txn->parent) {
        //Combine lists.
1981 1982 1983
        while (!toku_list_empty(&db_txn_struct_i(txn)->dbs_that_must_close_before_abort)) {
            struct toku_list *list = toku_list_pop(&db_txn_struct_i(txn)->dbs_that_must_close_before_abort);
            toku_list_push(&db_txn_struct_i(txn->parent)->dbs_that_must_close_before_abort, list);
1984 1985 1986 1987
        }
    }
    else {
        //Empty the list
1988 1989
        while (!toku_list_empty(&db_txn_struct_i(txn)->dbs_that_must_close_before_abort)) {
            toku_list_pop(&db_txn_struct_i(txn)->dbs_that_must_close_before_abort);
1990 1991 1992
        }
    }

1993
    // The txn is no good after the commit even if the commit fails, so free it up.
1994 1995 1996
#if !TOKUDB_NATIVE_H
    toku_free(db_txn_struct_i(txn));
#endif
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1997
    toku_free(txn);
Yoni Fogel's avatar
Yoni Fogel committed
1998
    if (flags!=0) return EINVAL;
1999
    return r;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
2000 2001
}

Rich Prohaska's avatar
Rich Prohaska committed
2002
static u_int32_t toku_txn_id(DB_TXN * txn) {
2003
    HANDLE_PANICKED_ENV(txn->mgrp);
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
2004
    toku_ydb_barf();
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
2005
    abort();
Rich Prohaska's avatar
Rich Prohaska committed
2006
    return -1;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
2007 2008
}

2009 2010
static int toku_txn_abort(DB_TXN * txn,
                          TXN_PROGRESS_POLL_FUNCTION poll, void* poll_extra) {
2011
    HANDLE_PANICKED_ENV(txn->mgrp);
2012
    //Recursively kill off children (abort or commit are both correct, commit is cheaper)
2013 2014
    if (db_txn_struct_i(txn)->child) {
        //commit of child sets the child pointer to NULL
2015
        int r_child = toku_txn_commit(db_txn_struct_i(txn)->child, DB_TXN_NOSYNC, NULL, NULL);
2016 2017 2018 2019
        if (r_child !=0 && !toku_env_is_panicked(txn->mgrp)) {
            txn->mgrp->i->is_panicked = r_child;
            txn->mgrp->i->panic_string = toku_strdup("Recursive child commit failed during parent abort.\n");
        }
Yoni Fogel's avatar
Yoni Fogel committed
2020 2021
        //In a panicked env, the child may not be removed from the list.
        HANDLE_PANICKED_ENV(txn->mgrp);
Yoni Fogel's avatar
Yoni Fogel committed
2022
    }
2023
    assert(!db_txn_struct_i(txn)->child);
Yoni Fogel's avatar
Yoni Fogel committed
2024 2025
    //Remove from parent
    if (txn->parent) {
2026 2027
        assert(db_txn_struct_i(txn->parent)->child == txn);
        db_txn_struct_i(txn->parent)->child=NULL;
Yoni Fogel's avatar
Yoni Fogel committed
2028
    }
2029
    env_remove_open_txn(txn->mgrp, txn);
2030 2031

    //All dbs that must close before abort, must now be closed
2032
    assert(toku_list_empty(&db_txn_struct_i(txn)->dbs_that_must_close_before_abort));
2033

2034
    //int r = toku_logger_abort(db_txn_struct_i(txn)->tokutxn, ydb_yield, NULL);
2035
    int r = toku_txn_abort_txn(db_txn_struct_i(txn)->tokutxn, ydb_yield, NULL, poll, poll_extra);
2036 2037 2038 2039 2040 2041 2042
    if (r!=0 && !toku_env_is_panicked(txn->mgrp)) {
        txn->mgrp->i->is_panicked = r;
        txn->mgrp->i->panic_string = toku_strdup("Error during abort.\n");
    }
    HANDLE_PANICKED_ENV(txn->mgrp);
    assert(r==0);
    r = toku_txn_release_locks(txn);
2043 2044
    //toku_logger_txn_close(db_txn_struct_i(txn)->tokutxn);
    toku_txn_close_txn(db_txn_struct_i(txn)->tokutxn);
Yoni Fogel's avatar
Yoni Fogel committed
2045

2046 2047 2048
#if !TOKUDB_NATIVE_H
    toku_free(db_txn_struct_i(txn));
#endif
2049
    toku_free(txn);
2050
    return r;
2051 2052
}

Rich Prohaska's avatar
Rich Prohaska committed
2053
static int locked_txn_begin(DB_ENV *env, DB_TXN * stxn, DB_TXN ** txn, u_int32_t flags) {
2054
    toku_ydb_lock(); int r = toku_txn_begin(env, stxn, txn, flags, 0); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
2055 2056 2057
}

static u_int32_t locked_txn_id(DB_TXN *txn) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
2058
    toku_ydb_lock(); u_int32_t r = toku_txn_id(txn); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
2059 2060
}

2061 2062
static int toku_txn_stat (DB_TXN *txn, struct txn_stat **txn_stat) {
    XMALLOC(*txn_stat);
2063
    return toku_logger_txn_rollback_raw_count(db_txn_struct_i(txn)->tokutxn, &(*txn_stat)->rollback_raw_count);
2064 2065 2066 2067 2068 2069
}

static int locked_txn_stat (DB_TXN *txn, struct txn_stat **txn_stat) {
    toku_ydb_lock(); u_int32_t r = toku_txn_stat(txn, txn_stat); toku_ydb_unlock(); return r;
}

2070 2071
static int locked_txn_commit_with_progress(DB_TXN *txn, u_int32_t flags,
                                           TXN_PROGRESS_POLL_FUNCTION poll, void* poll_extra) {
2072
    toku_multi_operation_client_lock(); //Cannot checkpoint during a commit.
2073
    toku_ydb_lock(); int r = toku_txn_commit(txn, flags, poll, poll_extra); toku_ydb_unlock();
2074
    toku_multi_operation_client_unlock(); //Cannot checkpoint during a commit.
2075
    return r;
Rich Prohaska's avatar
Rich Prohaska committed
2076 2077
}

2078 2079
static int locked_txn_abort_with_progress(DB_TXN *txn,
                                          TXN_PROGRESS_POLL_FUNCTION poll, void* poll_extra) {
2080
    toku_multi_operation_client_lock(); //Cannot checkpoint during an abort.
2081
    toku_ydb_lock(); int r = toku_txn_abort(txn, poll, poll_extra); toku_ydb_unlock();
2082
    toku_multi_operation_client_unlock(); //Cannot checkpoint during an abort.
2083
    return r;
Rich Prohaska's avatar
Rich Prohaska committed
2084 2085
}

2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097
static int locked_txn_commit(DB_TXN *txn, u_int32_t flags) {
    int r;
    r = locked_txn_commit_with_progress(txn, flags, NULL, NULL);
    return r;
}

static int locked_txn_abort(DB_TXN *txn) {
    int r;
    r = locked_txn_abort_with_progress(txn, NULL, NULL);
    return r;
}

2098
static int toku_txn_begin(DB_ENV *env, DB_TXN * stxn, DB_TXN ** txn, u_int32_t flags, int internal) {
2099
    HANDLE_PANICKED_ENV(env);
2100
    HANDLE_ILLEGAL_WORKING_PARENT_TXN(env, stxn); //Cannot create child while child already exists.
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
2101 2102
    if (!toku_logger_is_open(env->i->logger)) return toku_ydb_do_error(env, EINVAL, "Environment does not have logging enabled\n");
    if (!(env->i->open_flags & DB_INIT_TXN))  return toku_ydb_do_error(env, EINVAL, "Environment does not have transactions enabled\n");
2103 2104
    u_int32_t txn_flags = 0;
    txn_flags |= DB_TXN_NOWAIT; //We do not support blocking locks.
2105
    uint32_t child_isolation_flags = 0;
2106 2107 2108
    uint32_t parent_isolation_flags = 0;
    int inherit = 0;
    int set_isolation = 0;
2109 2110 2111 2112 2113 2114 2115
    if ((flags & DB_READ_UNCOMMITTED) && (flags & DB_READ_COMMITTED)) {
        return toku_ydb_do_error(
            env, 
            EINVAL, 
            "Transaction cannot have both DB_READ_COMMITTED and DB_READ_UNCOMMITTED set\n"
            );
    }
2116
    if (stxn) {
2117
        parent_isolation_flags = db_txn_struct_i(stxn)->flags & (DB_READ_UNCOMMITTED | DB_READ_COMMITTED);
2118 2119 2120 2121 2122 2123
        if (internal || flags&DB_INHERIT_ISOLATION) {
            flags &= ~DB_INHERIT_ISOLATION;
            inherit = 1;
            set_isolation = 1;
            child_isolation_flags = parent_isolation_flags;
        }
2124
    }
2125
    if (flags & (DB_READ_UNCOMMITTED|DB_READ_COMMITTED)) {
2126 2127 2128
        if (set_isolation)
            return toku_ydb_do_error(env, EINVAL, "Cannot set isolation two different ways in DB_ENV->txn_begin\n");
        set_isolation = 1;
2129 2130
        child_isolation_flags |=  (flags & (DB_READ_UNCOMMITTED|DB_READ_COMMITTED));
        flags                 &= ~(DB_READ_UNCOMMITTED | DB_READ_COMMITTED);
2131
    }
2132
    txn_flags |= child_isolation_flags;
2133 2134 2135 2136 2137 2138 2139 2140 2141
    if (flags&DB_TXN_NOWAIT) {
        txn_flags |=  DB_TXN_NOWAIT;
        flags     &= ~DB_TXN_NOWAIT;
    }
    if (flags&DB_TXN_NOSYNC) {
        txn_flags |=  DB_TXN_NOSYNC;
        flags     &= ~DB_TXN_NOSYNC;
    }
    if (flags!=0) return toku_ydb_do_error(env, EINVAL, "Invalid flags passed to DB_ENV->txn_begin\n");
2142 2143 2144
    //Require child to have same isolation level as parent.
    if (stxn && !inherit && parent_isolation_flags != child_isolation_flags) {
        return toku_ydb_do_error(env, EINVAL, "DB_ENV->txn_begin: Child transaction isolation level must match parent's isolation level.\n");
2145
    }
2146

2147 2148
    size_t result_size = sizeof(DB_TXN)+sizeof(struct __toku_db_txn_internal); // the internal stuff is stuck on the end.
    DB_TXN *result = toku_malloc(result_size);
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
2149 2150
    if (result == 0)
        return ENOMEM;
2151
    memset(result, 0, result_size);
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
2152
    //toku_ydb_notef("parent=%p flags=0x%x\n", stxn, flags);
2153
    result->mgrp = env;
2154 2155 2156 2157 2158 2159 2160
#define STXN(name) result->name = locked_txn_ ## name
    STXN(abort);
    STXN(commit);
    STXN(abort_with_progress);
    STXN(commit_with_progress);
    STXN(id);
#undef STXN
2161
    result->txn_stat = locked_txn_stat;
2162 2163 2164


    result->parent = stxn;
2165 2166 2167
#if !TOKUDB_NATIVE_H
    MALLOC(db_txn_struct_i(result));
    if (!db_txn_struct_i(result)) {
Yoni Fogel's avatar
Yoni Fogel committed
2168 2169 2170
        toku_free(result);
        return ENOMEM;
    }
2171 2172 2173
#endif
    memset(db_txn_struct_i(result), 0, sizeof *db_txn_struct_i(result));
    db_txn_struct_i(result)->flags = txn_flags;
2174
    toku_list_init(&db_txn_struct_i(result)->dbs_that_must_close_before_abort);
Yoni Fogel's avatar
Yoni Fogel committed
2175 2176

    int r;
Yoni Fogel's avatar
Yoni Fogel committed
2177
    if (env->i->open_flags & DB_INIT_LOCK && !stxn) {
2178
        r = toku_lth_create(&db_txn_struct_i(result)->lth,
Yoni Fogel's avatar
Yoni Fogel committed
2179 2180
                            toku_malloc, toku_free, toku_realloc);
        if (r!=0) {
2181 2182 2183
#if !TOKUDB_NATIVE_H
            toku_free(db_txn_struct_i(result));
#endif
Yoni Fogel's avatar
Yoni Fogel committed
2184 2185 2186
            toku_free(result);
            return r;
        }
Yoni Fogel's avatar
Yoni Fogel committed
2187 2188
    }
    
2189 2190
    //r = toku_logger_txn_begin(stxn ? db_txn_struct_i(stxn)->tokutxn : 0, &db_txn_struct_i(result)->tokutxn, env->i->logger);
    r = toku_txn_begin_txn(stxn ? db_txn_struct_i(stxn)->tokutxn : 0, &db_txn_struct_i(result)->tokutxn, env->i->logger);
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
2191 2192
    if (r != 0)
        return r;
Yoni Fogel's avatar
Yoni Fogel committed
2193 2194
    //Add to the list of children for the parent.
    if (result->parent) {
2195 2196
        assert(!db_txn_struct_i(result->parent)->child);
        db_txn_struct_i(result->parent)->child = result;
Yoni Fogel's avatar
Yoni Fogel committed
2197
    }
2198
    env_add_open_txn(env, result);
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
2199 2200 2201 2202
    *txn = result;
    return 0;
}

Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
2203
#if 0
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
2204 2205
int txn_commit(DB_TXN * txn, u_int32_t flags) {
    fprintf(stderr, "%s:%d\n", __FILE__, __LINE__);
2206
    return toku_logger_log_commit(db_txn_struct_i(txn)->tokutxn);
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
2207
}
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
2208
#endif
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
2209

Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
2210
int log_compare(const DB_LSN * a, const DB_LSN * b) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
2211
    toku_ydb_lock();
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
2212
    fprintf(stderr, "%s:%d log_compare(%p,%p)\n", __FILE__, __LINE__, a, b);
2213
    assert(0);
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
2214
    toku_ydb_unlock();
2215
    return 0;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
2216 2217
}

2218 2219
static void env_note_zombie_db_closed(DB_ENV *env, DB *db);

2220 2221
static int
db_close_before_brt(DB *db, u_int32_t UU(flags)) {
2222 2223 2224 2225
    if (db_opened(db) && db->i->dname) {
        // internal (non-user) dictionary has no dname
        env_note_zombie_db_closed(db->dbenv, db);  // tell env that this db is no longer a zombie (it is completely closed)
    }
2226
    char *error_string = 0;
2227
    int r1 = toku_close_brt(db->i->brt, &error_string);
2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239
    if (r1) {
	db->dbenv->i->is_panicked = r1; // Panicking the whole environment may be overkill, but I'm not sure what else to do.
	db->dbenv->i->panic_string = error_string;
	if (error_string) {
	    toku_ydb_do_error(db->dbenv, r1, "%s\n", error_string);
	} else {
	    toku_ydb_do_error(db->dbenv, r1, "Closing file\n");
	}
	error_string=0;
    }
    assert(error_string==0);
    int r2 = 0;
Yoni Fogel's avatar
Yoni Fogel committed
2240
    if (db->i->lt) {
2241 2242 2243 2244 2245
        r2 = toku_lt_remove_ref(db->i->lt);
	if (r2) {
	    db->dbenv->i->is_panicked = r2; // Panicking the whole environment may be overkill, but I'm not sure what else to do.
	    db->dbenv->i->panic_string = 0;
	}
Yoni Fogel's avatar
Yoni Fogel committed
2246
    }
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
2247
    // printf("%s:%d %d=__toku_db_close(%p)\n", __FILE__, __LINE__, r, db);
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
2248 2249
    // Even if panicked, let's close as much as we can.
    int is_panicked = toku_env_is_panicked(db->dbenv); 
2250 2251
    toku_sdbt_cleanup(&db->i->skey);
    toku_sdbt_cleanup(&db->i->sval);
2252
    if (db->i->dname) toku_free(db->i->dname);
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
2253 2254
    toku_free(db->i);
    toku_free(db);
2255
    ydb_unref();
2256 2257 2258 2259
    if (r1) return r1;
    if (r2) return r2;
    if (is_panicked) return EINVAL;
    return 0;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
2260 2261
}

2262 2263 2264 2265 2266 2267 2268 2269
// return 0 if v and dbv refer to same db (including same dname)
// return <0 if v is earlier in omt than dbv
// return >0 if v is later in omt than dbv
static int
find_db_by_db (OMTVALUE v, void *dbv) {
    DB *db = v;            // DB* that is stored in the omt
    DB *dbfind = dbv;      // extra, to be compared to v
    int cmp;
2270 2271 2272 2273 2274 2275 2276
    const char *dname     = db->i->dname;
    const char *dnamefind = dbfind->i->dname;
    cmp = strcmp(dname, dnamefind);
    if (cmp != 0) return cmp;
    int is_zombie     = db->i->is_zombie != 0;
    int is_zombiefind = dbfind->i->is_zombie != 0;
    cmp = is_zombie - is_zombiefind;
2277 2278
    if (cmp != 0) return cmp;
    if (db < dbfind) return -1;
2279
    if (db > dbfind) return  1;
2280 2281 2282 2283 2284 2285 2286
    return 0;
}

// Tell env that there is a new db handle (with non-unique dname in db->i-dname)
static void
env_note_db_opened(DB_ENV *env, DB *db) {
    assert(db->i->dname);  // internal (non-user) dictionary has no dname
2287
    assert(!db->i->is_zombie);
2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299
    int r;
    OMTVALUE dbv;
    uint32_t idx;
    r = toku_omt_find_zero(env->i->open_dbs, find_db_by_db, db, &dbv, &idx, NULL);
    assert(r==DB_NOTFOUND); //Must not already be there.
    r = toku_omt_insert_at(env->i->open_dbs, db, idx);
    assert(r==0);
}

static void
env_note_db_closed(DB_ENV *env, DB *db) {
    assert(db->i->dname);
2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328
    assert(!db->i->is_zombie);
    int r;
    OMTVALUE dbv;
    uint32_t idx;
    r = toku_omt_find_zero(env->i->open_dbs, find_db_by_db, db, &dbv, &idx, NULL);
    assert(r==0); //Must already be there.
    assert((DB*)dbv == db);
    r = toku_omt_delete_at(env->i->open_dbs, idx);
    assert(r==0);
}

// Tell env that there is a new db handle (with non-unique dname in db->i-dname)
static void
env_note_zombie_db(DB_ENV *env, DB *db) {
    assert(db->i->dname);  // internal (non-user) dictionary has no dname
    assert(db->i->is_zombie);
    int r;
    OMTVALUE dbv;
    uint32_t idx;
    r = toku_omt_find_zero(env->i->open_dbs, find_db_by_db, db, &dbv, &idx, NULL);
    assert(r==DB_NOTFOUND); //Must not already be there.
    r = toku_omt_insert_at(env->i->open_dbs, db, idx);
    assert(r==0);
}

static void
env_note_zombie_db_closed(DB_ENV *env, DB *db) {
    assert(db->i->dname);
    assert(db->i->is_zombie);
2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339
    int r;
    OMTVALUE dbv;
    uint32_t idx;
    r = toku_omt_find_zero(env->i->open_dbs, find_db_by_db, db, &dbv, &idx, NULL);
    assert(r==0); //Must already be there.
    assert((DB*)dbv == db);
    r = toku_omt_delete_at(env->i->open_dbs, idx);
    assert(r==0);
}

static int
2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356
find_zombie_db_by_dname (OMTVALUE v, void *dnamev) {
    DB *db = v;            // DB* that is stored in the omt
    int cmp;
    const char *dname     = db->i->dname;
    const char *dnamefind = dnamev;
    cmp = strcmp(dname, dnamefind);
    if (cmp != 0) return cmp;
    int is_zombie     = db->i->is_zombie != 0;
    int is_zombiefind = 1;
    cmp = is_zombie - is_zombiefind;
    return cmp;
}

static int
find_open_db_by_dname (OMTVALUE v, void *dnamev) {
    DB *db = v;            // DB* that is stored in the omt
    int cmp;
2357 2358
    const char *dname     = db->i->dname;
    const char *dnamefind = dnamev;
2359 2360 2361 2362 2363 2364
    cmp = strcmp(dname, dnamefind);
    if (cmp != 0) return cmp;
    int is_zombie     = db->i->is_zombie != 0;
    int is_zombiefind = 0;
    cmp = is_zombie - is_zombiefind;
    return cmp;
2365 2366 2367 2368 2369 2370 2371 2372 2373
}

// return true if there is any db open with the given dname
static BOOL
env_is_db_with_dname_open(DB_ENV *env, const char *dname) {
    int r;
    BOOL rval;
    OMTVALUE dbv;
    uint32_t idx;
2374
    r = toku_omt_find_zero(env->i->open_dbs, find_open_db_by_dname, (void*)dname, &dbv, &idx, NULL);
2375 2376 2377
    if (r==0) {
        DB *db = dbv;
        assert(strcmp(dname, db->i->dname) == 0);
2378
        assert(!db->i->is_zombie);
2379 2380 2381 2382 2383 2384 2385 2386 2387
        rval = TRUE;
    }
    else {
        assert(r==DB_NOTFOUND);
        rval = FALSE;
    }
    return rval;
}

2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409
// return true if there is any db open with the given dname
static DB*
env_get_zombie_db_with_dname(DB_ENV *env, const char *dname) {
    int r;
    DB* rval;
    OMTVALUE dbv;
    uint32_t idx;
    r = toku_omt_find_zero(env->i->open_dbs, find_zombie_db_by_dname, (void*)dname, &dbv, &idx, NULL);
    if (r==0) {
        DB *db = dbv;
        assert(db);
        assert(strcmp(dname, db->i->dname) == 0);
        assert(db->i->is_zombie);
        rval = db;
    }
    else {
        assert(r==DB_NOTFOUND);
        rval = NULL;
    }
    return rval;
}

2410
//DB->close()
2411
static int toku_db_close(DB * db, u_int32_t flags) {
2412 2413
    if (db_opened(db) && db->i->dname) {
        // internal (non-user) dictionary has no dname
2414
        env_note_db_closed(db->dbenv, db);  // tell env that this db is no longer in use by the user of this api (user-closed, may still be in use by fractal tree internals)
2415 2416 2417
        db->i->is_zombie = TRUE;
        env_note_zombie_db(db->dbenv, db);  // tell env that this db is a zombie
    }
2418
    //Remove from transaction's list of 'must close' if necessary.
2419 2420
    if (!toku_list_empty(&db->i->dbs_that_must_close_before_abort))
        toku_list_remove(&db->i->dbs_that_must_close_before_abort);
2421

2422 2423 2424 2425 2426
    int r = toku_brt_db_delay_closed(db->i->brt, db, db_close_before_brt, flags);
    return r;
}


2427 2428 2429
//Get the main portion of a cursor flag (excluding the bitwise or'd components).
static int get_main_cursor_flag(u_int32_t flags) {
    return flags & DB_OPFLAGS_MASK;
Yoni Fogel's avatar
Yoni Fogel committed
2430 2431
}

2432 2433
static int get_nonmain_cursor_flags(u_int32_t flags) {
    return flags & ~(DB_OPFLAGS_MASK);
2434 2435
}

2436
static inline BOOL toku_c_uninitialized(DBC* c) {
2437
    return toku_brt_cursor_uninitialized(dbc_struct_i(c)->c);
Yoni Fogel's avatar
Yoni Fogel committed
2438
}            
Yoni Fogel's avatar
Yoni Fogel committed
2439

2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450
typedef struct query_context_wrapped_t {
    DBT               *key;
    DBT               *val;
    struct simple_dbt *skey;
    struct simple_dbt *sval;
} *QUERY_CONTEXT_WRAPPED, QUERY_CONTEXT_WRAPPED_S;

static inline void
query_context_wrapped_init(QUERY_CONTEXT_WRAPPED context, DBC *c, DBT *key, DBT *val) {
    context->key  = key;
    context->val  = val;
2451 2452
    context->skey = dbc_struct_i(c)->skey;
    context->sval = dbc_struct_i(c)->sval;
2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468
}

static int
c_get_wrapper_callback(DBT const *key, DBT const *val, void *extra) {
    QUERY_CONTEXT_WRAPPED context = extra;
    int r;
              r = toku_dbt_set(key->size, key->data, context->key, context->skey);
    if (r==0) r = toku_dbt_set(val->size, val->data, context->val, context->sval);
    return r;
}

static int toku_c_get_current_unconditional(DBC* c, u_int32_t flags, DBT* key, DBT* val) {
    int r;
    QUERY_CONTEXT_WRAPPED_S context; 
    query_context_wrapped_init(&context, c, key, val);
    r = toku_c_getf_current_binding(c, flags, c_get_wrapper_callback, &context);
Yoni Fogel's avatar
Yoni Fogel committed
2469 2470 2471 2472 2473 2474 2475 2476 2477 2478
    return r;
}

static inline void toku_swap_flag(u_int32_t* flag, u_int32_t* get_flag,
                                  u_int32_t new_flag) {
    *flag    -= *get_flag;
    *get_flag =  new_flag;
    *flag    += *get_flag;
}

Yoni Fogel's avatar
Yoni Fogel committed
2479 2480 2481 2482 2483 2484 2485
/*
    Used for partial implementation of nested transactions.
    Work is done by children as normal, but all locking is done by the
    root of the nested txn tree.
    This may hold extra locks, and will not work as expected when
    a node has two non-completed txns at any time.
*/
2486
static inline DB_TXN* toku_txn_ancestor(DB_TXN* txn) {
Yoni Fogel's avatar
Yoni Fogel committed
2487
    while (txn && txn->parent) txn = txn->parent;
Yoni Fogel's avatar
Yoni Fogel committed
2488

Yoni Fogel's avatar
Yoni Fogel committed
2489 2490 2491
    return txn;
}

Yoni Fogel's avatar
Yoni Fogel committed
2492 2493
static int toku_txn_add_lt(DB_TXN* txn, toku_lock_tree* lt);

Yoni Fogel's avatar
Yoni Fogel committed
2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504
/* c_get has many subfunctions with lots of parameters
 * this structure exists to simplify it. */
typedef struct {
    DBC*        c;                  // The cursor
    DB*         db;                 // db the cursor is iterating over
    DB_TXN*     txn_anc;            // The (root) ancestor of the transaction
    TXNID       id_anc;
    DBT         tmp_key;            // Temporary key to protect out param
    DBT         tmp_val;            // Temporary val to protect out param
    u_int32_t   flag;               // The c_get flag
    u_int32_t   op;                 // The operation portion of the c_get flag
Yoni Fogel's avatar
Yoni Fogel committed
2505
    u_int32_t   lock_flags;         // The prelock flags.
Yoni Fogel's avatar
Yoni Fogel committed
2506 2507 2508 2509 2510 2511 2512 2513 2514 2515
    BOOL        cursor_is_write;    // Whether op can change position of cursor
    BOOL        key_is_read;        
    BOOL        key_is_write;
    BOOL        val_is_read;
    BOOL        val_is_write;
    BOOL        duplicates;
    BOOL        tmp_key_malloced;
    BOOL        tmp_val_malloced;
} C_GET_VARS;

2516 2517

static inline u_int32_t get_prelocked_flags(u_int32_t flags, DB_TXN* txn, DB* db) {
2518 2519
    u_int32_t lock_flags = flags & (DB_PRELOCKED | DB_PRELOCKED_WRITE);

2520 2521
    // for internal (non-user) dictionary, do not set DB_PRELOCK
    if (db->i->dname) {
2522 2523 2524 2525 2526 2527 2528
        //DB_READ_UNCOMMITTED and DB_READ_COMMITTED transactions 'own' all read locks for user-data dictionaries.
        if (txn && 
            (db_txn_struct_i(txn)->flags& (DB_READ_UNCOMMITTED | DB_READ_COMMITTED))
           )
        {
            lock_flags |= DB_PRELOCKED;
        }
2529
    }
2530
    return lock_flags;
Yoni Fogel's avatar
Yoni Fogel committed
2531 2532
}

2533 2534 2535 2536
//Return true for NODUP database, false for DUPSORT
static BOOL
db_is_nodup(DB *db) {
    unsigned int brtflags;
Yoni Fogel's avatar
Yoni Fogel committed
2537

2538 2539
    int r = toku_brt_get_flags(db->i->brt, &brtflags);
    assert(r==0);
2540
    BOOL rval = (BOOL)(!(brtflags&TOKU_DB_DUPSORT));
2541
    return rval;
Yoni Fogel's avatar
Yoni Fogel committed
2542
}
Yoni Fogel's avatar
Yoni Fogel committed
2543

2544 2545 2546 2547
static BOOL
c_db_is_nodup(DBC *c) {
    BOOL rval = db_is_nodup(c->dbp);
    return rval;
Yoni Fogel's avatar
Yoni Fogel committed
2548 2549
}

2550 2551
static int
toku_c_get(DBC* c, DBT* key, DBT* val, u_int32_t flag) {
2552 2553 2554
    //This function exists for legacy (test compatibility) purposes/parity with bdb.
    HANDLE_PANICKED_DB(c->dbp);
    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
Yoni Fogel's avatar
Yoni Fogel committed
2555

2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567
    u_int32_t main_flag       = get_main_cursor_flag(flag);
    u_int32_t remaining_flags = get_nonmain_cursor_flags(flag);
    int r;
    QUERY_CONTEXT_WRAPPED_S context;
    //Passing in NULL for a key or val means that it is NOT an output.
    //    Both key and val are output:
    //        query_context_wrapped_init(&context, c, key,  val);
    //    Val is output, key is not:
    //            query_context_wrapped_init(&context, c, NULL, val);
    //    Neither key nor val are output:
    //	    query_context_wrapped_init(&context, c, NULL, NULL); // Used for DB_GET_BOTH
    switch (main_flag) {
Yoni Fogel's avatar
Yoni Fogel committed
2568
        case (DB_FIRST):
2569 2570
            query_context_wrapped_init(&context, c, key,  val);
            r = toku_c_getf_first(c, remaining_flags, c_get_wrapper_callback, &context);
Yoni Fogel's avatar
Yoni Fogel committed
2571
            break;
Yoni Fogel's avatar
Yoni Fogel committed
2572
        case (DB_LAST):
2573 2574
            query_context_wrapped_init(&context, c, key,  val);
            r = toku_c_getf_last(c, remaining_flags, c_get_wrapper_callback, &context);
Yoni Fogel's avatar
Yoni Fogel committed
2575
            break;
2576 2577 2578
        case (DB_NEXT):
            query_context_wrapped_init(&context, c, key,  val);
            r = toku_c_getf_next(c, remaining_flags, c_get_wrapper_callback, &context);
Yoni Fogel's avatar
Yoni Fogel committed
2579
            break;
2580 2581 2582
        case (DB_NEXT_DUP):
            query_context_wrapped_init(&context, c, key,  val);
            r = toku_c_getf_next_dup(c, remaining_flags, c_get_wrapper_callback, &context);
Yoni Fogel's avatar
Yoni Fogel committed
2583
            break;
Yoni Fogel's avatar
Yoni Fogel committed
2584
        case (DB_NEXT_NODUP):
2585 2586
            query_context_wrapped_init(&context, c, key,  val);
            r = toku_c_getf_next_nodup(c, remaining_flags, c_get_wrapper_callback, &context);
Yoni Fogel's avatar
Yoni Fogel committed
2587 2588
            break;
        case (DB_PREV):
2589 2590
            query_context_wrapped_init(&context, c, key,  val);
            r = toku_c_getf_prev(c, remaining_flags, c_get_wrapper_callback, &context);
Yoni Fogel's avatar
Yoni Fogel committed
2591 2592
            break;
#ifdef DB_PREV_DUP
Yoni Fogel's avatar
Yoni Fogel committed
2593
        case (DB_PREV_DUP):
2594 2595
            query_context_wrapped_init(&context, c, key,  val);
            r = toku_c_getf_prev_dup(c, remaining_flags, c_get_wrapper_callback, &context);
Yoni Fogel's avatar
Yoni Fogel committed
2596 2597
            break;
#endif
2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618
        case (DB_PREV_NODUP):
            query_context_wrapped_init(&context, c, key,  val);
            r = toku_c_getf_prev_nodup(c, remaining_flags, c_get_wrapper_callback, &context);
            break;
        case (DB_CURRENT):
            query_context_wrapped_init(&context, c, key,  val);
            r = toku_c_getf_current(c, remaining_flags, c_get_wrapper_callback, &context);
            break;
        case (DB_CURRENT_BINDING):
            query_context_wrapped_init(&context, c, key,  val);
            r = toku_c_getf_current_binding(c, remaining_flags, c_get_wrapper_callback, &context);
            break;

        case (DB_SET):
            query_context_wrapped_init(&context, c, NULL, val);
            r = toku_c_getf_set(c, remaining_flags, key, c_get_wrapper_callback, &context);
            break;
        case (DB_SET_RANGE):
            query_context_wrapped_init(&context, c, key,  val);
            r = toku_c_getf_set_range(c, remaining_flags, key, c_get_wrapper_callback, &context);
            break;
2619 2620 2621 2622
        case (DB_SET_RANGE_REVERSE):
            query_context_wrapped_init(&context, c, key,  val);
            r = toku_c_getf_set_range_reverse(c, remaining_flags, key, c_get_wrapper_callback, &context);
            break;
2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633
        case (DB_GET_BOTH):
            query_context_wrapped_init(&context, c, NULL, NULL);
            r = toku_c_getf_get_both(c, remaining_flags, key, val, c_get_wrapper_callback, &context);
            break;
        case (DB_GET_BOTH_RANGE):
            //For a nodup database, DB_GET_BOTH_RANGE is an alias for DB_GET_BOTH.
            //DB_GET_BOTH(_RANGE) require different contexts (see case(DB_GET_BOTH)).
            if (c_db_is_nodup(c)) query_context_wrapped_init(&context, c, NULL, NULL);
            else                  query_context_wrapped_init(&context, c, NULL, val);
            r = toku_c_getf_get_both_range(c, remaining_flags, key, val, c_get_wrapper_callback, &context);
            break;
2634 2635 2636 2637 2638 2639 2640
        case (DB_GET_BOTH_RANGE_REVERSE):
            //For a nodup database, DB_GET_BOTH_RANGE_REVERSE is an alias for DB_GET_BOTH.
            //DB_GET_BOTH(_RANGE_REVERSE) require different contexts (see case(DB_GET_BOTH)).
            if (c_db_is_nodup(c)) query_context_wrapped_init(&context, c, NULL, NULL);
            else                  query_context_wrapped_init(&context, c, NULL, val);
            r = toku_c_getf_get_both_range_reverse(c, remaining_flags, key, val, c_get_wrapper_callback, &context);
            break;
Yoni Fogel's avatar
Yoni Fogel committed
2641
        default:
Yoni Fogel's avatar
Yoni Fogel committed
2642
            r = EINVAL;
2643
            break;
Yoni Fogel's avatar
Yoni Fogel committed
2644
    }
Yoni Fogel's avatar
Yoni Fogel committed
2645
    return r;
Yoni Fogel's avatar
Yoni Fogel committed
2646 2647
}

2648 2649
static int locked_c_getf_first(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    toku_ydb_lock();  int r = toku_c_getf_first(c, flag, f, extra); toku_ydb_unlock(); return r;
Yoni Fogel's avatar
Yoni Fogel committed
2650 2651
}

2652 2653 2654
static int locked_c_getf_last(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    toku_ydb_lock();  int r = toku_c_getf_last(c, flag, f, extra); toku_ydb_unlock(); return r;
}
Yoni Fogel's avatar
Yoni Fogel committed
2655

2656 2657
static int locked_c_getf_next(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    toku_ydb_lock();  int r = toku_c_getf_next(c, flag, f, extra); toku_ydb_unlock(); return r;
Yoni Fogel's avatar
Yoni Fogel committed
2658 2659
}

2660 2661 2662
static int locked_c_getf_next_nodup(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    toku_ydb_lock();  int r = toku_c_getf_next_nodup(c, flag, f, extra); toku_ydb_unlock(); return r;
}
Yoni Fogel's avatar
Yoni Fogel committed
2663

2664 2665 2666
static int locked_c_getf_next_dup(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    toku_ydb_lock();  int r = toku_c_getf_next_dup(c, flag, f, extra); toku_ydb_unlock(); return r;
}
Yoni Fogel's avatar
Yoni Fogel committed
2667

2668 2669
static int locked_c_getf_prev(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    toku_ydb_lock();  int r = toku_c_getf_prev(c, flag, f, extra); toku_ydb_unlock(); return r;
Yoni Fogel's avatar
Yoni Fogel committed
2670 2671
}

2672 2673 2674
static int locked_c_getf_prev_nodup(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    toku_ydb_lock();  int r = toku_c_getf_prev_nodup(c, flag, f, extra); toku_ydb_unlock(); return r;
}
Yoni Fogel's avatar
Yoni Fogel committed
2675

2676 2677 2678
static int locked_c_getf_prev_dup(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    toku_ydb_lock();  int r = toku_c_getf_prev_dup(c, flag, f, extra); toku_ydb_unlock(); return r;
}
Yoni Fogel's avatar
Yoni Fogel committed
2679

2680 2681
static int locked_c_getf_current(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    toku_ydb_lock();  int r = toku_c_getf_current(c, flag, f, extra); toku_ydb_unlock(); return r;
Yoni Fogel's avatar
Yoni Fogel committed
2682 2683
}

2684 2685 2686
static int locked_c_getf_current_binding(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    toku_ydb_lock();  int r = toku_c_getf_current_binding(c, flag, f, extra); toku_ydb_unlock(); return r;
}
Yoni Fogel's avatar
Yoni Fogel committed
2687

2688 2689 2690
static int locked_c_getf_set(DBC *c, u_int32_t flag, DBT * key, YDB_CALLBACK_FUNCTION f, void *extra) {
    toku_ydb_lock();  int r = toku_c_getf_set(c, flag, key, f, extra); toku_ydb_unlock(); return r;
}
Yoni Fogel's avatar
Yoni Fogel committed
2691

2692 2693 2694
static int locked_c_getf_set_range(DBC *c, u_int32_t flag, DBT * key, YDB_CALLBACK_FUNCTION f, void *extra) {
    toku_ydb_lock();  int r = toku_c_getf_set_range(c, flag, key, f, extra); toku_ydb_unlock(); return r;
}
Yoni Fogel's avatar
Yoni Fogel committed
2695

2696 2697 2698 2699
static int locked_c_getf_set_range_reverse(DBC *c, u_int32_t flag, DBT * key, YDB_CALLBACK_FUNCTION f, void *extra) {
    toku_ydb_lock();  int r = toku_c_getf_set_range_reverse(c, flag, key, f, extra); toku_ydb_unlock(); return r;
}

2700 2701 2702
static int locked_c_getf_get_both(DBC *c, u_int32_t flag, DBT * key, DBT *val, YDB_CALLBACK_FUNCTION f, void *extra) {
    toku_ydb_lock();  int r = toku_c_getf_get_both(c, flag, key, val, f, extra); toku_ydb_unlock(); return r;
}
2703

2704 2705 2706
static int locked_c_getf_get_both_range(DBC *c, u_int32_t flag, DBT * key, DBT *val, YDB_CALLBACK_FUNCTION f, void *extra) {
    toku_ydb_lock();  int r = toku_c_getf_get_both_range(c, flag, key, val, f, extra); toku_ydb_unlock(); return r;
}
Yoni Fogel's avatar
Yoni Fogel committed
2707

2708 2709 2710 2711
static int locked_c_getf_get_both_range_reverse(DBC *c, u_int32_t flag, DBT * key, DBT *val, YDB_CALLBACK_FUNCTION f, void *extra) {
    toku_ydb_lock();  int r = toku_c_getf_get_both_range_reverse(c, flag, key, val, f, extra); toku_ydb_unlock(); return r;
}

2712 2713 2714 2715 2716 2717 2718 2719 2720 2721
typedef struct {
    BOOL            is_read_lock;
    DB_TXN         *txn;
    DB             *db;
    toku_lock_tree *lt;
    DBT const      *left_key;
    DBT const      *left_val;
    DBT const      *right_key;
    DBT const      *right_val;
} *RANGE_LOCK_REQUEST, RANGE_LOCK_REQUEST_S;
Yoni Fogel's avatar
Yoni Fogel committed
2722

2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739
static void
range_lock_request_init(RANGE_LOCK_REQUEST request,
                        BOOL       is_read_lock,
                        DB_TXN    *txn,
                        DB        *db,
                        DBT const *left_key,
                        DBT const *left_val,
                        DBT const *right_key,
                        DBT const *right_val) {
    request->is_read_lock = is_read_lock;
    request->txn = txn;
    request->db = db;
    request->lt = db->i->lt;
    request->left_key = left_key;
    request->left_val = left_val;
    request->right_key = right_key;
    request->right_val = right_val;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
2740 2741
}

Yoni Fogel's avatar
Yoni Fogel committed
2742

2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753
static void
read_lock_request_init(RANGE_LOCK_REQUEST request,
                       DB_TXN    *txn,
                       DB        *db,
                       DBT const *left_key,
                       DBT const *left_val,
                       DBT const *right_key,
                       DBT const *right_val) {
    range_lock_request_init(request, TRUE, txn, db,
                            left_key,  left_val,
                            right_key, right_val);
Yoni Fogel's avatar
Yoni Fogel committed
2754 2755
}

2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775
static void
write_lock_request_init(RANGE_LOCK_REQUEST request,
                        DB_TXN    *txn,
                        DB        *db,
                        DBT const *left_key,
                        DBT const *left_val,
                        DBT const *right_key,
                        DBT const *right_val) {
    range_lock_request_init(request, FALSE, txn, db,
                            left_key,  left_val,
                            right_key, right_val);
}

static int
grab_range_lock(RANGE_LOCK_REQUEST request) {
    int r;
    //TODO: (Multithreading) Grab lock protecting lock tree
    DB_TXN *txn_anc = toku_txn_ancestor(request->txn);
    r = toku_txn_add_lt(txn_anc, request->lt);
    if (r==0) {
2776
        TXNID txn_anc_id = toku_txn_get_txnid(db_txn_struct_i(txn_anc)->tokutxn);
2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787
        if (request->is_read_lock)
            r = toku_lt_acquire_range_read_lock(request->lt, request->db, txn_anc_id,
                                                request->left_key,  request->left_val,
                                                request->right_key, request->right_val);
        else 
            r = toku_lt_acquire_range_write_lock(request->lt, request->db, txn_anc_id,
                                                 request->left_key,  request->left_val,
                                                 request->right_key, request->right_val);
    }
    //TODO: (Multithreading) Release lock protecting lock tree
    return r;
Yoni Fogel's avatar
Yoni Fogel committed
2788 2789
}

2790 2791
//This is the user level callback function given to ydb layer functions like
//toku_c_getf_first
Yoni Fogel's avatar
Yoni Fogel committed
2792

2793 2794 2795 2796
typedef struct __toku_is_write_op {
    BOOL is_write_op;
} WRITE_OP;

2797 2798 2799 2800 2801 2802
typedef struct query_context_base_t {
    BRT_CURSOR  c;
    DB_TXN     *txn;
    DB         *db;
    void       *f_extra;
    int         r_user_callback;
2803 2804
    BOOL        do_locking;
    BOOL        is_write_op;
2805
} *QUERY_CONTEXT_BASE, QUERY_CONTEXT_BASE_S;
2806

2807 2808 2809 2810
typedef struct query_context_t {
    QUERY_CONTEXT_BASE_S  base;
    YDB_CALLBACK_FUNCTION f;
} *QUERY_CONTEXT, QUERY_CONTEXT_S;
Yoni Fogel's avatar
Yoni Fogel committed
2811

2812 2813 2814 2815 2816 2817
typedef struct query_context_with_input_t {
    QUERY_CONTEXT_BASE_S  base;
    YDB_CALLBACK_FUNCTION f;
    DBT                  *input_key;
    DBT                  *input_val;
} *QUERY_CONTEXT_WITH_INPUT, QUERY_CONTEXT_WITH_INPUT_S;
Yoni Fogel's avatar
Yoni Fogel committed
2818

2819 2820

static void
2821
query_context_base_init(QUERY_CONTEXT_BASE context, DBC *c, u_int32_t flag, WRITE_OP is_write_op, void *extra) {
2822 2823
    context->c       = dbc_struct_i(c)->c;
    context->txn     = dbc_struct_i(c)->txn;
2824 2825
    context->db      = c->dbp;
    context->f_extra = extra;
2826
    context->is_write_op = is_write_op.is_write_op;
2827
    u_int32_t lock_flags = get_prelocked_flags(flag, dbc_struct_i(c)->txn, c->dbp);
Yoni Fogel's avatar
Yoni Fogel committed
2828
    flag &= ~lock_flags;
2829
    if (context->is_write_op) lock_flags &= DB_PRELOCKED_WRITE; // Only care about whether already locked for write
Yoni Fogel's avatar
Yoni Fogel committed
2830
    assert(flag==0);
2831
    context->do_locking = (BOOL)(context->db->i->lt!=NULL && !lock_flags);
2832
    context->r_user_callback = 0;
Yoni Fogel's avatar
Yoni Fogel committed
2833 2834
}

2835 2836
static void
query_context_init(QUERY_CONTEXT context, DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
2837 2838 2839 2840 2841 2842 2843 2844 2845
    WRITE_OP is_write = {FALSE};
    query_context_base_init(&context->base, c, flag, is_write, extra);
    context->f = f;
}

static void
query_context_init_write_op(QUERY_CONTEXT context, DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    WRITE_OP is_write = {TRUE};
    query_context_base_init(&context->base, c, flag, is_write, extra);
2846 2847
    context->f = f;
}
Yoni Fogel's avatar
Yoni Fogel committed
2848

2849 2850
static void
query_context_with_input_init(QUERY_CONTEXT_WITH_INPUT context, DBC *c, u_int32_t flag, DBT *key, DBT *val, YDB_CALLBACK_FUNCTION f, void *extra) {
2851 2852
    WRITE_OP is_write = {FALSE};
    query_context_base_init(&context->base, c, flag, is_write, extra);
2853 2854 2855
    context->f         = f;
    context->input_key = key;
    context->input_val = val;
2856 2857
}

2858 2859 2860 2861 2862
static int c_del_callback(DBT const *key, DBT const *val, void *extra);

//Delete whatever the cursor is pointing at.
static int
toku_c_del(DBC * c, u_int32_t flags) {
2863
    HANDLE_PANICKED_DB(c->dbp);
2864
    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
2865

2866 2867 2868 2869
    u_int32_t unchecked_flags = flags;
    //DB_DELETE_ANY means delete regardless of whether it exists in the db.
    u_int32_t flag_for_brt = flags&DB_DELETE_ANY;
    unchecked_flags &= ~flag_for_brt;
2870
    u_int32_t lock_flags = get_prelocked_flags(flags, dbc_struct_i(c)->txn, c->dbp);
2871
    unchecked_flags &= ~lock_flags;
2872
    BOOL do_locking = (BOOL)(c->dbp->i->lt && !(lock_flags&DB_PRELOCKED_WRITE));
2873

2874 2875 2876 2877 2878
    int r = 0;
    if (unchecked_flags!=0) r = EINVAL;
    else {
        if (do_locking) {
            QUERY_CONTEXT_S context;
2879
            query_context_init_write_op(&context, c, lock_flags, NULL, NULL);
2880 2881 2882 2883 2884
            //We do not need a read lock, we must already have it.
            r = toku_c_getf_current_binding(c, DB_PRELOCKED, c_del_callback, &context);
        }
        if (r==0) {
            //Do the actual delete.
2885 2886
            TOKUTXN txn = dbc_struct_i(c)->txn ? db_txn_struct_i(dbc_struct_i(c)->txn)->tokutxn : 0;
            r = toku_brt_cursor_delete(dbc_struct_i(c)->c, flag_for_brt, txn);
2887 2888
        }
    }
2889 2890
    return r;
}
2891

2892 2893 2894 2895 2896
//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..)
static int
c_del_callback(DBT const *key, DBT const *val, void *extra) {
    QUERY_CONTEXT_WITH_INPUT super_context = extra;
    QUERY_CONTEXT_BASE       context       = &super_context->base;
Yoni Fogel's avatar
Yoni Fogel committed
2897

2898
    int r;
Yoni Fogel's avatar
Yoni Fogel committed
2899

2900
    assert(context->do_locking);
2901
    assert(context->is_write_op);
2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912
    assert(key!=NULL);
    assert(val!=NULL);
    //Lock:
    //  left(key,val)==right(key,val) == (key, val);
    RANGE_LOCK_REQUEST_S request;
    write_lock_request_init(&request, context->txn, context->db,
                            key, val,
                            key, val);
    r = grab_range_lock(&request);

    //Give brt-layer an error (if any) to return from toku_c_getf_current_binding
Yoni Fogel's avatar
Yoni Fogel committed
2913 2914 2915
    return r;
}

2916 2917 2918 2919 2920
static int c_getf_first_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra);

static int
toku_c_getf_first(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    HANDLE_PANICKED_DB(c->dbp);
2921
    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
2922
    num_point_queries++;   // accountability
2923 2924 2925
    QUERY_CONTEXT_S context; //Describes the context of this query.
    query_context_init(&context, c, flag, f, extra); 
    //toku_brt_cursor_first will call c_getf_first_callback(..., context) (if query is successful)
2926
    int r = toku_brt_cursor_first(dbc_struct_i(c)->c, c_getf_first_callback, &context);
2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974
    if (r == TOKUDB_USER_CALLBACK_ERROR) r = context.base.r_user_callback;
    return r;
}

//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..)
static int
c_getf_first_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra) {
    QUERY_CONTEXT      super_context = extra;
    QUERY_CONTEXT_BASE context       = &super_context->base;

    int r;

    DBT found_key;
    DBT found_val;
    toku_fill_dbt(&found_key, key, keylen);
    toku_fill_dbt(&found_val, val, vallen);

    if (context->do_locking) {
        RANGE_LOCK_REQUEST_S request;
        if (key!=NULL) {
            read_lock_request_init(&request, context->txn, context->db,
                                   toku_lt_neg_infinity, toku_lt_neg_infinity,
                                   &found_key,           &found_val);
        }
        else {
            read_lock_request_init(&request, context->txn, context->db,
                                   toku_lt_neg_infinity, toku_lt_neg_infinity,
                                   toku_lt_infinity,     toku_lt_infinity);
        }
        r = grab_range_lock(&request);
    }
    else r = 0;

    //Call application-layer callback if found and locks were successfully obtained.
    if (r==0 && key!=NULL) {
        context->r_user_callback = super_context->f(&found_key, &found_val, context->f_extra);
        if (context->r_user_callback) r = TOKUDB_USER_CALLBACK_ERROR;
    }

    //Give brt-layer an error (if any) to return from toku_brt_cursor_first
    return r;
}

static int c_getf_last_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra);

static int
toku_c_getf_last(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    HANDLE_PANICKED_DB(c->dbp);
2975
    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
2976
    num_point_queries++;   // accountability
2977 2978 2979
    QUERY_CONTEXT_S context; //Describes the context of this query.
    query_context_init(&context, c, flag, f, extra); 
    //toku_brt_cursor_last will call c_getf_last_callback(..., context) (if query is successful)
2980
    int r = toku_brt_cursor_last(dbc_struct_i(c)->c, c_getf_last_callback, &context);
2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029
    if (r == TOKUDB_USER_CALLBACK_ERROR) r = context.base.r_user_callback;
    return r;
}

//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..)
static int
c_getf_last_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra) {
    QUERY_CONTEXT      super_context = extra;
    QUERY_CONTEXT_BASE context       = &super_context->base;

    int r;

    DBT found_key;
    DBT found_val;
    toku_fill_dbt(&found_key, key, keylen);
    toku_fill_dbt(&found_val, val, vallen);

    if (context->do_locking) {
        RANGE_LOCK_REQUEST_S request;
        if (key!=NULL) {
            read_lock_request_init(&request, context->txn, context->db,
                                   &found_key,           &found_val,
                                   toku_lt_infinity,     toku_lt_infinity);
        }
        else {
            read_lock_request_init(&request, context->txn, context->db,
                                   toku_lt_neg_infinity, toku_lt_neg_infinity,
                                   toku_lt_infinity,     toku_lt_infinity);
        }
        r = grab_range_lock(&request);
    }
    else r = 0;

    //Call application-layer callback if found and locks were successfully obtained.
    if (r==0 && key!=NULL) {
        context->r_user_callback = super_context->f(&found_key, &found_val, context->f_extra);
        if (context->r_user_callback) r = TOKUDB_USER_CALLBACK_ERROR;
    }

    //Give brt-layer an error (if any) to return from toku_brt_cursor_last
    return r;
}

static int c_getf_next_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra);

static int
toku_c_getf_next(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    int r;
    HANDLE_PANICKED_DB(c->dbp);
3030
    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
3031 3032 3033 3034
    if (c_db_is_nodup(c))             r = toku_c_getf_next_nodup(c, flag, f, extra);
    else if (toku_c_uninitialized(c)) r = toku_c_getf_first(c, flag, f, extra);
    else {
        QUERY_CONTEXT_S context; //Describes the context of this query.
3035
        num_sequential_queries++;   // accountability
3036 3037
        query_context_init(&context, c, flag, f, extra); 
        //toku_brt_cursor_next will call c_getf_next_callback(..., context) (if query is successful)
3038
        r = toku_brt_cursor_next(dbc_struct_i(c)->c, c_getf_next_callback, &context);
3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085
        if (r == TOKUDB_USER_CALLBACK_ERROR) r = context.base.r_user_callback;
    }
    return r;
}

//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..)
static int
c_getf_next_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra) {
    QUERY_CONTEXT      super_context = extra;
    QUERY_CONTEXT_BASE context       = &super_context->base;

    int r;

    DBT found_key;
    DBT found_val;
    toku_fill_dbt(&found_key, key, keylen);
    toku_fill_dbt(&found_val, val, vallen);

    if (context->do_locking) {
        RANGE_LOCK_REQUEST_S request;
        const DBT *prevkey;
        const DBT *prevval;
        const DBT *right_key = key==NULL ? toku_lt_infinity : &found_key;
        const DBT *right_val = key==NULL ? toku_lt_infinity : &found_val;

        toku_brt_cursor_peek(context->c, &prevkey, &prevval);
        read_lock_request_init(&request, context->txn, context->db,
                               prevkey,   prevval,
                               right_key, right_val);
        r = grab_range_lock(&request);
    }
    else r = 0;

    //Call application-layer callback if found and locks were successfully obtained.
    if (r==0 && key!=NULL) {
        context->r_user_callback = super_context->f(&found_key, &found_val, context->f_extra);
        if (context->r_user_callback) r = TOKUDB_USER_CALLBACK_ERROR;
    }

    //Give brt-layer an error (if any) to return from toku_brt_cursor_next
    return r;
}

static int
toku_c_getf_next_nodup(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    int r;
    HANDLE_PANICKED_DB(c->dbp);
3086
    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
3087 3088 3089
    if (toku_c_uninitialized(c)) r = toku_c_getf_first(c, flag, f, extra);
    else {
        QUERY_CONTEXT_S context; //Describes the context of this query.
3090
        num_sequential_queries++;   // accountability
3091 3092
        query_context_init(&context, c, flag, f, extra); 
        //toku_brt_cursor_next will call c_getf_next_callback(..., context) (if query is successful)
3093
        r = toku_brt_cursor_next_nodup(dbc_struct_i(c)->c, c_getf_next_callback, &context);
3094 3095 3096 3097 3098 3099 3100 3101 3102
        if (r == TOKUDB_USER_CALLBACK_ERROR) r = context.base.r_user_callback;
    }
    return r;
}

static int c_getf_next_dup_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra);

static int
toku_c_getf_next_dup(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
Yoni Fogel's avatar
Yoni Fogel committed
3103
    HANDLE_PANICKED_DB(c->dbp);
3104
    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
Yoni Fogel's avatar
Yoni Fogel committed
3105
    if (toku_c_uninitialized(c)) return EINVAL;
3106 3107

    QUERY_CONTEXT_S context; //Describes the context of this query.
3108
    num_sequential_queries++;   // accountability
3109 3110
    query_context_init(&context, c, flag, f, extra); 
    //toku_brt_cursor_next_dup will call c_getf_next_dup_callback(..., context) (if query is successful)
3111
    int r = toku_brt_cursor_next_dup(dbc_struct_i(c)->c, c_getf_next_dup_callback, &context);
3112 3113 3114 3115 3116 3117 3118 3119 3120 3121
    if (r == TOKUDB_USER_CALLBACK_ERROR) r = context.base.r_user_callback;
    return r;
}

//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..)
static int
c_getf_next_dup_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra) {
    QUERY_CONTEXT      super_context = extra;
    QUERY_CONTEXT_BASE context       = &super_context->base;

Yoni Fogel's avatar
Yoni Fogel committed
3122 3123
    int r;

3124 3125 3126 3127
    DBT found_key;
    DBT found_val;
    toku_fill_dbt(&found_key, key, keylen);
    toku_fill_dbt(&found_val, val, vallen);
Yoni Fogel's avatar
Yoni Fogel committed
3128

3129 3130 3131 3132 3133
    if (context->do_locking) {
        RANGE_LOCK_REQUEST_S request;
        const DBT *prevkey;
        const DBT *prevval;
        const DBT *right_val = key==NULL ? toku_lt_infinity : &found_val;
Yoni Fogel's avatar
Yoni Fogel committed
3134

3135 3136 3137 3138 3139 3140 3141
        toku_brt_cursor_peek(context->c, &prevkey, &prevval);
        read_lock_request_init(&request, context->txn, context->db,
                               prevkey,  prevval,
                               prevkey,  right_val); //found_key is same as prevkey for this case
        r = grab_range_lock(&request);
    }
    else r = 0;
Yoni Fogel's avatar
Yoni Fogel committed
3142

3143 3144 3145 3146
    //Call application-layer callback if found and locks were successfully obtained.
    if (r==0 && key!=NULL) {
        context->r_user_callback = super_context->f(&found_key, &found_val, context->f_extra);
        if (context->r_user_callback) r = TOKUDB_USER_CALLBACK_ERROR;
Yoni Fogel's avatar
Yoni Fogel committed
3147
    }
3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158

    //Give brt-layer an error (if any) to return from toku_brt_cursor_next_dup
    return r;
}

static int c_getf_prev_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra);

static int
toku_c_getf_prev(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    int r;
    HANDLE_PANICKED_DB(c->dbp);
3159
    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
3160 3161 3162 3163
    if (c_db_is_nodup(c))             r = toku_c_getf_prev_nodup(c, flag, f, extra);
    else if (toku_c_uninitialized(c)) r = toku_c_getf_last(c, flag, f, extra);
    else {
        QUERY_CONTEXT_S context; //Describes the context of this query.
3164
        num_sequential_queries++;   // accountability
3165 3166
        query_context_init(&context, c, flag, f, extra); 
        //toku_brt_cursor_prev will call c_getf_prev_callback(..., context) (if query is successful)
3167
        r = toku_brt_cursor_prev(dbc_struct_i(c)->c, c_getf_prev_callback, &context);
3168
        if (r == TOKUDB_USER_CALLBACK_ERROR) r = context.base.r_user_callback;
Yoni Fogel's avatar
Yoni Fogel committed
3169 3170 3171 3172
    }
    return r;
}

3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208
//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..)
static int
c_getf_prev_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra) {
    QUERY_CONTEXT      super_context = extra;
    QUERY_CONTEXT_BASE context       = &super_context->base;

    int r;

    DBT found_key;
    DBT found_val;
    toku_fill_dbt(&found_key, key, keylen);
    toku_fill_dbt(&found_val, val, vallen);

    if (context->do_locking) {
        RANGE_LOCK_REQUEST_S request;
        const DBT *prevkey;
        const DBT *prevval;
        const DBT *left_key = key==NULL ? toku_lt_neg_infinity : &found_key;
        const DBT *left_val = key==NULL ? toku_lt_neg_infinity : &found_val;

        toku_brt_cursor_peek(context->c, &prevkey, &prevval);
        read_lock_request_init(&request, context->txn, context->db,
                               left_key, left_val,
                               prevkey,  prevval);
        r = grab_range_lock(&request);
    }
    else r = 0;

    //Call application-layer callback if found and locks were successfully obtained.
    if (r==0 && key!=NULL) {
        context->r_user_callback = super_context->f(&found_key, &found_val, context->f_extra);
        if (context->r_user_callback) r = TOKUDB_USER_CALLBACK_ERROR;
    }

    //Give brt-layer an error (if any) to return from toku_brt_cursor_prev
    return r;
Yoni Fogel's avatar
Yoni Fogel committed
3209 3210
}

3211 3212
static int
toku_c_getf_prev_nodup(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
Yoni Fogel's avatar
Yoni Fogel committed
3213
    int r;
3214
    HANDLE_PANICKED_DB(c->dbp);
3215
    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
3216 3217 3218
    if (toku_c_uninitialized(c)) r = toku_c_getf_last(c, flag, f, extra);
    else {
        QUERY_CONTEXT_S context; //Describes the context of this query.
3219
        num_sequential_queries++;   // accountability
3220 3221
        query_context_init(&context, c, flag, f, extra); 
        //toku_brt_cursor_prev will call c_getf_prev_callback(..., context) (if query is successful)
3222
        r = toku_brt_cursor_prev_nodup(dbc_struct_i(c)->c, c_getf_prev_callback, &context);
3223 3224 3225 3226
        if (r == TOKUDB_USER_CALLBACK_ERROR) r = context.base.r_user_callback;
    }
    return r;
}
Yoni Fogel's avatar
Yoni Fogel committed
3227

3228 3229 3230 3231 3232
static int c_getf_prev_dup_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra);

static int
toku_c_getf_prev_dup(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    HANDLE_PANICKED_DB(c->dbp);
3233
    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
3234 3235 3236
    if (toku_c_uninitialized(c)) return EINVAL;

    QUERY_CONTEXT_S context; //Describes the context of this query.
3237
    num_sequential_queries++;   // accountability
3238 3239
    query_context_init(&context, c, flag, f, extra); 
    //toku_brt_cursor_prev_dup will call c_getf_prev_dup_callback(..., context) (if query is successful)
3240
    int r = toku_brt_cursor_prev_dup(dbc_struct_i(c)->c, c_getf_prev_dup_callback, &context);
3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286
    if (r == TOKUDB_USER_CALLBACK_ERROR) r = context.base.r_user_callback;
    return r;
}

//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..)
static int
c_getf_prev_dup_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra) {
    QUERY_CONTEXT      super_context = extra;
    QUERY_CONTEXT_BASE context       = &super_context->base;

    int r;

    DBT found_key;
    DBT found_val;
    toku_fill_dbt(&found_key, key, keylen);
    toku_fill_dbt(&found_val, val, vallen);

    if (context->do_locking) {
        RANGE_LOCK_REQUEST_S request;
        const DBT *prevkey;
        const DBT *prevval;
        const DBT *left_val = key==NULL ? toku_lt_neg_infinity : &found_val;

        toku_brt_cursor_peek(context->c, &prevkey, &prevval);
        read_lock_request_init(&request, context->txn, context->db,
                               prevkey,  left_val, //found_key is same as prevkey for this case
                               prevkey,  prevval);
        r = grab_range_lock(&request);
    }
    else r = 0;

    //Call application-layer callback if found and locks were successfully obtained.
    if (r==0 && key!=NULL) {
        context->r_user_callback = super_context->f(&found_key, &found_val, context->f_extra);
        if (context->r_user_callback) r = TOKUDB_USER_CALLBACK_ERROR;
    }

    //Give brt-layer an error (if any) to return from toku_brt_cursor_prev_dup
    return r;
}

static int c_getf_current_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra);

static int
toku_c_getf_current(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    HANDLE_PANICKED_DB(c->dbp);
3287
    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
3288 3289

    QUERY_CONTEXT_S context; //Describes the context of this query.
3290
    num_sequential_queries++;   // accountability
3291 3292
    query_context_init(&context, c, flag, f, extra); 
    //toku_brt_cursor_current will call c_getf_current_callback(..., context) (if query is successful)
3293
    int r = toku_brt_cursor_current(dbc_struct_i(c)->c, DB_CURRENT, c_getf_current_callback, &context);
3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322
    if (r == TOKUDB_USER_CALLBACK_ERROR) r = context.base.r_user_callback;
    return r;
}

//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..)
static int
c_getf_current_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra) {
    QUERY_CONTEXT      super_context = extra;
    QUERY_CONTEXT_BASE context       = &super_context->base;

    DBT found_key;
    DBT found_val;
    toku_fill_dbt(&found_key, key, keylen);
    toku_fill_dbt(&found_val, val, vallen);

    int r=0;
    //Call application-layer callback if found.
    if (key!=NULL) {
        context->r_user_callback = super_context->f(&found_key, &found_val, context->f_extra);
        if (context->r_user_callback) r = TOKUDB_USER_CALLBACK_ERROR;
    }

    //Give brt-layer an error (if any) to return from toku_brt_cursor_current
    return r;
}

static int
toku_c_getf_current_binding(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    HANDLE_PANICKED_DB(c->dbp);
3323
    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
3324 3325

    QUERY_CONTEXT_S context; //Describes the context of this query.
3326
    num_sequential_queries++;   // accountability
3327 3328
    query_context_init(&context, c, flag, f, extra); 
    //toku_brt_cursor_current will call c_getf_current_callback(..., context) (if query is successful)
3329
    int r = toku_brt_cursor_current(dbc_struct_i(c)->c, DB_CURRENT_BINDING, c_getf_current_callback, &context);
3330 3331 3332 3333 3334 3335 3336 3337 3338
    if (r == TOKUDB_USER_CALLBACK_ERROR) r = context.base.r_user_callback;
    return r;
}

static int c_getf_set_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra);

static int
toku_c_getf_set(DBC *c, u_int32_t flag, DBT *key, YDB_CALLBACK_FUNCTION f, void *extra) {
    HANDLE_PANICKED_DB(c->dbp);
3339
    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
3340 3341

    QUERY_CONTEXT_WITH_INPUT_S context; //Describes the context of this query.
3342
    num_point_queries++;   // accountability
3343 3344
    query_context_with_input_init(&context, c, flag, key, NULL, f, extra); 
    //toku_brt_cursor_set will call c_getf_set_callback(..., context) (if query is successful)
3345
    int r = toku_brt_cursor_set(dbc_struct_i(c)->c, key, NULL, c_getf_set_callback, &context);
3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396
    if (r == TOKUDB_USER_CALLBACK_ERROR) r = context.base.r_user_callback;
    return r;
}

//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..)
static int
c_getf_set_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra) {
    QUERY_CONTEXT_WITH_INPUT super_context = extra;
    QUERY_CONTEXT_BASE       context       = &super_context->base;

    int r;

    DBT found_key;
    DBT found_val;
    toku_fill_dbt(&found_key, key, keylen);
    toku_fill_dbt(&found_val, val, vallen);

    //Lock:
    //  left(key,val)  = (input_key, -infinity)
    //  right(key,val) = (input_key, found ? found_val : infinity)
    if (context->do_locking) {
        RANGE_LOCK_REQUEST_S request;
        if (key!=NULL) {
            read_lock_request_init(&request, context->txn, context->db,
                                   super_context->input_key, toku_lt_neg_infinity,
                                   super_context->input_key, &found_val);
        }
        else {
            read_lock_request_init(&request, context->txn, context->db,
                                   super_context->input_key, toku_lt_neg_infinity,
                                   super_context->input_key, toku_lt_infinity);
        }
        r = grab_range_lock(&request);
    }
    else r = 0;

    //Call application-layer callback if found and locks were successfully obtained.
    if (r==0 && key!=NULL) {
        context->r_user_callback = super_context->f(&found_key, &found_val, context->f_extra);
        if (context->r_user_callback) r = TOKUDB_USER_CALLBACK_ERROR;
    }

    //Give brt-layer an error (if any) to return from toku_brt_cursor_set
    return r;
}

static int c_getf_set_range_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra);

static int
toku_c_getf_set_range(DBC *c, u_int32_t flag, DBT *key, YDB_CALLBACK_FUNCTION f, void *extra) {
    HANDLE_PANICKED_DB(c->dbp);
3397
    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
3398 3399

    QUERY_CONTEXT_WITH_INPUT_S context; //Describes the context of this query.
3400
    num_point_queries++;   // accountability
3401 3402
    query_context_with_input_init(&context, c, flag, key, NULL, f, extra); 
    //toku_brt_cursor_set_range will call c_getf_set_range_callback(..., context) (if query is successful)
3403
    int r = toku_brt_cursor_set_range(dbc_struct_i(c)->c, key, c_getf_set_range_callback, &context);
3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450
    if (r == TOKUDB_USER_CALLBACK_ERROR) r = context.base.r_user_callback;
    return r;
}

//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..)
static int
c_getf_set_range_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra) {
    QUERY_CONTEXT_WITH_INPUT super_context = extra;
    QUERY_CONTEXT_BASE       context       = &super_context->base;

    int r;

    DBT found_key;
    DBT found_val;
    toku_fill_dbt(&found_key, key, keylen);
    toku_fill_dbt(&found_val, val, vallen);

    //Lock:
    //  left(key,val)  = (input_key, -infinity)
    //  right(key) = found ? found_key : infinity
    //  right(val) = found ? found_val : infinity
    if (context->do_locking) {
        RANGE_LOCK_REQUEST_S request;
        if (key!=NULL) {
            read_lock_request_init(&request, context->txn, context->db,
                                   super_context->input_key, toku_lt_neg_infinity,
                                   &found_key,               &found_val);
        }
        else {
            read_lock_request_init(&request, context->txn, context->db,
                                   super_context->input_key, toku_lt_neg_infinity,
                                   toku_lt_infinity,         toku_lt_infinity);
        }
        r = grab_range_lock(&request);
    }
    else r = 0;

    //Call application-layer callback if found and locks were successfully obtained.
    if (r==0 && key!=NULL) {
        context->r_user_callback = super_context->f(&found_key, &found_val, context->f_extra);
        if (context->r_user_callback) r = TOKUDB_USER_CALLBACK_ERROR;
    }

    //Give brt-layer an error (if any) to return from toku_brt_cursor_set_range
    return r;
}

3451 3452 3453 3454 3455 3456 3457 3458
static int c_getf_set_range_reverse_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra);

static int
toku_c_getf_set_range_reverse(DBC *c, u_int32_t flag, DBT *key, YDB_CALLBACK_FUNCTION f, void *extra) {
    HANDLE_PANICKED_DB(c->dbp);
    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);

    QUERY_CONTEXT_WITH_INPUT_S context; //Describes the context of this query.
3459
    num_point_queries++;   // accountability
3460 3461
    query_context_with_input_init(&context, c, flag, key, NULL, f, extra); 
    //toku_brt_cursor_set_range_reverse will call c_getf_set_range_reverse_callback(..., context) (if query is successful)
3462
    int r = toku_brt_cursor_set_range_reverse(dbc_struct_i(c)->c, key, c_getf_set_range_reverse_callback, &context);
3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509
    if (r == TOKUDB_USER_CALLBACK_ERROR) r = context.base.r_user_callback;
    return r;
}

//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..)
static int
c_getf_set_range_reverse_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra) {
    QUERY_CONTEXT_WITH_INPUT super_context = extra;
    QUERY_CONTEXT_BASE       context       = &super_context->base;

    int r;

    DBT found_key;
    DBT found_val;
    toku_fill_dbt(&found_key, key, keylen);
    toku_fill_dbt(&found_val, val, vallen);

    //Lock:
    //  left(key) = found ? found_key : -infinity
    //  left(val) = found ? found_val : -infinity
    //  right(key,val)  = (input_key, infinity)
    if (context->do_locking) {
        RANGE_LOCK_REQUEST_S request;
        if (key!=NULL) {
            read_lock_request_init(&request, context->txn, context->db,
                                   &found_key,               &found_val,
                                   super_context->input_key, toku_lt_infinity);
        }
        else {
            read_lock_request_init(&request, context->txn, context->db,
                                   toku_lt_neg_infinity,     toku_lt_neg_infinity,
                                   super_context->input_key, toku_lt_infinity);
        }
        r = grab_range_lock(&request);
    }
    else r = 0;

    //Call application-layer callback if found and locks were successfully obtained.
    if (r==0 && key!=NULL) {
        context->r_user_callback = super_context->f(&found_key, &found_val, context->f_extra);
        if (context->r_user_callback) r = TOKUDB_USER_CALLBACK_ERROR;
    }

    //Give brt-layer an error (if any) to return from toku_brt_cursor_set_range_reverse
    return r;
}

3510 3511 3512 3513 3514
static int c_getf_get_both_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra);

static int
toku_c_getf_get_both(DBC *c, u_int32_t flag, DBT *key, DBT *val, YDB_CALLBACK_FUNCTION f, void *extra) {
    HANDLE_PANICKED_DB(c->dbp);
3515
    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
3516 3517

    QUERY_CONTEXT_WITH_INPUT_S context; //Describes the context of this query.
3518
    num_point_queries++;   // accountability
3519 3520
    query_context_with_input_init(&context, c, flag, key, val, f, extra); 
    //toku_brt_cursor_get_both will call c_getf_get_both_callback(..., context) (if query is successful)
3521
    int r = toku_brt_cursor_set(dbc_struct_i(c)->c, key, val, c_getf_get_both_callback, &context);
3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565
    if (r == TOKUDB_USER_CALLBACK_ERROR) r = context.base.r_user_callback;
    return r;
}

//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..)
static int
c_getf_get_both_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra) {
    QUERY_CONTEXT_WITH_INPUT super_context = extra;
    QUERY_CONTEXT_BASE       context       = &super_context->base;

    int r;

    DBT found_key;
    DBT found_val;
    toku_fill_dbt(&found_key, key, keylen);
    toku_fill_dbt(&found_val, val, vallen);

    //Lock:
    //  left(key,val)  = (input_key, input_val)
    //  right==left
    if (context->do_locking) {
        RANGE_LOCK_REQUEST_S request;
        read_lock_request_init(&request, context->txn, context->db,
                               super_context->input_key, super_context->input_val,
                               super_context->input_key, super_context->input_val);
        r = grab_range_lock(&request);
    }
    else r = 0;

    //Call application-layer callback if found and locks were successfully obtained.
    if (r==0 && key!=NULL) {
        context->r_user_callback = super_context->f(&found_key, &found_val, context->f_extra);
        if (context->r_user_callback) r = TOKUDB_USER_CALLBACK_ERROR;
    }

    //Give brt-layer an error (if any) to return from toku_brt_cursor_get_both
    return r;
}

static int c_getf_get_both_range_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra);

static int
toku_c_getf_get_both_range(DBC *c, u_int32_t flag, DBT *key, DBT *val, YDB_CALLBACK_FUNCTION f, void *extra) {
    HANDLE_PANICKED_DB(c->dbp);
3566
    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
3567 3568 3569 3570
    int r;
    if (c_db_is_nodup(c)) r = toku_c_getf_get_both(c, flag, key, val, f, extra);
    else {
        QUERY_CONTEXT_WITH_INPUT_S context; //Describes the context of this query.
3571
        num_point_queries++;   // accountability
3572 3573
        query_context_with_input_init(&context, c, flag, key, val, f, extra); 
        //toku_brt_cursor_get_both_range will call c_getf_get_both_range_callback(..., context) (if query is successful)
3574
        r = toku_brt_cursor_get_both_range(dbc_struct_i(c)->c, key, val, c_getf_get_both_range_callback, &context);
3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621
        if (r == TOKUDB_USER_CALLBACK_ERROR) r = context.base.r_user_callback;
    }
    return r;
}

//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..)
static int
c_getf_get_both_range_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra) {
    QUERY_CONTEXT_WITH_INPUT super_context = extra;
    QUERY_CONTEXT_BASE       context       = &super_context->base;

    int r;

    DBT found_key;
    DBT found_val;
    toku_fill_dbt(&found_key, key, keylen);
    toku_fill_dbt(&found_val, val, vallen);

    //Lock:
    //  left(key,val)  = (input_key, input_val)
    //  right(key,val) = (input_key, found ? found_val : infinity)
    if (context->do_locking) {
        RANGE_LOCK_REQUEST_S request;
        if (key!=NULL) {
            read_lock_request_init(&request, context->txn, context->db,
                                   super_context->input_key, super_context->input_val,
                                   super_context->input_key, &found_val);
        }
        else {
            read_lock_request_init(&request, context->txn, context->db,
                                   super_context->input_key, super_context->input_val,
                                   super_context->input_key, toku_lt_infinity);
        }
        r = grab_range_lock(&request);
    }
    else r = 0;

    //Call application-layer callback if found and locks were successfully obtained.
    if (r==0 && key!=NULL) {
        context->r_user_callback = super_context->f(&found_key, &found_val, context->f_extra);
        if (context->r_user_callback) r = TOKUDB_USER_CALLBACK_ERROR;
    }

    //Give brt-layer an error (if any) to return from toku_brt_cursor_get_both_range
    return r;
}

3622 3623 3624 3625 3626 3627 3628 3629 3630 3631
static int c_getf_get_both_range_reverse_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra);

static int
toku_c_getf_get_both_range_reverse(DBC *c, u_int32_t flag, DBT *key, DBT *val, YDB_CALLBACK_FUNCTION f, void *extra) {
    HANDLE_PANICKED_DB(c->dbp);
    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
    int r;
    if (c_db_is_nodup(c)) r = toku_c_getf_get_both(c, flag, key, val, f, extra);
    else {
        QUERY_CONTEXT_WITH_INPUT_S context; //Describes the context of this query.
3632
        num_point_queries++;   // accountability
3633 3634
        query_context_with_input_init(&context, c, flag, key, val, f, extra); 
        //toku_brt_cursor_get_both_range_reverse will call c_getf_get_both_range_reverse_callback(..., context) (if query is successful)
3635
        r = toku_brt_cursor_get_both_range_reverse(dbc_struct_i(c)->c, key, val, c_getf_get_both_range_reverse_callback, &context);
3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681
        if (r == TOKUDB_USER_CALLBACK_ERROR) r = context.base.r_user_callback;
    }
    return r;
}

//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..)
static int
c_getf_get_both_range_reverse_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra) {
    QUERY_CONTEXT_WITH_INPUT super_context = extra;
    QUERY_CONTEXT_BASE       context       = &super_context->base;

    int r;

    DBT found_key;
    DBT found_val;
    toku_fill_dbt(&found_key, key, keylen);
    toku_fill_dbt(&found_val, val, vallen);

    //Lock:
    //  left(key,val)  = (input_key, found ? found_val : -infinity)
    //  right(key,val) = (input_key, input_val)
    if (context->do_locking) {
        RANGE_LOCK_REQUEST_S request;
        if (key!=NULL) {
            read_lock_request_init(&request, context->txn, context->db,
                                   super_context->input_key, &found_val,
                                   super_context->input_key, super_context->input_val);
        }
        else {
            read_lock_request_init(&request, context->txn, context->db,
                                   super_context->input_key, toku_lt_neg_infinity,
                                   super_context->input_key, super_context->input_val);
        }
        r = grab_range_lock(&request);
    }
    else r = 0;

    //Call application-layer callback if found and locks were successfully obtained.
    if (r==0 && key!=NULL) {
        context->r_user_callback = super_context->f(&found_key, &found_val, context->f_extra);
        if (context->r_user_callback) r = TOKUDB_USER_CALLBACK_ERROR;
    }

    //Give brt-layer an error (if any) to return from toku_brt_cursor_get_both_range_reverse
    return r;
}
3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696

static int locked_c_getf_heaviside(DBC *c, u_int32_t flags,
                               YDB_HEAVISIDE_CALLBACK_FUNCTION f, void *extra_f,
                               YDB_HEAVISIDE_FUNCTION h, void *extra_h, int direction) {
    toku_ydb_lock();  int r = toku_c_getf_heaviside(c, flags, f, extra_f, h, extra_h, direction); toku_ydb_unlock(); return r;
}

typedef struct {
    QUERY_CONTEXT_BASE_S            base;
    YDB_HEAVISIDE_CALLBACK_FUNCTION f;
    HEAVI_WRAPPER                   wrapper;
} *QUERY_CONTEXT_HEAVISIDE, QUERY_CONTEXT_HEAVISIDE_S;

static void
query_context_heaviside_init(QUERY_CONTEXT_HEAVISIDE context, DBC *c, u_int32_t flag, YDB_HEAVISIDE_CALLBACK_FUNCTION f, void *extra, HEAVI_WRAPPER wrapper) {
3697 3698
    WRITE_OP is_write = {FALSE};
    query_context_base_init(&context->base, c, flag, is_write, extra);
3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721
    context->f       = f;
    context->wrapper = wrapper;
}

static void
heavi_wrapper_init(HEAVI_WRAPPER wrapper, int (*h)(const DBT *key, const DBT *value, void *extra_h), void *extra_h, int direction) {
    wrapper->h         = h;
    wrapper->extra_h   = extra_h;
    wrapper->r_h       = direction; //Default value of r_h (may be set to 0 later)->
    wrapper->direction = direction;
}

static int c_getf_heaviside_callback(ITEMLEN found_keylen, bytevec found_key, ITEMLEN found_vallen, bytevec found_val,
                                     ITEMLEN next_keylen,  bytevec next_key,  ITEMLEN next_vallen,  bytevec next_val,
                                     void *extra);

static int
toku_c_getf_heaviside(DBC *c, u_int32_t flag,
                      YDB_HEAVISIDE_CALLBACK_FUNCTION f, void *extra_f,
                      YDB_HEAVISIDE_FUNCTION h, void *extra_h,
                      int direction) {
    int r;
    HANDLE_PANICKED_DB(c->dbp);
3722
    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
3723
    num_point_queries++;   // accountability
3724 3725 3726 3727 3728
    HEAVI_WRAPPER_S wrapper;
    heavi_wrapper_init(&wrapper, h, extra_h, direction);
    QUERY_CONTEXT_HEAVISIDE_S context; //Describes the context of this query.
    query_context_heaviside_init(&context, c, flag, f, extra_f, &wrapper); 
    //toku_brt_cursor_heaviside will call c_getf_heaviside_callback(..., context) (if query is successful)
3729
    r = toku_brt_cursor_heaviside(dbc_struct_i(c)->c, c_getf_heaviside_callback, &context, &wrapper);
3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759
    if (r == TOKUDB_USER_CALLBACK_ERROR) r = context.base.r_user_callback;
    return r;
}

//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..)
//bytevec==NULL means not found.
static int c_getf_heaviside_callback(ITEMLEN found_keylen, bytevec found_keyvec, ITEMLEN found_vallen, bytevec found_valvec,
                                 ITEMLEN next_keylen,  bytevec next_keyvec,  ITEMLEN next_vallen,  bytevec next_valvec,
                                 void *extra) {
    QUERY_CONTEXT_HEAVISIDE super_context = extra;
    QUERY_CONTEXT_BASE      context       = &super_context->base;

    int r;
    int r2 = 0;

    DBT found_key;
    DBT found_val;
    toku_fill_dbt(&found_key, found_keyvec, found_keylen);
    toku_fill_dbt(&found_val, found_valvec, found_vallen);

    if (context->do_locking) {
        const DBT *left_key  = toku_lt_neg_infinity;
        const DBT *left_val  = toku_lt_neg_infinity;
        const DBT *right_key = toku_lt_infinity;
        const DBT *right_val = toku_lt_infinity;
        RANGE_LOCK_REQUEST_S request;
#ifdef  BRT_LEVEL_STRADDLE_CALLBACK_LOGIC_NOT_READY
        //Have cursor (base->c)
        //Have txn    (base->txn)
        //Have db     (base->db)
3760
        BOOL found = (BOOL)(found_keyvec != NULL);
3761
        DBC *tmp_cursor; //Temporary cursor to find 'next_key/next_val'
Yoni Fogel's avatar
Yoni Fogel committed
3762 3763
        DBT tmp_key;
        DBT tmp_val;
3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779
        toku_init_dbt(&tmp_key);
        toku_init_dbt(&tmp_val);
        r = toku_db_cursor(context->db, context->txn, &tmp_cursor, 0, 0);
        if (r!=0) goto tmp_cleanup;
        //Find the 'next key and next val'
        //We will do all relevent range locking, so there is no need for any sub-queries to do locking.
        //Pass in DB_PRELOCKED.
        if (super_context->wrapper->direction<0) {
            if (found) {
                //do an 'after'
                //call DB_GET_BOTH to set the temp cursor to the 'found' values
                //then call 'DB_NEXT' to advance it to the values we want
                r = toku_c_getf_get_both(tmp_cursor, DB_PRELOCKED, &found_key, &found_val, ydb_getf_do_nothing, NULL);
                if (r==0) {
                    r = toku_c_get(tmp_cursor, &tmp_key, &tmp_val, DB_NEXT|DB_PRELOCKED);
                    if (r==DB_NOTFOUND) r = 0;
Yoni Fogel's avatar
Yoni Fogel committed
3780 3781 3782
                }
            }
            else {
3783 3784 3785
                //do a 'first'
                r = toku_c_get(tmp_cursor, &tmp_key, &tmp_val, DB_FIRST|DB_PRELOCKED);
                if (r==DB_NOTFOUND) r = 0;
Yoni Fogel's avatar
Yoni Fogel committed
3786 3787 3788
            }
        }
        else {
3789 3790 3791 3792 3793 3794 3795 3796
            if (found) {
                //do a 'before'
                //call DB_GET_BOTH to set the temp cursor to the 'found' values
                //then call 'DB_PREV' to advance it to the values we want
                r = toku_c_getf_get_both(tmp_cursor, DB_PRELOCKED, &found_key, &found_val, ydb_getf_do_nothing, NULL);
                if (r==0) {
                    r = toku_c_get(tmp_cursor, &tmp_key, &tmp_val, DB_PREV|DB_PRELOCKED);
                    if (r==DB_NOTFOUND) r = 0;
Yoni Fogel's avatar
Yoni Fogel committed
3797 3798 3799
                }
            }
            else {
3800 3801 3802
                //do a 'last'
                r = toku_c_get(tmp_cursor, &tmp_key, &tmp_val, DB_LAST|DB_PRELOCKED);
                if (r==DB_NOTFOUND) r = 0;
Yoni Fogel's avatar
Yoni Fogel committed
3803 3804
            }
        }
3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858
        if (r==0) {
            next_keyvec = tmp_key.data;
            next_keylen = tmp_key.size;
            next_valvec = tmp_val.data;
            next_vallen = tmp_val.size;
        }
        else goto temp_cursor_cleanup;
#endif
        DBT next_key;
        DBT next_val;
        toku_fill_dbt(&next_key, next_keyvec, next_keylen);
        toku_fill_dbt(&next_val, next_valvec, next_vallen);
        if (super_context->wrapper->direction<0) {
            if (found_keyvec!=NULL) {
                left_key  = &found_key; 
                left_val  = &found_val; 
            }
            if (next_keyvec!=NULL) {
                right_key = &next_key; 
                right_val = &next_val; 
            }
        }
        else {
            if (next_keyvec!=NULL) {
                left_key  = &next_key; 
                left_val  = &next_val; 
            }
            if (found_keyvec!=NULL) {
                right_key = &found_key; 
                right_val = &found_val; 
            }
        }
        read_lock_request_init(&request, context->txn, context->db,
                               left_key,   left_val,
                               right_key,  right_val);
        r = grab_range_lock(&request);
#ifdef  BRT_LEVEL_STRADDLE_CALLBACK_LOGIC_NOT_READY
temp_cursor_cleanup:
        r2 = toku_c_close(tmp_cursor);
        //cleanup cursor
#endif
    }
    else r = 0;

    //Call application-layer callback if found and locks were successfully obtained.
    if (r==0 && found_keyvec!=NULL) {
        context->r_user_callback = super_context->f(&found_key, &found_val, context->f_extra, super_context->wrapper->r_h);
        if (context->r_user_callback) r = TOKUDB_USER_CALLBACK_ERROR;
    }

#ifdef  BRT_LEVEL_STRADDLE_CALLBACK_LOGIC_NOT_READY
tmp_cleanup:
#endif
    //Give brt-layer an error (if any) to return from toku_brt_cursor_heavi
Yoni Fogel's avatar
Yoni Fogel committed
3859 3860 3861
    return r ? r : r2;
}

3862
static int toku_c_close(DBC * c) {
3863 3864
    HANDLE_PANICKED_DB(c->dbp);
    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
3865 3866 3867 3868 3869 3870
    int r = toku_brt_cursor_close(dbc_struct_i(c)->c);
    toku_sdbt_cleanup(&dbc_struct_i(c)->skey_s);
    toku_sdbt_cleanup(&dbc_struct_i(c)->sval_s);
#if !TOKUDB_NATIVE_H
    toku_free(dbc_struct_i(c));
#endif
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
3871
    toku_free(c);
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
3872 3873 3874
    return r;
}

3875 3876 3877 3878 3879
static inline int keyeq(DBC *c, DBT *a, DBT *b) {
    DB *db = c->dbp;
    return db->i->brt->compare_fun(db, a, b) == 0;
}

3880 3881 3882 3883
// Return the number of entries whose key matches the key currently 
// pointed to by the brt cursor.  
static int 
toku_c_count(DBC *cursor, db_recno_t *count, u_int32_t flags) {
3884 3885
    HANDLE_PANICKED_DB(cursor->dbp);
    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(cursor);
Rich Prohaska's avatar
Rich Prohaska committed
3886 3887
    int r;
    DBC *count_cursor = 0;
3888
    DBT currentkey;
Rich Prohaska's avatar
Rich Prohaska committed
3889

3890
    init_dbt_realloc(&currentkey);
3891
    u_int32_t lock_flags = get_prelocked_flags(flags, dbc_struct_i(cursor)->txn, cursor->dbp);
3892
    flags &= ~lock_flags;
Rich Prohaska's avatar
Rich Prohaska committed
3893 3894 3895 3896
    if (flags != 0) {
        r = EINVAL; goto finish;
    }

3897
    r = toku_c_get_current_unconditional(cursor, lock_flags, &currentkey, NULL);
Rich Prohaska's avatar
Rich Prohaska committed
3898
    if (r != 0) goto finish;
3899 3900 3901 3902 3903 3904

    //TODO: Optimization
    //if (do_locking) {
    //   do a lock from currentkey,-infinity to currentkey,infinity
    //   lock_flags |= DB_PRELOCKED
    //}
Rich Prohaska's avatar
Rich Prohaska committed
3905
    
3906
    r = toku_db_cursor(cursor->dbp, dbc_struct_i(cursor)->txn, &count_cursor, 0, 0);
Rich Prohaska's avatar
Rich Prohaska committed
3907 3908 3909
    if (r != 0) goto finish;

    *count = 0;
3910
    r = toku_c_getf_set(count_cursor, lock_flags, &currentkey, ydb_getf_do_nothing, NULL);
Rich Prohaska's avatar
Rich Prohaska committed
3911 3912 3913 3914 3915 3916
    if (r != 0) {
        r = 0; goto finish; /* success, the current key must be deleted and there are no more */
    }

    for (;;) {
        *count += 1;
3917
        r = toku_c_getf_next_dup(count_cursor, lock_flags, ydb_getf_do_nothing, NULL);
Rich Prohaska's avatar
Rich Prohaska committed
3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928
        if (r != 0) break;
    }
    r = 0; /* success, we found at least one before the end */
finish:
    if (currentkey.data) toku_free(currentkey.data);
    if (count_cursor) {
        int rr = toku_c_close(count_cursor); assert(rr == 0);
    }
    return r;
}

Yoni Fogel's avatar
 
Yoni Fogel committed
3929

3930 3931
///////////
//db_getf_XXX is equivalent to c_getf_XXX, without a persistent cursor
Yoni Fogel's avatar
 
Yoni Fogel committed
3932

3933 3934
static int
db_getf_set(DB *db, DB_TXN *txn, u_int32_t flags, DBT *key, YDB_CALLBACK_FUNCTION f, void *extra) {
3935 3936
    HANDLE_PANICKED_DB(db);
    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
3937 3938 3939 3940 3941 3942
    DBC *c;
    int r = toku_db_cursor(db, txn, &c, 0, 1);
    if (r==0) {
        r = toku_c_getf_set(c, flags, key, f, extra);
        int r2 = toku_c_close(c);
        if (r==0) r = r2;
Yoni Fogel's avatar
 
Yoni Fogel committed
3943
    }
3944
    return r;
Yoni Fogel's avatar
 
Yoni Fogel committed
3945 3946
}

3947 3948
static int
db_getf_get_both(DB *db, DB_TXN *txn, u_int32_t flags, DBT *key, DBT *val, YDB_CALLBACK_FUNCTION f, void *extra) {
3949 3950
    HANDLE_PANICKED_DB(db);
    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
3951 3952 3953 3954 3955 3956
    DBC *c;
    int r = toku_db_cursor(db, txn, &c, 0, 1);
    if (r==0) {
        r = toku_c_getf_get_both(c, flags, key, val, f, extra);
        int r2 = toku_c_close(c);
        if (r==0) r = r2;
Yoni Fogel's avatar
 
Yoni Fogel committed
3957
    }
3958
    return r;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
3959
}
3960
////////////
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
3961

3962 3963
static int
toku_db_del(DB *db, DB_TXN *txn, DBT *key, u_int32_t flags) {
3964
    HANDLE_PANICKED_DB(db);
3965
    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
3966
    num_deletes++;       // accountability 
3967 3968
    u_int32_t unchecked_flags = flags;
    //DB_DELETE_ANY means delete regardless of whether it exists in the db.
3969
    BOOL error_if_missing = (BOOL)(!(flags&DB_DELETE_ANY));
3970
    unchecked_flags &= ~DB_DELETE_ANY;
3971
    u_int32_t lock_flags = get_prelocked_flags(flags, txn, db);
3972
    unchecked_flags &= ~lock_flags;
3973
    BOOL do_locking = (BOOL)(db->i->lt && !(lock_flags&DB_PRELOCKED_WRITE));
3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987
    int r = 0;
    if (unchecked_flags!=0) r = EINVAL;
    if (r==0 && error_if_missing) {
        //Check if the key exists in the db.
        r = db_getf_set(db, txn, lock_flags, key, ydb_getf_do_nothing, NULL);
    }
    if (r==0 && do_locking) {
        //Do locking if necessary.
        RANGE_LOCK_REQUEST_S request;
        //Left end of range == right end of range (point lock)
        write_lock_request_init(&request, txn, db,
                                key, toku_lt_neg_infinity,
                                key, toku_lt_infinity);
        r = grab_range_lock(&request);
3988
    }
3989 3990
    if (r==0) {
        //Do the actual deleting.
3991
        r = toku_brt_delete(db->i->brt, key, txn ? db_txn_struct_i(txn)->tokutxn : 0);
3992
    }
3993 3994
    if (r)
	num_deletes_fail++;
3995
    return r;
Rich Prohaska's avatar
Rich Prohaska committed
3996 3997
}

3998
static int
3999
env_del_multiple(DB_ENV *env, DB *src_db, DB_TXN *txn, const DBT *key, const DBT *val, uint32_t num_dbs, DB **db_array, DBT *keys, uint32_t *flags_array, void *extra) {
4000 4001 4002 4003 4004 4005 4006 4007
    int r;
    uint32_t lock_flags[num_dbs];
    uint32_t remaining_flags[num_dbs];
    BRT brts[num_dbs];
    if (!txn || !num_dbs) {
        r = EINVAL;
        goto cleanup;
    }
4008
    if (!env->i->generate_row_for_del) {
4009 4010 4011 4012 4013 4014
        r = EINVAL;
        goto cleanup;
    }

    uint32_t which_db;
    for (which_db = 0; which_db < num_dbs; which_db++) {
4015 4016 4017 4018 4019 4020
        DB *db = db_array[which_db];
        //Generate the row
        r = env->i->generate_row_for_del(db, src_db, &keys[which_db], key, val, extra);
        if (r!=0) goto cleanup;
        lock_flags[which_db] = get_prelocked_flags(flags_array[which_db], txn, db);
        remaining_flags[which_db] = flags_array[which_db] & ~lock_flags[which_db];
4021 4022 4023 4024 4025 4026 4027 4028

        if (remaining_flags[which_db] & ~DB_DELETE_ANY) {
            r = EINVAL;
            goto cleanup;
        }
        BOOL error_if_missing = (BOOL)(!(remaining_flags[which_db]&DB_DELETE_ANY));
        if (error_if_missing) {
            //Check if the key exists in the db.
4029
            r = db_getf_set(db, txn, lock_flags[which_db], &keys[which_db], ydb_getf_do_nothing, NULL);
4030 4031 4032 4033 4034 4035 4036 4037 4038
            if (r!=0) goto cleanup;
        }

        //Do locking if necessary.
        if (db->i->lt && !(lock_flags[which_db] & DB_PRELOCKED_WRITE)) {
            //Needs locking
            RANGE_LOCK_REQUEST_S request;
            //Left end of range == right end of range (point lock)
            write_lock_request_init(&request, txn, db,
4039 4040
                                    &keys[which_db], toku_lt_neg_infinity,
                                    &keys[which_db], toku_lt_infinity);
4041 4042 4043 4044 4045 4046
            r = grab_range_lock(&request);
            if (r!=0) goto cleanup;
        }
        brts[which_db] = db->i->brt;
    }
    TOKUTXN ttxn = db_txn_struct_i(txn)->tokutxn;
4047 4048
    BRT src_brt  = src_db ? src_db->i->brt : NULL;
    r = toku_brt_log_del_multiple(ttxn, src_brt, brts, num_dbs, key, val);
4049 4050
    if (r!=0) goto cleanup;
    for (which_db = 0; which_db < num_dbs; which_db++) {
4051 4052 4053
        DB *db = db_array[which_db];
        num_deletes++;
        r = toku_brt_maybe_delete(db->i->brt, &keys[which_db], ttxn, FALSE, ZERO_LSN, FALSE);
4054 4055 4056 4057 4058 4059 4060 4061
        if (r!=0) goto cleanup;
    }

cleanup:
    return r;
}


Rich Prohaska's avatar
Rich Prohaska committed
4062
static int locked_c_get(DBC * c, DBT * key, DBT * data, u_int32_t flag) {
4063 4064 4065 4066
    //{ unsigned int i; printf("cget flags=%d keylen=%d key={", flag, key->size); for(i=0; i<key->size; i++) printf("%d,", ((char*)key->data)[i]); printf("} datalen=%d data={", data->size); for(i=0; i<data->size; i++) printf("%d,", ((char*)data->data)[i]); printf("}\n"); }
    toku_ydb_lock(); int r = toku_c_get(c, key, data, flag); toku_ydb_unlock();
    //{ unsigned int i; printf("cgot r=%d keylen=%d key={", r, key->size); for(i=0; i<key->size; i++) printf("%d,", ((char*)key->data)[i]); printf("} datalen=%d data={", data->size); for(i=0; i<data->size; i++) printf("%d,", ((char*)data->data)[i]); printf("}\n"); }
    return r;
Rich Prohaska's avatar
Rich Prohaska committed
4067 4068 4069
}

static int locked_c_close(DBC * c) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
4070
    toku_ydb_lock(); int r = toku_c_close(c); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
4071 4072 4073
}

static int locked_c_count(DBC *cursor, db_recno_t *count, u_int32_t flags) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
4074
    toku_ydb_lock(); int r = toku_c_count(cursor, count, flags); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
4075 4076 4077
}

static int locked_c_del(DBC * c, u_int32_t flags) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
4078
    toku_ydb_lock(); int r = toku_c_del(c, flags); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
4079 4080
}

4081
static int toku_db_cursor(DB * db, DB_TXN * txn, DBC ** c, u_int32_t flags, int is_temporary_cursor) {
4082
    HANDLE_PANICKED_DB(db);
4083
    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
4084 4085
    if (flags != 0)
        return EINVAL;
4086 4087
    size_t result_size = sizeof(DBC)+sizeof(struct __toku_dbc_internal); // internal stuff stuck on the end
    DBC *result = toku_malloc(result_size);
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
4088 4089
    if (result == 0)
        return ENOMEM;
4090
    memset(result, 0, result_size);
Yoni Fogel's avatar
Yoni Fogel committed
4091 4092 4093 4094 4095
#define SCRS(name) result->name = locked_ ## name
    SCRS(c_get);
    SCRS(c_close);
    SCRS(c_del);
    SCRS(c_count);
Yoni Fogel's avatar
Yoni Fogel committed
4096 4097
    SCRS(c_getf_first);
    SCRS(c_getf_last);
Yoni Fogel's avatar
Yoni Fogel committed
4098
    SCRS(c_getf_next);
4099
    SCRS(c_getf_next_nodup);
Yoni Fogel's avatar
Yoni Fogel committed
4100
    SCRS(c_getf_next_dup);
4101 4102 4103 4104 4105 4106 4107 4108
    SCRS(c_getf_prev);
    SCRS(c_getf_prev_nodup);
    SCRS(c_getf_prev_dup);
    SCRS(c_getf_current);
    SCRS(c_getf_current_binding);
    SCRS(c_getf_heaviside);
    SCRS(c_getf_set);
    SCRS(c_getf_set_range);
4109
    SCRS(c_getf_set_range_reverse);
4110 4111
    SCRS(c_getf_get_both);
    SCRS(c_getf_get_both_range);
4112
    SCRS(c_getf_get_both_range_reverse);
Yoni Fogel's avatar
Yoni Fogel committed
4113
#undef SCRS
4114 4115 4116

#if !TOKUDB_NATIVE_H
    MALLOC(result->i); // otherwise it is allocated as part of result->ii
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
4117
    assert(result->i);
4118
#endif
4119
    result->dbp = db;
4120 4121 4122
    dbc_struct_i(result)->txn = txn;
    dbc_struct_i(result)->skey_s = (struct simple_dbt){0,0};
    dbc_struct_i(result)->sval_s = (struct simple_dbt){0,0};
4123
    if (is_temporary_cursor) {
4124 4125
	dbc_struct_i(result)->skey = &db->i->skey;
	dbc_struct_i(result)->sval = &db->i->sval;
4126
    } else {
4127 4128
	dbc_struct_i(result)->skey = &dbc_struct_i(result)->skey_s;
	dbc_struct_i(result)->sval = &dbc_struct_i(result)->sval_s;
4129
    }
4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144
    DB_TXN* txn_anc = NULL;
    TXNID txn_anc_id = TXNID_NONE;
    BOOL is_read_committed = FALSE;
    if (txn) {
        txn_anc = toku_txn_ancestor(txn);
        txn_anc_id = toku_txn_get_txnid(db_txn_struct_i(txn_anc)->tokutxn);
        is_read_committed = ((db_txn_struct_i(txn_anc)->flags & DB_READ_COMMITTED) != 0);
    }
    int r = toku_brt_cursor(
        db->i->brt, 
        &dbc_struct_i(result)->c, 
        db->dbenv->i->logger, 
        txn_anc_id, 
        is_read_committed
        );
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
4145
    assert(r == 0);
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
4146 4147 4148 4149
    *c = result;
    return 0;
}

4150 4151
static int
toku_db_delboth(DB *db, DB_TXN *txn, DBT *key, DBT *val, u_int32_t flags) {
4152
    HANDLE_PANICKED_DB(db);
4153
    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
4154
    num_deletes++;   // accountability 
4155 4156
    u_int32_t unchecked_flags = flags;
    //DB_DELETE_ANY means delete regardless of whether it exists in the db.
4157
    BOOL error_if_missing = (BOOL)(!(flags&DB_DELETE_ANY));
4158
    unchecked_flags &= ~DB_DELETE_ANY;
4159
    u_int32_t lock_flags = get_prelocked_flags(flags, txn, db);
4160
    unchecked_flags &= ~lock_flags;
4161
    BOOL do_locking = (BOOL)(db->i->lt && !(lock_flags&DB_PRELOCKED_WRITE));
4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175
    int r = 0;
    if (unchecked_flags!=0) r = EINVAL;
    if (r==0 && error_if_missing) {
        //Check if the key exists in the db.
        r = db_getf_get_both(db, txn, lock_flags, key, val, ydb_getf_do_nothing, NULL);
    }
    if (r==0 && do_locking) {
        //Do locking if necessary.
        RANGE_LOCK_REQUEST_S request;
        //Left end of range == right end of range (point lock)
        write_lock_request_init(&request, txn, db,
                                key, val,
                                key, val);
        r = grab_range_lock(&request);
Yoni Fogel's avatar
Yoni Fogel committed
4176
    }
4177 4178
    if (r==0) {
        //Do the actual deleting.
4179
        r = toku_brt_delete_both(db->i->brt, key, val, txn ? db_txn_struct_i(txn)->tokutxn : NULL);
Yoni Fogel's avatar
Yoni Fogel committed
4180
    }
4181
    return r;
Yoni Fogel's avatar
Yoni Fogel committed
4182 4183
}

Rich Prohaska's avatar
Rich Prohaska committed
4184 4185 4186 4187
static inline int db_thread_need_flags(DBT *dbt) {
    return (dbt->flags & (DB_DBT_MALLOC+DB_DBT_REALLOC+DB_DBT_USERMEM)) == 0;
}

4188
static int toku_db_get (DB * db, DB_TXN * txn, DBT * key, DBT * data, u_int32_t flags) {
4189
    HANDLE_PANICKED_DB(db);
4190
    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
Yoni Fogel's avatar
Yoni Fogel committed
4191
    int r;
4192

Rich Prohaska's avatar
Rich Prohaska committed
4193
    if ((db->i->open_flags & DB_THREAD) && db_thread_need_flags(data))
4194 4195
        return EINVAL;

4196
    u_int32_t lock_flags = get_prelocked_flags(flags, txn, db);
Yoni Fogel's avatar
Yoni Fogel committed
4197
    flags &= ~lock_flags;
Yoni Fogel's avatar
Yoni Fogel committed
4198 4199 4200 4201
    if (flags != 0 && flags != DB_GET_BOTH) return EINVAL;
    // We aren't ready to handle flags such as DB_READ_COMMITTED or DB_READ_UNCOMMITTED or DB_RMW

    DBC *dbc;
4202
    r = toku_db_cursor(db, txn, &dbc, 0, 1);
Yoni Fogel's avatar
Yoni Fogel committed
4203
    if (r!=0) return r;
Yoni Fogel's avatar
Yoni Fogel committed
4204 4205
    u_int32_t c_get_flags = (flags == 0) ? DB_SET : DB_GET_BOTH;
    r = toku_c_get(dbc, key, data, c_get_flags | lock_flags);
Yoni Fogel's avatar
Yoni Fogel committed
4206 4207
    int r2 = toku_c_close(dbc);
    return r ? r : r2;
4208 4209
}

Rich Prohaska's avatar
Rich Prohaska committed
4210
#if 0
4211
static int toku_db_key_range(DB * db, DB_TXN * txn, DBT * dbt, DB_KEY_RANGE * kr, u_int32_t flags) {
4212
    HANDLE_PANICKED_DB(db);
4213
    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
4214
    txn=txn; dbt=dbt; kr=kr; flags=flags;
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
4215
    toku_ydb_barf();
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
4216
    abort();
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
4217
}
Rich Prohaska's avatar
Rich Prohaska committed
4218
#endif
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
4219

Yoni Fogel's avatar
Yoni Fogel committed
4220
static int toku_db_lt_panic(DB* db, int r) {
4221
    assert(r!=0);
Yoni Fogel's avatar
Yoni Fogel committed
4222 4223
    assert(db && db->i && db->dbenv && db->dbenv->i);
    DB_ENV* env = db->dbenv;
4224 4225 4226 4227 4228
    env->i->is_panicked = r;

    if (r < 0) env->i->panic_string = toku_strdup(toku_lt_strerror((TOKU_LT_ERROR)r));
    else       env->i->panic_string = toku_strdup("Error in locktree.\n");

4229
    return toku_ydb_do_error(env, r, "%s", env->i->panic_string);
Yoni Fogel's avatar
Yoni Fogel committed
4230 4231
}

Yoni Fogel's avatar
Yoni Fogel committed
4232
static int toku_txn_add_lt(DB_TXN* txn, toku_lock_tree* lt) {
Yoni Fogel's avatar
Yoni Fogel committed
4233
    int r = ENOSYS;
Yoni Fogel's avatar
Yoni Fogel committed
4234
    assert(txn && lt);
4235
    toku_lth* lth = db_txn_struct_i(txn)->lth;
Yoni Fogel's avatar
Yoni Fogel committed
4236 4237 4238 4239 4240
    assert(lth);

    toku_lock_tree* find = toku_lth_find(lth, lt);
    if (find) {
        assert(find == lt);
Yoni Fogel's avatar
Yoni Fogel committed
4241 4242
        r = 0;
        goto cleanup;
Yoni Fogel's avatar
Yoni Fogel committed
4243
    }
Yoni Fogel's avatar
Yoni Fogel committed
4244 4245 4246 4247 4248 4249
    r = toku_lth_insert(lth, lt);
    if (r != 0) { goto cleanup; }
    
    toku_lt_add_ref(lt);
    r = 0;
cleanup:
Yoni Fogel's avatar
Yoni Fogel committed
4250 4251 4252
    return r;
}

Yoni Fogel's avatar
Yoni Fogel committed
4253 4254 4255
static toku_dbt_cmp toku_db_get_compare_fun(DB* db) {
    return db->i->brt->compare_fun;
}
Yoni Fogel's avatar
Yoni Fogel committed
4256

Yoni Fogel's avatar
Yoni Fogel committed
4257 4258
static toku_dbt_cmp toku_db_get_dup_compare(DB* db) {
    return db->i->brt->dup_compare;
Yoni Fogel's avatar
Yoni Fogel committed
4259 4260
}

4261
/***** TODO 2216 delete this 
4262 4263 4264 4265 4266
static int toku_db_fd(DB *db, int *fdp) {
    HANDLE_PANICKED_DB(db);
    if (!db_opened(db)) return EINVAL;
    return toku_brt_get_fd(db->i->brt, fdp);
}
4267
*******/
4268

4269 4270 4271 4272 4273
static int
db_open_subdb(DB * db, DB_TXN * txn, const char *fname, const char *dbname, DBTYPE dbtype, u_int32_t flags, int mode) {
    int r;
    if (!fname || !dbname) r = EINVAL;
    else {
4274 4275 4276 4277 4278 4279
        char subdb_full_name[strlen(fname) + sizeof("/") + strlen(dbname)];
        int bytes = snprintf(subdb_full_name, sizeof(subdb_full_name), "%s/%s", fname, dbname);
        assert(bytes==(int)sizeof(subdb_full_name)-1);
        const char *null_subdbname = NULL;
        r = toku_db_open(db, txn, subdb_full_name, null_subdbname, dbtype, flags, mode);
    }
4280 4281
    return r;
}
4282

4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297
static inline char
rot13(char c) {
    char r;
    char a;
    if (isupper(c)) {
        a = 'A';
    }
    else {
        assert(islower(c));
        a = 'a';
    }
    r = (c - a + 13) % 26 + a;
    return r;
}

4298 4299 4300 4301 4302 4303 4304 4305
static void
create_iname_hint(const char *dname, char *hint) {
    //Requires: size of hint array must be > strlen(dname)
    //Copy alphanumeric characters only.
    //Replace strings of non-alphanumeric characters with a single underscore.
    BOOL underscored = FALSE;
    while (*dname) {
        if (isalnum(*dname)) {
4306 4307 4308 4309 4310
            char c = *dname++;
            if (isupper(c) || islower(c)) {
                c = rot13(c);
            }
            *hint++ = c;
4311 4312 4313 4314 4315 4316 4317
            underscored = FALSE;
        }
        else {
            if (!underscored)
                *hint++ = '_';
            dname++;
            underscored = TRUE;
4318 4319
        }
    }
4320 4321 4322
    *hint = '\0';
}

4323 4324 4325

// n >= 0 means to include "_L_" with hex value of n in iname
// (intended for use by loader, which will create many inames using one txnid).
4326
static char *
4327 4328
create_iname(DB_ENV *env, u_int64_t id, char *hint, int n) {
    int bytes;
4329
    char inamebase[strlen(hint) +
4330 4331 4332 4333 4334
		   8 +  // hex file format version
		   16 + // hex id (normally the txnid)
		   8  + // hex value of n if non-neg
		   sizeof("_L___.tokudb")]; // extra pieces
    if (n < 0)
4335 4336 4337
	bytes = snprintf(inamebase, sizeof(inamebase),
                         "%s_%"PRIx64"_%"PRIx32            ".tokudb",
                         hint, id, BRT_LAYOUT_VERSION);
4338
    else
4339 4340 4341
	bytes = snprintf(inamebase, sizeof(inamebase),
                         "%s_%"PRIx64"_%"PRIx32"_L_%"PRIx32".tokudb",
                         hint, id, BRT_LAYOUT_VERSION, n);
4342 4343 4344 4345
    assert(bytes>0);
    assert(bytes<=(int)sizeof(inamebase)-1);
    char *rval;
    if (env->i->data_dir)
Yoni Fogel's avatar
Yoni Fogel committed
4346
        rval = toku_construct_full_name(2, env->i->data_dir, inamebase);
4347
    else
Yoni Fogel's avatar
Yoni Fogel committed
4348
        rval = toku_construct_full_name(1, inamebase);
4349 4350
    assert(rval);
    return rval;
4351 4352
}

4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365

static int db_open_iname(DB * db, DB_TXN * txn, const char *iname, u_int32_t flags, int mode);


// inames are created here.
// algorithm:
//  begin txn
//  convert dname to iname (possibly creating new iname)
//  open file (toku_brt_open() will handle logging)
//  close txn
//  if created a new iname, take full range lock
static int 
toku_db_open(DB * db, DB_TXN * txn, const char *fname, const char *dbname, DBTYPE dbtype, u_int32_t flags, int mode) {
4366
    HANDLE_PANICKED_DB(db);
4367
    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
4368
    if (dbname!=NULL) 
4369
        return db_open_subdb(db, txn, fname, dbname, dbtype, flags, mode);
4370

4371
    // at this point fname is the dname
4372 4373
    //This code ONLY supports single-db files.
    assert(dbname==NULL);
4374
    const char * dname = fname;  // db_open_subdb() converts (fname, dbname) to dname
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
4375

4376 4377 4378
    ////////////////////////////// do some level of parameter checking.
    u_int32_t unused_flags = flags;
    int using_txns = db->dbenv->i->open_flags & DB_INIT_TXN;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
4379
    int r;
Yoni Fogel's avatar
Yoni Fogel committed
4380
    if (dbtype!=DB_BTREE && dbtype!=DB_UNKNOWN) return EINVAL;
4381 4382 4383
    int is_db_excl    = flags & DB_EXCL;    unused_flags&=~DB_EXCL;
    int is_db_create  = flags & DB_CREATE;  unused_flags&=~DB_CREATE;

4384
    //We support READ_UNCOMMITTED and READ_COMMITTED whether or not the flag is provided.
4385
                                            unused_flags&=~DB_READ_UNCOMMITTED;
4386
                                            unused_flags&=~DB_READ_COMMITTED;
4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406
    if (unused_flags & ~DB_THREAD) return EINVAL; // unknown flags

    if (is_db_excl && !is_db_create) return EINVAL;
    if (dbtype==DB_UNKNOWN && is_db_excl) return EINVAL;

    /* tokudb supports no duplicates and sorted duplicates only */
    unsigned int tflags;
    r = toku_brt_get_flags(db->i->brt, &tflags);
    if (r != 0) 
        return r;
    if ((tflags & TOKU_DB_DUP) && !(tflags & TOKU_DB_DUPSORT))
        return EINVAL;

    if (db_opened(db))
        return EINVAL;              /* It was already open. */
    //////////////////////////////

    DB_TXN *child = NULL;
    // begin child (unless transactionless)
    if (using_txns) {
4407
	r = toku_txn_begin(db->dbenv, txn, &child, DB_TXN_NOSYNC, 1);
4408 4409 4410 4411 4412 4413 4414
	assert(r==0);
    }

    // convert dname to iname
    //  - look up dname, get iname
    //  - if dname does not exist, create iname and make entry in directory
    DBT dname_dbt;  // holds dname
4415
    DBT iname_dbt;  // holds iname_in_env
4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433
    toku_fill_dbt(&dname_dbt, dname, strlen(dname)+1);
    init_dbt_realloc(&iname_dbt);  // sets iname_dbt.data = NULL
    r = toku_db_get(db->dbenv->i->directory, child, &dname_dbt, &iname_dbt, 0);  // allocates memory for iname
    char *iname = iname_dbt.data;
    if (r==DB_NOTFOUND && !is_db_create)
        r = ENOENT;
    else if (r==0 && is_db_excl) {
        r = EEXIST;
    }
    else if (r==DB_NOTFOUND) {
	char hint[strlen(dname) + 1];

	// create iname and make entry in directory
	u_int64_t id = 0;
	
	if (using_txns)
	    id = toku_txn_get_txnid(db_txn_struct_i(child)->tokutxn);
	create_iname_hint(dname, hint);
4434
        iname = create_iname(db->dbenv, id, hint, -1);  // allocated memory for iname
4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453
        toku_fill_dbt(&iname_dbt, iname, strlen(iname) + 1);
        r = toku_db_put(db->dbenv->i->directory, child, &dname_dbt, &iname_dbt, DB_YESOVERWRITE);  // DB_YESOVERWRITE for performance only, avoid unnecessary query
    }

    // we now have an iname
    if (r == 0) {
	r = db_open_iname(db, child, iname, flags, mode);
        if (r==0) {
            db->i->dname = toku_xstrdup(dname);
            env_note_db_opened(db->dbenv, db);  // tell env that a new db handle is open (using dname)
        }
    }

    // free string holding iname
    if (iname) toku_free(iname);

    if (using_txns) {
	// close txn
	if (r == 0) {  // commit
4454
	    r = toku_txn_commit(child, DB_TXN_NOSYNC, NULL, NULL);
4455 4456 4457
	    assert(r==0);  // TODO panic
	}
	else {         // abort
4458
	    int r2 = toku_txn_abort(child, NULL, NULL);
4459 4460 4461 4462 4463 4464 4465 4466
	    assert(r2==0);  // TODO panic
	}
    }

    return r;
}

static int 
4467
db_open_iname(DB * db, DB_TXN * txn, const char *iname_in_env, u_int32_t flags, int mode) {
4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483
    int r;

    //Set comparison functions if not yet set.
    if (!db->i->key_compare_was_set && db->dbenv->i->bt_compare) {
        r = toku_brt_set_bt_compare(db->i->brt, db->dbenv->i->bt_compare);
        assert(r==0);
        db->i->key_compare_was_set = TRUE;
    }
    if (!db->i->val_compare_was_set && db->dbenv->i->dup_compare) {
        r = toku_brt_set_dup_compare(db->i->brt, db->dbenv->i->dup_compare);
        assert(r==0);
        db->i->val_compare_was_set = TRUE;
    }
    BOOL need_locktree = (BOOL)((db->dbenv->i->open_flags & DB_INIT_LOCK) &&
                                (db->dbenv->i->open_flags & DB_INIT_TXN));

4484 4485
    int is_db_excl    = flags & DB_EXCL;    flags&=~DB_EXCL;
    int is_db_create  = flags & DB_CREATE;  flags&=~DB_CREATE;
4486
    //We support READ_UNCOMMITTED and READ_COMMITTED whether or not the flag is provided.
4487
                                            flags&=~DB_READ_UNCOMMITTED;
4488
                                            flags&=~DB_READ_COMMITTED;
4489
    if (flags & ~DB_THREAD) return EINVAL; // unknown flags
4490 4491

    if (is_db_excl && !is_db_create) return EINVAL;
4492

4493 4494 4495 4496 4497 4498 4499 4500
    /* tokudb supports no duplicates and sorted duplicates only */
    unsigned int tflags;
    r = toku_brt_get_flags(db->i->brt, &tflags);
    if (r != 0) 
        return r;
    if ((tflags & TOKU_DB_DUP) && !(tflags & TOKU_DB_DUPSORT))
        return EINVAL;

4501
    if (db_opened(db))
4502
        return EINVAL;              /* It was already open. */
Yoni Fogel's avatar
Yoni Fogel committed
4503
    
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
4504 4505
    db->i->open_flags = flags;
    db->i->open_mode = mode;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
4506

Yoni Fogel's avatar
Yoni Fogel committed
4507
    r = toku_brt_open(db->i->brt, iname_in_env,
4508
		      is_db_create, is_db_excl,
4509
		      db->dbenv->i->cachetable,
4510
		      txn ? db_txn_struct_i(txn)->tokutxn : NULL_TXN,
4511
		      db);
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
4512 4513 4514
    if (r != 0)
        goto error_cleanup;

4515
    db->i->opened = 1;
Yoni Fogel's avatar
Yoni Fogel committed
4516
    if (need_locktree) {
Yoni Fogel's avatar
Yoni Fogel committed
4517 4518 4519
        unsigned int brtflags;
        BOOL dups;
        toku_brt_get_flags(db->i->brt, &brtflags);
4520
        dups = (BOOL)((brtflags & TOKU_DB_DUPSORT || brtflags & TOKU_DB_DUP));
4521 4522
	db->i->dict_id = toku_brt_get_dictionary_id(db->i->brt);
        r = toku_ltm_get_lt(db->dbenv->i->ltm, &db->i->lt, dups, db->i->dict_id);
Yoni Fogel's avatar
Yoni Fogel committed
4523
        if (r!=0) { goto error_cleanup; }
Yoni Fogel's avatar
Yoni Fogel committed
4524
    }
4525 4526 4527
    //Add to transaction's list of 'must close' if necessary.
    if (txn) {
        //Do last so we don't have to undo.
4528
        toku_list_push(&db_txn_struct_i(txn)->dbs_that_must_close_before_abort,
4529 4530
                  &db->i->dbs_that_must_close_before_abort);
    }
Yoni Fogel's avatar
Yoni Fogel committed
4531

Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
4532
    return 0;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
4533 4534
 
error_cleanup:
4535
    db->i->dict_id = DICTIONARY_ID_NONE;
4536
    db->i->opened = 0;
Yoni Fogel's avatar
Yoni Fogel committed
4537
    if (db->i->lt) {
Yoni Fogel's avatar
Yoni Fogel committed
4538
        toku_lt_remove_ref(db->i->lt);
Yoni Fogel's avatar
Yoni Fogel committed
4539 4540
        db->i->lt = NULL;
    }
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
4541
    return r;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
4542
}
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
4543

4544 4545 4546 4547 4548
//Return 0 if proposed pair do not violate size constraints of DB
//(insertion is legal)
//Return non zero otherwise.
static int
db_put_check_size_constraints(DB *db, DBT *key, DBT *val) {
4549
    int r;
4550

4551
    BOOL dupsort = (BOOL)(!db_is_nodup(db));
4552
    //Check limits on size of key and val.
4553
    unsigned int nodesize;
4554
    r = toku_brt_get_nodesize(db->i->brt, &nodesize); assert(r == 0);
4555
    u_int32_t limit;
4556

4557
    if (dupsort) {
4558 4559
        limit = nodesize / BRT_FANOUT;
        if (key->size + val->size > limit)
4560
            r = toku_ydb_do_error(db->dbenv, EINVAL, "The largest row (key + val) allowed is %u bytes", limit);
4561
    } else {
4562
        limit = nodesize / BRT_FANOUT;
4563 4564 4565 4566
        if (key->size > limit)
            r = toku_ydb_do_error(db->dbenv, EINVAL, "The largest key allowed is %u bytes", limit);
        else if (val->size > nodesize)
            r = toku_ydb_do_error(db->dbenv, EINVAL, "The largest value allowed is %u bytes", nodesize);
Yoni Fogel's avatar
Yoni Fogel committed
4567
    }
4568 4569 4570
    return r;
}

4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591
//Return 0 if supported.
//Return ERANGE if out of range.
static int
db_row_size_supported(DB *db, u_int32_t size) {
    DBT key, val;

    toku_fill_dbt(&key, NULL, size);
    toku_fill_dbt(&val, NULL, 0);
    int r = db_put_check_size_constraints(db, &key, &val);
    if (r!=0) r = ERANGE;
    return r;
}

static int
locked_db_row_size_supported(DB *db, u_int32_t size) {
    toku_ydb_lock();
    int r = db_row_size_supported(db, size);
    toku_ydb_unlock();
    return r;
}

4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614
//Return 0 if insert is legal
static int
db_put_check_overwrite_constraint(DB *db, DB_TXN *txn, DBT *key, DBT *UU(val),
                                  u_int32_t lock_flags, u_int32_t overwrite_flag) {
    int r;

    //DB_YESOVERWRITE does not impose constraints.
    if (overwrite_flag==DB_YESOVERWRITE) r = 0;
    else if (overwrite_flag==DB_NOOVERWRITE) {
        //Check if (key,anything) exists in dictionary.
        //If exists, fail.  Otherwise, do insert.
        r = db_getf_set(db, txn, lock_flags, key, ydb_getf_do_nothing, NULL);
        if (r==DB_NOTFOUND) r = 0;
        else if (r==0)      r = DB_KEYEXIST;
        //Any other error is passed through.
    }
    else if (overwrite_flag==0) {
        //in a nodup db:   overwrite_flag==0 is an alias for DB_YESOVERWRITE
        //in a dupsort db: overwrite_flag==0 is an error
        if (db_is_nodup(db)) r = 0;
        else {
            r = toku_ydb_do_error(db->dbenv, EINVAL, "Tokudb requires that db->put specify DB_YESOVERWRITE or DB_NOOVERWRITE on DB_DUPSORT databases");
        }
4615
    }
4616 4617 4618
    else if (overwrite_flag==DB_NOOVERWRITE_NO_ERROR) {
        r = 0;
    }
4619 4620 4621
    else {
        //Other flags are not (yet) supported.
        r = EINVAL;
4622 4623 4624 4625
    }
    return r;
}

4626 4627
static int
toku_db_put(DB *db, DB_TXN *txn, DBT *key, DBT *val, u_int32_t flags) {
4628
    HANDLE_PANICKED_DB(db);
4629
    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
4630 4631
    int r;

4632
    num_inserts++;
4633
    u_int32_t lock_flags = get_prelocked_flags(flags, txn, db);
4634
    flags &= ~lock_flags;
4635
    BOOL do_locking = (BOOL)(db->i->lt && !(lock_flags&DB_PRELOCKED_WRITE));
4636

4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649
    r = db_put_check_size_constraints(db, key, val);
    if (r==0) {
        //Do any checking required by the flags.
        r = db_put_check_overwrite_constraint(db, txn, key, val, lock_flags, flags);
    }
    if (r==0 && do_locking) {
        //Do locking if necessary.
        RANGE_LOCK_REQUEST_S request;
        //Left end of range == right end of range (point lock)
        write_lock_request_init(&request, txn, db,
                                key, val,
                                key, val);
        r = grab_range_lock(&request);
4650
    }
4651 4652
    if (r==0) {
        //Insert into the brt.
4653 4654 4655 4656 4657
        TOKUTXN ttxn = txn ? db_txn_struct_i(txn)->tokutxn : NULL;
        enum brt_msg_type type = BRT_INSERT;
        if (flags==DB_NOOVERWRITE_NO_ERROR)
            type = BRT_INSERT_NO_OVERWRITE;
        r = toku_brt_maybe_insert(db->i->brt, key, val, ttxn, FALSE, ZERO_LSN, TRUE, type);
4658
    }
4659 4660
    if (r)
	num_inserts_fail++;
4661
    return r;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
4662
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
4663

4664
static int
4665
env_put_multiple(DB_ENV *env, DB *src_db, DB_TXN *txn, const DBT *key, const DBT *val, uint32_t num_dbs, DB **db_array, DBT *keys, DBT *vals, uint32_t *flags_array, void *extra) {
4666 4667 4668 4669 4670 4671 4672 4673
    int r;
    uint32_t lock_flags[num_dbs];
    uint32_t remaining_flags[num_dbs];
    BRT brts[num_dbs];
    if (!txn || !num_dbs) {
        r = EINVAL;
        goto cleanup;
    }
4674
    if (!env->i->generate_row_for_put) {
4675 4676 4677 4678 4679 4680
        r = EINVAL;
        goto cleanup;
    }

    uint32_t which_db;
    for (which_db = 0; which_db < num_dbs; which_db++) {
4681 4682 4683 4684 4685 4686
        DB *db = db_array[which_db];
        //Generate the row
        r = env->i->generate_row_for_put(db, src_db, &keys[which_db], &vals[which_db], key, val, extra);
        if (r!=0) goto cleanup;
        lock_flags[which_db] = get_prelocked_flags(flags_array[which_db], txn, db);
        remaining_flags[which_db] = flags_array[which_db] & ~lock_flags[which_db];
4687 4688
        //Check overwrite constraints
        r = db_put_check_overwrite_constraint(db, txn,
4689
                                              &keys[which_db],      &vals[which_db],
4690 4691
                                              lock_flags[which_db], remaining_flags[which_db]);
        if (r!=0) goto cleanup;
4692 4693 4694 4695 4696
        if (remaining_flags[which_db] == DB_NOOVERWRITE_NO_ERROR) {
            //put_multiple does not support delaying the no error, since we would
            //have to log the flag in the put_multiple.
            r = EINVAL; goto cleanup;
        }
4697 4698 4699 4700 4701 4702
        //Do locking if necessary.
        if (db->i->lt && !(lock_flags[which_db] & DB_PRELOCKED_WRITE)) {
            //Needs locking
            RANGE_LOCK_REQUEST_S request;
            //Left end of range == right end of range (point lock)
            write_lock_request_init(&request, txn, db,
4703 4704
                                    &keys[which_db], &vals[which_db],
                                    &keys[which_db], &vals[which_db]);
4705 4706 4707 4708 4709 4710
            r = grab_range_lock(&request);
            if (r!=0) goto cleanup;
        }
        brts[which_db] = db->i->brt;
    }
    TOKUTXN ttxn = db_txn_struct_i(txn)->tokutxn;
4711 4712
    BRT src_brt  = src_db ? src_db->i->brt : NULL;
    r = toku_brt_log_put_multiple(ttxn, src_brt, brts, num_dbs, key, val);
4713 4714
    if (r!=0) goto cleanup;
    for (which_db = 0; which_db < num_dbs; which_db++) {
4715
        DB *db = db_array[which_db];
4716
        num_inserts++;
4717
        r = toku_brt_maybe_insert(db->i->brt, &keys[which_db], &vals[which_db], ttxn, FALSE, ZERO_LSN, FALSE, BRT_INSERT);
4718 4719 4720 4721 4722 4723 4724 4725
        if (r!=0) goto cleanup;
    }

cleanup:
    return r;
}


4726 4727
static int toku_db_remove(DB * db, const char *fname, const char *dbname, u_int32_t flags);

4728 4729

//We do not (yet?) support deleting subdbs by deleting the enclosing 'fname'
4730
static int
4731 4732 4733 4734
env_dbremove_subdb(DB_ENV * env, DB_TXN * txn, const char *fname, const char *dbname, int32_t flags) {
    int r;
    if (!fname || !dbname) r = EINVAL;
    else {
4735 4736 4737 4738
        char subdb_full_name[strlen(fname) + sizeof("/") + strlen(dbname)];
        int bytes = snprintf(subdb_full_name, sizeof(subdb_full_name), "%s/%s", fname, dbname);
        assert(bytes==(int)sizeof(subdb_full_name)-1);
        const char *null_subdbname = NULL;
4739
        r = toku_env_dbremove(env, txn, subdb_full_name, null_subdbname, flags);
4740
    }
4741 4742
    return r;
}
4743

4744 4745 4746 4747

//Called during committing an fdelete ONLY IF you still have an fd AND it is not connected to /dev/null
//Called during aborting an fcreate (harmless to do, and definitely correct)
static void
4748
finalize_file_removal(DICTIONARY_ID dict_id, void * extra) {
4749 4750 4751
    toku_ltm *ltm = (toku_ltm*) extra;
    if (ltm) {
        //Poison the lock tree to prevent a future file from re-using it.
4752
        toku_ltm_invalidate_lt(ltm, dict_id);
4753 4754 4755
    }
}

4756
//static int toku_db_pre_acquire_table_lock(DB *db, DB_TXN *txn);
4757

4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783
static int
toku_env_dbremove(DB_ENV * env, DB_TXN *txn, const char *fname, const char *dbname, u_int32_t flags) {
    int r;
    HANDLE_PANICKED_ENV(env);
    HANDLE_ILLEGAL_WORKING_PARENT_TXN(env, txn);
    if (!env_opened(env)) return EINVAL;
    if (dbname!=NULL) 
        return env_dbremove_subdb(env, txn, fname, dbname, flags);
    // env_dbremove_subdb() converts (fname, dbname) to dname

    const char * dname = fname;
    assert(dbname == NULL);

    if (flags!=0) return EINVAL;
    if (env_is_db_with_dname_open(env, dname))
        return toku_ydb_do_error(env, EINVAL, "Cannot remove dictionary with an open handle.\n");
    
    DBT dname_dbt;  
    DBT iname_dbt;  
    toku_fill_dbt(&dname_dbt, dname, strlen(dname)+1);
    init_dbt_realloc(&iname_dbt);  // sets iname_dbt.data = NULL

    int using_txns = env->i->open_flags & DB_INIT_TXN;
    DB_TXN *child = NULL;
    // begin child (unless transactionless)
    if (using_txns) {
4784
	r = toku_txn_begin(env, txn, &child, DB_TXN_NOSYNC, 1);
4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797
	assert(r==0);
    }

    // get iname
    r = toku_db_get(env->i->directory, child, &dname_dbt, &iname_dbt, 0);  // allocates memory for iname
    char *iname = iname_dbt.data;
    if (r==DB_NOTFOUND)
        r = ENOENT;
    else if (r==0) {
	// remove (dname,iname) from directory
	r = toku_db_del(env->i->directory, child, &dname_dbt, DB_DELETE_ANY);
	if (r == 0) {
            if (using_txns) {
Yoni Fogel's avatar
Yoni Fogel committed
4798
                r = toku_brt_remove_on_commit(db_txn_struct_i(child)->tokutxn, &iname_dbt);
4799
		assert(r==0);
4800 4801 4802
                //Now that we have a writelock on dname, verify that there are still no handles open. (to prevent race conditions)
                if (r==0 && env_is_db_with_dname_open(env, dname))
                    r = toku_ydb_do_error(env, EINVAL, "Cannot remove dictionary with an open handle.\n");
4803 4804 4805
                if (r==0) {
                    DB* zombie = env_get_zombie_db_with_dname(env, dname);
                    if (zombie)
4806
                        r = toku_db_pre_acquire_table_lock(zombie, child, TRUE);
4807 4808 4809
                    if (r!=0)
                        toku_ydb_do_error(env, r, "Cannot remove dictionary.\n");
                }
4810 4811
            }
            else {
Yoni Fogel's avatar
Yoni Fogel committed
4812
                r = toku_brt_remove_now(env->i->cachetable, &iname_dbt);
4813 4814 4815 4816 4817 4818 4819 4820
		assert(r==0);
            }
	}
    }

    if (using_txns) {
	// close txn
	if (r == 0) {  // commit
4821
	    r = toku_txn_commit(child, DB_TXN_NOSYNC, NULL, NULL);
4822 4823 4824
	    assert(r==0);  // TODO panic
	}
	else {         // abort
4825
	    int r2 = toku_txn_abort(child, NULL, NULL);
4826 4827
	    assert(r2==0);  // TODO panic
	}
4828
    }
4829 4830

    if (iname) toku_free(iname);
4831
    return r;
4832

4833 4834
}

4835 4836 4837

static int
toku_db_remove(DB * db, const char *fname, const char *dbname, u_int32_t flags) {
4838
    HANDLE_PANICKED_DB(db);
4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859
    DB_TXN *null_txn = NULL;
    int r  = toku_env_dbremove(db->dbenv, null_txn, fname, dbname, flags);
    int r2 = toku_db_close(db, 0);
    if (r==0) r = r2;
    return r;
}

static int
env_dbrename_subdb(DB_ENV *env, DB_TXN *txn, const char *fname, const char *dbname, const char *newname, u_int32_t flags) {
    int r;
    if (!fname || !dbname || !newname) r = EINVAL;
    else {
        char subdb_full_name[strlen(fname) + sizeof("/") + strlen(dbname)];
        {
            int bytes = snprintf(subdb_full_name, sizeof(subdb_full_name), "%s/%s", fname, dbname);
            assert(bytes==(int)sizeof(subdb_full_name)-1);
        }
        char new_full_name[strlen(fname) + sizeof("/") + strlen(dbname)];
        {
            int bytes = snprintf(new_full_name, sizeof(new_full_name), "%s/%s", fname, dbname);
            assert(bytes==(int)sizeof(new_full_name)-1);
Yoni Fogel's avatar
Yoni Fogel committed
4860
        }
4861 4862
        const char *null_subdbname = NULL;
        r = toku_env_dbrename(env, txn, subdb_full_name, null_subdbname, new_full_name, flags);
Yoni Fogel's avatar
Yoni Fogel committed
4863
    }
4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885
    return r;
}


static int
toku_env_dbrename(DB_ENV *env, DB_TXN *txn, const char *fname, const char *dbname, const char *newname, u_int32_t flags) {
    int r;
    HANDLE_PANICKED_ENV(env);
    HANDLE_ILLEGAL_WORKING_PARENT_TXN(env, txn);
    if (!env_opened(env)) return EINVAL;
    if (dbname!=NULL) 
        return env_dbrename_subdb(env, txn, fname, dbname, newname, flags);
    // env_dbrename_subdb() converts (fname, dbname) to dname and (fname, newname) to newdname

    const char * dname = fname;
    assert(dbname == NULL);

    if (flags!=0) return EINVAL;
    if (env_is_db_with_dname_open(env, dname))
        return toku_ydb_do_error(env, EINVAL, "Cannot rename dictionary with an open handle.\n");
    if (env_is_db_with_dname_open(env, newname))
        return toku_ydb_do_error(env, EINVAL, "Cannot rename dictionary; Dictionary with target name has an open handle.\n");
Yoni Fogel's avatar
Yoni Fogel committed
4886
    
4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897
    DBT old_dname_dbt;  
    DBT new_dname_dbt;  
    DBT iname_dbt;  
    toku_fill_dbt(&old_dname_dbt, dname, strlen(dname)+1);
    toku_fill_dbt(&new_dname_dbt, newname, strlen(newname)+1);
    init_dbt_realloc(&iname_dbt);  // sets iname_dbt.data = NULL

    int using_txns = env->i->open_flags & DB_INIT_TXN;
    DB_TXN *child = NULL;
    // begin child (unless transactionless)
    if (using_txns) {
4898
	r = toku_txn_begin(env, txn, &child, DB_TXN_NOSYNC, 1);
4899 4900
	assert(r==0);
    }
4901

4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915
    r = toku_db_get(env->i->directory, child, &old_dname_dbt, &iname_dbt, 0);  // allocates memory for iname
    char *iname = iname_dbt.data;
    if (r==DB_NOTFOUND)
        r = ENOENT;
    else if (r==0) {
	// verify that newname does not already exist
	r = db_getf_set(env->i->directory, child, 0, &new_dname_dbt, ydb_getf_do_nothing, NULL);
	if (r == 0) 
	    r = EEXIST;
	else if (r == DB_NOTFOUND) {
	    // remove old (dname,iname) and insert (newname,iname) in directory
	    r = toku_db_del(env->i->directory, child, &old_dname_dbt, DB_DELETE_ANY);
	    if (r == 0)
		r = toku_db_put(env->i->directory, child, &new_dname_dbt, &iname_dbt, DB_YESOVERWRITE);
4916 4917 4918
            //Now that we have writelocks on both dnames, verify that there are still no handles open. (to prevent race conditions)
            if (r==0 && env_is_db_with_dname_open(env, dname))
                r = toku_ydb_do_error(env, EINVAL, "Cannot rename dictionary with an open handle.\n");
4919
            DB* zombie = NULL;
4920
            if (r==0) {
4921
                zombie = env_get_zombie_db_with_dname(env, dname);
4922
                if (zombie)
4923
                    r = toku_db_pre_acquire_table_lock(zombie, child, TRUE);
4924 4925 4926
                if (r!=0)
                    toku_ydb_do_error(env, r, "Cannot rename dictionary.\n");
            }
4927 4928
            if (r==0 && env_is_db_with_dname_open(env, newname))
                r = toku_ydb_do_error(env, EINVAL, "Cannot rename dictionary; Dictionary with target name has an open handle.\n");
4929 4930 4931 4932 4933 4934
            if (r==0 && zombie) {
                //Update zombie in list if exists.
                env_note_zombie_db_closed(env, zombie);  // tell env that this db is no longer a zombie (it is completely closed)
                toku_free(zombie->i->dname);
                zombie->i->dname = toku_xstrdup(newname);
                env_note_zombie_db(env, zombie);  // tell env that this db is a zombie
4935
            }
4936 4937 4938 4939 4940 4941
	}
    }

    if (using_txns) {
	// close txn
	if (r == 0) {  // commit
4942
	    r = toku_txn_commit(child, DB_TXN_NOSYNC, NULL, NULL);
4943 4944 4945
	    assert(r==0);  // TODO panic
	}
	else {         // abort
4946
	    int r2 = toku_txn_abort(child, NULL, NULL);
4947 4948 4949 4950 4951 4952
	    assert(r2==0);  // TODO panic
	}
    }

    if (iname) toku_free(iname);
    return r;
Yoni Fogel's avatar
Yoni Fogel committed
4953

Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
4954
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
4955

4956 4957
static int
toku_db_rename(DB * db, const char *fname, const char *dbname, const char *newname, u_int32_t flags) {
4958
    HANDLE_PANICKED_DB(db);
4959 4960 4961 4962 4963
    DB_TXN *null_txn = NULL;
    int r  = toku_env_dbrename(db->dbenv, null_txn, fname, dbname, newname, flags);
    int r2 = toku_db_close(db, 0);
    if (r==0) r = r2;
    return r;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
4964
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
4965

4966 4967 4968
// set key comparison function to function provided by user (pre-empting environment key comparison function)
static int
toku_db_set_bt_compare(DB * db, int (*bt_compare) (DB *, const DBT *, const DBT *)) {
4969
    HANDLE_PANICKED_DB(db);
4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981
    int r;
    if (db_opened(db))
        r = toku_ydb_do_error(db->dbenv, EINVAL, "Comparison functions cannot be set after DB open.\n");
    else if (!bt_compare)
        r = toku_ydb_do_error(db->dbenv, EINVAL, "Comparison functions cannot be NULL.\n");
    else if (db->i->key_compare_was_set)
        r = toku_ydb_do_error(db->dbenv, EINVAL, "Key comparison function already set.\n");
    else {
        r = toku_brt_set_bt_compare(db->i->brt, bt_compare);
        if (!r)
            db->i->key_compare_was_set = TRUE;
    }
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
4982 4983 4984
    return r;
}

4985 4986 4987
// set val comparison function to function provided by user (pre-empting environment val comparison function)
static int
toku_db_set_dup_compare(DB *db, int (*dup_compare)(DB *, const DBT *, const DBT *)) {
4988
    HANDLE_PANICKED_DB(db);
4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000
    int r;
    if (db_opened(db))
        r = toku_ydb_do_error(db->dbenv, EINVAL, "Comparison functions cannot be set after DB open.\n");
    else if (!dup_compare)
        r = toku_ydb_do_error(db->dbenv, EINVAL, "Comparison functions cannot be NULL.\n");
    else if (db->i->val_compare_was_set)
        r = toku_ydb_do_error(db->dbenv, EINVAL, "Val comparison function already set.\n");
    else {
        r = toku_brt_set_dup_compare(db->i->brt, dup_compare);
        if (!r)
            db->i->val_compare_was_set = TRUE;
    }
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
5001 5002 5003
    return r;
}

5004
static int toku_db_set_descriptor(DB *db, u_int32_t version, const DBT* descriptor, toku_dbt_upgradef dbt_userformat_upgrade) {
5005 5006 5007 5008 5009
    HANDLE_PANICKED_DB(db);
    int r;
    if (db_opened(db)) return EINVAL;
    else if (!descriptor) r = EINVAL;
    else if (descriptor->size>0 && !descriptor->data) r = EINVAL;
5010
    else r = toku_brt_set_descriptor(db->i->brt, version, descriptor, dbt_userformat_upgrade);
5011 5012 5013
    return r;
}

Rich Prohaska's avatar
Rich Prohaska committed
5014
static int toku_db_set_flags(DB *db, u_int32_t flags) {
5015
    HANDLE_PANICKED_DB(db);
5016

Rich Prohaska's avatar
Rich Prohaska committed
5017
    /* the following matches BDB */
5018 5019
    if (db_opened(db) && flags != 0) return EINVAL;

Yoni Fogel's avatar
Yoni Fogel committed
5020 5021 5022 5023
    u_int32_t tflags;
    int r = toku_brt_get_flags(db->i->brt, &tflags);
    if (r!=0) return r;
    
5024
    if (flags & DB_DUP)
5025
        tflags |= TOKU_DB_DUP;
5026
    if (flags & DB_DUPSORT)
5027
        tflags |= TOKU_DB_DUPSORT;
Yoni Fogel's avatar
Yoni Fogel committed
5028
    r = toku_brt_set_flags(db->i->brt, tflags);
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
5029 5030 5031
    return r;
}

5032
static int toku_db_get_flags(DB *db, u_int32_t *pflags) {
5033
    HANDLE_PANICKED_DB(db);
5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046
    if (!pflags) return EINVAL;
    u_int32_t tflags;
    u_int32_t flags = 0;
    int r = toku_brt_get_flags(db->i->brt, &tflags);
    if (r!=0) return r;
    if (tflags & TOKU_DB_DUP) {
        tflags &= ~TOKU_DB_DUP;
        flags  |= DB_DUP;
    }
    if (tflags & TOKU_DB_DUPSORT) {
        tflags &= ~TOKU_DB_DUPSORT;
        flags  |= DB_DUPSORT;
    }
5047 5048 5049 5050
    { // ignore internal flags
        tflags &= ~TOKU_DB_KEYCMP_BUILTIN;
        tflags &= ~TOKU_DB_VALCMP_BUILTIN; 
    }
5051 5052 5053 5054 5055
    assert(tflags == 0);
    *pflags = flags;
    return 0;
}

5056
static int toku_db_set_pagesize(DB *db, u_int32_t pagesize) {
5057
    HANDLE_PANICKED_DB(db);
5058
    int r = toku_brt_set_nodesize(db->i->brt, pagesize);
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
5059
    return r;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
5060
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
5061

5062
static int toku_db_stat64(DB * db, DB_TXN *txn, DB_BTREE_STAT64 *s) {
5063
    HANDLE_PANICKED_DB(db);
5064
    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
5065 5066 5067 5068 5069 5070 5071 5072 5073
    struct brtstat64_s brtstat;
    int r = toku_brt_stat64(db->i->brt, db_txn_struct_i(txn)->tokutxn, &brtstat);
    if (r==0) {
	s->bt_nkeys = brtstat.nkeys;
	s->bt_ndata = brtstat.ndata;
	s->bt_dsize = brtstat.dsize;
	s->bt_fsize = brtstat.fsize;
    }
    return r;
5074 5075 5076 5077 5078 5079
}
static int locked_db_stat64 (DB *db, DB_TXN *txn, DB_BTREE_STAT64 *s) {
    toku_ydb_lock();
    int r = toku_db_stat64(db, txn, s);
    toku_ydb_unlock();
    return r;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
5080 5081
}

Yoni Fogel's avatar
Yoni Fogel committed
5082
static int toku_db_key_range64(DB* db, DB_TXN* txn __attribute__((__unused__)), DBT* key, u_int64_t* less, u_int64_t* equal, u_int64_t* greater, int* is_exact) {
Zardosht Kasheff's avatar
Zardosht Kasheff committed
5083
    HANDLE_PANICKED_DB(db);
5084
    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
Zardosht Kasheff's avatar
Zardosht Kasheff committed
5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097

    // note that toku_brt_keyrange does not have a txn param
    // this will be fixed later
    // temporarily, because the caller, locked_db_keyrange, 
    // has the ydb lock, we are ok
    int r = toku_brt_keyrange(db->i->brt, key, less, equal, greater);
    if (r != 0) { goto cleanup; }
    // temporarily set is_exact to 0 because brt_keyrange does not have this parameter
    *is_exact = 0;
cleanup:
    return r;
}

5098
static int toku_db_pre_acquire_read_lock(DB *db, DB_TXN *txn, const DBT *key_left, const DBT *val_left, const DBT *key_right, const DBT *val_right) {
Yoni Fogel's avatar
Yoni Fogel committed
5099 5100
    HANDLE_PANICKED_DB(db);
    if (!db->i->lt || !txn) return EINVAL;
5101
    //READ_UNCOMMITTED and READ_COMMITTED transactions do not need read locks.
5102
    if (db_txn_struct_i(txn)->flags&(DB_READ_UNCOMMITTED|DB_READ_COMMITTED)) return 0;
Yoni Fogel's avatar
Yoni Fogel committed
5103 5104

    int r;
5105 5106 5107 5108 5109 5110 5111
    {
	RANGE_LOCK_REQUEST_S request;
	read_lock_request_init(&request, txn, db,
			       key_left,  val_left,
			       key_right, val_right);
        r = grab_range_lock(&request);
    }
Yoni Fogel's avatar
Yoni Fogel committed
5112 5113 5114
    return r;
}

5115 5116
//static int toku_db_pre_acquire_table_lock(DB *db, DB_TXN *txn) {
// needed by loader.c
5117
int toku_db_pre_acquire_table_lock(DB *db, DB_TXN *txn, BOOL just_lock) {
Yoni Fogel's avatar
Yoni Fogel committed
5118 5119 5120 5121 5122
    HANDLE_PANICKED_DB(db);
    if (!db->i->lt || !txn) return EINVAL;

    int r;

5123 5124 5125 5126 5127 5128 5129 5130
    {
	RANGE_LOCK_REQUEST_S request;
	write_lock_request_init(&request, txn, db,
				toku_lt_neg_infinity, toku_lt_neg_infinity,
				toku_lt_infinity,     toku_lt_infinity);
        r = grab_range_lock(&request);
    }

5131 5132 5133 5134 5135 5136 5137 5138 5139 5140
    if (r==0 && !just_lock &&
        !toku_brt_is_recovery_logging_suppressed(db->i->brt) &&
        toku_brt_is_empty(db->i->brt) &&
        db_is_nodup(db) //TODO: Remove this check once we kill dupsort support.
    ) {
        //Try to suppress both rollback and recovery logs
        DB_LOADER *loader;
        DB *dbs[1] = {db};
        uint32_t db_flags[1]  = {DB_NOOVERWRITE};
        uint32_t dbt_flags[1] = {0};
5141
        uint32_t loader_flags = DB_PRELOCKED_WRITE; //Don't recursively prelock
5142
        DB_ENV *env = db->dbenv;
5143 5144 5145 5146 5147 5148 5149
	DB_TXN *child = NULL;
	
	{
	    // begin child
	    int rt = toku_txn_begin(env, txn, &child, DB_TXN_NOSYNC, 1);
	    assert(rt==0);
	}
5150 5151

        toku_ydb_unlock(); //Cannot hold ydb lock when creating loader
5152 5153
	
        int r_loader = env->create_loader(env, child, &loader, NULL, 1, dbs, db_flags, dbt_flags, loader_flags);
5154
        if (r_loader==0) {
5155 5156 5157 5158
            r_loader = loader->set_error_callback(loader, NULL, NULL);
            assert(r_loader==0);
            r_loader = loader->set_poll_function(loader, NULL, NULL);
            assert(r_loader==0);
5159
            // close the loader
5160 5161 5162 5163
            r_loader = loader->close(loader);
	    if (r_loader==0) {
		toku_brt_suppress_recovery_logs(db->i->brt, db_txn_struct_i(child)->tokutxn);
	    }
5164 5165 5166 5167 5168 5169 5170
        }
        else if (r_loader != DB_LOCK_NOTGRANTED) {
            //Lock not granted is not an error.
            //It just means we cannot use the loader optimization.
            assert(r==0);
            r = r_loader;
        }
5171 5172 5173
	if (r_loader == 0) { // commit
	    r = locked_txn_commit(child, 0);
	    assert(r==0);
5174
	    logsuppress++;
5175 5176 5177 5178
	}
	else {  // abort
	    r = locked_txn_abort(child);
	    assert(r==0);
5179
	    logsuppressfail++;
5180
	}
5181
        toku_ydb_lock(); //Reaquire ydb lock.
5182 5183
    }

Yoni Fogel's avatar
Yoni Fogel committed
5184 5185
    return r;
}
Zardosht Kasheff's avatar
Zardosht Kasheff committed
5186

Yoni Fogel's avatar
Yoni Fogel committed
5187 5188 5189
//TODO: DB_AUTO_COMMIT.
//TODO: Nowait only conditionally?
//TODO: NOSYNC change to SYNC if DB_ENV has something in set_flags
5190
static inline int toku_db_construct_autotxn(DB* db, DB_TXN **txn, BOOL* changed,
Yoni Fogel's avatar
Yoni Fogel committed
5191 5192 5193 5194 5195 5196 5197
                                            BOOL force_auto_commit) {
    assert(db && txn && changed);
    DB_ENV* env = db->dbenv;
    if (*txn || !(env->i->open_flags & DB_INIT_TXN)) {
        *changed = FALSE;
        return 0;
    }
5198
    BOOL nosync = (BOOL)(!force_auto_commit && !(env->i->open_flags & DB_AUTO_COMMIT));
Yoni Fogel's avatar
Yoni Fogel committed
5199
    u_int32_t txn_flags = DB_TXN_NOWAIT | (nosync ? DB_TXN_NOSYNC : 0);
5200
    int r = toku_txn_begin(env, NULL, txn, txn_flags, 1);
Yoni Fogel's avatar
Yoni Fogel committed
5201 5202 5203 5204 5205
    if (r!=0) return r;
    *changed = TRUE;
    return 0;
}

5206
static inline int toku_db_destruct_autotxn(DB_TXN *txn, int r, BOOL changed) {
Yoni Fogel's avatar
Yoni Fogel committed
5207
    if (!changed) return r;
5208 5209
    if (r==0) return toku_txn_commit(txn, 0, NULL, NULL);
    toku_txn_abort(txn, NULL, NULL);
Yoni Fogel's avatar
Yoni Fogel committed
5210 5211 5212
    return r; 
}

Rich Prohaska's avatar
Rich Prohaska committed
5213
static int locked_db_close(DB * db, u_int32_t flags) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
5214
    toku_ydb_lock(); int r = toku_db_close(db, flags); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
5215 5216
}

5217
static inline int autotxn_db_cursor(DB *db, DB_TXN *txn, DBC **c, u_int32_t flags) {
Yoni Fogel's avatar
Yoni Fogel committed
5218
    if (!txn && (db->dbenv->i->open_flags & DB_INIT_TXN)) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
5219
        return toku_ydb_do_error(db->dbenv, EINVAL,
Yoni Fogel's avatar
Yoni Fogel committed
5220 5221
              "Cursors in a transaction environment must have transactions.\n");
    }
5222
    return toku_db_cursor(db, txn, c, flags, 0);
Yoni Fogel's avatar
Yoni Fogel committed
5223 5224
}

Rich Prohaska's avatar
Rich Prohaska committed
5225
static int locked_db_cursor(DB *db, DB_TXN *txn, DBC **c, u_int32_t flags) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
5226
    toku_ydb_lock(); int r = autotxn_db_cursor(db, txn, c, flags); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
5227 5228
}

5229
static inline int autotxn_db_del(DB* db, DB_TXN* txn, DBT* key,
Yoni Fogel's avatar
Yoni Fogel committed
5230 5231 5232 5233 5234 5235 5236 5237
                                 u_int32_t flags) {
    BOOL changed; int r;
    r = toku_db_construct_autotxn(db, &txn, &changed, FALSE);
    if (r!=0) return r;
    r = toku_db_del(db, txn, key, flags);
    return toku_db_destruct_autotxn(txn, r, changed);
}

Rich Prohaska's avatar
Rich Prohaska committed
5238
static int locked_db_del(DB * db, DB_TXN * txn, DBT * key, u_int32_t flags) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
5239
    toku_ydb_lock(); int r = autotxn_db_del(db, txn, key, flags); toku_ydb_unlock(); return r;
Yoni Fogel's avatar
Yoni Fogel committed
5240 5241
}

Yoni Fogel's avatar
Yoni Fogel committed
5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254
static inline int autotxn_db_delboth(DB* db, DB_TXN* txn, DBT* key, DBT* val,
                                 u_int32_t flags) {
    BOOL changed; int r;
    r = toku_db_construct_autotxn(db, &txn, &changed, FALSE);
    if (r!=0) return r;
    r = toku_db_delboth(db, txn, key, val, flags);
    return toku_db_destruct_autotxn(txn, r, changed);
}

static int locked_db_delboth(DB *db, DB_TXN *txn, DBT *key,  DBT *val, u_int32_t flags) {
    toku_ydb_lock(); int r = autotxn_db_delboth(db, txn, key, val, flags); toku_ydb_unlock(); return r;
}

5255
static inline int autotxn_db_get(DB* db, DB_TXN* txn, DBT* key, DBT* data,
Yoni Fogel's avatar
Yoni Fogel committed
5256 5257 5258 5259 5260 5261
                                 u_int32_t flags) {
    BOOL changed; int r;
    r = toku_db_construct_autotxn(db, &txn, &changed, FALSE);
    if (r!=0) return r;
    r = toku_db_get(db, txn, key, data, flags);
    return toku_db_destruct_autotxn(txn, r, changed);
Rich Prohaska's avatar
Rich Prohaska committed
5262 5263 5264
}

static int locked_db_get (DB * db, DB_TXN * txn, DBT * key, DBT * data, u_int32_t flags) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
5265
    toku_ydb_lock(); int r = autotxn_db_get(db, txn, key, data, flags); toku_ydb_unlock(); return r;
Yoni Fogel's avatar
Yoni Fogel committed
5266 5267
}

5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291
static inline int autotxn_db_getf_set (DB *db, DB_TXN *txn, u_int32_t flags, DBT *key, YDB_CALLBACK_FUNCTION f, void *extra) {
    BOOL changed; int r;
    r = toku_db_construct_autotxn(db, &txn, &changed, FALSE);
    if (r!=0) return r;
    r = db_getf_set(db, txn, flags, key, f, extra);
    return toku_db_destruct_autotxn(txn, r, changed);
}

static int locked_db_getf_set (DB *db, DB_TXN *txn, u_int32_t flags, DBT *key, YDB_CALLBACK_FUNCTION f, void *extra) {
    toku_ydb_lock(); int r = autotxn_db_getf_set(db, txn, flags, key, f, extra); toku_ydb_unlock(); return r;
}

static inline int autotxn_db_getf_get_both (DB *db, DB_TXN *txn, u_int32_t flags, DBT *key, DBT *val, YDB_CALLBACK_FUNCTION f, void *extra) {
    BOOL changed; int r;
    r = toku_db_construct_autotxn(db, &txn, &changed, FALSE);
    if (r!=0) return r;
    r = db_getf_get_both(db, txn, flags, key, val, f, extra);
    return toku_db_destruct_autotxn(txn, r, changed);
}

static int locked_db_getf_get_both (DB *db, DB_TXN *txn, u_int32_t flags, DBT *key, DBT *val, YDB_CALLBACK_FUNCTION f, void *extra) {
    toku_ydb_lock(); int r = autotxn_db_getf_get_both(db, txn, flags, key, val, f, extra); toku_ydb_unlock(); return r;
}

5292
static int locked_db_pre_acquire_read_lock(DB *db, DB_TXN *txn, const DBT *key_left, const DBT *val_left, const DBT *key_right, const DBT *val_right) {
Yoni Fogel's avatar
Yoni Fogel committed
5293 5294 5295 5296 5297 5298
    toku_ydb_lock();
    int r = toku_db_pre_acquire_read_lock(db, txn, key_left, val_left, key_right, val_right);
    toku_ydb_unlock();
    return r;
}

5299
static int locked_db_pre_acquire_table_lock(DB *db, DB_TXN *txn) {
Yoni Fogel's avatar
Yoni Fogel committed
5300
    toku_ydb_lock();
5301
    int r = toku_db_pre_acquire_table_lock(db, txn, FALSE);
Yoni Fogel's avatar
Yoni Fogel committed
5302 5303 5304 5305
    toku_ydb_unlock();
    return r;
}

5306 5307 5308 5309
// truncate a database
// effect: remove all of the rows from a database
static int toku_db_truncate(DB *db, DB_TXN *txn, u_int32_t *row_count, u_int32_t flags) {
    HANDLE_PANICKED_DB(db);
5310
    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
5311 5312
    int r;

5313 5314 5315 5316 5317 5318 5319
    u_int32_t unhandled_flags = flags;
    int ignore_cursors = 0;
    if (flags & DB_TRUNCATE_WITHCURSORS) {
        ignore_cursors = 1;
        unhandled_flags &= ~DB_TRUNCATE_WITHCURSORS;
    }

5320
    // dont support flags (yet)
5321
    if (unhandled_flags)
5322
        return EINVAL;
5323 5324
    // dont support cursors unless explicitly told to
    if (!ignore_cursors && toku_brt_get_cursor_count(db->i->brt) > 0)
5325 5326 5327 5328
        return EINVAL;

    // acquire a table lock
    if (txn) {
5329
        r = toku_db_pre_acquire_table_lock(db, txn, TRUE);
5330 5331 5332 5333 5334 5335
        if (r != 0)
            return r;
    }

    *row_count = 0;

5336
    r = toku_brt_truncate(db->i->brt);
5337 5338 5339 5340

    return r;
}

5341
static inline int autotxn_db_open(DB* db, DB_TXN* txn, const char *fname, const char *dbname, DBTYPE dbtype, u_int32_t flags, int mode) {
Yoni Fogel's avatar
Yoni Fogel committed
5342
    BOOL changed; int r;
5343
    r = toku_db_construct_autotxn(db, &txn, &changed, (BOOL)((flags & DB_AUTO_COMMIT) != 0));
Yoni Fogel's avatar
Yoni Fogel committed
5344 5345 5346
    if (r!=0) return r;
    r = toku_db_open(db, txn, fname, dbname, dbtype, flags & ~DB_AUTO_COMMIT, mode);
    return toku_db_destruct_autotxn(txn, r, changed);
Rich Prohaska's avatar
Rich Prohaska committed
5347 5348 5349
}

static int locked_db_open(DB *db, DB_TXN *txn, const char *fname, const char *dbname, DBTYPE dbtype, u_int32_t flags, int mode) {
5350 5351 5352 5353
    toku_multi_operation_client_lock(); //Cannot begin checkpoint
    toku_ydb_lock(); int r = autotxn_db_open(db, txn, fname, dbname, dbtype, flags, mode); toku_ydb_unlock();
    toku_multi_operation_client_unlock(); //Can now begin checkpoint
    return r;
Rich Prohaska's avatar
Rich Prohaska committed
5354 5355
}

5356
static inline int autotxn_db_put(DB* db, DB_TXN* txn, DBT* key, DBT* data,
Yoni Fogel's avatar
Yoni Fogel committed
5357
                                 u_int32_t flags) {
5358
    //{ unsigned i; printf("put %p keylen=%d key={", db, key->size); for(i=0; i<key->size; i++) printf("%d,", ((char*)key->data)[i]); printf("} datalen=%d data={", data->size); for(i=0; i<data->size; i++) printf("%d,", ((char*)data->data)[i]); printf("}\n"); }
Yoni Fogel's avatar
Yoni Fogel committed
5359 5360 5361 5362 5363 5364 5365
    BOOL changed; int r;
    r = toku_db_construct_autotxn(db, &txn, &changed, FALSE);
    if (r!=0) return r;
    r = toku_db_put(db, txn, key, data, flags);
    return toku_db_destruct_autotxn(txn, r, changed);
}

Rich Prohaska's avatar
Rich Prohaska committed
5366
static int locked_db_put(DB * db, DB_TXN * txn, DBT * key, DBT * data, u_int32_t flags) {
5367 5368 5369 5370 5371 5372 5373
    int r = env_check_avail_fs_space(db->dbenv);
    if (r == 0) {
	toku_ydb_lock(); 
	r = autotxn_db_put(db, txn, key, data, flags); 
	toku_ydb_unlock(); 
    }
    return r;
Rich Prohaska's avatar
Rich Prohaska committed
5374 5375 5376
}

static int locked_db_remove(DB * db, const char *fname, const char *dbname, u_int32_t flags) {
5377
    toku_multi_operation_client_lock(); //Cannot begin checkpoint
5378 5379 5380
    toku_ydb_lock();
    int r = toku_db_remove(db, fname, dbname, flags);
    toku_ydb_unlock();
5381
    toku_multi_operation_client_unlock(); //Can now begin checkpoint
5382
    return r;
Rich Prohaska's avatar
Rich Prohaska committed
5383 5384 5385
}

static int locked_db_rename(DB * db, const char *namea, const char *nameb, const char *namec, u_int32_t flags) {
5386
    toku_multi_operation_client_lock(); //Cannot begin checkpoint
5387 5388 5389
    toku_ydb_lock();
    int r = toku_db_rename(db, namea, nameb, namec, flags);
    toku_ydb_unlock();
5390
    toku_multi_operation_client_unlock(); //Can now begin checkpoint
5391
    return r;
Rich Prohaska's avatar
Rich Prohaska committed
5392 5393 5394
}

static int locked_db_set_bt_compare(DB * db, int (*bt_compare) (DB *, const DBT *, const DBT *)) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
5395
    toku_ydb_lock(); int r = toku_db_set_bt_compare(db, bt_compare); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
5396 5397 5398
}

static int locked_db_set_dup_compare(DB * db, int (*dup_compare) (DB *, const DBT *, const DBT *)) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
5399
    toku_ydb_lock(); int r = toku_db_set_dup_compare(db, dup_compare); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
5400 5401
}

5402 5403 5404 5405 5406
static int locked_db_set_descriptor(DB *db, u_int32_t version, const DBT* descriptor, toku_dbt_upgradef dbt_userformat_upgrade) {
    toku_ydb_lock();
    int r = toku_db_set_descriptor(db, version, descriptor, dbt_userformat_upgrade);
    toku_ydb_unlock();
    return r;
5407 5408
}

Rich Prohaska's avatar
Rich Prohaska committed
5409 5410 5411 5412 5413
static void locked_db_set_errfile (DB *db, FILE *errfile) {
    db->dbenv->set_errfile(db->dbenv, errfile);
}

static int locked_db_set_flags(DB *db, u_int32_t flags) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
5414
    toku_ydb_lock(); int r = toku_db_set_flags(db, flags); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
5415 5416 5417
}

static int locked_db_get_flags(DB *db, u_int32_t *flags) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
5418
    toku_ydb_lock(); int r = toku_db_get_flags(db, flags); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
5419 5420 5421
}

static int locked_db_set_pagesize(DB *db, u_int32_t pagesize) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
5422
    toku_ydb_lock(); int r = toku_db_set_pagesize(db, pagesize); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
5423 5424
}

5425 5426 5427 5428 5429 5430 5431
// TODO 2216 delete this
static int locked_db_fd(DB * UU(db), int * UU(fdp)) {
    //    toku_ydb_lock(); 
    // int r = toku_db_fd(db, fdp); 
    //    toku_ydb_unlock(); 
    //    return r;
    return 0;
Rich Prohaska's avatar
Rich Prohaska committed
5432 5433
}

Zardosht Kasheff's avatar
Zardosht Kasheff committed
5434

Yoni Fogel's avatar
Yoni Fogel committed
5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446
static int locked_db_key_range64(DB* db, DB_TXN* txn, DBT* dbt, u_int64_t* less, u_int64_t* equal, u_int64_t* greater, int* is_exact) {
    toku_ydb_lock(); int r = toku_db_key_range64(db, txn, dbt, less, equal, greater, is_exact); toku_ydb_unlock(); return r;
}

static const DBT* toku_db_dbt_pos_infty(void) __attribute__((pure));
static const DBT* toku_db_dbt_pos_infty(void) {
    return toku_lt_infinity;
}

static const DBT* toku_db_dbt_neg_infty(void) __attribute__((pure));
static const DBT* toku_db_dbt_neg_infty(void) {
    return toku_lt_neg_infinity;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
5447 5448
}

5449
static int locked_db_truncate(DB *db, DB_TXN *txn, u_int32_t *row_count, u_int32_t flags) {
5450 5451 5452 5453 5454 5455
    toku_checkpoint_safe_client_lock();
    toku_ydb_lock();
    int r = toku_db_truncate(db, txn, row_count, flags);
    toku_ydb_unlock();
    toku_checkpoint_safe_client_unlock();
    return r;
5456 5457
}

5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478
static int
toku_db_flatten(DB *db, DB_TXN *txn) {
    HANDLE_PANICKED_DB(db);
    TOKULOGGER logger = toku_txn_logger(txn ? db_txn_struct_i(txn)->tokutxn : NULL);
    int r = toku_brt_flatten(db->i->brt, logger);
    return r;
}

static inline int autotxn_db_flatten(DB* db, DB_TXN* txn) {
    BOOL changed; int r;
    r = toku_db_construct_autotxn(db, &txn, &changed, FALSE);
    if (r!=0) return r;
    r = toku_db_flatten(db, txn);
    return toku_db_destruct_autotxn(txn, r, changed);
}


static int locked_db_flatten(DB *db, DB_TXN *txn) {
    toku_ydb_lock(); int r = autotxn_db_flatten(db, txn); toku_ydb_unlock(); return r;
}

5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497
static int
db_get_fragmentation(DB * db, TOKU_DB_FRAGMENTATION report) {
    HANDLE_PANICKED_DB(db);
    int r;
    if (!db_opened(db))
        r = toku_ydb_do_error(db->dbenv, EINVAL, "Fragmentation report available only on open DBs.\n");
    else
        r = toku_brt_get_fragmentation(db->i->brt, report);
    return r;
}

static int
locked_db_get_fragmentation(DB * db, TOKU_DB_FRAGMENTATION report) {
    toku_ydb_lock();
    int r = db_get_fragmentation(db, report);
    toku_ydb_unlock();
    return r;
}

Rich Prohaska's avatar
Rich Prohaska committed
5498
static int toku_db_create(DB ** db, DB_ENV * env, u_int32_t flags) {
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
5499 5500
    int r;

5501 5502 5503 5504 5505
    if (flags || env == NULL) 
        return EINVAL;

    if (!env_opened(env))
        return EINVAL;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
5506
    
5507
    DB *MALLOC(result);
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
5508 5509
    if (result == 0) {
        return ENOMEM;
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
5510
    }
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
5511 5512
    memset(result, 0, sizeof *result);
    result->dbenv = env;
Yoni Fogel's avatar
Yoni Fogel committed
5513 5514 5515 5516 5517
#define SDB(name) result->name = locked_db_ ## name
    SDB(key_range64);
    SDB(close);
    SDB(cursor);
    SDB(del);
Yoni Fogel's avatar
Yoni Fogel committed
5518
    SDB(delboth);
Yoni Fogel's avatar
Yoni Fogel committed
5519 5520 5521 5522 5523 5524 5525 5526
    SDB(get);
    //    SDB(key_range);
    SDB(open);
    SDB(put);
    SDB(remove);
    SDB(rename);
    SDB(set_bt_compare);
    SDB(set_dup_compare);
5527
    SDB(set_descriptor);
Yoni Fogel's avatar
Yoni Fogel committed
5528 5529 5530 5531
    SDB(set_errfile);
    SDB(set_pagesize);
    SDB(set_flags);
    SDB(get_flags);
5532
    SDB(stat64);
Yoni Fogel's avatar
Yoni Fogel committed
5533 5534
    SDB(fd);
    SDB(pre_acquire_read_lock);
Yoni Fogel's avatar
Yoni Fogel committed
5535
    SDB(pre_acquire_table_lock);
5536
    SDB(truncate);
5537
    SDB(row_size_supported);
5538 5539
    SDB(getf_set);
    SDB(getf_get_both);
5540
    SDB(flatten);
5541
    SDB(get_fragmentation);
Yoni Fogel's avatar
Yoni Fogel committed
5542 5543 5544
#undef SDB
    result->dbt_pos_infty = toku_db_dbt_pos_infty;
    result->dbt_neg_infty = toku_db_dbt_neg_infty;
5545
    MALLOC(result->i);
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
5546 5547 5548 5549 5550
    if (result->i == 0) {
        toku_free(result);
        return ENOMEM;
    }
    memset(result->i, 0, sizeof *result->i);
5551
    result->i->dict_id = DICTIONARY_ID_NONE;
5552
    result->i->db = result;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
5553
    result->i->freed = 0;
5554
    result->i->opened = 0;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
5555 5556 5557
    result->i->open_flags = 0;
    result->i->open_mode = 0;
    result->i->brt = 0;
5558
    toku_list_init(&result->i->dbs_that_must_close_before_abort);
5559
    r = toku_brt_create(&result->i->brt);
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
5560 5561 5562
    if (r != 0) {
        toku_free(result->i);
        toku_free(result);
Yoni Fogel's avatar
Yoni Fogel committed
5563
        return r;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
5564
    }
5565
    ydb_add_ref();
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
5566 5567
    *db = result;
    return 0;
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
5568
}
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
5569

5570
int DB_CREATE_FUN (DB ** db, DB_ENV * env, u_int32_t flags) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
5571
    toku_ydb_lock(); int r = toku_db_create(db, env, flags); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
5572 5573 5574 5575
}

/* need db_strerror_r for multiple threads */

Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
5576 5577 5578 5579 5580 5581 5582 5583
char *db_strerror(int error) {
    char *errorstr;
    if (error >= 0) {
        errorstr = strerror(error);
        if (errorstr)
            return errorstr;
    }
    
5584 5585 5586
    if (error==DB_BADFORMAT) {
	return "Database Bad Format (probably a corrupted database)";
    }
5587 5588 5589
    if (error==DB_NOTFOUND) {
	return "Not found";
    }
5590

Rich Prohaska's avatar
Rich Prohaska committed
5591
    static char unknown_result[100];    // Race condition if two threads call this at the same time. However even in a bad case, it should be some sort of null-terminated string.
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603
    errorstr = unknown_result;
    snprintf(errorstr, sizeof unknown_result, "Unknown error code: %d", error);
    return errorstr;
}

const char *db_version(int *major, int *minor, int *patch) {
    if (major)
        *major = DB_VERSION_MAJOR;
    if (minor)
        *minor = DB_VERSION_MINOR;
    if (patch)
        *patch = DB_VERSION_PATCH;
5604 5605 5606 5607 5608
#if defined(TOKUDB_REVISION)
#define xstr(X) str(X)
#define str(X) #X
    return "tokudb " xstr(DB_VERSION_MAJOR) "." xstr(DB_VERSION_MINOR) "." xstr(DB_VERSION_PATCH) " build " xstr(TOKUDB_REVISION);
#else
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
5609
    return DB_VERSION_STRING;
5610
#endif
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
5611
}
5612 5613 5614 5615
 
int db_env_set_func_fsync (int (*fsync_function)(int)) {
    return toku_set_func_fsync(fsync_function);
}
Yoni Fogel's avatar
Yoni Fogel committed
5616

5617
int db_env_set_func_pwrite (ssize_t (*pwrite_function)(int, const void *, size_t, toku_off_t)) {
5618 5619 5620 5621 5622 5623
    return toku_set_func_pwrite(pwrite_function);
}
int db_env_set_func_write (ssize_t (*write_function)(int, const void *, size_t)) {
    return toku_set_func_write(write_function);
}

5624 5625 5626 5627 5628 5629
void 
db_env_set_func_loader_fwrite (size_t (*fwrite_fun)(const void*,size_t,size_t,FILE*)) {
    brtloader_set_os_fwrite(fwrite_fun);
}


5630 5631 5632 5633 5634 5635
int db_env_set_func_malloc (void *(*f)(size_t)) {
    return toku_set_func_malloc(f);
}
int db_env_set_func_realloc (void *(*f)(void*, size_t)) {
    return toku_set_func_realloc(f);
}
5636
int db_env_set_func_free (void (*f)(void*)) {
5637 5638
    return toku_set_func_free(f);
}
5639

5640 5641 5642 5643 5644 5645
// Got to call dlmalloc, or else it won't get included.
void setup_dlmalloc (void) {
    db_env_set_func_malloc(dlmalloc);
    db_env_set_func_realloc(dlrealloc);
    db_env_set_func_free(dlfree);
}
5646 5647

// For test purposes only.
5648
// With this interface, all checkpoint users get the same callbacks and the same extras.
5649
void db_env_set_checkpoint_callback (void (*callback_f)(void*), void* extra) {
5650
    toku_checkpoint_safe_client_lock();
5651 5652
    checkpoint_callback_f = callback_f;
    checkpoint_callback_extra = extra;
5653
    toku_checkpoint_safe_client_unlock();
5654
    //printf("set callback = %p, extra = %p\n", callback_f, extra);
5655
}
5656 5657 5658 5659 5660 5661 5662
void db_env_set_checkpoint_callback2 (void (*callback_f)(void*), void* extra) {
    toku_checkpoint_safe_client_lock();
    checkpoint_callback2_f = callback_f;
    checkpoint_callback2_extra = extra;
    toku_checkpoint_safe_client_unlock();
    //printf("set callback2 = %p, extra2 = %p\n", callback2_f, extra2);
}
Yoni Fogel's avatar
Yoni Fogel committed
5663

5664 5665 5666 5667 5668 5669 5670 5671
void db_env_set_recover_callback (void (*callback_f)(void*), void* extra) {
    toku_recover_set_callback(callback_f, extra);
}

void db_env_set_recover_callback2 (void (*callback_f)(void*), void* extra) {
    toku_recover_set_callback2(callback_f, extra);
}

5672 5673 5674 5675 5676 5677 5678 5679
void db_env_set_loader_size_factor (uint32_t factor) {
    toku_brtloader_set_size_factor(factor);
}





Yoni Fogel's avatar
Yoni Fogel committed
5680 5681 5682 5683 5684 5685 5686 5687 5688 5689
// HACK: To ensure toku_pthread_yield gets included in the .so
// non-static would require a prototype in a header
// static (since unused) would give a warning
// static + unused would not actually help toku_pthread_yield get in the .so
// static + used avoids all the warnings and makes sure toku_pthread_yield is in the .so
static void __attribute__((__used__))
include_toku_pthread_yield (void) {
    toku_pthread_yield();
}

5690 5691 5692 5693

// For test purposes only, translate dname to iname
static int 
env_get_iname(DB_ENV* env, DBT* dname_dbt, DBT* iname_dbt) {
5694
    toku_ydb_lock();
5695 5696
    DB *directory = env->i->directory;
    int r = autotxn_db_get(directory, NULL, dname_dbt, iname_dbt, DB_PRELOCKED); // allocates memory for iname
5697
    toku_ydb_unlock();
5698 5699 5700
    return r;
}

5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715
/* Following functions (ydb_load_xxx()) are used by loader:
 */


// When the loader is created, it makes this call.
// For each dictionary to be loaded, replace old iname in directory
// with a newly generated iname.  This will also take a write lock
// on the directory entries.  The write lock will be released when
// the transaction of the loader is completed.
// If the transaction commits, the new inames are in place.
// If the transaction aborts, the old inames will be restored.
// The new inames are returned to the caller.  
// It is the caller's responsibility to free them.
// Return 0 on success (could fail if write lock not available).
int
5716
ydb_load_inames(DB_ENV * env, DB_TXN * txn, int N, DB * dbs[N], char * new_inames_in_env[N], LSN *load_lsn) {
5717 5718 5719 5720 5721
    int rval;
    int i;
    
    int using_txns = env->i->open_flags & DB_INIT_TXN;
    DB_TXN * child = NULL;
5722
    TXNID xid = 0;
5723 5724 5725
    DBT dname_dbt;  // holds dname
    DBT iname_dbt;  // holds new iname
    
5726 5727 5728 5729
    for (i=0; i<N; i++) {
	new_inames_in_env[i] = NULL;
    }

5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741
    // begin child (unless transactionless)
    if (using_txns) {
	rval = toku_txn_begin(env, txn, &child, DB_TXN_NOSYNC, 1);
	assert(rval == 0);
	xid = toku_txn_get_txnid(db_txn_struct_i(child)->tokutxn);
    }
    for (i = 0; i < N; i++) {
	char * dname = dbs[i]->i->dname;
	toku_fill_dbt(&dname_dbt, dname, strlen(dname)+1);
	// now create new iname
	char hint[strlen(dname) + 1];
	create_iname_hint(dname, hint);
5742
	char * new_iname = create_iname(env, xid, hint, i);               // allocates memory for iname_in_env
5743
	new_inames_in_env[i] = new_iname;
5744
        toku_fill_dbt(&iname_dbt, new_iname, strlen(new_iname) + 1);      // iname_in_env goes in directory
5745 5746
        rval = toku_db_put(env->i->directory, child, &dname_dbt, &iname_dbt, DB_YESOVERWRITE);  // DB_YESOVERWRITE necessary
	if (rval) break;
Yoni Fogel's avatar
Yoni Fogel committed
5747 5748 5749 5750 5751 5752
    }

    // Generate load log entries.
    if (!rval && using_txns) {
        TOKUTXN ttxn = db_txn_struct_i(txn)->tokutxn;
        int do_fsync = 0;
5753
        LSN *get_lsn = NULL;
Yoni Fogel's avatar
Yoni Fogel committed
5754 5755 5756
        for (i = 0; i < N; i++) {
            BRT brt  = dbs[i]->i->brt;
            //Fsync is necessary for the last one only.
5757 5758 5759 5760 5761
            if (i==N-1) {
                do_fsync = 1; //We only need a single fsync of logs.
                get_lsn  = load_lsn; //Set pointer to capture the last lsn.
            }
            rval = toku_brt_load(brt, ttxn, new_inames_in_env[i], do_fsync, get_lsn);
Yoni Fogel's avatar
Yoni Fogel committed
5762 5763
            if (rval) break;
        }
5764 5765 5766 5767 5768
    }
	
    if (using_txns) {
	// close txn
	if (rval == 0) {  // all well so far, commit child
5769
	    rval = toku_txn_commit(child, DB_TXN_NOSYNC, NULL, NULL);
5770 5771 5772
	    assert(rval==0);
	}
	else {         // abort child
5773
	    int r2 = toku_txn_abort(child, NULL, NULL);
5774 5775
	    assert(r2==0);
	    for (i=0; i<N; i++) {
5776 5777 5778 5779
		if (new_inames_in_env[i]) {
		    toku_free(new_inames_in_env[i]);
		    new_inames_in_env[i] = NULL;
		}
5780 5781 5782 5783 5784 5785 5786
	    }
	}
    }

    return rval;
}

5787
int
5788
locked_ydb_load_inames(DB_ENV * env, DB_TXN * txn, int N, DB * dbs[N], char * new_inames_in_env[N], LSN *load_lsn) {
5789
    toku_ydb_lock();
5790
    int r = ydb_load_inames(env, txn, N, dbs, new_inames_in_env, load_lsn);
5791 5792 5793 5794
    toku_ydb_unlock();
    return r;
}

5795 5796 5797 5798
// TODO 2216:  Patch out this (dangerous) function when loader is working and 
//             we don't need to test the low-level redirect anymore.
// for use by test programs only, just a wrapper around brt call:
int
5799
toku_test_db_redirect_dictionary(DB * db, char * dname_of_new_file, DB_TXN *dbtxn) {
5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813
    int r;
    DBT dname_dbt;
    DBT iname_dbt;
    char * new_iname_in_env;

    BRT brt = db->i->brt;
    TOKUTXN tokutxn = db_txn_struct_i(dbtxn)->tokutxn;

    toku_fill_dbt(&dname_dbt, dname_of_new_file, strlen(dname_of_new_file)+1);
    init_dbt_realloc(&iname_dbt);  // sets iname_dbt.data = NULL
    r = toku_db_get(db->dbenv->i->directory, dbtxn, &dname_dbt, &iname_dbt, 0);  // allocates memory for iname
    assert(r==0);
    new_iname_in_env = iname_dbt.data;

Yoni Fogel's avatar
Yoni Fogel committed
5814
    r = toku_dictionary_redirect(new_iname_in_env, brt, tokutxn);
5815 5816 5817 5818

    toku_free(new_iname_in_env);
    return r;
}
5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829

//Tets only function
uint64_t
toku_test_get_latest_lsn(DB_ENV *env) {
    LSN rval = ZERO_LSN;
    if (env && env->i->logger) {
        rval = toku_logger_last_lsn(env->i->logger);
    }
    return rval.lsn;
}