ydb.c 256 KB
Newer Older
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1
/* -*- mode: C; c-basic-offset: 4 -*- */
2
#ident "Copyright (c) 2007-2009 Tokutek Inc.  All rights reserved."
Zardosht Kasheff's avatar
Zardosht Kasheff committed
3
 
4 5 6
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."

const char *toku_patent_string = "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it.";
7
const char *toku_copyright_string = "Copyright (c) 2007-2009 Tokutek Inc.  All rights reserved.";
8

9
#include <toku_portability.h>
10
#include <toku_pthread.h>
11
#include <ctype.h>
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
12 13
#include <errno.h>
#include <limits.h>
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
14 15 16
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
17
#include <fcntl.h>
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
18
#include <sys/stat.h>
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
19 20
#include <sys/types.h>
#include <unistd.h>
21
#include <db.h>
22
#include "toku_assert.h"
23
#include "ydb.h"
24
#include "ydb-internal.h"
25
#include "brt-internal.h"
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
26
#include "cachetable.h"
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
27 28
#include "log.h"
#include "memory.h"
29
#include "dlmalloc.h"
30
#include "checkpoint.h"
31
#include "key.h"
32
#include "loader.h"
33
#include "indexer.h"
34
#include "ydb_load.h"
35
#include "brtloader.h"
Rich Prohaska's avatar
Rich Prohaska committed
36
#include "log_header.h"
Yoni Fogel's avatar
Yoni Fogel committed
37

38

39 40 41 42 43 44 45 46 47 48
#ifdef TOKUTRACE
 #define DB_ENV_CREATE_FUN db_env_create_toku10
 #define DB_CREATE_FUN db_create_toku10
#else
 #define DB_ENV_CREATE_FUN db_env_create
 #define DB_CREATE_FUN db_create
 int toku_set_trace_file (char *fname __attribute__((__unused__))) { return 0; }
 int toku_close_trace_file (void) { return 0; } 
#endif

49
#define DB_ISOLATION_FLAGS (DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_TXN_SNAPSHOT | DB_SERIALIZABLE | DB_INHERIT_ISOLATION)
50

51 52 53 54 55 56 57 58 59 60 61 62 63 64
// Set when env is panicked, never cleared.
static int env_is_panicked = 0;

static void
env_panic(DB_ENV * env, int cause, char * msg) {
    if (cause == 0)
	cause = -1;  // if unknown cause, at least guarantee panic
    if (msg == NULL)
	msg = "Unknown cause in env_panic\n";
    env_is_panicked = cause;
    env->i->is_panicked = cause;
    env->i->panic_string = toku_strdup(msg);
}

65 66
// Accountability: operation counters available for debugging and for "show engine status"
static u_int64_t num_inserts;
67
static u_int64_t num_inserts_fail;
68
static u_int64_t num_deletes;
69
static u_int64_t num_deletes_fail;
70 71
static u_int64_t num_updates;
static u_int64_t num_updates_fail;
72 73
static u_int64_t num_updates_broadcast;
static u_int64_t num_updates_broadcast_fail;
74 75 76 77 78 79
static u_int64_t num_multi_inserts;
static u_int64_t num_multi_inserts_fail;
static u_int64_t num_multi_deletes;
static u_int64_t num_multi_deletes_fail;
static u_int64_t num_multi_updates;
static u_int64_t num_multi_updates_fail;
80 81
static u_int64_t num_point_queries;
static u_int64_t num_sequential_queries;
82 83 84 85
static u_int64_t num_db_open;
static u_int64_t num_db_close;
static u_int64_t num_open_dbs;
static u_int64_t max_open_dbs; 
86 87 88 89 90 91

static u_int64_t directory_read_locks;        /* total directory read locks taken */ 
static u_int64_t directory_read_locks_fail;   /* total directory read locks unable to be taken */ 
static u_int64_t directory_write_locks;       /* total directory write locks taken */ 
static u_int64_t directory_write_locks_fail;  /* total directory write locks unable to be taken */ 

92 93 94
static u_int64_t logsuppress;                // number of times logs are suppressed for empty table (2440)
static u_int64_t logsuppressfail;            // number of times unable to suppress logs for empty table (2440)
static time_t    startuptime;                // timestamp of system startup
95
static DB_ENV * volatile most_recent_env;   // most recently opened env, used for engine status on crash.  Note there are likely to be races on this if you have multiple threads creating and closing environments in parallel.  We'll declare it volatile since at least that helps make sure the compiler doesn't optimize away certain code (e.g., if while debugging, you write a code that spins on most_recent_env, you'd like to compiler not to optimize your code away.)
96

97 98
static uint32_t  engine_status_enable = 1;   // if zero, suppress engine status output on failed assert, for test programs only

99 100 101 102 103 104
static void
init_status_info(void) {
    num_inserts = 0;
    num_inserts_fail = 0;
    num_deletes = 0;
    num_deletes_fail = 0;
105 106
    num_updates = 0;
    num_updates_fail = 0;
107 108
    num_updates_broadcast = 0;
    num_updates_broadcast_fail = 0;
109 110 111 112 113 114
    num_multi_inserts = 0;
    num_multi_inserts_fail = 0;
    num_multi_deletes = 0;
    num_multi_deletes_fail = 0;
    num_multi_updates = 0;
    num_multi_updates_fail = 0;
115 116
    num_point_queries = 0;
    num_sequential_queries = 0;
117 118 119 120
    directory_read_locks = 0;
    directory_read_locks_fail = 0;
    directory_write_locks = 0;
    directory_write_locks_fail = 0;
121 122 123 124
    logsuppress = 0;
    logsuppressfail = 0;
    startuptime = time(NULL);
}
125

126 127 128
const char * environmentdictionary = "tokudb.environment";
const char * fileopsdirectory = "tokudb.directory";

129
static int env_get_iname(DB_ENV* env, DBT* dname_dbt, DBT* iname_dbt);
130 131 132
static int toku_maybe_get_engine_status_text (char* buff, int buffsize);  // for use by toku_assert
static void toku_maybe_set_env_panic(int code, char * msg);               // for use by toku_assert

133

Yoni Fogel's avatar
Yoni Fogel committed
134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
static const char single_process_lock_file[] = "/__tokudb_lock_dont_delete_me_";

static int
single_process_lock(const char *lock_dir, const char *which, int *lockfd) {
    if (!lock_dir)
        return ENOENT;
    int namelen=strlen(lock_dir)+strlen(which);
    char lockfname[namelen+sizeof(single_process_lock_file)];

    int l = snprintf(lockfname, sizeof(lockfname), "%s%s%s", lock_dir, single_process_lock_file, which);
    assert(l+1 == (signed)(sizeof(lockfname)));
    *lockfd = toku_os_lock_file(lockfname);
    if (*lockfd < 0) {
        int e = errno;
        fprintf(stderr, "Couldn't start tokudb because some other tokudb process is using the same directory [%s] for [%s]\n", lock_dir, which);
        return e;
    }
    return 0;
}

static int
single_process_unlock(int *lockfd) {
    int fd = *lockfd;
    *lockfd = -1;
    if (fd>=0) {
        int r = toku_os_unlock_file(fd);
        if (r != 0)
            return errno;
    }
    return 0;
}
165

Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
166
/** The default maximum number of persistent locks in a lock tree  */
167
const u_int32_t __toku_env_default_max_locks = 0x7FFFFFFF;
168
const uint64_t __toku_env_default_max_lock_memory = 1000*1024;
Rich Prohaska's avatar
Rich Prohaska committed
169

170 171 172 173 174 175 176
static inline DBT*
init_dbt_realloc(DBT *dbt) {
    memset(dbt, 0, sizeof(*dbt));
    dbt->flags = DB_DBT_REALLOC;
    return dbt;
}

177 178 179 180 181 182
//Callback used for redirecting dictionaries.
static void
ydb_set_brt(DB *db, BRT brt) {
    db->i->brt = brt;
}

183 184
int 
toku_ydb_init(void) {
Yoni Fogel's avatar
Yoni Fogel committed
185 186 187
    int r = 0;
    //Lower level must be initialized first.
    if (r==0) 
188
        r = toku_brt_init(toku_ydb_lock, toku_ydb_unlock, ydb_set_brt);
Yoni Fogel's avatar
Yoni Fogel committed
189 190 191
    if (r==0) 
        r = toku_ydb_lock_init();
    return r;
192 193
}

194
// Do not clean up resources if env is panicked, just exit ugly
195 196
int 
toku_ydb_destroy(void) {
Yoni Fogel's avatar
Yoni Fogel committed
197
    int r = 0;
198
    if (env_is_panicked == 0) {
Yoni Fogel's avatar
Yoni Fogel committed
199
        r = toku_ydb_lock_destroy();
200 201 202 203
	//Lower level must be cleaned up last.
	if (r==0)
	    r = toku_brt_destroy();
    }
Yoni Fogel's avatar
Yoni Fogel committed
204
    return r;
205 206
}

207 208 209 210 211
static int
ydb_getf_do_nothing(DBT const* UU(key), DBT const* UU(val), void* UU(extra)) {
    return 0;
}

Rich Prohaska's avatar
Rich Prohaska committed
212 213
/* env methods */
static int toku_env_close(DB_ENV *env, u_int32_t flags);
214 215 216
static int toku_env_set_data_dir(DB_ENV * env, const char *dir);
static int toku_env_set_lg_dir(DB_ENV * env, const char *dir);
static int toku_env_set_tmp_dir(DB_ENV * env, const char *tmp_dir);
Rich Prohaska's avatar
Rich Prohaska committed
217

218 219
static inline int 
env_opened(DB_ENV *env) {
220
    return env->i->cachetable != 0;
Rich Prohaska's avatar
Rich Prohaska committed
221 222
}

223 224
static void 
env_init_open_txn(DB_ENV *env) {
225
    toku_list_init(&env->i->open_txns);
Rich Prohaska's avatar
Rich Prohaska committed
226 227
}

228
// add a txn to the list of open txn's
229 230
static void 
env_add_open_txn(DB_ENV *env, DB_TXN *txn) {
231
    toku_list_push(&env->i->open_txns, (struct toku_list *) (void *) &txn->open_txns);
232 233 234
}

// remove a txn from the list of open txn's
235 236
static void 
env_remove_open_txn(DB_ENV *UU(env), DB_TXN *txn) {
237
    toku_list_remove((struct toku_list *) (void *) &txn->open_txns);
Rich Prohaska's avatar
Rich Prohaska committed
238 239
}

240 241
static int toku_txn_abort(DB_TXN * txn, TXN_PROGRESS_POLL_FUNCTION, void*,
			  bool release_multi_operation_client_lock);
242

243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275
static void
env_fs_report_in_yellow(DB_ENV *UU(env)) {
    char tbuf[26];
    time_t tnow = time(NULL);
    fprintf(stderr, "%.24s Tokudb file system space is low\n", ctime_r(&tnow, tbuf)); fflush(stderr);
}

static void
env_fs_report_in_red(DB_ENV *UU(env)) {
    char tbuf[26];
    time_t tnow = time(NULL);
    fprintf(stderr, "%.24s Tokudb file system space is really low and access is restricted\n", ctime_r(&tnow, tbuf)); fflush(stderr);
}

static inline uint64_t
env_fs_redzone(DB_ENV *env, uint64_t total) {
    return total * env->i->redzone / 100;
}

#define ZONEREPORTLIMIT 12
// Check the available space in the file systems used by tokudb and erect barriers when available space gets low.
static int
env_fs_poller(void *arg) {
    if (0) printf("%s:%d %p\n", __FUNCTION__, __LINE__, arg);
 
    DB_ENV *env = (DB_ENV *) arg;
    int r;
#if 0
    // get the cachetable size limit (not yet needed)
    uint64_t cs = toku_cachetable_get_size_limit(env->i->cachetable);
#endif

    int in_yellow; // set true to issue warning to user
276
    int in_red;    // set true to prevent certain operations (returning ENOSPC)
277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310

    // get the fs sizes for the home dir
    uint64_t avail_size, total_size;
    r = toku_get_filesystem_sizes(env->i->dir, &avail_size, NULL, &total_size);
    assert(r == 0);
    if (0) fprintf(stderr, "%s %"PRIu64" %"PRIu64"\n", env->i->dir, avail_size, total_size);
    in_yellow = (avail_size < 2 * env_fs_redzone(env, total_size));
    in_red = (avail_size < env_fs_redzone(env, total_size));
    
    // get the fs sizes for the data dir if different than the home dir
    if (strcmp(env->i->dir, env->i->real_data_dir) != 0) {
        r = toku_get_filesystem_sizes(env->i->real_data_dir, &avail_size, NULL, &total_size);
        assert(r == 0);
        if (0) fprintf(stderr, "%s %"PRIu64" %"PRIu64"\n", env->i->real_data_dir, avail_size, total_size);
        in_yellow += (avail_size < 2 * env_fs_redzone(env, total_size));
        in_red += (avail_size < env_fs_redzone(env, total_size));
    }

    // get the fs sizes for the log dir if different than the home dir and data dir
    if (strcmp(env->i->dir, env->i->real_log_dir) != 0 && strcmp(env->i->real_data_dir, env->i->real_log_dir) != 0) {
        r = toku_get_filesystem_sizes(env->i->real_log_dir, &avail_size, NULL, &total_size);
        assert(r == 0);
        if (0) fprintf(stderr, "%s %"PRIu64" %"PRIu64"\n", env->i->real_log_dir, avail_size, total_size);
        in_yellow += (avail_size < 2 * env_fs_redzone(env, total_size));
        in_red += (avail_size < env_fs_redzone(env, total_size));
    }

    
    env->i->fs_seq++;                    // how many times through this polling loop?
    uint64_t now = env->i->fs_seq;

    // Don't issue report if we have not been out of this fs_state for a while, unless we're at system startup
    switch (env->i->fs_state) {
    case FS_RED:
Barry Perlman's avatar
Barry Perlman committed
311
        if (!in_red) {
312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378
	    if (in_yellow) {
		env->i->fs_state = FS_YELLOW;
	    } else {
		env->i->fs_state = FS_GREEN;
	    }
	}
        break;
    case FS_YELLOW:
        if (in_red) {
	    if ((now - env->i->last_seq_entered_red > ZONEREPORTLIMIT) || (now < ZONEREPORTLIMIT))
		env_fs_report_in_red(env);
            env->i->fs_state = FS_RED;
	    env->i->last_seq_entered_red = now;
        } else if (!in_yellow) {
            env->i->fs_state = FS_GREEN;
        }
        break;
    case FS_GREEN:
        if (in_red) {
	    if ((now - env->i->last_seq_entered_red > ZONEREPORTLIMIT) || (now < ZONEREPORTLIMIT))
		env_fs_report_in_red(env);
            env->i->fs_state = FS_RED;
	    env->i->last_seq_entered_red = now;
        } else if (in_yellow) {
	    if ((now - env->i->last_seq_entered_yellow > ZONEREPORTLIMIT) || (now < ZONEREPORTLIMIT))
		env_fs_report_in_yellow(env);
            env->i->fs_state = FS_YELLOW;
	    env->i->last_seq_entered_yellow = now;
        }
        break;
    }
    return 0;
}
#undef ZONEREPORTLIMIT

static void
env_fs_init(DB_ENV *env) {
    env->i->fs_state = FS_GREEN;
    env->i->fs_poll_time = 5;  // seconds
    env->i->redzone = 5;       // percent of total space
    env->i->fs_poller_is_init = FALSE;
}

// Initialize the minicron that polls file system space
static int
env_fs_init_minicron(DB_ENV *env) {
    int r = toku_minicron_setup(&env->i->fs_poller, env->i->fs_poll_time, env_fs_poller, env); 
    assert(r == 0);
    env->i->fs_poller_is_init = TRUE;
    return r;
}

// Destroy the file system space minicron
static void
env_fs_destroy(DB_ENV *env) {
    if (env->i->fs_poller_is_init) {
        int r = toku_minicron_shutdown(&env->i->fs_poller);
        assert(r == 0);
        env->i->fs_poller_is_init = FALSE;
    }
}

// Check if the available file system space is less than the reserve
// Returns ENOSPC if not enough space, othersize 0
static inline int 
env_check_avail_fs_space(DB_ENV *env) {
    int r = env->i->fs_state == FS_RED ? ENOSPC : 0; 
379
    if (r) env->i->enospc_redzone_ctr++;
380 381 382
    return r;
}

383 384 385 386 387 388
int
toku_ydb_check_avail_fs_space(DB_ENV *env) {
    int rval = env_check_avail_fs_space(env);
    return rval;
}

Rich Prohaska's avatar
Rich Prohaska committed
389 390
/* db methods */
static inline int db_opened(DB *db) {
391
    return db->i->opened != 0;
Rich Prohaska's avatar
Rich Prohaska committed
392 393
}

Zardosht Kasheff's avatar
Zardosht Kasheff committed
394

Rich Prohaska's avatar
Rich Prohaska committed
395
static int toku_db_put(DB * db, DB_TXN * txn, DBT * key, DBT * data, u_int32_t flags);
396 397
static int toku_db_update(DB *db, DB_TXN *txn, const DBT *key, const DBT *update_function_extra, u_int32_t flags);
static int toku_db_update_broadcast(DB *db, DB_TXN *txn, const DBT *update_function_extra, u_int32_t flags);
Rich Prohaska's avatar
Rich Prohaska committed
398
static int toku_db_get (DB * db, DB_TXN * txn, DBT * key, DBT * data, u_int32_t flags);
399
static int toku_db_cursor(DB *db, DB_TXN * txn, DBC **c, u_int32_t flags, int is_temporary_cursor);
Rich Prohaska's avatar
Rich Prohaska committed
400 401 402

/* txn methods */

403
/* lightweight cursor methods. */
404 405 406
static int toku_c_getf_first(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra);

static int toku_c_getf_last(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra);
Yoni Fogel's avatar
Yoni Fogel committed
407

408
static int toku_c_getf_next(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra);
Yoni Fogel's avatar
Yoni Fogel committed
409

410
static int toku_c_getf_prev(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra);
Yoni Fogel's avatar
Yoni Fogel committed
411

412 413
static int toku_c_getf_current(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra);
static int toku_c_getf_current_binding(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra);
Yoni Fogel's avatar
Yoni Fogel committed
414

415 416
static int toku_c_getf_set(DBC *c, u_int32_t flag, DBT *key, YDB_CALLBACK_FUNCTION f, void *extra);
static int toku_c_getf_set_range(DBC *c, u_int32_t flag, DBT *key, YDB_CALLBACK_FUNCTION f, void *extra);
417
static int toku_c_getf_set_range_reverse(DBC *c, u_int32_t flag, DBT *key, YDB_CALLBACK_FUNCTION f, void *extra);
418

Yoni Fogel's avatar
Yoni Fogel committed
419 420
// Effect: Lightweight cursor get

Rich Prohaska's avatar
Rich Prohaska committed
421 422 423 424 425
/* cursor methods */
static int toku_c_get(DBC * c, DBT * key, DBT * data, u_int32_t flag);
static int toku_c_del(DBC *c, u_int32_t flags);
static int toku_c_count(DBC *cursor, db_recno_t *count, u_int32_t flags);
static int toku_c_close(DBC * c);
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
426

427
static void
428 429 430
env_setup_real_dir(DB_ENV *env, char **real_dir, const char *nominal_dir) {
    toku_free(*real_dir);
    *real_dir = NULL;
431 432

    assert(env->i->dir);
433 434
    if (nominal_dir) 
	*real_dir = toku_construct_full_name(2, env->i->dir, nominal_dir);
435
    else
436 437 438 439 440 441
        *real_dir = toku_strdup(env->i->dir);
}

static void
env_setup_real_data_dir(DB_ENV *env) {
    env_setup_real_dir(env, &env->i->real_data_dir, env->i->data_dir);
442 443 444 445
}

static void
env_setup_real_log_dir(DB_ENV *env) {
446
    env_setup_real_dir(env, &env->i->real_log_dir, env->i->lg_dir);
447 448
}

449 450 451 452 453
static void
env_setup_real_tmp_dir(DB_ENV *env) {
    env_setup_real_dir(env, &env->i->real_tmp_dir, env->i->tmp_dir);
}

454 455
static int 
ydb_do_recovery (DB_ENV *env) {
456
    assert(env->i->real_log_dir);
457
    toku_ydb_unlock();
458
    int r = tokudb_recover(env->i->dir, env->i->real_log_dir, env->i->bt_compare,
459
                           env->i->update_function,
460
                           env->i->generate_row_for_put, env->i->generate_row_for_del,
461
                           env->i->cachetable_size);
462
    toku_ydb_lock();
463
    return r;
464 465
}

466 467
static int 
needs_recovery (DB_ENV *env) {
468 469
    assert(env->i->real_log_dir);
    int recovery_needed = tokudb_needs_recovery(env->i->real_log_dir, TRUE);
470
    return recovery_needed ? DB_RUNRECOVERY : 0;
471 472
}

473 474 475 476
static int toku_db_create(DB ** db, DB_ENV * env, u_int32_t flags);
static int toku_db_open(DB * db, DB_TXN * txn, const char *fname, const char *dbname, DBTYPE dbtype, u_int32_t flags, int mode);
static int toku_env_txn_checkpoint(DB_ENV * env, u_int32_t kbyte, u_int32_t min, u_int32_t flags);
static int toku_db_close(DB * db, u_int32_t flags);
477
static int toku_txn_begin(DB_ENV *env, DB_TXN * stxn, DB_TXN ** txn, u_int32_t flags, int internal);
478
static int toku_txn_commit(DB_TXN * txn, u_int32_t flags, TXN_PROGRESS_POLL_FUNCTION, void*, bool release_multi_operation_client_lock);
479 480
static int db_open_iname(DB * db, DB_TXN * txn, const char *iname, u_int32_t flags, int mode);

481
static void finalize_file_removal(DICTIONARY_ID dict_id, void * extra);
482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505

// Instruct db to use the default (built-in) key comparison function
// by setting the flag bits in the db and brt structs
static int
db_use_builtin_key_cmp(DB *db) {
    HANDLE_PANICKED_DB(db);
    int r;
    if (db_opened(db))
        r = toku_ydb_do_error(db->dbenv, EINVAL, "Comparison functions cannot be set after DB open.\n");
    else if (db->i->key_compare_was_set)
        r = toku_ydb_do_error(db->dbenv, EINVAL, "Key comparison function already set.\n");
    else {
        u_int32_t tflags;
        r = toku_brt_get_flags(db->i->brt, &tflags);
        if (r!=0) return r;

        tflags |= TOKU_DB_KEYCMP_BUILTIN;
        r = toku_brt_set_flags(db->i->brt, tflags);
        if (!r)
            db->i->key_compare_was_set = TRUE;
    }
    return r;
}

506 507
// Keys used in persistent environment dictionary:
// Following keys added in version 12
508
static const char * orig_env_ver_key = "original_version";
509
static const char * curr_env_ver_key = "current_version";  
510
// Following keys added in version 14, add more keys for future versions
511
static const char * creation_time_key         = "creation_time";
512 513 514
static const char * last_lsn_of_v13_key       = "last_lsn_of_v13";
static const char * upgrade_v14_time_key      = "upgrade_v14_time";      
static const char * upgrade_v14_footprint_key = "upgrade_v14_footprint";
515 516 517 518 519 520

// Values read from (or written into) persistent environment,
// kept here for read-only access from engine status.
static uint32_t persistent_original_env_version;
static uint32_t persistent_stored_env_version_at_startup;    // read from curr_env_ver_key, prev version as of this startup
static time_t   persistent_creation_time;
521 522 523
static uint64_t persistent_last_lsn_of_v13;
static time_t   persistent_upgrade_v14_time;
static uint64_t persistent_upgrade_v14_footprint;
524 525 526 527

// Requires: persistent environment dictionary is already open.
// Input arg is lsn of clean shutdown of previous version,
// or ZERO_LSN if no upgrade or if crash between log upgrade and here.
528
static int
529
maybe_upgrade_persistent_environment_dictionary(DB_ENV * env, DB_TXN * txn, LSN last_lsn_of_clean_shutdown_read_from_log) {
530 531
    int r;
    DBT key, val;
532
    DB *persistent_environment = env->i->persistent_environment;
533

534
    toku_fill_dbt(&key, curr_env_ver_key, strlen(curr_env_ver_key));
535
    toku_init_dbt(&val);
536
    r = toku_db_get(persistent_environment, txn, &key, &val, 0);
537
    assert(r == 0);
538 539
    uint32_t stored_env_version = toku_dtoh32(*(uint32_t*)val.data);
    persistent_stored_env_version_at_startup = stored_env_version;
540
    if (stored_env_version > BRT_LAYOUT_VERSION)
541
	r = TOKUDB_DICTIONARY_TOO_NEW;
542 543 544
    else if (stored_env_version < BRT_LAYOUT_MIN_SUPPORTED_VERSION)
	r = TOKUDB_DICTIONARY_TOO_OLD;
    else if (stored_env_version < BRT_LAYOUT_VERSION) {
545
        const uint32_t curr_env_ver_d = toku_htod32(BRT_LAYOUT_VERSION);
546
        toku_fill_dbt(&key, curr_env_ver_key, strlen(curr_env_ver_key));
547
        toku_fill_dbt(&val, &curr_env_ver_d, sizeof(curr_env_ver_d));
548
        r = toku_db_put(persistent_environment, txn, &key, &val, 0);
549 550
        assert(r==0);
	
551 552 553
	uint64_t last_lsn_of_v13_d = toku_htod64(last_lsn_of_clean_shutdown_read_from_log.lsn);
	toku_fill_dbt(&key, last_lsn_of_v13_key, strlen(last_lsn_of_v13_key));
	toku_fill_dbt(&val, &last_lsn_of_v13_d, sizeof(last_lsn_of_v13_d));
554
	r = toku_db_put(persistent_environment, txn, &key, &val, 0);
555 556
        assert(r==0);
	
557 558 559
	time_t upgrade_v14_time_d = toku_htod64(time(NULL));
	toku_fill_dbt(&key, upgrade_v14_time_key, strlen(upgrade_v14_time_key));
	toku_fill_dbt(&val, &upgrade_v14_time_d, sizeof(upgrade_v14_time_d));
560
	r = toku_db_put(persistent_environment, txn, &key, &val, DB_NOOVERWRITE);
561
        assert(r==0);
562

563 564 565
	uint64_t upgrade_v14_footprint_d = toku_htod64(toku_log_upgrade_get_footprint());
	toku_fill_dbt(&key, upgrade_v14_footprint_key, strlen(upgrade_v14_footprint_key));
	toku_fill_dbt(&val, &upgrade_v14_footprint_d, sizeof(upgrade_v14_footprint_d));
566 567
	r = toku_db_put(persistent_environment, txn, &key, &val, DB_NOOVERWRITE);
        assert(r==0);
568
    }
569
    return r;
570 571
}

572

Barry Perlman's avatar
Barry Perlman committed
573
// Capture contents of persistent_environment dictionary so that it can be read by engine status
574
static void
Barry Perlman's avatar
Barry Perlman committed
575
capture_persistent_env_contents (DB_ENV * env, DB_TXN * txn) {
576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594
    int r;
    DBT key, val;
    DB *persistent_environment = env->i->persistent_environment;

    toku_fill_dbt(&key, curr_env_ver_key, strlen(curr_env_ver_key));
    toku_init_dbt(&val);
    r = toku_db_get(persistent_environment, txn, &key, &val, 0);
    assert(r == 0);
    uint32_t curr_env_version = toku_dtoh32(*(uint32_t*)val.data);
    assert(curr_env_version == BRT_LAYOUT_VERSION);

    toku_fill_dbt(&key, orig_env_ver_key, strlen(orig_env_ver_key));
    toku_init_dbt(&val);
    r = toku_db_get(persistent_environment, txn, &key, &val, 0);
    assert(r == 0);
    persistent_original_env_version = toku_dtoh32(*(uint32_t*)val.data);
    assert(persistent_original_env_version <= curr_env_version);

    // make no assertions about timestamps, clock may have been reset
595
    if (persistent_original_env_version >= BRT_LAYOUT_VERSION_14) {
596 597 598 599 600 601 602 603 604 605
	toku_fill_dbt(&key, creation_time_key, strlen(creation_time_key));
	toku_init_dbt(&val);
	r = toku_db_get(persistent_environment, txn, &key, &val, 0);
	assert(r == 0);
	persistent_creation_time = toku_dtoh64((*(time_t*)val.data));
    }

    if (persistent_original_env_version != curr_env_version) {
	// an upgrade was performed at some time, capture info about the upgrade
	
606
	toku_fill_dbt(&key, last_lsn_of_v13_key, strlen(last_lsn_of_v13_key));
607 608 609
	toku_init_dbt(&val);
	r = toku_db_get(persistent_environment, txn, &key, &val, 0);
	assert(r == 0);
610
	persistent_last_lsn_of_v13 = toku_dtoh64(*(uint32_t*)val.data);
611

612
	toku_fill_dbt(&key, upgrade_v14_time_key, strlen(upgrade_v14_time_key));
613 614 615
	toku_init_dbt(&val);
	r = toku_db_get(persistent_environment, txn, &key, &val, 0);
	assert(r == 0);
616
	persistent_upgrade_v14_time = toku_dtoh64((*(time_t*)val.data));
617

618
	toku_fill_dbt(&key, upgrade_v14_footprint_key, strlen(upgrade_v14_footprint_key));
619 620 621
	toku_init_dbt(&val);
	r = toku_db_get(persistent_environment, txn, &key, &val, 0);
	assert(r == 0);
622
	persistent_upgrade_v14_footprint = toku_dtoh64((*(uint64_t*)val.data));
623 624 625 626 627 628 629
    }

}




630 631 632
// return 0 if log exists or ENOENT if log does not exist
static int
ydb_recover_log_exists(DB_ENV *env) {
633
    int r = tokudb_recover_log_exists(env->i->real_log_dir);
634 635 636 637 638
    return r;
}


// Validate that all required files are present, no side effects.
639 640
// Return 0 if all is well, ENOENT if some files are present but at least one is missing, 
// other non-zero value if some other error occurs.
641 642 643
// Set *valid_newenv if creating a new environment (all files missing).
// (Note, if special dictionaries exist, then they were created transactionally and log should exist.)
static int 
644
validate_env(DB_ENV * env, BOOL * valid_newenv, BOOL need_rollback_cachefile) {
645
    int r;
646
    BOOL expect_newenv = FALSE;        // set true if we expect to create a new env
647 648
    toku_struct_stat buf;
    char* path = NULL;
649

650
    // Test for persistent environment
Yoni Fogel's avatar
Yoni Fogel committed
651
    path = toku_construct_full_name(2, env->i->dir, environmentdictionary);
652 653
    assert(path);
    r = toku_stat(path, &buf);
654
    int stat_errno = errno;
655
    toku_free(path);
656 657 658
    if (r == 0) {
	expect_newenv = FALSE;  // persistent info exists
    }
659
    else if (stat_errno == ENOENT) {
660 661
	expect_newenv = TRUE;
	r = 0;
662 663
    }
    else {
664 665
	r = toku_ydb_do_error(env, errno, "Unable to access persistent environment\n");
	assert(r);
666
    }
667

668
    // Test for existence of rollback cachefile if it is expected to exist
669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685
    if (r == 0 && need_rollback_cachefile) {
	path = toku_construct_full_name(2, env->i->dir, ROLLBACK_CACHEFILE_NAME);
	assert(path);
	r = toku_stat(path, &buf);
	stat_errno = errno;
	toku_free(path);
	if (r == 0) {  
	    if (expect_newenv)  // rollback cachefile exists, but persistent env is missing
		r = toku_ydb_do_error(env, ENOENT, "Persistent environment is missing\n");
	}
	else if (stat_errno == ENOENT) {
	    if (!expect_newenv)  // rollback cachefile is missing but persistent env exists
		r = toku_ydb_do_error(env, ENOENT, "rollback cachefile directory is missing\n");
	    else 
		r = 0;           // both rollback cachefile and persistent env are missing
	}
	else {
686
	    r = toku_ydb_do_error(env, stat_errno, "Unable to access rollback cachefile\n");
687 688 689 690
	    assert(r);
	}
    }

691 692
    // Test for fileops directory
    if (r == 0) {
Yoni Fogel's avatar
Yoni Fogel committed
693
	path = toku_construct_full_name(2, env->i->dir, fileopsdirectory);
694 695
	assert(path);
	r = toku_stat(path, &buf);
696
	stat_errno = errno;
697 698 699 700 701
	toku_free(path);
	if (r == 0) {  
	    if (expect_newenv)  // fileops directory exists, but persistent env is missing
		r = toku_ydb_do_error(env, ENOENT, "Persistent environment is missing\n");
	}
702
	else if (stat_errno == ENOENT) {
703 704 705 706 707 708
	    if (!expect_newenv)  // fileops directory is missing but persistent env exists
		r = toku_ydb_do_error(env, ENOENT, "Fileops directory is missing\n");
	    else 
		r = 0;           // both fileops directory and persistent env are missing
	}
	else {
709
	    r = toku_ydb_do_error(env, stat_errno, "Unable to access fileops directory\n");
710 711 712 713 714
	    assert(r);
	}
    }

    // Test for recovery log
715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732
    if ((r == 0) && (env->i->open_flags & DB_INIT_LOG)) {
	// if using transactions, test for existence of log
	r = ydb_recover_log_exists(env);  // return 0 or ENOENT
	if (expect_newenv && (r != ENOENT))
	    r = toku_ydb_do_error(env, ENOENT, "Persistent environment information is missing (but log exists)\n");
	else if (!expect_newenv && r == ENOENT)
	    r = toku_ydb_do_error(env, ENOENT, "Recovery log is missing (persistent environment information is present)\n");
	else
	    r = 0;
    }

    if (r == 0)
	*valid_newenv = expect_newenv;
    else 
	*valid_newenv = FALSE;
    return r;
}

Barry Perlman's avatar
Barry Perlman committed
733 734 735 736 737

// The version of the environment (on disk) is the version of the recovery log.  
// If the recovery log is of the current version, then there is no upgrade to be done.  
// If the recovery log is of an old version, then replacing it with a new recovery log
// of the current version is how the upgrade is done.  
Barry Perlman's avatar
Barry Perlman committed
738
// Note, the upgrade procedure takes a checkpoint, so we must release the ydb lock.
739
static int
740
ydb_maybe_upgrade_env (DB_ENV *env, LSN * last_lsn_of_clean_shutdown_read_from_log, BOOL * upgrade_in_progress) {
741 742 743
    int r = 0;
    if (env->i->open_flags & DB_INIT_TXN && env->i->open_flags & DB_INIT_LOG) {
        toku_ydb_unlock();
744
        r = toku_maybe_upgrade_log(env->i->dir, env->i->real_log_dir, last_lsn_of_clean_shutdown_read_from_log, upgrade_in_progress);
745 746 747 748 749 750
        toku_ydb_lock();
    }
    return r;
}


Yoni Fogel's avatar
Yoni Fogel committed
751 752 753 754 755 756 757 758 759 760 761 762
static void
unlock_single_process(DB_ENV *env) {
    int r;
    r = single_process_unlock(&env->i->envdir_lockfd);
    lazy_assert(r==0);
    r = single_process_unlock(&env->i->datadir_lockfd);
    lazy_assert(r==0);
    r = single_process_unlock(&env->i->logdir_lockfd);
    lazy_assert(r==0);
    r = single_process_unlock(&env->i->tmpdir_lockfd);
    lazy_assert(r==0);
}
763 764 765 766 767 768 769

// Open the environment.
// If this is a new environment, then create the necessary files.
// Return 0 on success, ENOENT if any of the expected necessary files are missing.
// (The set of necessary files is defined in the function validate_env() above.)
static int 
toku_env_open(DB_ENV * env, const char *home, u_int32_t flags, int mode) {
770
    HANDLE_PANICKED_ENV(env);
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
771
    int r;
772
    BOOL newenv;  // true iff creating a new environment
773
    u_int32_t unused_flags=flags;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
774

Rich Prohaska's avatar
Rich Prohaska committed
775
    if (env_opened(env)) {
Yoni Fogel's avatar
Yoni Fogel committed
776 777
	r = toku_ydb_do_error(env, EINVAL, "The environment is already open\n");
        goto cleanup;
778
    }
Yoni Fogel's avatar
Yoni Fogel committed
779

780 781
    most_recent_env = NULL;

782 783
    assert(sizeof(time_t) == sizeof(uint64_t));

784 785 786 787
    HANDLE_EXTRA_FLAGS(env, flags, 
                       DB_CREATE|DB_PRIVATE|DB_INIT_LOG|DB_INIT_TXN|DB_RECOVER|DB_INIT_MPOOL|DB_INIT_LOCK|DB_THREAD);


788 789
    // DB_CREATE means create if env does not exist, and Tokudb requires it because
    // Tokudb requries DB_PRIVATE.
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
790
    if ((flags & DB_PRIVATE) && !(flags & DB_CREATE)) {
Yoni Fogel's avatar
Yoni Fogel committed
791 792
	r = toku_ydb_do_error(env, ENOENT, "DB_PRIVATE requires DB_CREATE (seems gratuitous to us, but that's BDB's behavior\n");
        goto cleanup;
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
793 794
    }

795
    if (!(flags & DB_PRIVATE)) {
Yoni Fogel's avatar
Yoni Fogel committed
796 797
	r = toku_ydb_do_error(env, ENOENT, "TokuDB requires DB_PRIVATE\n");
        goto cleanup;
Yoni Fogel's avatar
Yoni Fogel committed
798
    }
799

Yoni Fogel's avatar
Yoni Fogel committed
800 801 802 803
    if ((flags & DB_INIT_LOG) && !(flags & DB_INIT_TXN)) {
	r = toku_ydb_do_error(env, EINVAL, "TokuDB requires transactions for logging\n");
        goto cleanup;
    }
804

Yoni Fogel's avatar
Yoni Fogel committed
805
    if (!home) home = ".";
Yoni Fogel's avatar
Yoni Fogel committed
806

807 808 809
    // Verify that the home exists.
    {
	BOOL made_new_home = FALSE;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
810
        char* new_home = NULL;
811
    	toku_struct_stat buf;
812
        if (strlen(home) > 1 && home[strlen(home)-1] == '\\') {
Zardosht Kasheff's avatar
Zardosht Kasheff committed
813 814 815 816 817
            new_home = toku_malloc(strlen(home));
            memcpy(new_home, home, strlen(home));
            new_home[strlen(home) - 1] = 0;
            made_new_home = TRUE;
        }
818
    	r = toku_stat(made_new_home? new_home : home, &buf);
Zardosht Kasheff's avatar
Zardosht Kasheff committed
819 820 821 822
        if (made_new_home) {
            toku_free(new_home);
        }
    	if (r!=0) {
Yoni Fogel's avatar
Yoni Fogel committed
823 824
    	    r = toku_ydb_do_error(env, errno, "Error from toku_stat(\"%s\",...)\n", home);
            goto cleanup;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
825
    	}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
826
    }
827
    unused_flags &= ~DB_PRIVATE;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
828 829 830

    if (env->i->dir)
        toku_free(env->i->dir);
Yoni Fogel's avatar
Yoni Fogel committed
831
    env->i->dir = toku_strdup(home);
832
    if (env->i->dir == 0) {
Yoni Fogel's avatar
Yoni Fogel committed
833 834
	r = toku_ydb_do_error(env, ENOMEM, "Out of memory\n");
        goto cleanup;
835
    }
Yoni Fogel's avatar
Yoni Fogel committed
836 837 838 839
    if (0) {
        died1:
        toku_free(env->i->dir);
        env->i->dir = NULL;
Yoni Fogel's avatar
Yoni Fogel committed
840
        goto cleanup;
Yoni Fogel's avatar
Yoni Fogel committed
841
    }
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
842 843
    env->i->open_flags = flags;
    env->i->open_mode = mode;
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
844

845 846
    env_setup_real_data_dir(env);
    env_setup_real_log_dir(env);
847
    env_setup_real_tmp_dir(env);
848

Yoni Fogel's avatar
Yoni Fogel committed
849 850
    r = single_process_lock(env->i->dir, "environment", &env->i->envdir_lockfd);
    if (r!=0) goto cleanup;
851
    r = single_process_lock(env->i->real_data_dir, "data", &env->i->datadir_lockfd);
Yoni Fogel's avatar
Yoni Fogel committed
852
    if (r!=0) goto cleanup;
853
    r = single_process_lock(env->i->real_log_dir, "logs", &env->i->logdir_lockfd);
Yoni Fogel's avatar
Yoni Fogel committed
854
    if (r!=0) goto cleanup;
855
    r = single_process_lock(env->i->real_tmp_dir, "temp", &env->i->tmpdir_lockfd);
Yoni Fogel's avatar
Yoni Fogel committed
856 857 858
    if (r!=0) goto cleanup;


859 860 861 862 863
    BOOL need_rollback_cachefile = FALSE;
    if (flags & (DB_INIT_TXN | DB_INIT_LOG)) {
        need_rollback_cachefile = TRUE;
    }

864 865
    init_status_info();  // do this before possibly upgrading, so upgrade work is counted in status counters

866 867 868
    LSN last_lsn_of_clean_shutdown_read_from_log = ZERO_LSN;
    BOOL upgrade_in_progress = FALSE;
    r = ydb_maybe_upgrade_env(env, &last_lsn_of_clean_shutdown_read_from_log, &upgrade_in_progress);
Yoni Fogel's avatar
Yoni Fogel committed
869
    if (r!=0) goto cleanup;
870

871 872 873 874 875 876 877 878 879 880 881
    if (upgrade_in_progress) {
	// Delete old rollback file.  There was a clean shutdown, so it has nothing useful,
	// and there is no value in upgrading it.  It is simpler to just create a new one.
	char* rollback_filename = toku_construct_full_name(2, env->i->dir, ROLLBACK_CACHEFILE_NAME);
	assert(rollback_filename);
	r = unlink(rollback_filename);
	toku_free(rollback_filename);
	assert(r==0 || errno==ENOENT);	
	need_rollback_cachefile = FALSE;  // we're not expecting it to exist now
    }
    
882
    r = validate_env(env, &newenv, need_rollback_cachefile);  // make sure that environment is either new or complete
Yoni Fogel's avatar
Yoni Fogel committed
883
    if (r != 0) goto cleanup;
884

885
    unused_flags &= ~DB_INIT_TXN & ~DB_INIT_LOG;
886

887 888 889 890 891 892 893
    // do recovery only if there exists a log and recovery is requested
    // otherwise, a log is created when the logger is opened later
    if (!newenv) {
        if (flags & DB_INIT_LOG) {
            // the log does exist
            if (flags & DB_RECOVER) {
                r = ydb_do_recovery(env);
Yoni Fogel's avatar
Yoni Fogel committed
894
                if (r != 0) goto cleanup;
895 896 897
            } else {
                // the log is required to have clean shutdown if recovery is not requested
                r = needs_recovery(env);
Yoni Fogel's avatar
Yoni Fogel committed
898
                if (r != 0) goto cleanup;
899
            }
900 901
        }
    }
902 903
    
    toku_loader_cleanup_temp_files(env);
904

Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
905
    if (flags & (DB_INIT_TXN | DB_INIT_LOG)) {
906
	assert(env->i->logger);
907
        toku_logger_write_log_files(env->i->logger, (BOOL)((flags & DB_INIT_LOG) != 0));
908
        r = toku_logger_open(env->i->real_log_dir, env->i->logger);
909
	if (r!=0) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
910
	    toku_ydb_do_error(env, r, "Could not open logger\n");
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
911
	died2:
912
	    toku_logger_close(&env->i->logger);
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
913 914
	    goto died1;
	}
915 916 917
    } else {
	r = toku_logger_close(&env->i->logger); // if no logging system, then kill the logger
	assert(r==0);
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
918 919
    }

920 921 922 923 924 925 926
    unused_flags &= ~DB_INIT_MPOOL; // we always init an mpool.
    unused_flags &= ~DB_CREATE;     // we always do DB_CREATE
    unused_flags &= ~DB_INIT_LOCK;  // we check this later (e.g. in db->open)
    unused_flags &= ~DB_RECOVER;

// This is probably correct, but it will be pain...
//    if ((flags & DB_THREAD)==0) {
Yoni Fogel's avatar
Yoni Fogel committed
927 928
//	r = toku_ydb_do_error(env, EINVAL, "TokuDB requires DB_THREAD");
//	goto cleanup;
929 930 931 932
//    }
    unused_flags &= ~DB_THREAD;

    if (unused_flags!=0) {
Yoni Fogel's avatar
Yoni Fogel committed
933 934
	r = toku_ydb_do_error(env, EINVAL, "Extra flags not understood by tokudb: %u\n", unused_flags);
        goto cleanup;
935 936
    }

937
    r = toku_brt_create_cachetable(&env->i->cachetable, env->i->cachetable_size, ZERO_LSN, env->i->logger);
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
938
    if (r!=0) goto died2;
939
    toku_cachetable_set_lock_unlock_for_io(env->i->cachetable, toku_ydb_lock, toku_ydb_unlock);
940

Yoni Fogel's avatar
Yoni Fogel committed
941 942
    toku_cachetable_set_env_dir(env->i->cachetable, env->i->dir);

943 944
    int using_txns = env->i->open_flags & DB_INIT_TXN;
    if (env->i->logger) {
945 946
	// if this is a newborn env or if this is an upgrade, then create a brand new rollback file
	BOOL create_new_rollback_file = newenv | upgrade_in_progress;
947 948 949
	assert (using_txns);
	toku_logger_set_cachetable(env->i->logger, env->i->cachetable);
	toku_logger_set_remove_finalize_callback(env->i->logger, finalize_file_removal, env->i->ltm);
950
        r = toku_logger_open_rollback(env->i->logger, env->i->cachetable, create_new_rollback_file);
951
        assert(r==0);
952
    }
953

954 955
    DB_TXN *txn=NULL;
    if (using_txns) {
956
        r = toku_txn_begin(env, 0, &txn, 0, 1);
957
        assert(r==0);
958
    }
959

960 961
    {
        r = toku_db_create(&env->i->persistent_environment, env, 0);
962
        assert(r==0);
963 964
        r = db_use_builtin_key_cmp(env->i->persistent_environment);
        assert(r==0);
965
	r = db_open_iname(env->i->persistent_environment, txn, environmentdictionary, DB_CREATE, mode);
966
	assert(r==0);
967
	if (newenv) {
968 969
	    // create new persistent_environment
	    DBT key, val;
970 971 972
	    persistent_original_env_version = BRT_LAYOUT_VERSION;
	    const uint32_t environment_version = toku_htod32(persistent_original_env_version);

973
	    toku_fill_dbt(&key, orig_env_ver_key, strlen(orig_env_ver_key));
974 975 976
	    toku_fill_dbt(&val, &environment_version, sizeof(environment_version));
	    r = toku_db_put(env->i->persistent_environment, txn, &key, &val, 0);
	    assert(r==0);
977

978
	    toku_fill_dbt(&key, curr_env_ver_key, strlen(curr_env_ver_key));
979 980 981
	    toku_fill_dbt(&val, &environment_version, sizeof(environment_version));
	    r = toku_db_put(env->i->persistent_environment, txn, &key, &val, 0);
	    assert(r==0);
982 983 984 985 986 987

	    time_t creation_time_d = toku_htod64(time(NULL));
	    toku_fill_dbt(&key, creation_time_key, strlen(creation_time_key));
	    toku_fill_dbt(&val, &creation_time_d, sizeof(creation_time_d));
	    r = toku_db_put(env->i->persistent_environment, txn, &key, &val, 0);
	    assert(r==0);
988 989
	}
	else {
990
	    r = maybe_upgrade_persistent_environment_dictionary(env, txn, last_lsn_of_clean_shutdown_read_from_log);
991 992
	    assert(r==0);
	}
Barry Perlman's avatar
Barry Perlman committed
993
	capture_persistent_env_contents(env, txn);
994 995 996
    }
    {
        r = toku_db_create(&env->i->directory, env, 0);
997
        assert(r==0);
998 999
        r = db_use_builtin_key_cmp(env->i->directory);
        assert(r==0);
1000 1001
        r = db_open_iname(env->i->directory, txn, fileopsdirectory, DB_CREATE, mode);
        assert(r==0);
1002 1003
    }
    if (using_txns) {
1004
        r = toku_txn_commit(txn, 0, NULL, NULL, false);
1005
        assert(r==0);
1006 1007
    }
    toku_ydb_unlock();
1008
    r = toku_checkpoint(env->i->cachetable, env->i->logger, NULL, NULL, NULL, NULL);
1009
    assert(r==0);
1010
    toku_ydb_lock();
1011 1012
    env_fs_poller(env);          // get the file system state at startup
    env_fs_init_minicron(env); 
Yoni Fogel's avatar
Yoni Fogel committed
1013 1014 1015 1016 1017 1018
cleanup:
    if (r!=0) {
        if (env && env->i) {
            unlock_single_process(env);
        }
    }
1019
    if (r == 0) {
1020
	errno = 0; // tabula rasa.   If there's a crash after env was successfully opened, no misleading errno will have been left around by this code.
1021
	most_recent_env = env;
1022
	toku_assert_set_fpointers(toku_maybe_get_engine_status_text, toku_maybe_set_env_panic);
1023
    }
Yoni Fogel's avatar
Yoni Fogel committed
1024
    return r;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1025
}
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
1026

1027

1028 1029
static int 
toku_env_close(DB_ENV * env, u_int32_t flags) {
1030
    int r = 0;
1031
    char * err_msg = NULL;
1032

1033 1034
    most_recent_env = NULL; // Set most_recent_env to NULL so that we don't have a dangling pointer (and if there's an error, the toku assert code would try to look at the env.)

1035
    // if panicked, or if any open transactions, or any open dbs, then do nothing.
1036

1037
    if (toku_env_is_panicked(env)) goto panic_and_quit_early;
1038
    if (!toku_list_empty(&env->i->open_txns)) {
1039 1040
	err_msg = "Cannot close environment due to open transactions\n";
        r = toku_ydb_do_error(env, EINVAL, "%s", err_msg);
1041 1042
        goto panic_and_quit_early;
    }
1043 1044 1045 1046
    { //Verify open dbs. Zombies are ok at this stage, fully open is not.
        uint32_t size = toku_omt_size(env->i->open_dbs);
        assert(size == env->i->num_open_dbs + env->i->num_zombie_dbs);
        if (env->i->num_open_dbs > 0) {
1047 1048
	    err_msg = "Cannot close environment due to open DBs\n";
            r = toku_ydb_do_error(env, EINVAL, "%s", err_msg);
1049 1050
            goto panic_and_quit_early;
        }
1051 1052 1053 1054 1055
    }
    {
        if (env->i->persistent_environment) {
            r = toku_db_close(env->i->persistent_environment, 0);
            if (r) {
1056 1057
		err_msg = "Cannot close persistent environment dictionary (DB->close error)\n";
                toku_ydb_do_error(env, r, "%s", err_msg);
1058 1059 1060 1061 1062 1063
                goto panic_and_quit_early;
            }
        }
        if (env->i->directory) {
            r = toku_db_close(env->i->directory, 0);
            if (r) {
1064 1065
		err_msg = "Cannot close Directory dictionary (DB->close error)\n";
                toku_ydb_do_error(env, r, "%s", err_msg);
1066 1067 1068 1069
                goto panic_and_quit_early;
            }
        }
    }
1070
    if (env->i->cachetable) {
1071 1072
	toku_ydb_unlock();  // ydb lock must not be held when shutting down minicron
	toku_cachetable_minicron_shutdown(env->i->cachetable);
1073
        if (env->i->logger) {
1074
            r = toku_checkpoint(env->i->cachetable, env->i->logger, NULL, NULL, NULL, NULL);
1075
            if (r) {
1076 1077
		err_msg = "Cannot close environment (error during checkpoint)\n";
                toku_ydb_do_error(env, r, "%s", err_msg);
1078 1079
                goto panic_and_quit_early;
            }
1080 1081 1082 1083
            { //Verify open dbs. Neither Zombies nor fully open are ok at this stage.
                uint32_t size = toku_omt_size(env->i->open_dbs);
                assert(size == env->i->num_open_dbs + env->i->num_zombie_dbs);
                if (size > 0) {
1084 1085
		    err_msg = "Cannot close environment due to zombie DBs\n";
                    r = toku_ydb_do_error(env, EINVAL, "%s", err_msg);
1086 1087 1088
                    goto panic_and_quit_early;
                }
            }
1089 1090
            r = toku_logger_close_rollback(env->i->logger, FALSE);
            if (r) {
1091 1092
		err_msg = "Cannot close environment (error during closing rollback cachefile)\n";
                toku_ydb_do_error(env, r, "%s", err_msg);
1093 1094 1095 1096 1097
                goto panic_and_quit_early;
            }
            //Do a second checkpoint now that the rollback cachefile is closed.
            r = toku_checkpoint(env->i->cachetable, env->i->logger, NULL, NULL, NULL, NULL);
            if (r) {
1098 1099
		err_msg = "Cannot close environment (error during checkpoint)\n";
                toku_ydb_do_error(env, r, "%s", err_msg);
1100 1101
                goto panic_and_quit_early;
            }
1102 1103
            r = toku_logger_shutdown(env->i->logger); 
            if (r) {
1104 1105
		err_msg = "Cannot close environment (error during logger shutdown)\n";
                toku_ydb_do_error(env, r, "%s", err_msg);
1106 1107
                goto panic_and_quit_early;
            }
1108
        }
1109
	toku_ydb_lock();
1110 1111
        r=toku_cachetable_close(&env->i->cachetable);
	if (r) {
1112 1113
	    err_msg = "Cannot close environment (cachetable close error)\n";
	    toku_ydb_do_error(env, r, "%s", err_msg);
1114
            goto panic_and_quit_early;
1115 1116 1117
	}
    }
    if (env->i->logger) {
1118 1119
        r=toku_logger_close(&env->i->logger);
	if (r) {
1120
	    err_msg = "Cannot close environment (logger close error)\n";
1121
            env->i->logger = NULL;
1122
	    toku_ydb_do_error(env, r, "%s", err_msg);
1123
            goto panic_and_quit_early;
1124 1125 1126 1127
	}
    }
    // Even if nothing else went wrong, but we were panicked, then raise an error.
    // But if something else went wrong then raise that error (above)
1128 1129 1130 1131
    if (toku_env_is_panicked(env))
        goto panic_and_quit_early;
    else
	assert(env->i->panic_string==0);
1132

1133
    env_fs_destroy(env);
Yoni Fogel's avatar
Yoni Fogel committed
1134
    toku_ltm_close(env->i->ltm);
1135 1136
    if (env->i->data_dir)
        toku_free(env->i->data_dir);
1137 1138
    if (env->i->lg_dir)
        toku_free(env->i->lg_dir);
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1139 1140
    if (env->i->tmp_dir)
        toku_free(env->i->tmp_dir);
1141 1142 1143 1144
    if (env->i->real_data_dir)
	toku_free(env->i->real_data_dir);
    if (env->i->real_log_dir)
	toku_free(env->i->real_log_dir);
1145 1146
    if (env->i->real_tmp_dir)
	toku_free(env->i->real_tmp_dir);
1147 1148
    if (env->i->open_dbs)
        toku_omt_destroy(&env->i->open_dbs);
1149 1150
    if (env->i->dir)
	toku_free(env->i->dir);
1151 1152
    //Immediately before freeing internal environment unlock the directories
    unlock_single_process(env);
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
1153
    toku_free(env->i);
1154
    env->i = NULL;
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
1155
    toku_free(env);
1156
    env = NULL;
1157
    if (flags!=0)
1158 1159 1160 1161
        r = EINVAL;
    return r;

panic_and_quit_early:
Yoni Fogel's avatar
Yoni Fogel committed
1162 1163
    //release lock files.
    unlock_single_process(env);
1164 1165 1166 1167 1168
    //r is the panic error
    if (toku_env_is_panicked(env)) {
        char *panic_string = env->i->panic_string;
        r = toku_ydb_do_error(env, toku_env_is_panicked(env), "Cannot close environment due to previous error: %s\n", panic_string);
    }
1169 1170 1171
    else {
	env_panic(env, r, err_msg);
    }
1172
    return r;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1173
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1174

1175 1176
static int 
toku_env_log_archive(DB_ENV * env, char **list[], u_int32_t flags) {
1177
    return toku_logger_log_archive(env->i->logger, list, flags);
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1178
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1179

1180 1181
static int 
toku_env_log_flush(DB_ENV * env, const DB_LSN * lsn __attribute__((__unused__))) {
1182
    HANDLE_PANICKED_ENV(env);
1183 1184
    // We just flush everything.  MySQL uses lsn==0 which means flush everything.  For anyone else using the log, it is correct to flush too much, so we are OK.
    return toku_logger_fsync(env->i->logger);
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1185
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1186

1187 1188
static int 
toku_env_set_cachesize(DB_ENV * env, u_int32_t gbytes, u_int32_t bytes, int ncache) {
1189
    HANDLE_PANICKED_ENV(env);
Rich Prohaska's avatar
Rich Prohaska committed
1190 1191
    if (ncache != 1)
        return EINVAL;
Rich Prohaska's avatar
Rich Prohaska committed
1192 1193 1194 1195 1196
    u_int64_t cs64 = ((u_int64_t) gbytes << 30) + bytes;
    unsigned long cs = cs64;
    if (cs64 > cs)
        return EINVAL;
    env->i->cachetable_size = cs;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1197 1198 1199
    return 0;
}

1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224
static int toku_env_dbremove(DB_ENV * env, DB_TXN *txn, const char *fname, const char *dbname, u_int32_t flags);

static int
locked_env_dbremove(DB_ENV * env, DB_TXN *txn, const char *fname, const char *dbname, u_int32_t flags) {
    toku_multi_operation_client_lock(); //Cannot begin checkpoint
    toku_ydb_lock();
    int r = toku_env_dbremove(env, txn, fname, dbname, flags);
    toku_ydb_unlock();
    toku_multi_operation_client_unlock(); //Can now begin checkpoint
    return r;
}

static int toku_env_dbrename(DB_ENV *env, DB_TXN *txn, const char *fname, const char *dbname, const char *newname, u_int32_t flags);

static int
locked_env_dbrename(DB_ENV *env, DB_TXN *txn, const char *fname, const char *dbname, const char *newname, u_int32_t flags) {
    toku_multi_operation_client_lock(); //Cannot begin checkpoint
    toku_ydb_lock();
    int r = toku_env_dbrename(env, txn, fname, dbname, newname, flags);
    toku_ydb_unlock();
    toku_multi_operation_client_unlock(); //Can now begin checkpoint
    return r;
}


Rich Prohaska's avatar
Rich Prohaska committed
1225 1226
#if DB_VERSION_MAJOR == 4 && DB_VERSION_MINOR >= 3

1227 1228
static int 
toku_env_get_cachesize(DB_ENV * env, u_int32_t *gbytes, u_int32_t *bytes, int *ncache) {
1229
    HANDLE_PANICKED_ENV(env);
Rich Prohaska's avatar
Rich Prohaska committed
1230 1231 1232 1233 1234 1235
    *gbytes = env->i->cachetable_size >> 30;
    *bytes = env->i->cachetable_size & ((1<<30)-1);
    *ncache = 1;
    return 0;
}

1236 1237
static int 
locked_env_get_cachesize(DB_ENV *env, u_int32_t *gbytes, u_int32_t *bytes, int *ncache) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1238
    toku_ydb_lock(); int r = toku_env_get_cachesize(env, gbytes, bytes, ncache); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
1239
}
Rich Prohaska's avatar
Rich Prohaska committed
1240 1241
#endif

1242 1243
static int 
toku_env_set_data_dir(DB_ENV * env, const char *dir) {
1244
    HANDLE_PANICKED_ENV(env);
Yoni Fogel's avatar
Yoni Fogel committed
1245 1246
    int r;
    
Rich Prohaska's avatar
Rich Prohaska committed
1247
    if (env_opened(env) || !dir) {
1248
	r = toku_ydb_do_error(env, EINVAL, "You cannot set the data dir after opening the env\n");
1249
    }
1250 1251
    else if (env->i->data_dir)
	r = toku_ydb_do_error(env, EINVAL, "You cannot set the data dir more than once.\n");
1252 1253 1254 1255 1256
    else {
        env->i->data_dir = toku_strdup(dir);
        if (env->i->data_dir==NULL) {
            assert(errno == ENOMEM);
            r = toku_ydb_do_error(env, ENOMEM, "Out of memory\n");
Yoni Fogel's avatar
Yoni Fogel committed
1257
        }
1258
        else r = 0;
Yoni Fogel's avatar
Yoni Fogel committed
1259
    }
1260
    return r;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1261
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1262

1263 1264
static void 
toku_env_set_errcall(DB_ENV * env, toku_env_errcall_t errcall) {
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1265
    env->i->errcall = errcall;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1266
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1267

1268 1269
static void 
toku_env_set_errfile(DB_ENV*env, FILE*errfile) {
1270 1271 1272
    env->i->errfile = errfile;
}

1273 1274
static void 
toku_env_set_errpfx(DB_ENV * env, const char *errpfx) {
1275
    env->i->errpfx = errpfx;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1276
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1277

1278 1279
static int 
toku_env_set_flags(DB_ENV * env, u_int32_t flags, int onoff) {
1280
    HANDLE_PANICKED_ENV(env);
Yoni Fogel's avatar
Yoni Fogel committed
1281 1282 1283 1284 1285 1286

    u_int32_t change = 0;
    if (flags & DB_AUTO_COMMIT) {
        change |=  DB_AUTO_COMMIT;
        flags  &= ~DB_AUTO_COMMIT;
    }
1287
    if (flags != 0 && onoff) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1288
	return toku_ydb_do_error(env, EINVAL, "TokuDB does not (yet) support any nonzero ENV flags other than DB_AUTO_COMMIT\n");
1289
    }
Yoni Fogel's avatar
Yoni Fogel committed
1290 1291
    if   (onoff) env->i->open_flags |=  change;
    else         env->i->open_flags &= ~change;
Rich Prohaska's avatar
Rich Prohaska committed
1292
    return 0;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1293
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1294

1295 1296
static int 
toku_env_set_lg_bsize(DB_ENV * env, u_int32_t bsize) {
1297
    HANDLE_PANICKED_ENV(env);
1298
    return toku_logger_set_lg_bsize(env->i->logger, bsize);
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1299
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1300

1301 1302
static int 
toku_env_set_lg_dir(DB_ENV * env, const char *dir) {
1303
    HANDLE_PANICKED_ENV(env);
Rich Prohaska's avatar
Rich Prohaska committed
1304
    if (env_opened(env)) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1305
	return toku_ydb_do_error(env, EINVAL, "Cannot set log dir after opening the env\n");
1306
    }
1307 1308

    if (env->i->lg_dir) toku_free(env->i->lg_dir);
1309 1310
    if (dir) {
        env->i->lg_dir = toku_strdup(dir);
1311
        if (!env->i->lg_dir) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1312
	    return toku_ydb_do_error(env, ENOMEM, "Out of memory\n");
1313
	}
1314
    }
1315 1316
    else env->i->lg_dir = NULL;
    return 0;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1317
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1318

1319 1320
static int 
toku_env_set_lg_max(DB_ENV * env, u_int32_t lg_max) {
1321
    HANDLE_PANICKED_ENV(env);
1322 1323 1324
    return toku_logger_set_lg_max(env->i->logger, lg_max);
}

1325 1326
static int 
toku_env_get_lg_max(DB_ENV * env, u_int32_t *lg_maxp) {
1327 1328
    HANDLE_PANICKED_ENV(env);
    return toku_logger_get_lg_max(env->i->logger, lg_maxp);
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1329
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1330

1331 1332
static int 
toku_env_set_lk_detect(DB_ENV * env, u_int32_t detect) {
1333
    HANDLE_PANICKED_ENV(env);
1334
    detect=detect;
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1335
    return toku_ydb_do_error(env, EINVAL, "TokuDB does not (yet) support set_lk_detect\n");
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1336
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1337

1338 1339
static int 
toku_env_set_lk_max_locks(DB_ENV *dbenv, u_int32_t max) {
Yoni Fogel's avatar
Yoni Fogel committed
1340
    int r = ENOSYS;
Yoni Fogel's avatar
Yoni Fogel committed
1341
    HANDLE_PANICKED_ENV(dbenv);
Yoni Fogel's avatar
Yoni Fogel committed
1342
    if (env_opened(dbenv))         { return EINVAL; }
1343
    r = toku_ltm_set_max_locks(dbenv->i->ltm, max);
Yoni Fogel's avatar
Yoni Fogel committed
1344
    return r;
Yoni Fogel's avatar
Yoni Fogel committed
1345 1346
}

1347
#if DB_VERSION_MAJOR == 4 && DB_VERSION_MINOR <= 4
1348 1349
static int 
toku_env_set_lk_max(DB_ENV * env, u_int32_t lk_max) {
Yoni Fogel's avatar
Yoni Fogel committed
1350
    return toku_env_set_lk_max_locks(env, lk_max);
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1351
}
Rich Prohaska's avatar
Rich Prohaska committed
1352

1353 1354
static int 
locked_env_set_lk_max(DB_ENV * env, u_int32_t lk_max) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1355
    toku_ydb_lock(); int r = toku_env_set_lk_max(env, lk_max); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
1356
}
1357
#endif
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1358

1359 1360
static int 
toku_env_get_lk_max_locks(DB_ENV *dbenv, u_int32_t *lk_maxp) {
1361
    HANDLE_PANICKED_ENV(dbenv);
1362
    return toku_ltm_get_max_locks(dbenv->i->ltm, lk_maxp);
1363 1364
}

1365 1366
static int 
locked_env_set_lk_max_locks(DB_ENV *dbenv, u_int32_t max) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1367
    toku_ydb_lock(); int r = toku_env_set_lk_max_locks(dbenv, max); toku_ydb_unlock(); return r;
1368 1369
}

1370 1371
static int 
locked_env_get_lk_max_locks(DB_ENV *dbenv, u_int32_t *lk_maxp) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1372
    toku_ydb_lock(); int r = toku_env_get_lk_max_locks(dbenv, lk_maxp); toku_ydb_unlock(); return r;
1373 1374
}

1375 1376
static int 
toku_env_set_lk_max_memory(DB_ENV *dbenv, uint64_t max) {
1377 1378 1379 1380 1381 1382 1383
    int r = ENOSYS;
    HANDLE_PANICKED_ENV(dbenv);
    if (env_opened(dbenv))         { return EINVAL; }
    r = toku_ltm_set_max_lock_memory(dbenv->i->ltm, max);
    return r;
}

1384 1385
static int 
toku_env_get_lk_max_memory(DB_ENV *dbenv, uint64_t *lk_maxp) {
1386 1387 1388 1389
    HANDLE_PANICKED_ENV(dbenv);
    return toku_ltm_get_max_lock_memory(dbenv->i->ltm, lk_maxp);
}

1390 1391 1392 1393 1394 1395
static int 
locked_env_set_lk_max_memory(DB_ENV *dbenv, uint64_t max) {
    toku_ydb_lock(); 
    int r = toku_env_set_lk_max_memory(dbenv, max); 
    toku_ydb_unlock(); 
    return r;
1396 1397 1398 1399 1400 1401
}

static int locked_env_get_lk_max_memory(DB_ENV *dbenv, uint64_t *lk_maxp) {
    toku_ydb_lock(); int r = toku_env_get_lk_max_memory(dbenv, lk_maxp); toku_ydb_unlock(); return r;
}

Yoni Fogel's avatar
Yoni Fogel committed
1402
//void toku__env_set_noticecall (DB_ENV *env, void (*noticecall)(DB_ENV *, db_notices)) {
Bradley C. Kuszmaul's avatar
Fixup  
Bradley C. Kuszmaul committed
1403 1404
//    env->i->noticecall = noticecall;
//}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1405

1406 1407
static int 
toku_env_set_tmp_dir(DB_ENV * env, const char *tmp_dir) {
1408
    HANDLE_PANICKED_ENV(env);
Rich Prohaska's avatar
Rich Prohaska committed
1409
    if (env_opened(env)) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1410
	return toku_ydb_do_error(env, EINVAL, "Cannot set the tmp dir after opening an env\n");
1411 1412
    }
    if (!tmp_dir) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1413
	return toku_ydb_do_error(env, EINVAL, "Tmp dir bust be non-null\n");
1414
    }
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1415 1416
    if (env->i->tmp_dir)
        toku_free(env->i->tmp_dir);
Yoni Fogel's avatar
Yoni Fogel committed
1417
    env->i->tmp_dir = toku_strdup(tmp_dir);
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1418
    return env->i->tmp_dir ? 0 : ENOMEM;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1419
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1420

1421 1422
static int 
toku_env_set_verbose(DB_ENV * env, u_int32_t which, int onoff) {
1423 1424
    HANDLE_PANICKED_ENV(env);
    which=which; onoff=onoff;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1425
    return 1;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1426
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1427

1428
// For test purposes only.
1429 1430
// These callbacks are never used in production code, only as a way to test the system
// (for example, by causing crashes at predictable times).
1431 1432
static void (*checkpoint_callback_f)(void*) = NULL;
static void * checkpoint_callback_extra     = NULL;
1433 1434
static void (*checkpoint_callback2_f)(void*) = NULL;
static void * checkpoint_callback2_extra     = NULL;
1435

1436 1437
static int 
toku_env_txn_checkpoint(DB_ENV * env, u_int32_t kbyte __attribute__((__unused__)), u_int32_t min __attribute__((__unused__)), u_int32_t flags __attribute__((__unused__))) {
1438
    int r = toku_checkpoint(env->i->cachetable, env->i->logger,
1439 1440
			    checkpoint_callback_f,  checkpoint_callback_extra,
			    checkpoint_callback2_f, checkpoint_callback2_extra);
1441
    if (r) {
1442 1443
	// Panicking the whole environment may be overkill, but I'm not sure what else to do.
	env_panic(env, r, "checkpoint error\n");
1444
        toku_ydb_do_error(env, r, "Checkpoint\n");
1445 1446
    }
    return r;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1447 1448
}

1449 1450
static int 
toku_env_txn_stat(DB_ENV * env, DB_TXN_STAT ** statp, u_int32_t flags) {
1451 1452
    HANDLE_PANICKED_ENV(env);
    statp=statp;flags=flags;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
1453
    return 1;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
1454 1455
}

1456 1457
static int 
locked_env_open(DB_ENV * env, const char *home, u_int32_t flags, int mode) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1458
    toku_ydb_lock(); int r = toku_env_open(env, home, flags, mode); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
1459 1460
}

1461 1462
static int 
locked_env_close(DB_ENV * env, u_int32_t flags) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1463
    toku_ydb_lock(); int r = toku_env_close(env, flags); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
1464 1465
}

1466 1467
static int 
locked_env_log_archive(DB_ENV * env, char **list[], u_int32_t flags) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1468
    toku_ydb_lock(); int r = toku_env_log_archive(env, list, flags); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
1469 1470
}

1471 1472
static int 
locked_env_log_flush(DB_ENV * env, const DB_LSN * lsn) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1473
    toku_ydb_lock(); int r = toku_env_log_flush(env, lsn); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
1474 1475
}

1476 1477
static int 
locked_env_set_cachesize(DB_ENV *env, u_int32_t gbytes, u_int32_t bytes, int ncache) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1478
    toku_ydb_lock(); int r = toku_env_set_cachesize(env, gbytes, bytes, ncache); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
1479 1480
}

1481 1482
static int 
locked_env_set_data_dir(DB_ENV * env, const char *dir) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1483
    toku_ydb_lock(); int r = toku_env_set_data_dir(env, dir); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
1484 1485
}

1486 1487
static int 
locked_env_set_flags(DB_ENV * env, u_int32_t flags, int onoff) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1488
    toku_ydb_lock(); int r = toku_env_set_flags(env, flags, onoff); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
1489 1490
}

1491 1492
static int 
locked_env_set_lg_bsize(DB_ENV * env, u_int32_t bsize) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1493
    toku_ydb_lock(); int r = toku_env_set_lg_bsize(env, bsize); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
1494 1495
}

1496 1497
static int 
locked_env_set_lg_dir(DB_ENV * env, const char *dir) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1498
    toku_ydb_lock(); int r = toku_env_set_lg_dir(env, dir); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
1499 1500
}

1501 1502
static int 
locked_env_set_lg_max(DB_ENV * env, u_int32_t lg_max) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1503
    toku_ydb_lock(); int r = toku_env_set_lg_max(env, lg_max); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
1504 1505
}

1506 1507
static int 
locked_env_get_lg_max(DB_ENV * env, u_int32_t *lg_maxp) {
1508 1509 1510
    toku_ydb_lock(); int r = toku_env_get_lg_max(env, lg_maxp); toku_ydb_unlock(); return r;
}

1511 1512
static int 
locked_env_set_lk_detect(DB_ENV * env, u_int32_t detect) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1513
    toku_ydb_lock(); int r = toku_env_set_lk_detect(env, detect); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
1514 1515
}

1516 1517
static int 
locked_env_set_tmp_dir(DB_ENV * env, const char *tmp_dir) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1518
    toku_ydb_lock(); int r = toku_env_set_tmp_dir(env, tmp_dir); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
1519 1520
}

1521 1522
static int 
locked_env_set_verbose(DB_ENV * env, u_int32_t which, int onoff) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1523
    toku_ydb_lock(); int r = toku_env_set_verbose(env, which, onoff); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
1524 1525
}

1526 1527
static int 
locked_env_txn_stat(DB_ENV * env, DB_TXN_STAT ** statp, u_int32_t flags) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1528
    toku_ydb_lock(); int r = toku_env_txn_stat(env, statp, flags); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
1529 1530
}

1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545
static int
env_checkpointing_set_period(DB_ENV * env, u_int32_t seconds) {
    HANDLE_PANICKED_ENV(env);
    int r;
    if (!env_opened(env)) r = EINVAL;
    else
        r = toku_set_checkpoint_period(env->i->cachetable, seconds);
    return r;
}

static int
locked_env_checkpointing_set_period(DB_ENV * env, u_int32_t seconds) {
    toku_ydb_lock(); int r = env_checkpointing_set_period(env, seconds); toku_ydb_unlock(); return r;
}

Zardosht Kasheff's avatar
Zardosht Kasheff committed
1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575
static int
env_cleaner_set_period(DB_ENV * env, u_int32_t seconds) {
    HANDLE_PANICKED_ENV(env);
    int r;
    if (!env_opened(env)) r = EINVAL;
    else
        r = toku_set_cleaner_period(env->i->cachetable, seconds);
    return r;
}

static int
locked_env_cleaner_set_period(DB_ENV * env, u_int32_t seconds) {
    toku_ydb_lock(); int r = env_cleaner_set_period(env, seconds); toku_ydb_unlock(); return r;
}

static int
env_cleaner_set_iterations(DB_ENV * env, u_int32_t iterations) {
    HANDLE_PANICKED_ENV(env);
    int r;
    if (!env_opened(env)) r = EINVAL;
    else
        r = toku_set_cleaner_iterations(env->i->cachetable, iterations);
    return r;
}

static int
locked_env_cleaner_set_iterations(DB_ENV * env, u_int32_t iterations) {
    toku_ydb_lock(); int r = env_cleaner_set_iterations(env, iterations); toku_ydb_unlock(); return r;
}

Dave Wells's avatar
Dave Wells committed
1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590
static int
locked_env_create_indexer(DB_ENV *env,
                          DB_TXN *txn,
                          DB_INDEXER **indexerp,
                          DB *src_db,
                          int N,
                          DB *dest_dbs[N],
                          uint32_t db_flags[N],
                          uint32_t indexer_flags) {
    toku_ydb_lock();
    int r = toku_indexer_create_indexer(env, txn, indexerp, src_db, N, dest_dbs, db_flags, indexer_flags);
    toku_ydb_unlock();
    return r;
}

1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607
static int
locked_env_create_loader(DB_ENV *env,
                         DB_TXN *txn, 
                         DB_LOADER **blp, 
                         DB *src_db, 
                         int N, 
                         DB *dbs[], 
                         uint32_t db_flags[N], 
                         uint32_t dbt_flags[N], 
                         uint32_t loader_flags) {
    toku_ydb_lock();
    int r = toku_loader_create_loader(env, txn, blp, src_db, N, dbs, db_flags, dbt_flags, loader_flags);
    toku_ydb_unlock();
    return r;
}


Dave Wells's avatar
Dave Wells committed
1608

1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623
static int
env_checkpointing_get_period(DB_ENV * env, u_int32_t *seconds) {
    HANDLE_PANICKED_ENV(env);
    int r = 0;
    if (!env_opened(env)) r = EINVAL;
    else 
        *seconds = toku_get_checkpoint_period(env->i->cachetable);
    return r;
}

static int
locked_env_checkpointing_get_period(DB_ENV * env, u_int32_t *seconds) {
    toku_ydb_lock(); int r = env_checkpointing_get_period(env, seconds); toku_ydb_unlock(); return r;
}

Zardosht Kasheff's avatar
Zardosht Kasheff committed
1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653
static int
env_cleaner_get_period(DB_ENV * env, u_int32_t *seconds) {
    HANDLE_PANICKED_ENV(env);
    int r = 0;
    if (!env_opened(env)) r = EINVAL;
    else 
        *seconds = toku_get_cleaner_period(env->i->cachetable);
    return r;
}

static int
locked_env_cleaner_get_period(DB_ENV * env, u_int32_t *seconds) {
    toku_ydb_lock(); int r = env_cleaner_get_period(env, seconds); toku_ydb_unlock(); return r;
}

static int
env_cleaner_get_iterations(DB_ENV * env, u_int32_t *iterations) {
    HANDLE_PANICKED_ENV(env);
    int r = 0;
    if (!env_opened(env)) r = EINVAL;
    else 
        *iterations = toku_get_cleaner_iterations(env->i->cachetable);
    return r;
}

static int
locked_env_cleaner_get_iterations(DB_ENV * env, u_int32_t *iterations) {
    toku_ydb_lock(); int r = env_cleaner_get_iterations(env, iterations); toku_ydb_unlock(); return r;
}

1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708
static int
env_checkpointing_postpone(DB_ENV * env) {
    HANDLE_PANICKED_ENV(env);
    int r = 0;
    if (!env_opened(env)) r = EINVAL;
    else toku_checkpoint_safe_client_lock();
    return r;
}

static int
env_checkpointing_resume(DB_ENV * env) {
    HANDLE_PANICKED_ENV(env);
    int r = 0;
    if (!env_opened(env)) r = EINVAL;
    else toku_checkpoint_safe_client_unlock();
    return r;
}

static int
env_checkpointing_begin_atomic_operation(DB_ENV * env) {
    HANDLE_PANICKED_ENV(env);
    int r = 0;
    if (!env_opened(env)) r = EINVAL;
    else toku_multi_operation_client_lock();
    return r;
}

static int
env_checkpointing_end_atomic_operation(DB_ENV * env) {
    HANDLE_PANICKED_ENV(env);
    int r = 0;
    if (!env_opened(env)) r = EINVAL;
    else toku_multi_operation_client_unlock();
    return r;
}

static int
env_set_default_bt_compare(DB_ENV * env, int (*bt_compare) (DB *, const DBT *, const DBT *)) {
    HANDLE_PANICKED_ENV(env);
    int r = 0;
    if (env_opened(env)) r = EINVAL;
    else {
        env->i->bt_compare = bt_compare;
    }
    return r;
}

static int
locked_env_set_default_bt_compare(DB_ENV * env, int (*bt_compare) (DB *, const DBT *, const DBT *)) {
    toku_ydb_lock();
    int r = env_set_default_bt_compare(env, bt_compare);
    toku_ydb_unlock();
    return r;
}

1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722
static void
env_set_update (DB_ENV *env, int (*update_function)(DB *, const DBT *key, const DBT *old_val, const DBT *extra, void (*set_val)(const DBT *new_val, void *set_extra), void *set_extra)) {
    env->i->update_function = update_function;
}

static void
locked_env_set_update (DB_ENV *env, int (*update_function)(DB *, const DBT *key, const DBT *old_val, const DBT *extra, void (*set_val)(const DBT *new_val, void *set_extra), void *set_extra)) {
    toku_ydb_lock();
    env_set_update (env, update_function);
    toku_ydb_unlock();
}



1723
static int
1724
env_set_generate_row_callback_for_put(DB_ENV *env, generate_row_for_put_func generate_row_for_put) {
1725 1726 1727 1728
    HANDLE_PANICKED_ENV(env);
    int r = 0;
    if (env_opened(env)) r = EINVAL;
    else {
1729
        env->i->generate_row_for_put = generate_row_for_put;
1730 1731 1732 1733 1734
    }
    return r;
}

static int
1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746
env_set_generate_row_callback_for_del(DB_ENV *env, generate_row_for_del_func generate_row_for_del) {
    HANDLE_PANICKED_ENV(env);
    int r = 0;
    if (env_opened(env)) r = EINVAL;
    else {
        env->i->generate_row_for_del = generate_row_for_del;
    }
    return r;
}

static int
locked_env_set_generate_row_callback_for_put(DB_ENV *env, generate_row_for_put_func generate_row_for_put) {
1747
    toku_ydb_lock();
1748
    int r = env_set_generate_row_callback_for_put(env, generate_row_for_put);
1749 1750 1751 1752
    toku_ydb_unlock();
    return r;
}

1753 1754 1755 1756 1757 1758 1759 1760
static int
locked_env_set_generate_row_callback_for_del(DB_ENV *env, generate_row_for_del_func generate_row_for_del) {
    toku_ydb_lock();
    int r = env_set_generate_row_callback_for_del(env, generate_row_for_del);
    toku_ydb_unlock();
    return r;
}

1761
static int env_put_multiple(DB_ENV *env, DB *src_db, DB_TXN *txn, 
1762
                            const DBT *src_key, const DBT *src_val, 
1763
                            uint32_t num_dbs, DB **db_array, DBT *keys, DBT *vals, uint32_t *flags_array);
1764 1765

static int env_del_multiple(DB_ENV *env, DB *src_db, DB_TXN *txn, 
1766
                            const DBT *src_key, const DBT *src_val, 
1767
                            uint32_t num_dbs, DB **db_array, DBT *keys, uint32_t *flags_array);
1768 1769

static int env_update_multiple(DB_ENV *env, DB *src_db, DB_TXN *txn, 
1770 1771 1772
                               DBT *old_src_key, DBT *old_src_data,
                               DBT *new_src_key, DBT *new_src_data,
                               uint32_t num_dbs, DB **db_array, uint32_t* flags_array, 
1773
                               uint32_t num_keys, DBT *keys, 
1774
                               uint32_t num_vals, DBT *vals);
1775 1776

static int
1777
locked_env_put_multiple(DB_ENV *env, DB *src_db, DB_TXN *txn, const DBT *src_key, const DBT *src_val, uint32_t num_dbs, DB **db_array, DBT *keys, DBT *vals, uint32_t *flags_array) {
1778 1779 1780
    int r = env_check_avail_fs_space(env);
    if (r == 0) {
	toku_ydb_lock();
1781
	r = env_put_multiple(env, src_db, txn, src_key, src_val, num_dbs, db_array, keys, vals, flags_array);
1782 1783
	toku_ydb_unlock();
    }
1784 1785 1786 1787
    return r;
}

static int
1788
locked_env_del_multiple(DB_ENV *env, DB *src_db, DB_TXN *txn, const DBT *src_key, const DBT *src_val, uint32_t num_dbs, DB **db_array, DBT *keys, uint32_t *flags_array) {
1789
    toku_ydb_lock();
1790
    int r = env_del_multiple(env, src_db, txn, src_key, src_val, num_dbs, db_array, keys, flags_array);
1791 1792 1793 1794
    toku_ydb_unlock();
    return r;
}

1795 1796
static int
locked_env_update_multiple(DB_ENV *env, DB *src_db, DB_TXN *txn,                                
1797 1798 1799
                           DBT *old_src_key, DBT *old_src_data,
                           DBT *new_src_key, DBT *new_src_data,
                           uint32_t num_dbs, DB **db_array, uint32_t* flags_array, 
1800
                           uint32_t num_keys, DBT *keys, 
1801
                           uint32_t num_vals, DBT *vals) {
1802
    toku_ydb_lock();
1803
    int r = env_update_multiple(env, src_db, txn, old_src_key, old_src_data, new_src_key, new_src_data, num_dbs, db_array, flags_array, num_keys, keys, num_vals, vals);
1804 1805 1806
    toku_ydb_unlock();
    return r;
}
1807

1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828
static int
env_set_redzone(DB_ENV *env, int redzone) {
    HANDLE_PANICKED_ENV(env);
    int r;
    if (env_opened(env))
        r = EINVAL;
    else {
        env->i->redzone = redzone;
        r = 0;
    }
    return r;
}

static int 
locked_env_set_redzone(DB_ENV *env, int redzone) {
    toku_ydb_lock();
    int r= env_set_redzone(env, redzone);
    toku_ydb_unlock();
    return r;
}

1829
static int
1830 1831
env_get_lock_timeout(DB_ENV *env, uint64_t *lock_timeout_msec) {
    toku_ltm_get_lock_wait_time(env->i->ltm, lock_timeout_msec);
1832 1833 1834 1835
    return 0;
}

static int
1836
locked_env_get_lock_timeout(DB_ENV *env, uint64_t *lock_timeout_msec) {
1837
    toku_ydb_lock();
1838
    int r = env_get_lock_timeout(env, lock_timeout_msec);
1839 1840 1841 1842 1843
    toku_ydb_unlock();
    return r;
}

static int
1844 1845
env_set_lock_timeout(DB_ENV *env, uint64_t lock_timeout_msec) {
    toku_ltm_set_lock_wait_time(env->i->ltm, lock_timeout_msec);
1846 1847 1848 1849
    return 0;
}

static int
1850
locked_env_set_lock_timeout(DB_ENV *env, uint64_t lock_timeout_msec) {
1851
    toku_ydb_lock();
1852
    int r = env_set_lock_timeout(env, lock_timeout_msec);
1853 1854 1855 1856
    toku_ydb_unlock();
    return r;
}

1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872
static void
format_time(const time_t *timer, char *buf) {
    ctime_r(timer, buf);
    size_t len = strlen(buf);
    assert(len < 26);
    char end;

    assert(len>=1);
    end = buf[len-1];
    while (end == '\n' || end == '\r') {
        buf[len-1] = '\0';
        len--;
        assert(len>=1);
        end = buf[len-1];
    }
}
1873

1874 1875
// Do not take ydb lock or any other lock around or in this function.  
// If the engine is blocked because some thread is holding a lock, this function
1876 1877 1878
// can help diagnose the problem.
// This function only collects information, and it does not matter if something gets garbled
// because of a race condition.  
1879
// Note, engine status is still collected even if the environment or logger is panicked
1880
static int
1881 1882 1883
env_get_engine_status(DB_ENV * env, ENGINE_STATUS * engstat, char * env_panic_string_buf, int env_panic_string_length) {
    int r;
    if (env_panic_string_buf) {
1884
	if (env && env->i && env->i->is_panicked && env->i->panic_string) {
1885
	    strncpy(env_panic_string_buf, env->i->panic_string, env_panic_string_length);
1886 1887
	    env_panic_string_buf[env_panic_string_length - 1] = '\0';  // just in case
	}
1888 1889 1890 1891
	else 
	    *env_panic_string_buf = '\0';
    }

1892 1893 1894 1895
    if ( !(env)     || 
	 !(env->i)  || 
	 !(env_opened(env)) )
	r = EINVAL;
1896
    else {
1897 1898
	r = 0;
	engstat->env_panic = env->i->is_panicked;
1899
	format_time(&persistent_creation_time, engstat->creationtime);
1900 1901
	time_t now = time(NULL);
        format_time(&now, engstat->now);
1902
        format_time(&startuptime, engstat->startuptime);
1903 1904 1905
	{
	    SCHEDULE_STATUS_S schedstat;
	    toku_ydb_lock_get_status(&schedstat);
1906 1907 1908 1909 1910 1911 1912 1913
	    engstat->ydb_lock_ctr             = schedstat.ydb_lock_ctr;             /* How many times has ydb lock been taken/released?                                                                      */ 
	    engstat->num_waiters_now          = schedstat.num_waiters_now;          /* How many are waiting on on the ydb lock right now (including the current lock holder, if any)?                        */
	    engstat->max_waiters              = schedstat.max_waiters;              /* The maxium of num_waiters_now (since the system booted).                                                              */ 
	    engstat->total_sleep_time         = schedstat.total_sleep_time;         /* The total time spent (since the system booted) sleeping (by the indexer) to give foreground threads a chance to work .*/ 
	    engstat->max_time_ydb_lock_held   = schedstat.max_time_ydb_lock_held;   /* Maximum time that the ydb lock was held.                                                                              */ 
	    engstat->total_time_ydb_lock_held = schedstat.total_time_ydb_lock_held; /* Total time client threads held the ydb lock                                                                           */ 
	    engstat->total_time_since_start   = schedstat.total_time_since_start;   /* Total time since the lock was created.  Use this as total_time_ydb_lock_held/total_time_since_start to get a ratio.   */

1914
	}
1915 1916 1917 1918 1919 1920 1921 1922
        {
	    LE_STATUS_S lestat;                    // Rice's vampire
	    toku_le_get_status(&lestat);
            engstat->le_max_committed_xr    = lestat.max_committed_xr;
            engstat->le_max_provisional_xr  = lestat.max_provisional_xr;
            engstat->le_expanded            = lestat.expanded;
            engstat->le_max_memsize         = lestat.max_memsize;
        }
1923
	engstat->checkpoint_period = toku_get_checkpoint_period_unlocked(env->i->cachetable);  // do not take any locks (not even minicron lock)
1924 1925 1926 1927 1928 1929 1930
	{
            CHECKPOINT_STATUS_S cpstat;
            toku_checkpoint_get_status(&cpstat);
            engstat->checkpoint_footprint = cpstat.footprint;
	    format_time(&cpstat.time_last_checkpoint_begin_complete, engstat->checkpoint_time_begin_complete);
	    format_time(&cpstat.time_last_checkpoint_begin,          engstat->checkpoint_time_begin);
	    format_time(&cpstat.time_last_checkpoint_end,            engstat->checkpoint_time_end);
1931 1932 1933 1934
	    engstat->checkpoint_last_lsn   = cpstat.last_lsn;
	    engstat->checkpoint_count      = cpstat.checkpoint_count;
	    engstat->checkpoint_count_fail = cpstat.checkpoint_count_fail;
	}
Zardosht Kasheff's avatar
Zardosht Kasheff committed
1935 1936
        engstat->cleaner_period = toku_get_cleaner_period_unlocked(env->i->cachetable);
        engstat->cleaner_iterations = toku_get_cleaner_iterations_unlocked(env->i->cachetable);
1937 1938 1939 1940 1941 1942 1943
	{
	    TXN_STATUS_S txnstat;
	    toku_txn_get_status(&txnstat);
	    engstat->txn_begin   = txnstat.begin;
	    engstat->txn_commit  = txnstat.commit;
	    engstat->txn_abort   = txnstat.abort;
	    engstat->txn_close   = txnstat.close;
1944 1945
	    engstat->txn_num_open = txnstat.num_open;
	    engstat->txn_max_open = txnstat.max_open;
1946 1947
	    {
		uint64_t oldest_xid = 0;
1948
                time_t   oldest_starttime = 0;
1949 1950 1951
		uint64_t next_lsn   = 0;
		TOKULOGGER logger = env->i->logger;
		if (logger) {
1952
		    oldest_xid = toku_logger_get_oldest_living_xid(env->i->logger, &oldest_starttime);
1953 1954 1955 1956
		    next_lsn   = (toku_logger_get_next_lsn(env->i->logger)).lsn;
		}
		engstat->txn_oldest_live = oldest_xid;
		engstat->next_lsn = next_lsn;
1957
                format_time(&oldest_starttime, engstat->txn_oldest_live_starttime);
1958
	    }
1959
	}
1960 1961 1962
	{
	    CACHETABLE_STATUS_S ctstat;
	    toku_cachetable_get_status(env->i->cachetable, &ctstat);
1963 1964 1965 1966 1967 1968 1969 1970
	    engstat->cachetable_lock_taken    = ctstat.lock_taken;
	    engstat->cachetable_lock_released = ctstat.lock_released;
	    engstat->cachetable_hit           = ctstat.hit;
	    engstat->cachetable_miss          = ctstat.miss;
	    engstat->cachetable_misstime      = ctstat.misstime;
	    engstat->cachetable_waittime      = ctstat.waittime;
	    engstat->cachetable_wait_reading  = ctstat.wait_reading;
	    engstat->cachetable_wait_writing  = ctstat.wait_writing;
1971
	    engstat->cachetable_wait_checkpoint = ctstat.wait_checkpoint;
1972 1973 1974 1975 1976 1977
	    engstat->puts                     = ctstat.puts;
	    engstat->prefetches               = ctstat.prefetches;
	    engstat->maybe_get_and_pins       = ctstat.maybe_get_and_pins;
	    engstat->maybe_get_and_pin_hits   = ctstat.maybe_get_and_pin_hits;
	    engstat->cachetable_size_current  = ctstat.size_current;
	    engstat->cachetable_size_limit    = ctstat.size_limit;
1978
	    engstat->cachetable_size_max      = ctstat.size_max;
1979 1980
	    engstat->cachetable_size_writing  = ctstat.size_writing;
	    engstat->get_and_pin_footprint    = ctstat.get_and_pin_footprint;
1981 1982 1983
	    engstat->local_checkpoint         = ctstat.local_checkpoint;
	    engstat->local_checkpoint_files   = ctstat.local_checkpoint_files;
	    engstat->local_checkpoint_during_checkpoint = ctstat.local_checkpoint_during_checkpoint;
1984
            engstat->cachetable_evictions     = ctstat.evictions;
Leif Walsh's avatar
Leif Walsh committed
1985
            engstat->cleaner_executions       = ctstat.cleaner_executions;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
1986 1987 1988
            engstat->cachetable_size_leaf     = ctstat.size_leaf;
            engstat->cachetable_size_nonleaf  = ctstat.size_nonleaf;
            engstat->cachetable_size_rollback = ctstat.size_rollback;
1989
	}
1990 1991
	{
	    toku_ltm* ltm = env->i->ltm;
1992
	    LTM_STATUS_S ltmstat;
1993 1994 1995 1996 1997
	    uint32_t max_locks, curr_locks;
	    uint64_t max_lock_memory, curr_lock_memory;
	    toku_ltm_get_status(ltm, &max_locks, &curr_locks, 
				&max_lock_memory, &curr_lock_memory,
				&ltmstat);
1998 1999
	    engstat->range_locks_max                 = max_locks;
	    engstat->range_locks_curr                = curr_locks;
2000 2001
	    engstat->range_locks_max_memory          = max_lock_memory;
	    engstat->range_locks_curr_memory         = curr_lock_memory;
2002 2003
	    engstat->range_lock_escalation_successes = ltmstat.lock_escalation_successes;
	    engstat->range_lock_escalation_failures  = ltmstat.lock_escalation_failures;
2004 2005 2006 2007 2008 2009
	    engstat->range_read_locks                = ltmstat.read_lock;
	    engstat->range_read_locks_fail           = ltmstat.read_lock_fail;
	    engstat->range_out_of_read_locks         = ltmstat.out_of_read_locks;
	    engstat->range_write_locks               = ltmstat.write_lock;
	    engstat->range_write_locks_fail          = ltmstat.write_lock_fail;
	    engstat->range_out_of_write_locks        = ltmstat.out_of_write_locks;
2010 2011 2012 2013 2014
	    engstat->range_lt_create                 = ltmstat.lt_create;
	    engstat->range_lt_create_fail            = ltmstat.lt_create_fail;
	    engstat->range_lt_destroy                = ltmstat.lt_destroy;
	    engstat->range_lt_num                    = ltmstat.lt_num;
	    engstat->range_lt_num_max                = ltmstat.lt_num_max;
2015
	}
2016
	{
2017
     	    engstat->inserts            = num_inserts;
2018
	    engstat->inserts_fail       = num_inserts_fail;
2019
	    engstat->deletes            = num_deletes;
2020
	    engstat->deletes_fail       = num_deletes_fail;
2021 2022
	    engstat->updates            = num_updates;
	    engstat->updates_fail       = num_updates_fail;
2023 2024
	    engstat->updates_broadcast  = num_updates_broadcast;
	    engstat->updates_broadcast_fail  = num_updates_broadcast_fail;
2025 2026 2027 2028 2029 2030
     	    engstat->multi_inserts      = num_multi_inserts;
	    engstat->multi_inserts_fail = num_multi_inserts_fail;
	    engstat->multi_deletes      = num_multi_deletes;
	    engstat->multi_deletes_fail = num_multi_deletes_fail;
	    engstat->multi_updates      = num_multi_updates;
	    engstat->multi_updates_fail = num_multi_updates_fail;
2031 2032
	    engstat->point_queries      = num_point_queries;
	    engstat->sequential_queries = num_sequential_queries;
2033 2034 2035 2036
	    engstat->num_db_open        = num_db_open;
	    engstat->num_db_close       = num_db_close;
	    engstat->num_open_dbs       = num_open_dbs;
	    engstat->max_open_dbs       = max_open_dbs;
2037 2038 2039 2040
            engstat->directory_read_locks = directory_read_locks;
            engstat->directory_read_locks_fail = directory_read_locks_fail;
            engstat->directory_write_locks = directory_write_locks;
            engstat->directory_write_locks_fail = directory_write_locks_fail;
2041
	}
2042
	{
2043 2044 2045 2046 2047
	    BRT_STATUS_S brt_stat;
	    toku_brt_get_status(&brt_stat);
	    engstat->le_updates = brt_stat.updates;
	    engstat->le_updates_broadcast = brt_stat.updates_broadcast;
	    engstat->descriptor_set = brt_stat.descriptor_set;
2048 2049 2050
	    engstat->partial_fetch_hit = brt_stat.partial_fetch_hit;
	    engstat->partial_fetch_miss = brt_stat.partial_fetch_miss;
	    engstat->partial_fetch_compressed = brt_stat.partial_fetch_compressed;
2051 2052
	    engstat->partial_evictions_nonleaf = brt_stat.partial_evictions_nonleaf;
	    engstat->partial_evictions_leaf = brt_stat.partial_evictions_leaf;
2053 2054
	    engstat->msn_discards = brt_stat.msn_discards;
	    engstat->max_workdone = brt_stat.max_workdone;
2055 2056 2057 2058 2059 2060 2061
	    engstat->total_searches = brt_stat.total_searches;              
	    engstat->total_retries = brt_stat.total_retries;
	    engstat->max_search_excess_retries = brt_stat.max_search_excess_retries;
	    engstat->max_search_root_tries = brt_stat.max_search_root_tries;
	    engstat->search_root_retries = brt_stat.search_root_retries;
	    engstat->search_tries_gt_height = brt_stat.search_tries_gt_height;
	    engstat->search_tries_gt_heightplus3 = brt_stat.search_tries_gt_heightplus3;	    
2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072
	    engstat->cleaner_total_nodes = brt_stat.cleaner_total_nodes;
	    engstat->cleaner_h1_nodes = brt_stat.cleaner_h1_nodes;
	    engstat->cleaner_hgt1_nodes = brt_stat.cleaner_hgt1_nodes;
	    engstat->cleaner_empty_nodes = brt_stat.cleaner_empty_nodes;
	    engstat->cleaner_nodes_dirtied = brt_stat.cleaner_nodes_dirtied;
	    engstat->cleaner_max_buffer_size = brt_stat.cleaner_max_buffer_size;
	    engstat->cleaner_min_buffer_size = brt_stat.cleaner_min_buffer_size;
	    engstat->cleaner_total_buffer_size = brt_stat.cleaner_total_buffer_size;
	    engstat->cleaner_max_buffer_workdone = brt_stat.cleaner_max_buffer_workdone;
	    engstat->cleaner_min_buffer_workdone = brt_stat.cleaner_min_buffer_workdone;
	    engstat->cleaner_total_buffer_workdone = brt_stat.cleaner_total_buffer_workdone;
2073
            engstat->cleaner_num_leaves_unmerged = brt_stat.cleaner_num_leaves_unmerged;
Leif Walsh's avatar
Leif Walsh committed
2074 2075 2076 2077 2078 2079 2080 2081 2082 2083
            engstat->flush_total = brt_stat.flush_total;
            engstat->flush_in_memory = brt_stat.flush_in_memory;
            engstat->flush_needed_io = brt_stat.flush_needed_io;
            engstat->flush_cascades = brt_stat.flush_cascades;
            engstat->flush_cascades_1 = brt_stat.flush_cascades_1;
            engstat->flush_cascades_2 = brt_stat.flush_cascades_2;
            engstat->flush_cascades_3 = brt_stat.flush_cascades_3;
            engstat->flush_cascades_4 = brt_stat.flush_cascades_4;
            engstat->flush_cascades_5 = brt_stat.flush_cascades_5;
            engstat->flush_cascades_gt_5 = brt_stat.flush_cascades_gt_5;
2084 2085 2086 2087
            engstat->disk_flush_leaf = brt_stat.disk_flush_leaf; 
            engstat->disk_flush_nonleaf = brt_stat.disk_flush_nonleaf; 
            engstat->disk_flush_leaf_for_checkpoint = brt_stat.disk_flush_leaf_for_checkpoint; 
            engstat->disk_flush_nonleaf_for_checkpoint = brt_stat.disk_flush_nonleaf_for_checkpoint; 
2088 2089
            engstat->create_leaf = brt_stat.create_leaf;
            engstat->create_nonleaf = brt_stat.create_nonleaf;
2090 2091
            engstat->destroy_leaf = brt_stat.destroy_leaf;
            engstat->destroy_nonleaf = brt_stat.destroy_nonleaf;
2092 2093 2094 2095 2096 2097 2098
            engstat->split_leaf = brt_stat.split_leaf;
            engstat->split_nonleaf = brt_stat.split_nonleaf;
            engstat->merge_leaf = brt_stat.merge_leaf;
            engstat->merge_nonleaf = brt_stat.merge_nonleaf;
            engstat->dirty_leaf = brt_stat.dirty_leaf;
            engstat->dirty_nonleaf = brt_stat.dirty_nonleaf;
            engstat->balance_leaf = brt_stat.balance_leaf;
2099 2100 2101 2102 2103 2104
            engstat->msg_bytes_in = brt_stat.msg_bytes_in;
            engstat->msg_bytes_out = brt_stat.msg_bytes_out;
            engstat->msg_bytes_curr = brt_stat.msg_bytes_curr;
            engstat->msg_bytes_max = brt_stat.msg_bytes_max;
            engstat->msg_num = brt_stat.msg_num;
            engstat->msg_num_broadcast = brt_stat.msg_num_broadcast;
2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123
            engstat->num_basements_decompressed_normal = brt_stat.num_basements_decompressed_normal;
            engstat->num_basements_decompressed_aggressive = brt_stat.num_basements_decompressed_aggressive;
            engstat->num_basements_decompressed_prefetch = brt_stat.num_basements_decompressed_prefetch;
            engstat->num_basements_decompressed_write = brt_stat.num_basements_decompressed_write;
            engstat->num_msg_buffer_decompressed_normal = brt_stat.num_msg_buffer_decompressed_normal;
            engstat->num_msg_buffer_decompressed_aggressive = brt_stat.num_msg_buffer_decompressed_aggressive;
            engstat->num_msg_buffer_decompressed_prefetch = brt_stat.num_msg_buffer_decompressed_prefetch;
            engstat->num_msg_buffer_decompressed_write = brt_stat.num_msg_buffer_decompressed_write;
            engstat->num_pivots_fetched_query = brt_stat.num_pivots_fetched_query;
            engstat->num_pivots_fetched_prefetch = brt_stat.num_pivots_fetched_prefetch;
            engstat->num_pivots_fetched_write = brt_stat.num_pivots_fetched_write;
            engstat->num_basements_fetched_normal = brt_stat.num_basements_fetched_normal;
            engstat->num_basements_fetched_aggressive = brt_stat.num_basements_fetched_aggressive;
            engstat->num_basements_fetched_prefetch = brt_stat.num_basements_fetched_prefetch;
            engstat->num_basements_fetched_write = brt_stat.num_basements_fetched_write;
            engstat->num_msg_buffer_fetched_normal = brt_stat.num_msg_buffer_fetched_normal;
            engstat->num_msg_buffer_fetched_aggressive = brt_stat.num_msg_buffer_fetched_aggressive;
            engstat->num_msg_buffer_fetched_prefetch = brt_stat.num_msg_buffer_fetched_prefetch;
            engstat->num_msg_buffer_fetched_write = brt_stat.num_msg_buffer_fetched_write;
2124
	}
2125 2126 2127 2128 2129 2130
	{
	    u_int64_t fsync_count, fsync_time;
	    toku_get_fsync_times(&fsync_count, &fsync_time);
	    engstat->fsync_count = fsync_count;
	    engstat->fsync_time  = fsync_time;
	}
2131 2132 2133 2134 2135 2136 2137
	{
	    LOGGER_STATUS_S log_stat;
	    TOKULOGGER logger = env->i->logger;
	    toku_logger_get_status(logger, &log_stat);
	    engstat->logger_ilock_ctr = log_stat.ilock_ctr;
	    engstat->logger_olock_ctr = log_stat.olock_ctr;
	    engstat->logger_swap_ctr  = log_stat.swap_ctr;
2138 2139
	    engstat->logger_panic     = log_stat.panicked;
	    engstat->logger_panic_errno = log_stat.panic_errno;
2140
	}
2141 2142
	{
	    time_t    enospc_most_recent_timestamp;
2143 2144
	    u_int64_t enospc_threads_blocked, enospc_ctr;
	    toku_fs_get_write_info(&enospc_most_recent_timestamp, &enospc_threads_blocked, &enospc_ctr);
2145 2146
	    format_time(&enospc_most_recent_timestamp, engstat->enospc_most_recent);	    
	    engstat->enospc_threads_blocked = enospc_threads_blocked;
2147
	    engstat->enospc_ctr = enospc_ctr;
2148
	}
2149
	{
2150 2151 2152 2153 2154 2155 2156 2157 2158
	    engstat->enospc_redzone_ctr   = env->i->enospc_redzone_ctr;   // number of operations rejected by enospc prevention (red zone)
	    engstat->enospc_state         = env->i->fs_state;
	}
	{
	    LOADER_STATUS_S loader_stat;
	    toku_loader_get_status(&loader_stat);
	    engstat->loader_create         = loader_stat.create;
	    engstat->loader_create_fail    = loader_stat.create_fail;
	    engstat->loader_put            = loader_stat.put;
2159
	    engstat->loader_put_fail       = loader_stat.put_fail;
2160 2161 2162 2163 2164 2165 2166 2167
	    engstat->loader_close          = loader_stat.close;
	    engstat->loader_close_fail     = loader_stat.close_fail;
	    engstat->loader_abort          = loader_stat.abort;
	    engstat->loader_current        = loader_stat.current;
	    engstat->loader_max            = loader_stat.max;
	    
	    engstat->logsuppress     = logsuppress;
	    engstat->logsuppressfail = logsuppressfail;
2168
	}
2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181
	{
	    INDEXER_STATUS_S indexer_stat;
	    toku_indexer_get_status(&indexer_stat);
	    engstat->indexer_create         = indexer_stat.create;
	    engstat->indexer_create_fail    = indexer_stat.create_fail;
	    engstat->indexer_build          = indexer_stat.build;
	    engstat->indexer_build_fail     = indexer_stat.build_fail;
	    engstat->indexer_close          = indexer_stat.close;
	    engstat->indexer_close_fail     = indexer_stat.close_fail;
	    engstat->indexer_abort          = indexer_stat.abort;
	    engstat->indexer_current        = indexer_stat.current;
	    engstat->indexer_max            = indexer_stat.max;
	}
2182
	{
2183 2184
	    BRT_UPGRADE_STATUS_S brt_upgrade_stat;
	    toku_brt_get_upgrade_status(&brt_upgrade_stat);
2185 2186 2187 2188 2189
	    uint64_t upgrade_footprint  = toku_log_upgrade_get_footprint();
	    // Footprint of upgrade maybe performed for this time environment is opened
	    // is provided in six least significant decimal digits, footprint of 
	    // upgrade performed when environment was actually upgraded is provided
	    // in most significant decimal digits.
2190
	    // If ver_at_startup == 13, then the footprint will have the same value in 
2191
	    // upper and lower digits.
2192 2193 2194 2195 2196
	    engstat->upgrade_env_status = (persistent_upgrade_v14_footprint * 1000000) + upgrade_footprint;
	    engstat->upgrade_header     = brt_upgrade_stat.header_13;
	    engstat->upgrade_nonleaf    = brt_upgrade_stat.nonleaf_13;
	    engstat->upgrade_leaf       = brt_upgrade_stat.leaf_13;
	    engstat->optimized_for_upgrade = brt_upgrade_stat.optimized_for_upgrade;
2197 2198
	    engstat->original_ver       = persistent_original_env_version;
	    engstat->ver_at_startup     = persistent_stored_env_version_at_startup;
2199 2200
	    engstat->last_lsn_v13       = persistent_last_lsn_of_v13;
	    format_time(&persistent_upgrade_v14_time, engstat->upgrade_v14_time);
2201
	}
2202 2203 2204
	{
	    MEMORY_STATUS_S memory_status;
	    toku_memory_get_status(&memory_status);
2205 2206 2207 2208 2209 2210 2211 2212 2213
	    engstat->malloc_count   = memory_status.malloc_count;
	    engstat->free_count     = memory_status.free_count;
	    engstat->realloc_count  = memory_status.realloc_count;
	    engstat->malloc_fail    = memory_status.malloc_fail;
	    engstat->realloc_fail   = memory_status.realloc_fail;
	    engstat->mem_requested  = memory_status.requested;
	    engstat->mem_used       = memory_status.used;
	    engstat->mem_freed      = memory_status.freed;
	    engstat->max_mem_in_use = memory_status.max_in_use;
2214
	    engstat->mallocator_version = memory_status.mallocator_version;
2215
	}
2216 2217 2218 2219
    }
    return r;
}

2220

2221
// Fill buff with text description of engine status up to bufsiz bytes.
2222 2223
// Intended for use by test programs that do not have the handlerton available,
// and for use by toku_assert logic to print diagnostic info on crash.
2224 2225 2226
static int
env_get_engine_status_text(DB_ENV * env, char * buff, int bufsiz) {
    ENGINE_STATUS engstat;
2227
    uint32_t stringsize = 1024;
2228
    char panicstring[stringsize];
2229 2230
    int n = 0;  // number of characters printed so far

2231 2232
    n = snprintf(buff, bufsiz - n, "BUILD_ID = %d\n", BUILD_ID);

2233 2234 2235
    int r = env_get_engine_status(env, &engstat, panicstring, stringsize);    

    if (strlen(panicstring)) {
2236 2237
        invariant(strlen(panicstring) <= stringsize);
        n += snprintf(buff + n, bufsiz - n, "Env panic: %s\n", panicstring);
2238 2239
    }

2240
    if (r) {
2241
        n += snprintf(buff + n, bufsiz - n, "Engine status not available: ");
2242
	if (!env) {
2243
        n += snprintf(buff + n, bufsiz - n, "no environment\n");
2244 2245
	}
	else if (!(env->i)) {
2246
        n += snprintf(buff + n, bufsiz - n, "environment internal struct is null\n");
2247 2248 2249 2250 2251 2252
	}
	else if (!env_opened(env)) {
	    n += snprintf(buff + n, bufsiz - n, "environment is not open\n");
	}
    }
    else {
2253
	n += snprintf(buff + n, bufsiz - n, "env panic                        %"PRIu64"\n", engstat.env_panic);
2254 2255 2256
	n += snprintf(buff + n, bufsiz - n, "creationtime                     %s \n",       engstat.creationtime);
	n += snprintf(buff + n, bufsiz - n, "startuptime                      %s \n",       engstat.startuptime);
	n += snprintf(buff + n, bufsiz - n, "now                              %s \n",       engstat.now);
2257
	n += snprintf(buff + n, bufsiz - n, "ydb_lock_ctr                     %"PRIu64"\n", engstat.ydb_lock_ctr);
2258 2259
	n += snprintf(buff + n, bufsiz - n, "num_waiters_now                  %"PRIu64"\n", engstat.num_waiters_now);
	n += snprintf(buff + n, bufsiz - n, "max_waiters                      %"PRIu64"\n", engstat.max_waiters);
2260
	n += snprintf(buff + n, bufsiz - n, "total_sleep_time                 %"PRIu64"\n", engstat.total_sleep_time);
2261 2262
	n += snprintf(buff + n, bufsiz - n, "max_time_ydb_lock_held           %.6f\n",      tokutime_to_seconds(engstat.max_time_ydb_lock_held));
	n += snprintf(buff + n, bufsiz - n, "total_time_ydb_lock_held         %.6f\n",      tokutime_to_seconds(engstat.total_time_ydb_lock_held));
2263
	n += snprintf(buff + n, bufsiz - n, "total_time_since_start           %.6f\n",      tokutime_to_seconds(engstat.total_time_since_start));
2264 2265 2266 2267
	n += snprintf(buff + n, bufsiz - n, "le_max_committed_xr              %"PRIu64"\n", engstat.le_max_committed_xr);
	n += snprintf(buff + n, bufsiz - n, "le_max_provisional_xr            %"PRIu64"\n", engstat.le_max_provisional_xr);
	n += snprintf(buff + n, bufsiz - n, "le_expanded                      %"PRIu64"\n", engstat.le_expanded);
	n += snprintf(buff + n, bufsiz - n, "le_max_memsize                   %"PRIu64"\n", engstat.le_max_memsize);
2268 2269 2270 2271 2272
	n += snprintf(buff + n, bufsiz - n, "checkpoint_period                %"PRIu64"\n", engstat.checkpoint_period);
	n += snprintf(buff + n, bufsiz - n, "checkpoint_footprint             %"PRIu64"\n", engstat.checkpoint_footprint);
	n += snprintf(buff + n, bufsiz - n, "checkpoint_time_begin            %s \n",       engstat.checkpoint_time_begin);
	n += snprintf(buff + n, bufsiz - n, "checkpoint_time_begin_complete   %s \n",       engstat.checkpoint_time_begin_complete);
	n += snprintf(buff + n, bufsiz - n, "checkpoint_time_end              %s \n",       engstat.checkpoint_time_end);
2273
	n += snprintf(buff + n, bufsiz - n, "checkpoint_last_lsn              %"PRIu64"\n", engstat.checkpoint_last_lsn);
2274 2275 2276 2277
	n += snprintf(buff + n, bufsiz - n, "checkpoint_count                 %"PRIu64"\n", engstat.checkpoint_count);
	n += snprintf(buff + n, bufsiz - n, "checkpoint_count_fail            %"PRIu64"\n", engstat.checkpoint_count_fail);
	n += snprintf(buff + n, bufsiz - n, "cleaner_period                   %"PRIu64"\n", engstat.cleaner_period);
	n += snprintf(buff + n, bufsiz - n, "cleaner_iterations               %"PRIu64"\n", engstat.cleaner_iterations);
2278 2279 2280 2281
	n += snprintf(buff + n, bufsiz - n, "txn_begin                        %"PRIu64"\n", engstat.txn_begin);
	n += snprintf(buff + n, bufsiz - n, "txn_commit                       %"PRIu64"\n", engstat.txn_commit);
	n += snprintf(buff + n, bufsiz - n, "txn_abort                        %"PRIu64"\n", engstat.txn_abort);
	n += snprintf(buff + n, bufsiz - n, "txn_close                        %"PRIu64"\n", engstat.txn_close);
2282 2283
	n += snprintf(buff + n, bufsiz - n, "txn_num_open                     %"PRIu64"\n", engstat.txn_num_open);
	n += snprintf(buff + n, bufsiz - n, "txn_max_open                     %"PRIu64"\n", engstat.txn_max_open);
2284 2285 2286 2287 2288 2289 2290 2291 2292 2293
	n += snprintf(buff + n, bufsiz - n, "txn_oldest_live                  %"PRIu64"\n", engstat.txn_oldest_live);
	n += snprintf(buff + n, bufsiz - n, "next_lsn                         %"PRIu64"\n", engstat.next_lsn);
	n += snprintf(buff + n, bufsiz - n, "cachetable_lock_taken            %"PRIu64"\n", engstat.cachetable_lock_taken);
	n += snprintf(buff + n, bufsiz - n, "cachetable_lock_released         %"PRIu64"\n", engstat.cachetable_lock_released);
	n += snprintf(buff + n, bufsiz - n, "cachetable_hit                   %"PRIu64"\n", engstat.cachetable_hit);
	n += snprintf(buff + n, bufsiz - n, "cachetable_miss                  %"PRIu64"\n", engstat.cachetable_miss);
	n += snprintf(buff + n, bufsiz - n, "cachetable_misstime              %"PRIu64"\n", engstat.cachetable_misstime);
	n += snprintf(buff + n, bufsiz - n, "cachetable_waittime              %"PRIu64"\n", engstat.cachetable_waittime);
	n += snprintf(buff + n, bufsiz - n, "cachetable_wait_reading          %"PRIu64"\n", engstat.cachetable_wait_reading);
	n += snprintf(buff + n, bufsiz - n, "cachetable_wait_writing          %"PRIu64"\n", engstat.cachetable_wait_writing);
2294
	n += snprintf(buff + n, bufsiz - n, "cachetable_evictions             %"PRIu64"\n", engstat.cachetable_evictions);
Leif Walsh's avatar
Leif Walsh committed
2295
        n += snprintf(buff + n, bufsiz - n, "cleaner_executions               %"PRIu64"\n", engstat.cleaner_executions);
2296 2297 2298 2299 2300 2301
	n += snprintf(buff + n, bufsiz - n, "puts                             %"PRIu64"\n", engstat.puts);
	n += snprintf(buff + n, bufsiz - n, "prefetches                       %"PRIu64"\n", engstat.prefetches);
	n += snprintf(buff + n, bufsiz - n, "maybe_get_and_pins               %"PRIu64"\n", engstat.maybe_get_and_pins);
	n += snprintf(buff + n, bufsiz - n, "maybe_get_and_pin_hits           %"PRIu64"\n", engstat.maybe_get_and_pin_hits);
	n += snprintf(buff + n, bufsiz - n, "cachetable_size_current          %"PRId64"\n", engstat.cachetable_size_current);
	n += snprintf(buff + n, bufsiz - n, "cachetable_size_limit            %"PRId64"\n", engstat.cachetable_size_limit);
2302
	n += snprintf(buff + n, bufsiz - n, "cachetable_size_max              %"PRId64"\n", engstat.cachetable_size_max);
2303 2304
	n += snprintf(buff + n, bufsiz - n, "cachetable_size_leaf             %"PRIu64"\n", engstat.cachetable_size_leaf);
	n += snprintf(buff + n, bufsiz - n, "cachetable_size_nonleaf          %"PRIu64"\n", engstat.cachetable_size_nonleaf);
Zardosht Kasheff's avatar
Zardosht Kasheff committed
2305
	n += snprintf(buff + n, bufsiz - n, "cachetable_size_rollback         %"PRIu64"\n", engstat.cachetable_size_rollback);
2306 2307 2308 2309 2310
	n += snprintf(buff + n, bufsiz - n, "cachetable_size_writing          %"PRId64"\n", engstat.cachetable_size_writing);
	n += snprintf(buff + n, bufsiz - n, "get_and_pin_footprint            %"PRId64"\n", engstat.get_and_pin_footprint);
	n += snprintf(buff + n, bufsiz - n, "local_checkpoint                 %"PRId64"\n", engstat.local_checkpoint);
	n += snprintf(buff + n, bufsiz - n, "local_checkpoint_files           %"PRId64"\n", engstat.local_checkpoint_files);
	n += snprintf(buff + n, bufsiz - n, "local_checkpoint_during_checkpoint  %"PRId64"\n", engstat.local_checkpoint_during_checkpoint);
2311 2312
	n += snprintf(buff + n, bufsiz - n, "range_locks_max                  %"PRIu64"\n", engstat.range_locks_max);
	n += snprintf(buff + n, bufsiz - n, "range_locks_curr                 %"PRIu64"\n", engstat.range_locks_curr);
2313 2314
	n += snprintf(buff + n, bufsiz - n, "range_locks_max_memory           %"PRIu64"\n", engstat.range_locks_max_memory);
	n += snprintf(buff + n, bufsiz - n, "range_locks_curr_memory          %"PRIu64"\n", engstat.range_locks_curr_memory);
2315 2316
	n += snprintf(buff + n, bufsiz - n, "range_locks_escalation_successes %"PRIu64"\n", engstat.range_lock_escalation_successes);
	n += snprintf(buff + n, bufsiz - n, "range_locks_escalation_failures  %"PRIu64"\n", engstat.range_lock_escalation_failures);
2317 2318 2319 2320 2321 2322
	n += snprintf(buff + n, bufsiz - n, "range_read_locks                 %"PRIu64"\n", engstat.range_read_locks);
	n += snprintf(buff + n, bufsiz - n, "range_read_locks_fail            %"PRIu64"\n", engstat.range_read_locks_fail);
	n += snprintf(buff + n, bufsiz - n, "range_out_of_read_locks          %"PRIu64"\n", engstat.range_out_of_read_locks);
	n += snprintf(buff + n, bufsiz - n, "range_write_locks                %"PRIu64"\n", engstat.range_write_locks);
	n += snprintf(buff + n, bufsiz - n, "range_write_locks_fail           %"PRIu64"\n", engstat.range_write_locks_fail);
	n += snprintf(buff + n, bufsiz - n, "range_out_of_write_locks         %"PRIu64"\n", engstat.range_out_of_write_locks);
2323 2324 2325 2326 2327
	n += snprintf(buff + n, bufsiz - n, "range_lt_create                  %"PRIu64"\n", engstat.range_lt_create);
	n += snprintf(buff + n, bufsiz - n, "range_lt_create_fail             %"PRIu64"\n", engstat.range_lt_create_fail);
	n += snprintf(buff + n, bufsiz - n, "range_lt_destroy                 %"PRIu64"\n", engstat.range_lt_destroy);
	n += snprintf(buff + n, bufsiz - n, "range_lt_num                     %"PRIu64"\n", engstat.range_lt_num);
	n += snprintf(buff + n, bufsiz - n, "range_lt_num_max                 %"PRIu64"\n", engstat.range_lt_num_max);
2328 2329 2330 2331
	n += snprintf(buff + n, bufsiz - n, "inserts                          %"PRIu64"\n", engstat.inserts);
	n += snprintf(buff + n, bufsiz - n, "inserts_fail                     %"PRIu64"\n", engstat.inserts_fail);
	n += snprintf(buff + n, bufsiz - n, "deletes                          %"PRIu64"\n", engstat.deletes);
	n += snprintf(buff + n, bufsiz - n, "deletes_fail                     %"PRIu64"\n", engstat.deletes_fail);
2332 2333
	n += snprintf(buff + n, bufsiz - n, "updates                          %"PRIu64"\n", engstat.updates);
	n += snprintf(buff + n, bufsiz - n, "updates_fail                     %"PRIu64"\n", engstat.updates_fail);
2334 2335 2336 2337
	n += snprintf(buff + n, bufsiz - n, "updates_broadcast                %"PRIu64"\n", engstat.updates_broadcast);
	n += snprintf(buff + n, bufsiz - n, "updates_broadcast_fail           %"PRIu64"\n", engstat.updates_broadcast_fail);
	n += snprintf(buff + n, bufsiz - n, "le_updates                       %"PRIu64"\n", engstat.le_updates);
	n += snprintf(buff + n, bufsiz - n, "le_updates_broadcast             %"PRIu64"\n", engstat.le_updates_broadcast);
2338
	n += snprintf(buff + n, bufsiz - n, "descriptor_set                   %"PRIu64"\n", engstat.descriptor_set);
2339 2340 2341
	n += snprintf(buff + n, bufsiz - n, "partial_fetch_hit                %"PRIu64"\n", engstat.partial_fetch_hit);
	n += snprintf(buff + n, bufsiz - n, "partial_fetch_miss               %"PRIu64"\n", engstat.partial_fetch_miss);
	n += snprintf(buff + n, bufsiz - n, "partial_fetch_compressed         %"PRIu64"\n", engstat.partial_fetch_compressed);
2342
	n += snprintf(buff + n, bufsiz - n, "partial_evictions_nonleaf        %"PRIu64"\n", engstat.partial_evictions_nonleaf);
2343
	n += snprintf(buff + n, bufsiz - n, "partial_evictions_leaf           %"PRIu64"\n", engstat.partial_evictions_leaf);
2344 2345
	n += snprintf(buff + n, bufsiz - n, "msn_discards                     %"PRIu64"\n", engstat.msn_discards);
	n += snprintf(buff + n, bufsiz - n, "max_workdone                     %"PRIu64"\n", engstat.max_workdone);
2346 2347 2348 2349 2350 2351 2352
	n += snprintf(buff + n, bufsiz - n, "total_searches                   %"PRIu64"\n", engstat.total_searches);
	n += snprintf(buff + n, bufsiz - n, "total_retries                    %"PRIu64"\n", engstat.total_retries);
	n += snprintf(buff + n, bufsiz - n, "max_search_excess_retries        %"PRIu64"\n", engstat.max_search_excess_retries);
	n += snprintf(buff + n, bufsiz - n, "max_search_root_tries            %"PRIu64"\n", engstat.max_search_root_tries);
	n += snprintf(buff + n, bufsiz - n, "search_root_retries              %"PRIu64"\n", engstat.search_root_retries);
	n += snprintf(buff + n, bufsiz - n, "search_tries_gt_height           %"PRIu64"\n", engstat.search_tries_gt_height);
	n += snprintf(buff + n, bufsiz - n, "search_tries_gt_heightplus3      %"PRIu64"\n", engstat.search_tries_gt_heightplus3);
2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363
	n += snprintf(buff + n, bufsiz - n, "cleaner_total_nodes              %"PRIu64"\n", engstat.cleaner_total_nodes);
	n += snprintf(buff + n, bufsiz - n, "cleaner_h1_nodes                 %"PRIu64"\n", engstat.cleaner_h1_nodes);
	n += snprintf(buff + n, bufsiz - n, "cleaner_hgt1_nodes               %"PRIu64"\n", engstat.cleaner_hgt1_nodes);
	n += snprintf(buff + n, bufsiz - n, "cleaner_empty_nodes              %"PRIu64"\n", engstat.cleaner_empty_nodes);
	n += snprintf(buff + n, bufsiz - n, "cleaner_nodes_dirtied            %"PRIu64"\n", engstat.cleaner_nodes_dirtied);
	n += snprintf(buff + n, bufsiz - n, "cleaner_max_buffer_size          %"PRIu64"\n", engstat.cleaner_max_buffer_size);
	n += snprintf(buff + n, bufsiz - n, "cleaner_min_buffer_size          %"PRIu64"\n", engstat.cleaner_min_buffer_size);
	n += snprintf(buff + n, bufsiz - n, "cleaner_total_buffer_size        %"PRIu64"\n", engstat.cleaner_total_buffer_size);
	n += snprintf(buff + n, bufsiz - n, "cleaner_max_buffer_workdone      %"PRIu64"\n", engstat.cleaner_max_buffer_workdone);
	n += snprintf(buff + n, bufsiz - n, "cleaner_min_buffer_workdone      %"PRIu64"\n", engstat.cleaner_min_buffer_workdone);
	n += snprintf(buff + n, bufsiz - n, "cleaner_total_buffer_workdone    %"PRIu64"\n", engstat.cleaner_total_buffer_workdone);
2364
        n += snprintf(buff + n, bufsiz - n, "cleaner_num_leaves_unmerged      %"PRIu64"\n", engstat.cleaner_num_leaves_unmerged);
Leif Walsh's avatar
Leif Walsh committed
2365
        n += snprintf(buff + n, bufsiz - n, "flush_total                      %"PRIu64"\n", engstat.flush_total);
2366
        n += snprintf(buff + n, bufsiz - n, "flush_in_memory                  %"PRIu64"\n", engstat.flush_in_memory);
Leif Walsh's avatar
Leif Walsh committed
2367 2368 2369 2370 2371 2372 2373 2374
        n += snprintf(buff + n, bufsiz - n, "flush_needed_io                  %"PRIu64"\n", engstat.flush_needed_io);
        n += snprintf(buff + n, bufsiz - n, "flush_cascades                   %"PRIu64"\n", engstat.flush_cascades);
        n += snprintf(buff + n, bufsiz - n, "flush_cascades_1                 %"PRIu64"\n", engstat.flush_cascades_1);
        n += snprintf(buff + n, bufsiz - n, "flush_cascades_2                 %"PRIu64"\n", engstat.flush_cascades_2);
        n += snprintf(buff + n, bufsiz - n, "flush_cascades_3                 %"PRIu64"\n", engstat.flush_cascades_3);
        n += snprintf(buff + n, bufsiz - n, "flush_cascades_4                 %"PRIu64"\n", engstat.flush_cascades_4);
        n += snprintf(buff + n, bufsiz - n, "flush_cascades_5                 %"PRIu64"\n", engstat.flush_cascades_5);
        n += snprintf(buff + n, bufsiz - n, "flush_cascades_gt_5              %"PRIu64"\n", engstat.flush_cascades_gt_5);
2375 2376 2377 2378
        n += snprintf(buff + n, bufsiz - n, "disk_flush_leaf                  %"PRIu64"\n", engstat.disk_flush_leaf); 
        n += snprintf(buff + n, bufsiz - n, "disk_flush_nonleaf               %"PRIu64"\n", engstat.disk_flush_nonleaf); 
        n += snprintf(buff + n, bufsiz - n, "disk_flush_leaf_for_checkpoint   %"PRIu64"\n", engstat.disk_flush_leaf_for_checkpoint); 
        n += snprintf(buff + n, bufsiz - n, "disk_flush_nonleaf_for_checkpoint %"PRIu64"\n", engstat.disk_flush_nonleaf_for_checkpoint); 
2379 2380
        n += snprintf(buff + n, bufsiz - n, "create_leaf                      %"PRIu64"\n", engstat.create_leaf); 
        n += snprintf(buff + n, bufsiz - n, "create_nonleaf                   %"PRIu64"\n", engstat.create_nonleaf); 
2381 2382
        n += snprintf(buff + n, bufsiz - n, "destroy_leaf                     %"PRIu64"\n", engstat.destroy_leaf); 
        n += snprintf(buff + n, bufsiz - n, "destroy_nonleaf                  %"PRIu64"\n", engstat.destroy_nonleaf); 
2383 2384 2385 2386 2387 2388 2389
        n += snprintf(buff + n, bufsiz - n, "split_leaf                       %"PRIu64"\n", engstat.split_leaf); 
        n += snprintf(buff + n, bufsiz - n, "split_nonleaf                    %"PRIu64"\n", engstat.split_nonleaf); 
        n += snprintf(buff + n, bufsiz - n, "merge_leaf                       %"PRIu64"\n", engstat.merge_leaf); 
        n += snprintf(buff + n, bufsiz - n, "merge_nonleaf                    %"PRIu64"\n", engstat.merge_nonleaf); 
        n += snprintf(buff + n, bufsiz - n, "dirty_leaf                       %"PRIu64"\n", engstat.dirty_leaf); 
        n += snprintf(buff + n, bufsiz - n, "dirty_nonleaf                    %"PRIu64"\n", engstat.dirty_nonleaf); 
        n += snprintf(buff + n, bufsiz - n, "balance_leaf                     %"PRIu64"\n", engstat.balance_leaf); 
2390 2391 2392 2393 2394 2395
        n += snprintf(buff + n, bufsiz - n, "msg_bytes_in                     %"PRIu64"\n", engstat.msg_bytes_in); 
        n += snprintf(buff + n, bufsiz - n, "msg_bytes_out                    %"PRIu64"\n", engstat.msg_bytes_out); 
        n += snprintf(buff + n, bufsiz - n, "msg_bytes_curr                   %"PRIu64"\n", engstat.msg_bytes_curr); 
        n += snprintf(buff + n, bufsiz - n, "msg_bytes_max                    %"PRIu64"\n", engstat.msg_bytes_max); 
        n += snprintf(buff + n, bufsiz - n, "msg_num                          %"PRIu64"\n", engstat.msg_num); 
        n += snprintf(buff + n, bufsiz - n, "msg_num_broadcast                %"PRIu64"\n", engstat.msg_num_broadcast); 
2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414
        n += snprintf(buff + n, bufsiz - n, "num_basements_decompressed_normal      %"PRIu64"\n", engstat.num_basements_decompressed_normal);
        n += snprintf(buff + n, bufsiz - n, "num_basements_decompressed_aggressive  %"PRIu64"\n", engstat.num_basements_decompressed_aggressive);
        n += snprintf(buff + n, bufsiz - n, "num_basements_decompressed_prefetch    %"PRIu64"\n", engstat.num_basements_decompressed_prefetch);
        n += snprintf(buff + n, bufsiz - n, "num_basements_decompressed_write       %"PRIu64"\n", engstat.num_basements_decompressed_write);
        n += snprintf(buff + n, bufsiz - n, "num_msg_buffer_decompressed_normal      %"PRIu64"\n", engstat.num_msg_buffer_decompressed_normal);
        n += snprintf(buff + n, bufsiz - n, "num_msg_buffer_decompressed_aggressive  %"PRIu64"\n", engstat.num_msg_buffer_decompressed_aggressive);
        n += snprintf(buff + n, bufsiz - n, "num_msg_buffer_decompressed_prefetch    %"PRIu64"\n", engstat.num_msg_buffer_decompressed_prefetch);
        n += snprintf(buff + n, bufsiz - n, "num_msg_buffer_decompressed_write       %"PRIu64"\n", engstat.num_msg_buffer_decompressed_write);
        n += snprintf(buff + n, bufsiz - n, "num_pivots_fetched_query               %"PRIu64"\n", engstat.num_pivots_fetched_query);
        n += snprintf(buff + n, bufsiz - n, "num_pivots_fetched_prefetch            %"PRIu64"\n", engstat.num_pivots_fetched_prefetch);
        n += snprintf(buff + n, bufsiz - n, "num_pivots_fetched_write               %"PRIu64"\n", engstat.num_pivots_fetched_write);
        n += snprintf(buff + n, bufsiz - n, "num_basements_fetched_normal           %"PRIu64"\n", engstat.num_basements_fetched_normal);
        n += snprintf(buff + n, bufsiz - n, "num_basements_fetched_aggressive       %"PRIu64"\n", engstat.num_basements_fetched_aggressive);
        n += snprintf(buff + n, bufsiz - n, "num_basements_fetched_prefetch         %"PRIu64"\n", engstat.num_basements_fetched_prefetch);
        n += snprintf(buff + n, bufsiz - n, "num_basements_fetched_write            %"PRIu64"\n", engstat.num_basements_fetched_write);
        n += snprintf(buff + n, bufsiz - n, "num_msg_buffer_fetched_normal           %"PRIu64"\n", engstat.num_msg_buffer_fetched_normal);
        n += snprintf(buff + n, bufsiz - n, "num_msg_buffer_fetched_aggressive       %"PRIu64"\n", engstat.num_msg_buffer_fetched_aggressive);
        n += snprintf(buff + n, bufsiz - n, "num_msg_buffer_fetched_prefetch         %"PRIu64"\n", engstat.num_msg_buffer_fetched_prefetch);
        n += snprintf(buff + n, bufsiz - n, "num_msg_buffer_fetched_write            %"PRIu64"\n", engstat.num_msg_buffer_fetched_write);
2415 2416 2417 2418 2419 2420
	n += snprintf(buff + n, bufsiz - n, "multi_inserts                    %"PRIu64"\n", engstat.multi_inserts);
	n += snprintf(buff + n, bufsiz - n, "multi_inserts_fail               %"PRIu64"\n", engstat.multi_inserts_fail);
	n += snprintf(buff + n, bufsiz - n, "multi_deletes                    %"PRIu64"\n", engstat.multi_deletes);
	n += snprintf(buff + n, bufsiz - n, "multi_deletes_fail               %"PRIu64"\n", engstat.multi_deletes_fail);
	n += snprintf(buff + n, bufsiz - n, "multi_updates                    %"PRIu64"\n", engstat.multi_updates);
	n += snprintf(buff + n, bufsiz - n, "multi_updates_fail               %"PRIu64"\n", engstat.multi_updates_fail);
2421 2422
	n += snprintf(buff + n, bufsiz - n, "point_queries                    %"PRIu64"\n", engstat.point_queries);
	n += snprintf(buff + n, bufsiz - n, "sequential_queries               %"PRIu64"\n", engstat.sequential_queries);
2423 2424 2425 2426
	n += snprintf(buff + n, bufsiz - n, "num_db_open                      %"PRIu64"\n", engstat.num_db_open);
	n += snprintf(buff + n, bufsiz - n, "num_db_close                     %"PRIu64"\n", engstat.num_db_close);
	n += snprintf(buff + n, bufsiz - n, "num_open_dbs                     %"PRIu64"\n", engstat.num_open_dbs);
	n += snprintf(buff + n, bufsiz - n, "max_open_dbs                     %"PRIu64"\n", engstat.max_open_dbs);
2427 2428 2429 2430
	n += snprintf(buff + n, bufsiz - n, "directory_read_locks             %"PRIu64"\n", engstat.directory_read_locks);
	n += snprintf(buff + n, bufsiz - n, "directory_read_locks_fail        %"PRIu64"\n", engstat.directory_read_locks_fail);
	n += snprintf(buff + n, bufsiz - n, "directory_write_locks            %"PRIu64"\n", engstat.directory_write_locks);
	n += snprintf(buff + n, bufsiz - n, "directory_write_locks_fail       %"PRIu64"\n", engstat.directory_write_locks_fail);
2431 2432 2433 2434 2435
	n += snprintf(buff + n, bufsiz - n, "fsync_count                      %"PRIu64"\n", engstat.fsync_count);
	n += snprintf(buff + n, bufsiz - n, "fsync_time                       %"PRIu64"\n", engstat.fsync_time);
	n += snprintf(buff + n, bufsiz - n, "logger ilock count               %"PRIu64"\n", engstat.logger_ilock_ctr);
	n += snprintf(buff + n, bufsiz - n, "logger olock count               %"PRIu64"\n", engstat.logger_olock_ctr);
	n += snprintf(buff + n, bufsiz - n, "logger swap count                %"PRIu64"\n", engstat.logger_swap_ctr);
2436 2437
	n += snprintf(buff + n, bufsiz - n, "logger panic                     %"PRIu64"\n", engstat.logger_panic);
	n += snprintf(buff + n, bufsiz - n, "logger panic_errno               %"PRIu64"\n", engstat.logger_panic_errno);
2438
	n += snprintf(buff + n, bufsiz - n, "enospc_most_recent               %s \n",       engstat.enospc_most_recent);
2439 2440 2441 2442 2443
	n += snprintf(buff + n, bufsiz - n, "enospc threads blocked           %"PRIu64"\n", engstat.enospc_threads_blocked);
	n += snprintf(buff + n, bufsiz - n, "enospc count                     %"PRIu64"\n", engstat.enospc_ctr);
	n += snprintf(buff + n, bufsiz - n, "enospc redzone ctr               %"PRIu64"\n", engstat.enospc_redzone_ctr);
	n += snprintf(buff + n, bufsiz - n, "enospc state                     %"PRIu64"\n", engstat.enospc_state);
	n += snprintf(buff + n, bufsiz - n, "loader_create                    %"PRIu64"\n", engstat.loader_create);
2444
	n += snprintf(buff + n, bufsiz - n, "loader_create_fail               %"PRIu64"\n", engstat.loader_create_fail);
2445
	n += snprintf(buff + n, bufsiz - n, "loader_put                       %"PRIu64"\n", engstat.loader_put);
2446
	n += snprintf(buff + n, bufsiz - n, "loader_put_fail                  %"PRIu64"\n", engstat.loader_put_fail);
2447 2448 2449
	n += snprintf(buff + n, bufsiz - n, "loader_close                     %"PRIu64"\n", engstat.loader_close);
	n += snprintf(buff + n, bufsiz - n, "loader_close_fail                %"PRIu64"\n", engstat.loader_close_fail);
	n += snprintf(buff + n, bufsiz - n, "loader_abort                     %"PRIu64"\n", engstat.loader_abort);
2450 2451
	n += snprintf(buff + n, bufsiz - n, "loader_current                   %"PRIu64"\n", engstat.loader_current);
	n += snprintf(buff + n, bufsiz - n, "loader_max                       %"PRIu64"\n", engstat.loader_max);
2452 2453
	n += snprintf(buff + n, bufsiz - n, "logsuppress                      %"PRIu64"\n", engstat.logsuppress);
	n += snprintf(buff + n, bufsiz - n, "logsuppressfail                  %"PRIu64"\n", engstat.logsuppressfail);
2454 2455 2456 2457 2458 2459 2460
	n += snprintf(buff + n, bufsiz - n, "indexer_create                   %"PRIu64"\n", engstat.indexer_create);
	n += snprintf(buff + n, bufsiz - n, "indexer_create_fail              %"PRIu64"\n", engstat.indexer_create_fail);
	n += snprintf(buff + n, bufsiz - n, "indexer_build                    %"PRIu64"\n", engstat.indexer_build);
	n += snprintf(buff + n, bufsiz - n, "indexer_build_fail               %"PRIu64"\n", engstat.indexer_build_fail);
	n += snprintf(buff + n, bufsiz - n, "indexer_close                    %"PRIu64"\n", engstat.indexer_close);
	n += snprintf(buff + n, bufsiz - n, "indexer_close_fail               %"PRIu64"\n", engstat.indexer_close_fail);
	n += snprintf(buff + n, bufsiz - n, "indexer_abort                    %"PRIu64"\n", engstat.indexer_abort);
2461 2462
	n += snprintf(buff + n, bufsiz - n, "indexer_current                  %"PRIu64"\n", engstat.indexer_current);
	n += snprintf(buff + n, bufsiz - n, "indexer_max                      %"PRIu64"\n", engstat.indexer_max);
2463 2464 2465 2466
	n += snprintf(buff + n, bufsiz - n, "upgrade_env_status               %"PRIu64"\n", engstat.upgrade_env_status);
	n += snprintf(buff + n, bufsiz - n, "upgrade_header                   %"PRIu64"\n", engstat.upgrade_header);
	n += snprintf(buff + n, bufsiz - n, "upgrade_nonleaf                  %"PRIu64"\n", engstat.upgrade_nonleaf);
	n += snprintf(buff + n, bufsiz - n, "upgrade_leaf                     %"PRIu64"\n", engstat.upgrade_leaf);
2467
	n += snprintf(buff + n, bufsiz - n, "optimized_for_upgrade            %"PRIu64"\n", engstat.optimized_for_upgrade);
2468 2469
	n += snprintf(buff + n, bufsiz - n, "original_ver                     %"PRIu64"\n", engstat.original_ver);
	n += snprintf(buff + n, bufsiz - n, "ver_at_startup                   %"PRIu64"\n", engstat.ver_at_startup);
2470
	n += snprintf(buff + n, bufsiz - n, "last_lsn_v13                     %"PRIu64"\n", engstat.last_lsn_v13);
2471
	n += snprintf(buff + n, bufsiz - n, "upgrade_v14_time                 %s \n",       engstat.upgrade_v14_time);
2472 2473 2474
	n += snprintf(buff + n, bufsiz - n, "malloc_count                     %"PRIu64"\n", engstat.malloc_count);
	n += snprintf(buff + n, bufsiz - n, "free_count                       %"PRIu64"\n", engstat.free_count);
	n += snprintf(buff + n, bufsiz - n, "realloc_count                    %"PRIu64"\n", engstat.realloc_count);
2475 2476
	n += snprintf(buff + n, bufsiz - n, "malloc_fail                      %"PRIu64"\n", engstat.malloc_fail);
	n += snprintf(buff + n, bufsiz - n, "realloc_fail                     %"PRIu64"\n", engstat.realloc_fail);
2477 2478 2479
	n += snprintf(buff + n, bufsiz - n, "mem_requested                    %"PRIu64"\n", engstat.mem_requested);
	n += snprintf(buff + n, bufsiz - n, "mem_used                         %"PRIu64"\n", engstat.mem_used);
	n += snprintf(buff + n, bufsiz - n, "mem_freed                        %"PRIu64"\n", engstat.mem_freed);
2480
	n += snprintf(buff + n, bufsiz - n, "max_mem_in_use                   %"PRIu64"\n", engstat.max_mem_in_use);
2481
	n += snprintf(buff + n, bufsiz - n, "mallocator_version               %s\n",        engstat.mallocator_version);
2482
    }
2483 2484 2485
    if (n > bufsiz) {
	char * errmsg = "BUFFER TOO SMALL\n";
	int len = strlen(errmsg) + 1;
2486
	(void) snprintf(buff + (bufsiz - 1) - len, len, "%s", errmsg);
2487 2488 2489 2490 2491
    }

    return r;
}

2492 2493 2494 2495
// intended for use by toku_assert logic, when env is not known
static int 
toku_maybe_get_engine_status_text (char * buff, int buffsize) {
    DB_ENV * env = most_recent_env;
2496 2497 2498 2499 2500 2501 2502 2503
    int r;
    if (engine_status_enable) {
	r = env_get_engine_status_text(env, buff, buffsize);
    }
    else {
	r = ENODATA;
	snprintf(buff, buffsize, "Engine status not available: disabled by user.  This should only happen in test programs.\n");
    }
2504 2505 2506
    return r;
}

2507 2508 2509 2510
// Set panic code and panic string if not already panicked,
// intended for use by toku_assert when about to abort().
static void 
toku_maybe_set_env_panic(int code, char * msg) {
2511 2512 2513 2514 2515
    if (code == 0) 
	code = -1;
    if (msg == NULL)
	msg = "Unknown cause from abort (failed assert)\n";
    env_is_panicked = code;  // disable library destructor no matter what
2516
    DB_ENV * env = most_recent_env;
2517 2518 2519 2520
    if (env && 
	env->i &&
	(env->i->is_panicked == 0)) {
	env_panic(env, code, msg);
2521 2522
    }
}
2523

2524 2525 2526 2527 2528 2529 2530
// handlerton's call to fractal tree layer on failed assert in handlerton
static int 
env_crash(DB_ENV * UU(db_env), const char* msg, const char * fun, const char* file, int line, int caller_errno) {
    toku_do_assert_fail(msg, fun, file, line, caller_errno);
    return -1;  // placate compiler
}

2531

Rich Prohaska's avatar
Rich Prohaska committed
2532 2533
static int locked_txn_begin(DB_ENV * env, DB_TXN * stxn, DB_TXN ** txn, u_int32_t flags);

Yoni Fogel's avatar
Yoni Fogel committed
2534 2535 2536 2537
static int toku_db_lt_panic(DB* db, int r);

static toku_dbt_cmp toku_db_get_compare_fun(DB* db);

2538 2539
static int 
toku_env_create(DB_ENV ** envp, u_int32_t flags) {
Yoni Fogel's avatar
Yoni Fogel committed
2540 2541 2542
    int r = ENOSYS;
    DB_ENV* result = NULL;

2543 2544
    engine_status_enable = 1;

Yoni Fogel's avatar
Yoni Fogel committed
2545 2546 2547
    if (flags!=0)    { r = EINVAL; goto cleanup; }
    MALLOC(result);
    if (result == 0) { r = ENOMEM; goto cleanup; }
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
2548
    memset(result, 0, sizeof *result);
2549
    result->err = (void (*)(const DB_ENV * env, int error, const char *fmt, ...)) toku_locked_env_err;
2550 2551 2552 2553
#define SENV(name) result->name = locked_env_ ## name
    SENV(dbremove);
    SENV(dbrename);
    SENV(set_default_bt_compare);
2554
    SENV(set_update);
2555 2556
    SENV(set_generate_row_callback_for_put);
    SENV(set_generate_row_callback_for_del);
2557 2558
    SENV(put_multiple);
    SENV(del_multiple);
2559
    SENV(update_multiple);
2560 2561
    SENV(checkpointing_set_period);
    SENV(checkpointing_get_period);
Zardosht Kasheff's avatar
Zardosht Kasheff committed
2562 2563 2564 2565
    SENV(cleaner_set_period);
    SENV(cleaner_get_period);
    SENV(cleaner_set_iterations);
    SENV(cleaner_get_iterations);
2566 2567 2568 2569
    result->checkpointing_postpone = env_checkpointing_postpone;
    result->checkpointing_resume = env_checkpointing_resume;
    result->checkpointing_begin_atomic_operation = env_checkpointing_begin_atomic_operation;
    result->checkpointing_end_atomic_operation = env_checkpointing_end_atomic_operation;
2570
    result->get_engine_status = env_get_engine_status;
2571
    result->get_engine_status_text = env_get_engine_status_text;
2572
    result->crash = env_crash;  // handlerton's call to fractal tree layer on failed assert
2573 2574 2575
    result->get_iname = env_get_iname;
    SENV(open);
    SENV(close);
2576
    result->txn_checkpoint = toku_env_txn_checkpoint;
2577
    SENV(log_flush);
Rich Prohaska's avatar
Rich Prohaska committed
2578 2579 2580
    result->set_errcall = toku_env_set_errcall;
    result->set_errfile = toku_env_set_errfile;
    result->set_errpfx = toku_env_set_errpfx;
2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591
    //SENV(set_noticecall);
    SENV(set_flags);
    SENV(set_data_dir);
    SENV(set_tmp_dir);
    SENV(set_verbose);
    SENV(set_lg_bsize);
    SENV(set_lg_dir);
    SENV(set_lg_max);
    SENV(get_lg_max);
    SENV(set_lk_max_locks);
    SENV(get_lk_max_locks);
2592 2593
    SENV(set_lk_max_memory);
    SENV(get_lk_max_memory);
2594
    SENV(set_cachesize);
Rich Prohaska's avatar
Rich Prohaska committed
2595
#if DB_VERSION_MAJOR == 4 && DB_VERSION_MINOR >= 3
2596
    SENV(get_cachesize);
Rich Prohaska's avatar
Rich Prohaska committed
2597
#endif
2598
    SENV(set_lk_detect);
2599
#if DB_VERSION_MAJOR == 4 && DB_VERSION_MINOR <= 4
2600
    SENV(set_lk_max);
2601
#endif
2602 2603
    SENV(log_archive);
    SENV(txn_stat);
Rich Prohaska's avatar
Rich Prohaska committed
2604
    result->txn_begin = locked_txn_begin;
2605
    SENV(set_redzone);
Dave Wells's avatar
Dave Wells committed
2606
    SENV(create_indexer);
2607
    SENV(create_loader);
2608 2609
    SENV(get_lock_timeout);
    SENV(set_lock_timeout);
Dave Wells's avatar
Dave Wells committed
2610
#undef SENV
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
2611

2612
    MALLOC(result->i);
Yoni Fogel's avatar
Yoni Fogel committed
2613
    if (result->i == 0) { r = ENOMEM; goto cleanup; }
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
2614
    memset(result->i, 0, sizeof *result->i);
Yoni Fogel's avatar
Yoni Fogel committed
2615 2616 2617 2618
    result->i->envdir_lockfd  = -1;
    result->i->datadir_lockfd = -1;
    result->i->logdir_lockfd  = -1;
    result->i->tmpdir_lockfd  = -1;
2619
    env_init_open_txn(result);
2620
    env_fs_init(result);
Yoni Fogel's avatar
Yoni Fogel committed
2621

2622 2623
    r = toku_ltm_create(&result->i->ltm,
                        __toku_env_default_max_locks, __toku_env_default_max_lock_memory,
2624 2625
                        toku_db_lt_panic, 
                        toku_db_get_compare_fun);
Yoni Fogel's avatar
Yoni Fogel committed
2626
    if (r!=0) { goto cleanup; }
2627
    toku_ltm_set_mutex(result->i->ltm, toku_ydb_mutex());
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
2628

2629
    {
Yoni Fogel's avatar
Yoni Fogel committed
2630 2631
	r = toku_logger_create(&result->i->logger);
	if (r!=0) { goto cleanup; }
2632 2633
	assert(result->i->logger);
    }
2634 2635 2636 2637 2638
    {
        r = toku_omt_create(&result->i->open_dbs);
        if (r!=0) goto cleanup;
        assert(result->i->open_dbs);
    }
2639

Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
2640
    *envp = result;
Yoni Fogel's avatar
Yoni Fogel committed
2641 2642 2643 2644 2645 2646 2647 2648
    r = 0;
cleanup:
    if (r!=0) {
        if (result) {
            if (result->i) {
                if (result->i->ltm) {
                    toku_ltm_close(result->i->ltm);
                }
2649 2650
                if (result->i->open_dbs)
                    toku_omt_destroy(&result->i->open_dbs);
Yoni Fogel's avatar
Yoni Fogel committed
2651 2652 2653 2654 2655 2656
                toku_free(result->i);
            }
            toku_free(result);
        }
    }
    return r;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
2657 2658
}

2659 2660
int 
DB_ENV_CREATE_FUN (DB_ENV ** envp, u_int32_t flags) {
2661 2662 2663 2664
    toku_ydb_lock(); 
    int r = toku_env_create(envp, flags); 
    toku_ydb_unlock(); 
    return r;
Rich Prohaska's avatar
Rich Prohaska committed
2665 2666
}

2667 2668
static int 
toku_txn_release_locks(DB_TXN* txn) {
Yoni Fogel's avatar
Yoni Fogel committed
2669
    assert(txn);
2670
    toku_lth* lth = db_txn_struct_i(txn)->lth;
Yoni Fogel's avatar
Yoni Fogel committed
2671

Yoni Fogel's avatar
Yoni Fogel committed
2672 2673
    int r = ENOSYS;
    int first_error = 0;
Yoni Fogel's avatar
Yoni Fogel committed
2674 2675 2676 2677
    if (lth) {
        toku_lth_start_scan(lth);
        toku_lock_tree* next = toku_lth_next(lth);
        while (next) {
2678
            r = toku_lt_unlock(next, toku_txn_get_txnid(db_txn_struct_i(txn)->tokutxn));
Yoni Fogel's avatar
Yoni Fogel committed
2679 2680 2681 2682 2683
            if (!first_error && r!=0) { first_error = r; }
            if (r == 0) {
                r = toku_lt_remove_ref(next);
                if (!first_error && r!=0) { first_error = r; }
            }
Yoni Fogel's avatar
Yoni Fogel committed
2684 2685 2686
            next = toku_lth_next(lth);
        }
        toku_lth_close(lth);
2687
        db_txn_struct_i(txn)->lth = NULL;
Yoni Fogel's avatar
Yoni Fogel committed
2688
    }
Yoni Fogel's avatar
Yoni Fogel committed
2689 2690
    r = first_error;

Yoni Fogel's avatar
Yoni Fogel committed
2691
    return r;
Yoni Fogel's avatar
Yoni Fogel committed
2692 2693
}

2694 2695
// Yield the lock so someone else can work, and then reacquire the lock.
// Useful while processing commit or rollback logs, to allow others to access the system.
2696 2697
static void 
ydb_yield (voidfp f, void *fv, void *UU(v)) {
2698
    toku_ydb_unlock(); 
2699 2700
    if (f) 
        f(fv);
2701 2702 2703
    toku_ydb_lock();
}

2704 2705
static int 
toku_txn_commit(DB_TXN * txn, u_int32_t flags,
2706 2707
                TXN_PROGRESS_POLL_FUNCTION poll, void* poll_extra,
		bool release_multi_operation_client_lock) {
2708
    if (!txn) return EINVAL;
2709
    HANDLE_PANICKED_ENV(txn->mgrp);
Yoni Fogel's avatar
Yoni Fogel committed
2710
    //Recursively kill off children
2711 2712
    if (db_txn_struct_i(txn)->child) {
        //commit of child sets the child pointer to NULL
2713
        int r_child = toku_txn_commit(db_txn_struct_i(txn)->child, flags, NULL, NULL, false);
2714
        if (r_child !=0 && !toku_env_is_panicked(txn->mgrp)) {
2715
	    env_panic(txn->mgrp, r_child, "Recursive child commit failed during parent commit.\n");
2716
        }
Yoni Fogel's avatar
Yoni Fogel committed
2717 2718
        //In a panicked env, the child may not be removed from the list.
        HANDLE_PANICKED_ENV(txn->mgrp);
Yoni Fogel's avatar
Yoni Fogel committed
2719
    }
2720
    assert(!db_txn_struct_i(txn)->child);
Yoni Fogel's avatar
Yoni Fogel committed
2721 2722
    //Remove from parent
    if (txn->parent) {
2723 2724
        assert(db_txn_struct_i(txn->parent)->child == txn);
        db_txn_struct_i(txn->parent)->child=NULL;
Yoni Fogel's avatar
Yoni Fogel committed
2725
    }
2726
    env_remove_open_txn(txn->mgrp, txn);
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
2727
    //toku_ydb_notef("flags=%d\n", flags);
2728 2729 2730 2731
    if (flags & DB_TXN_SYNC) {
        toku_txn_force_fsync_on_commit(db_txn_struct_i(txn)->tokutxn);
        flags &= ~DB_TXN_SYNC;
    }
2732
    int nosync = (flags & DB_TXN_NOSYNC)!=0 || (db_txn_struct_i(txn)->flags&DB_TXN_NOSYNC);
2733
    flags &= ~DB_TXN_NOSYNC;
Yoni Fogel's avatar
Yoni Fogel committed
2734 2735

    int r;
2736
    if (flags!=0)
2737 2738
	// frees the tokutxn
	// Calls ydb_yield(NULL) occasionally
2739
        //r = toku_logger_abort(db_txn_struct_i(txn)->tokutxn, ydb_yield, NULL);
2740 2741
        r = toku_txn_abort_txn(db_txn_struct_i(txn)->tokutxn, ydb_yield, NULL, poll, poll_extra,
			       release_multi_operation_client_lock);
Yoni Fogel's avatar
Yoni Fogel committed
2742
    else
2743 2744
	// frees the tokutxn
	// Calls ydb_yield(NULL) occasionally
2745
        //r = toku_logger_commit(db_txn_struct_i(txn)->tokutxn, nosync, ydb_yield, NULL);
2746
        r = toku_txn_commit_txn(db_txn_struct_i(txn)->tokutxn, nosync, ydb_yield, NULL,
2747
				poll, poll_extra, release_multi_operation_client_lock);
2748

2749
    if (r!=0 && !toku_env_is_panicked(txn->mgrp)) {
2750
	env_panic(txn->mgrp, r, "Error during commit.\n");
2751 2752 2753 2754 2755
    }
    //If panicked, we're done.
    HANDLE_PANICKED_ENV(txn->mgrp);
    assert(r==0);

2756
    // Close the logger after releasing the locks
2757
    r = toku_txn_release_locks(txn);
2758 2759 2760 2761 2762 2763 2764 2765 2766 2767
    TOKUTXN ttxn = db_txn_struct_i(txn)->tokutxn;
    TOKULOGGER logger = txn->mgrp->i->logger;
    LSN do_fsync_lsn;
    BOOL do_fsync;
    //
    // quickie fix for 5.2.0, need to extract these variables so that
    // we can do the fsync after the close of txn. We need to do it 
    // after the close because if we do it before, there are race
    // conditions exposed by test_stress1.c (#4145, #4153)
    //
2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786
    // Here is what was going on. In Maxwell (5.1.X), we used to 
    // call toku_txn_maybe_fsync_log in between toku_txn_release_locks
    // and toku_txn_close_txn. As a result, the ydb lock was released
    // and retaken in between these two calls. This was wrong, as the 
    // two commands need to be atomic. The problem was that 
    // when the ydb lock was released, the locks that this txn took
    // were released, but the txn was not removed from the list of 
    // live transactions. This allowed the following sequence of events: 
    //  - another txn B comes and writes to some key this transaction wrote to
    //  - txn B successfully commits
    //  - read txn C comes along, sees this transaction in its live list,
    //     but NOT txn B, which came after this transaction.
    //     This is incorrect. When txn C comes across a leafentry that has been
    //     modified by both this transaction and B, it'll read B's value, even
    //     though it cannot read this transaction's value, which comes below
    //     B's value on the leafentry's stack. This behavior is incorrect.
    //  All of this happens while the ydb lock is yielded. This causes a failure
    //  in the test_stress tests.
    //
2787 2788 2789 2790
    toku_txn_get_fsync_info(ttxn, &do_fsync, &do_fsync_lsn);
    toku_txn_close_txn(ttxn);
    toku_txn_maybe_fsync_log(logger, do_fsync_lsn, do_fsync, ydb_yield, NULL);
    
2791
    // the toxutxn is freed, and we must free the rest. */
Yoni Fogel's avatar
Yoni Fogel committed
2792

2793 2794 2795
    //Promote list to parent (dbs that must close before abort)
    if (txn->parent) {
        //Combine lists.
2796 2797 2798
        while (!toku_list_empty(&db_txn_struct_i(txn)->dbs_that_must_close_before_abort)) {
            struct toku_list *list = toku_list_pop(&db_txn_struct_i(txn)->dbs_that_must_close_before_abort);
            toku_list_push(&db_txn_struct_i(txn->parent)->dbs_that_must_close_before_abort, list);
2799 2800 2801 2802
        }
    }
    else {
        //Empty the list
2803 2804
        while (!toku_list_empty(&db_txn_struct_i(txn)->dbs_that_must_close_before_abort)) {
            toku_list_pop(&db_txn_struct_i(txn)->dbs_that_must_close_before_abort);
2805 2806 2807
        }
    }

2808
    // The txn is no good after the commit even if the commit fails, so free it up.
2809 2810 2811
#if !TOKUDB_NATIVE_H
    toku_free(db_txn_struct_i(txn));
#endif
2812
    toku_free(txn);    txn = NULL;
Yoni Fogel's avatar
Yoni Fogel committed
2813
    if (flags!=0) return EINVAL;
2814
    return r;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
2815 2816
}

2817 2818
static u_int32_t 
toku_txn_id(DB_TXN * txn) {
2819
    HANDLE_PANICKED_ENV(txn->mgrp);
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
2820
    toku_ydb_barf();
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
2821
    abort();
Rich Prohaska's avatar
Rich Prohaska committed
2822
    return -1;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
2823 2824
}

2825 2826
static int 
toku_txn_abort(DB_TXN * txn,
2827 2828
               TXN_PROGRESS_POLL_FUNCTION poll, void* poll_extra,
	       bool release_multi_operation_client_lock) {
2829
    HANDLE_PANICKED_ENV(txn->mgrp);
2830
    //Recursively kill off children (abort or commit are both correct, commit is cheaper)
2831 2832
    if (db_txn_struct_i(txn)->child) {
        //commit of child sets the child pointer to NULL
2833
        int r_child = toku_txn_commit(db_txn_struct_i(txn)->child, DB_TXN_NOSYNC, NULL, NULL, false);
2834
        if (r_child !=0 && !toku_env_is_panicked(txn->mgrp)) {
2835
	    env_panic(txn->mgrp, r_child, "Recursive child commit failed during parent abort.\n");
2836
        }
Yoni Fogel's avatar
Yoni Fogel committed
2837 2838
        //In a panicked env, the child may not be removed from the list.
        HANDLE_PANICKED_ENV(txn->mgrp);
Yoni Fogel's avatar
Yoni Fogel committed
2839
    }
2840
    assert(!db_txn_struct_i(txn)->child);
Yoni Fogel's avatar
Yoni Fogel committed
2841 2842
    //Remove from parent
    if (txn->parent) {
2843 2844
        assert(db_txn_struct_i(txn->parent)->child == txn);
        db_txn_struct_i(txn->parent)->child=NULL;
Yoni Fogel's avatar
Yoni Fogel committed
2845
    }
2846
    env_remove_open_txn(txn->mgrp, txn);
2847 2848

    //All dbs that must close before abort, must now be closed
2849
    assert(toku_list_empty(&db_txn_struct_i(txn)->dbs_that_must_close_before_abort));
2850

2851
    //int r = toku_logger_abort(db_txn_struct_i(txn)->tokutxn, ydb_yield, NULL);
2852
    int r = toku_txn_abort_txn(db_txn_struct_i(txn)->tokutxn, ydb_yield, NULL, poll, poll_extra, release_multi_operation_client_lock);
2853
    if (r!=0 && !toku_env_is_panicked(txn->mgrp)) {
2854
	env_panic(txn->mgrp, r, "Error during abort.\n");
2855 2856 2857 2858
    }
    HANDLE_PANICKED_ENV(txn->mgrp);
    assert(r==0);
    r = toku_txn_release_locks(txn);
2859 2860
    //toku_logger_txn_close(db_txn_struct_i(txn)->tokutxn);
    toku_txn_close_txn(db_txn_struct_i(txn)->tokutxn);
Yoni Fogel's avatar
Yoni Fogel committed
2861

2862 2863 2864
#if !TOKUDB_NATIVE_H
    toku_free(db_txn_struct_i(txn));
#endif
2865
    toku_free(txn);
2866
    return r;
2867 2868
}

2869 2870
static int 
locked_txn_begin(DB_ENV *env, DB_TXN * stxn, DB_TXN ** txn, u_int32_t flags) {
2871
    toku_ydb_lock(); int r = toku_txn_begin(env, stxn, txn, flags, 0); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
2872 2873
}

2874 2875
static u_int32_t 
locked_txn_id(DB_TXN *txn) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
2876
    toku_ydb_lock(); u_int32_t r = toku_txn_id(txn); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
2877 2878
}

2879 2880
static int 
toku_txn_stat (DB_TXN *txn, struct txn_stat **txn_stat) {
2881
    XMALLOC(*txn_stat);
2882
    return toku_logger_txn_rollback_raw_count(db_txn_struct_i(txn)->tokutxn, &(*txn_stat)->rollback_raw_count);
2883 2884
}

2885 2886
static int 
locked_txn_stat (DB_TXN *txn, struct txn_stat **txn_stat) {
2887 2888 2889
    toku_ydb_lock(); u_int32_t r = toku_txn_stat(txn, txn_stat); toku_ydb_unlock(); return r;
}

Zardosht Kasheff's avatar
Zardosht Kasheff committed
2890
static int
2891 2892
locked_txn_commit_with_progress(DB_TXN *txn, u_int32_t flags,
                                TXN_PROGRESS_POLL_FUNCTION poll, void* poll_extra) {
Zardosht Kasheff's avatar
Zardosht Kasheff committed
2893 2894 2895 2896 2897 2898
    TOKUTXN ttxn = db_txn_struct_i(txn)->tokutxn;
    //
    // We must unpin rollback log, otherwise, another thread that tries to checkpoint during commit
    // will grab the multi operation lock, and then not be able to complete the checkpoint because
    // this thread has its rollback log pinned and is trying to grab the multi operation lock.
    //
2899 2900 2901 2902 2903
    // We grab the ydb lock because the checkpoint thread also unpins inprogress rollback logs,
    // so the ydb lock protects a race of both this thread and the checkpoint thread unpinning the
    // inprogress rollback log. If we want, we can probably have the checkpoint thread to not
    // unpin inprogress rollback logs, making this ydb lock grab unnecessary.
    //
2904
    toku_ydb_lock();
Zardosht Kasheff's avatar
Zardosht Kasheff committed
2905
    int r = toku_unpin_inprogress_rollback_log(ttxn);
2906
    toku_ydb_unlock();
Zardosht Kasheff's avatar
Zardosht Kasheff committed
2907 2908 2909 2910
    assert(r==0);
    if (toku_txn_requires_checkpoint(ttxn)) {
        toku_checkpoint(txn->mgrp->i->cachetable, txn->mgrp->i->logger, NULL, NULL, NULL, NULL);
    }
2911
    toku_multi_operation_client_lock(); //Cannot checkpoint during a commit.
2912 2913 2914
    toku_ydb_lock();
    r = toku_txn_commit(txn, flags, poll, poll_extra, true); // the final 'true' says to release the multi_operation_client_lock
    toku_ydb_unlock();
2915
    return r;
Rich Prohaska's avatar
Rich Prohaska committed
2916 2917
}

2918 2919 2920
static int 
locked_txn_abort_with_progress(DB_TXN *txn,
                               TXN_PROGRESS_POLL_FUNCTION poll, void* poll_extra) {
2921
    toku_multi_operation_client_lock(); //Cannot checkpoint during an abort.
2922 2923 2924
    toku_ydb_lock();
    int r = toku_txn_abort(txn, poll, poll_extra, true); // the final 'true' says to release the multi_operation_client_lokc
    toku_ydb_unlock();
2925
    return r;
Rich Prohaska's avatar
Rich Prohaska committed
2926 2927
}

2928 2929
static int 
locked_txn_commit(DB_TXN *txn, u_int32_t flags) {
2930 2931 2932 2933 2934
    int r;
    r = locked_txn_commit_with_progress(txn, flags, NULL, NULL);
    return r;
}

2935 2936
static int 
locked_txn_abort(DB_TXN *txn) {
2937 2938 2939 2940 2941
    int r;
    r = locked_txn_abort_with_progress(txn, NULL, NULL);
    return r;
}

2942 2943
static int 
toku_txn_begin(DB_ENV *env, DB_TXN * stxn, DB_TXN ** txn, u_int32_t flags, int internal) {
2944
    HANDLE_PANICKED_ENV(env);
2945
    HANDLE_ILLEGAL_WORKING_PARENT_TXN(env, stxn); //Cannot create child while child already exists.
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
2946 2947
    if (!toku_logger_is_open(env->i->logger)) return toku_ydb_do_error(env, EINVAL, "Environment does not have logging enabled\n");
    if (!(env->i->open_flags & DB_INIT_TXN))  return toku_ydb_do_error(env, EINVAL, "Environment does not have transactions enabled\n");
2948 2949
    u_int32_t txn_flags = 0;
    txn_flags |= DB_TXN_NOWAIT; //We do not support blocking locks.
2950
    TOKU_ISOLATION child_isolation = TOKU_ISO_SERIALIZABLE;
2951 2952 2953 2954 2955 2956 2957 2958 2959
    u_int32_t iso_flags = flags & DB_ISOLATION_FLAGS;
    if (!(iso_flags == 0 || 
          iso_flags == DB_TXN_SNAPSHOT || 
          iso_flags == DB_READ_COMMITTED || 
          iso_flags == DB_READ_UNCOMMITTED || 
          iso_flags == DB_SERIALIZABLE || 
          iso_flags == DB_INHERIT_ISOLATION)
       ) 
    {
2960 2961 2962
        return toku_ydb_do_error(
            env, 
            EINVAL, 
2963
            "Invalid isolation flags set\n"
2964 2965
            );
    }
2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000
    flags &= ~iso_flags;

    if (internal && stxn) {
        child_isolation = db_txn_struct_i(stxn)->iso;
    }
    else {
        switch (iso_flags) {
            case (DB_INHERIT_ISOLATION):
                if (stxn) {
                    child_isolation = db_txn_struct_i(stxn)->iso;
                }
                else {
                    return toku_ydb_do_error(
                        env, 
                        EINVAL, 
                        "Cannot set DB_INHERIT_ISOLATION when no parent exists\n"
                        );                    
                }
                break;
            case (DB_READ_COMMITTED):
                child_isolation = TOKU_ISO_READ_COMMITTED;
                break;
            case (DB_READ_UNCOMMITTED):
                child_isolation = TOKU_ISO_READ_UNCOMMITTED;
                break;
            case (DB_TXN_SNAPSHOT):
                child_isolation = TOKU_ISO_SNAPSHOT;
                break;
            case (DB_SERIALIZABLE):
            case (0):
                child_isolation = TOKU_ISO_SERIALIZABLE;
                break;
            default:
                assert(FALSE); // error path is above, so this should not happen
                break;
3001
        }
3002
    }
3003 3004 3005 3006 3007 3008 3009
    if (stxn && child_isolation != db_txn_struct_i(stxn)->iso) {
        return toku_ydb_do_error(
            env, 
            EINVAL, 
            "Cannot set isolation level of transaction to something different \
                isolation level\n"
            );   
3010
    }
3011

3012 3013 3014 3015 3016 3017 3018 3019 3020 3021
    if (flags&DB_TXN_NOWAIT) {
        txn_flags |=  DB_TXN_NOWAIT;
        flags     &= ~DB_TXN_NOWAIT;
    }
    if (flags&DB_TXN_NOSYNC) {
        txn_flags |=  DB_TXN_NOSYNC;
        flags     &= ~DB_TXN_NOSYNC;
    }
    if (flags!=0) return toku_ydb_do_error(env, EINVAL, "Invalid flags passed to DB_ENV->txn_begin\n");

3022 3023 3024 3025
    struct __toku_db_txn_external *XMALLOC(eresult); // so the internal stuff is stuck on the end.
    memset(eresult, 0, sizeof(*eresult));
    DB_TXN *result = &eresult->external_part;

Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
3026
    //toku_ydb_notef("parent=%p flags=0x%x\n", stxn, flags);
3027
    result->mgrp = env;
3028 3029 3030 3031 3032 3033 3034
#define STXN(name) result->name = locked_txn_ ## name
    STXN(abort);
    STXN(commit);
    STXN(abort_with_progress);
    STXN(commit_with_progress);
    STXN(id);
#undef STXN
3035
    result->txn_stat = locked_txn_stat;
3036 3037 3038


    result->parent = stxn;
3039 3040 3041
#if !TOKUDB_NATIVE_H
    MALLOC(db_txn_struct_i(result));
    if (!db_txn_struct_i(result)) {
Yoni Fogel's avatar
Yoni Fogel committed
3042 3043 3044
        toku_free(result);
        return ENOMEM;
    }
3045 3046 3047
#endif
    memset(db_txn_struct_i(result), 0, sizeof *db_txn_struct_i(result));
    db_txn_struct_i(result)->flags = txn_flags;
3048
    db_txn_struct_i(result)->iso = child_isolation;
3049
    toku_list_init(&db_txn_struct_i(result)->dbs_that_must_close_before_abort);
Yoni Fogel's avatar
Yoni Fogel committed
3050 3051

    int r;
Yoni Fogel's avatar
Yoni Fogel committed
3052
    if (env->i->open_flags & DB_INIT_LOCK && !stxn) {
3053
        r = toku_lth_create(&db_txn_struct_i(result)->lth);
Yoni Fogel's avatar
Yoni Fogel committed
3054
        if (r!=0) {
3055 3056 3057
#if !TOKUDB_NATIVE_H
            toku_free(db_txn_struct_i(result));
#endif
Yoni Fogel's avatar
Yoni Fogel committed
3058 3059 3060
            toku_free(result);
            return r;
        }
Yoni Fogel's avatar
Yoni Fogel committed
3061 3062
    }
    
3063
    //r = toku_logger_txn_begin(stxn ? db_txn_struct_i(stxn)->tokutxn : 0, &db_txn_struct_i(result)->tokutxn, env->i->logger);
3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081
    TXN_SNAPSHOT_TYPE snapshot_type;
    switch(db_txn_struct_i(result)->iso){
        case(TOKU_ISO_SNAPSHOT):
        {
            snapshot_type = TXN_SNAPSHOT_ROOT;
            break;
        }
        case(TOKU_ISO_READ_COMMITTED):
        {
            snapshot_type = TXN_SNAPSHOT_CHILD;
            break;
        }
        default:
        {
            snapshot_type = TXN_SNAPSHOT_NONE;
            break;
        }
    }
3082 3083 3084 3085 3086 3087
    r = toku_txn_begin_txn(result,		   
			   stxn ? db_txn_struct_i(stxn)->tokutxn : 0, 
			   &db_txn_struct_i(result)->tokutxn, 
			   env->i->logger,
			   snapshot_type
			   );
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
3088 3089
    if (r != 0)
        return r;
3090

Yoni Fogel's avatar
Yoni Fogel committed
3091 3092
    //Add to the list of children for the parent.
    if (result->parent) {
3093 3094
        assert(!db_txn_struct_i(result->parent)->child);
        db_txn_struct_i(result->parent)->child = result;
Yoni Fogel's avatar
Yoni Fogel committed
3095
    }
3096
    env_add_open_txn(env, result);
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
3097 3098 3099 3100
    *txn = result;
    return 0;
}

3101 3102
int 
log_compare(const DB_LSN * a, const DB_LSN * b) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
3103
    toku_ydb_lock();
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
3104
    fprintf(stderr, "%s:%d log_compare(%p,%p)\n", __FILE__, __LINE__, a, b);
3105
    assert(0);
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
3106
    toku_ydb_unlock();
3107
    return 0;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
3108 3109
}

3110 3111
static void env_note_zombie_db_closed(DB_ENV *env, DB *db);

3112 3113
static int
db_close_before_brt(DB *db, u_int32_t UU(flags)) {
3114 3115 3116
    int r;
    char *error_string = NULL;
    
3117 3118 3119 3120
    if (db_opened(db) && db->i->dname) {
        // internal (non-user) dictionary has no dname
        env_note_zombie_db_closed(db->dbenv, db);  // tell env that this db is no longer a zombie (it is completely closed)
    }
3121 3122 3123 3124 3125 3126 3127
    r = toku_close_brt(db->i->brt, &error_string);
    if (r) {
	if (!error_string)
	    error_string = "Closing file\n";
	// Panicking the whole environment may be overkill, but I'm not sure what else to do.
	env_panic(db->dbenv, r, error_string);
	toku_ydb_do_error(db->dbenv, r, "%s", error_string);
3128
    }
3129 3130 3131 3132 3133 3134 3135 3136 3137 3138
    else {
	if (db->i->lt) {
	    toku_lt_remove_db_ref(db->i->lt, db);
	}
	// printf("%s:%d %d=__toku_db_close(%p)\n", __FILE__, __LINE__, r, db);
	toku_sdbt_cleanup(&db->i->skey);
	toku_sdbt_cleanup(&db->i->sval);
	if (db->i->dname) toku_free(db->i->dname);
	toku_free(db->i);
	toku_free(db);
Yoni Fogel's avatar
Yoni Fogel committed
3139
    }
3140
    return r;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
3141 3142
}

3143 3144 3145 3146 3147 3148 3149 3150
// return 0 if v and dbv refer to same db (including same dname)
// return <0 if v is earlier in omt than dbv
// return >0 if v is later in omt than dbv
static int
find_db_by_db (OMTVALUE v, void *dbv) {
    DB *db = v;            // DB* that is stored in the omt
    DB *dbfind = dbv;      // extra, to be compared to v
    int cmp;
3151 3152 3153 3154 3155 3156 3157
    const char *dname     = db->i->dname;
    const char *dnamefind = dbfind->i->dname;
    cmp = strcmp(dname, dnamefind);
    if (cmp != 0) return cmp;
    int is_zombie     = db->i->is_zombie != 0;
    int is_zombiefind = dbfind->i->is_zombie != 0;
    cmp = is_zombie - is_zombiefind;
3158 3159
    if (cmp != 0) return cmp;
    if (db < dbfind) return -1;
3160
    if (db > dbfind) return  1;
3161 3162 3163 3164 3165 3166 3167
    return 0;
}

// Tell env that there is a new db handle (with non-unique dname in db->i-dname)
static void
env_note_db_opened(DB_ENV *env, DB *db) {
    assert(db->i->dname);  // internal (non-user) dictionary has no dname
3168
    assert(!db->i->is_zombie);
3169 3170 3171
    int r;
    OMTVALUE dbv;
    uint32_t idx;
3172
    env->i->num_open_dbs++;
3173 3174 3175 3176
    num_open_dbs = env->i->num_open_dbs;
    num_db_open++;
    if (num_open_dbs > max_open_dbs)
	max_open_dbs = num_open_dbs;
3177
    r = toku_omt_find_zero(env->i->open_dbs, find_db_by_db, db, &dbv, &idx);
3178 3179 3180 3181 3182 3183 3184 3185
    assert(r==DB_NOTFOUND); //Must not already be there.
    r = toku_omt_insert_at(env->i->open_dbs, db, idx);
    assert(r==0);
}

static void
env_note_db_closed(DB_ENV *env, DB *db) {
    assert(db->i->dname);
3186
    assert(!db->i->is_zombie);
3187
    assert(env->i->num_open_dbs);
3188 3189 3190
    int r;
    OMTVALUE dbv;
    uint32_t idx;
3191
    env->i->num_open_dbs--;
3192 3193
    num_open_dbs = env->i->num_open_dbs;
    num_db_close++;
3194
    r = toku_omt_find_zero(env->i->open_dbs, find_db_by_db, db, &dbv, &idx);
3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208
    assert(r==0); //Must already be there.
    assert((DB*)dbv == db);
    r = toku_omt_delete_at(env->i->open_dbs, idx);
    assert(r==0);
}

// Tell env that there is a new db handle (with non-unique dname in db->i-dname)
static void
env_note_zombie_db(DB_ENV *env, DB *db) {
    assert(db->i->dname);  // internal (non-user) dictionary has no dname
    assert(db->i->is_zombie);
    int r;
    OMTVALUE dbv;
    uint32_t idx;
3209
    env->i->num_zombie_dbs++;
3210
    r = toku_omt_find_zero(env->i->open_dbs, find_db_by_db, db, &dbv, &idx);
3211 3212 3213 3214 3215 3216 3217 3218 3219
    assert(r==DB_NOTFOUND); //Must not already be there.
    r = toku_omt_insert_at(env->i->open_dbs, db, idx);
    assert(r==0);
}

static void
env_note_zombie_db_closed(DB_ENV *env, DB *db) {
    assert(db->i->dname);
    assert(db->i->is_zombie);
3220
    assert(env->i->num_zombie_dbs);
3221 3222 3223
    int r;
    OMTVALUE dbv;
    uint32_t idx;
3224
    env->i->num_zombie_dbs--;
3225
    r = toku_omt_find_zero(env->i->open_dbs, find_db_by_db, db, &dbv, &idx);
3226 3227 3228 3229 3230 3231 3232
    assert(r==0); //Must already be there.
    assert((DB*)dbv == db);
    r = toku_omt_delete_at(env->i->open_dbs, idx);
    assert(r==0);
}

static int
3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249
find_zombie_db_by_dname (OMTVALUE v, void *dnamev) {
    DB *db = v;            // DB* that is stored in the omt
    int cmp;
    const char *dname     = db->i->dname;
    const char *dnamefind = dnamev;
    cmp = strcmp(dname, dnamefind);
    if (cmp != 0) return cmp;
    int is_zombie     = db->i->is_zombie != 0;
    int is_zombiefind = 1;
    cmp = is_zombie - is_zombiefind;
    return cmp;
}

static int
find_open_db_by_dname (OMTVALUE v, void *dnamev) {
    DB *db = v;            // DB* that is stored in the omt
    int cmp;
3250 3251
    const char *dname     = db->i->dname;
    const char *dnamefind = dnamev;
3252 3253 3254 3255 3256 3257
    cmp = strcmp(dname, dnamefind);
    if (cmp != 0) return cmp;
    int is_zombie     = db->i->is_zombie != 0;
    int is_zombiefind = 0;
    cmp = is_zombie - is_zombiefind;
    return cmp;
3258 3259 3260 3261 3262 3263 3264 3265 3266
}

// return true if there is any db open with the given dname
static BOOL
env_is_db_with_dname_open(DB_ENV *env, const char *dname) {
    int r;
    BOOL rval;
    OMTVALUE dbv;
    uint32_t idx;
3267
    r = toku_omt_find_zero(env->i->open_dbs, find_open_db_by_dname, (void*)dname, &dbv, &idx);
3268 3269 3270
    if (r==0) {
        DB *db = dbv;
        assert(strcmp(dname, db->i->dname) == 0);
3271
        assert(!db->i->is_zombie);
3272 3273 3274 3275 3276 3277 3278 3279 3280
        rval = TRUE;
    }
    else {
        assert(r==DB_NOTFOUND);
        rval = FALSE;
    }
    return rval;
}

3281 3282 3283 3284 3285 3286 3287
// return true if there is any db open with the given dname
static DB*
env_get_zombie_db_with_dname(DB_ENV *env, const char *dname) {
    int r;
    DB* rval;
    OMTVALUE dbv;
    uint32_t idx;
3288
    r = toku_omt_find_zero(env->i->open_dbs, find_zombie_db_by_dname, (void*)dname, &dbv, &idx);
3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302
    if (r==0) {
        DB *db = dbv;
        assert(db);
        assert(strcmp(dname, db->i->dname) == 0);
        assert(db->i->is_zombie);
        rval = db;
    }
    else {
        assert(r==DB_NOTFOUND);
        rval = NULL;
    }
    return rval;
}

3303 3304 3305 3306 3307 3308 3309 3310 3311 3312
void
toku_db_add_ref(DB *db) {
    db->i->refs++;
}

void
toku_db_release_ref(DB *db){
    db->i->refs--;
}

3313
//DB->close()
3314 3315
static int 
toku_db_close(DB * db, u_int32_t flags) {
3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329
    int r = 0;
    if (db->i->refs != 1) {
        r = EBUSY;
    } else {
        db->i->refs = 0;
        if (db_opened(db) && db->i->dname) {
            // internal (non-user) dictionary has no dname
            env_note_db_closed(db->dbenv, db);  // tell env that this db is no longer in use by the user of this api (user-closed, may still be in use by fractal tree internals)
            db->i->is_zombie = TRUE;
            env_note_zombie_db(db->dbenv, db);  // tell env that this db is a zombie
        }
        //Remove from transaction's list of 'must close' if necessary.
        if (!toku_list_empty(&db->i->dbs_that_must_close_before_abort))
            toku_list_remove(&db->i->dbs_that_must_close_before_abort);
3330

3331 3332
        r = toku_brt_db_delay_closed(db->i->brt, db, db_close_before_brt, flags);
    }
3333 3334 3335 3336
    return r;
}


3337
//Get the main portion of a cursor flag (excluding the bitwise or'd components).
3338 3339
static int 
get_main_cursor_flag(u_int32_t flags) {
3340
    return flags & DB_OPFLAGS_MASK;
Yoni Fogel's avatar
Yoni Fogel committed
3341 3342
}

3343 3344
static int 
get_nonmain_cursor_flags(u_int32_t flags) {
3345
    return flags & ~(DB_OPFLAGS_MASK);
3346 3347
}

3348 3349
static inline BOOL 
toku_c_uninitialized(DBC* c) {
3350
    return toku_brt_cursor_uninitialized(dbc_struct_i(c)->c);
Yoni Fogel's avatar
Yoni Fogel committed
3351
}            
Yoni Fogel's avatar
Yoni Fogel committed
3352

3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363
typedef struct query_context_wrapped_t {
    DBT               *key;
    DBT               *val;
    struct simple_dbt *skey;
    struct simple_dbt *sval;
} *QUERY_CONTEXT_WRAPPED, QUERY_CONTEXT_WRAPPED_S;

static inline void
query_context_wrapped_init(QUERY_CONTEXT_WRAPPED context, DBC *c, DBT *key, DBT *val) {
    context->key  = key;
    context->val  = val;
3364 3365
    context->skey = dbc_struct_i(c)->skey;
    context->sval = dbc_struct_i(c)->sval;
3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376
}

static int
c_get_wrapper_callback(DBT const *key, DBT const *val, void *extra) {
    QUERY_CONTEXT_WRAPPED context = extra;
    int r;
              r = toku_dbt_set(key->size, key->data, context->key, context->skey);
    if (r==0) r = toku_dbt_set(val->size, val->data, context->val, context->sval);
    return r;
}

3377 3378
static int 
toku_c_get_current_unconditional(DBC* c, u_int32_t flags, DBT* key, DBT* val) {
3379 3380 3381 3382
    int r;
    QUERY_CONTEXT_WRAPPED_S context; 
    query_context_wrapped_init(&context, c, key, val);
    r = toku_c_getf_current_binding(c, flags, c_get_wrapper_callback, &context);
Yoni Fogel's avatar
Yoni Fogel committed
3383 3384 3385
    return r;
}

3386 3387
static inline void 
toku_swap_flag(u_int32_t* flag, u_int32_t* get_flag, u_int32_t new_flag) {
Yoni Fogel's avatar
Yoni Fogel committed
3388 3389 3390 3391 3392
    *flag    -= *get_flag;
    *get_flag =  new_flag;
    *flag    += *get_flag;
}

Yoni Fogel's avatar
Yoni Fogel committed
3393 3394 3395 3396 3397 3398 3399
/*
    Used for partial implementation of nested transactions.
    Work is done by children as normal, but all locking is done by the
    root of the nested txn tree.
    This may hold extra locks, and will not work as expected when
    a node has two non-completed txns at any time.
*/
3400 3401
static inline DB_TXN* 
toku_txn_ancestor(DB_TXN* txn) {
Yoni Fogel's avatar
Yoni Fogel committed
3402
    while (txn && txn->parent) txn = txn->parent;
Yoni Fogel's avatar
Yoni Fogel committed
3403

Yoni Fogel's avatar
Yoni Fogel committed
3404 3405 3406
    return txn;
}

Yoni Fogel's avatar
Yoni Fogel committed
3407 3408
static int toku_txn_add_lt(DB_TXN* txn, toku_lock_tree* lt);

Yoni Fogel's avatar
Yoni Fogel committed
3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419
/* c_get has many subfunctions with lots of parameters
 * this structure exists to simplify it. */
typedef struct {
    DBC*        c;                  // The cursor
    DB*         db;                 // db the cursor is iterating over
    DB_TXN*     txn_anc;            // The (root) ancestor of the transaction
    TXNID       id_anc;
    DBT         tmp_key;            // Temporary key to protect out param
    DBT         tmp_val;            // Temporary val to protect out param
    u_int32_t   flag;               // The c_get flag
    u_int32_t   op;                 // The operation portion of the c_get flag
Yoni Fogel's avatar
Yoni Fogel committed
3420
    u_int32_t   lock_flags;         // The prelock flags.
Yoni Fogel's avatar
Yoni Fogel committed
3421 3422 3423 3424 3425 3426 3427 3428 3429 3430
    BOOL        cursor_is_write;    // Whether op can change position of cursor
    BOOL        key_is_read;        
    BOOL        key_is_write;
    BOOL        val_is_read;
    BOOL        val_is_write;
    BOOL        duplicates;
    BOOL        tmp_key_malloced;
    BOOL        tmp_val_malloced;
} C_GET_VARS;

3431

3432 3433
static inline u_int32_t 
get_prelocked_flags(u_int32_t flags) {
3434
    u_int32_t lock_flags = flags & (DB_PRELOCKED | DB_PRELOCKED_WRITE | DB_PRELOCKED_FILE_READ);
3435 3436
    return lock_flags;
}
3437

3438 3439
static inline u_int32_t 
get_cursor_prelocked_flags(u_int32_t flags, DBC* dbc) {
3440
    u_int32_t lock_flags = flags & (DB_PRELOCKED | DB_PRELOCKED_WRITE | DB_PRELOCKED_FILE_READ);
3441 3442

    //DB_READ_UNCOMMITTED and DB_READ_COMMITTED transactions 'own' all read locks for user-data dictionaries.
3443
    if (dbc_struct_i(dbc)->iso != TOKU_ISO_SERIALIZABLE) {
3444
        lock_flags |= DB_PRELOCKED;
3445
    }
3446
    return lock_flags;
Yoni Fogel's avatar
Yoni Fogel committed
3447 3448
}

3449 3450
static int
toku_c_get(DBC* c, DBT* key, DBT* val, u_int32_t flag) {
3451 3452 3453
    //This function exists for legacy (test compatibility) purposes/parity with bdb.
    HANDLE_PANICKED_DB(c->dbp);
    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
Yoni Fogel's avatar
Yoni Fogel committed
3454

3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466
    u_int32_t main_flag       = get_main_cursor_flag(flag);
    u_int32_t remaining_flags = get_nonmain_cursor_flags(flag);
    int r;
    QUERY_CONTEXT_WRAPPED_S context;
    //Passing in NULL for a key or val means that it is NOT an output.
    //    Both key and val are output:
    //        query_context_wrapped_init(&context, c, key,  val);
    //    Val is output, key is not:
    //            query_context_wrapped_init(&context, c, NULL, val);
    //    Neither key nor val are output:
    //	    query_context_wrapped_init(&context, c, NULL, NULL); // Used for DB_GET_BOTH
    switch (main_flag) {
Yoni Fogel's avatar
Yoni Fogel committed
3467
        case (DB_FIRST):
3468 3469
            query_context_wrapped_init(&context, c, key,  val);
            r = toku_c_getf_first(c, remaining_flags, c_get_wrapper_callback, &context);
Yoni Fogel's avatar
Yoni Fogel committed
3470
            break;
Yoni Fogel's avatar
Yoni Fogel committed
3471
        case (DB_LAST):
3472 3473
            query_context_wrapped_init(&context, c, key,  val);
            r = toku_c_getf_last(c, remaining_flags, c_get_wrapper_callback, &context);
Yoni Fogel's avatar
Yoni Fogel committed
3474
            break;
3475
        case (DB_NEXT):
Yoni Fogel's avatar
Yoni Fogel committed
3476
        case (DB_NEXT_NODUP):
3477
            query_context_wrapped_init(&context, c, key,  val);
3478
            r = toku_c_getf_next(c, remaining_flags, c_get_wrapper_callback, &context);
Yoni Fogel's avatar
Yoni Fogel committed
3479 3480
            break;
        case (DB_PREV):
3481
        case (DB_PREV_NODUP):
3482 3483
            query_context_wrapped_init(&context, c, key,  val);
            r = toku_c_getf_prev(c, remaining_flags, c_get_wrapper_callback, &context);
Yoni Fogel's avatar
Yoni Fogel committed
3484 3485
            break;
#ifdef DB_PREV_DUP
Yoni Fogel's avatar
Yoni Fogel committed
3486
        case (DB_PREV_DUP):
3487 3488
            query_context_wrapped_init(&context, c, key,  val);
            r = toku_c_getf_prev_dup(c, remaining_flags, c_get_wrapper_callback, &context);
Yoni Fogel's avatar
Yoni Fogel committed
3489 3490
            break;
#endif
3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507
        case (DB_CURRENT):
            query_context_wrapped_init(&context, c, key,  val);
            r = toku_c_getf_current(c, remaining_flags, c_get_wrapper_callback, &context);
            break;
        case (DB_CURRENT_BINDING):
            query_context_wrapped_init(&context, c, key,  val);
            r = toku_c_getf_current_binding(c, remaining_flags, c_get_wrapper_callback, &context);
            break;

        case (DB_SET):
            query_context_wrapped_init(&context, c, NULL, val);
            r = toku_c_getf_set(c, remaining_flags, key, c_get_wrapper_callback, &context);
            break;
        case (DB_SET_RANGE):
            query_context_wrapped_init(&context, c, key,  val);
            r = toku_c_getf_set_range(c, remaining_flags, key, c_get_wrapper_callback, &context);
            break;
3508 3509 3510 3511
        case (DB_SET_RANGE_REVERSE):
            query_context_wrapped_init(&context, c, key,  val);
            r = toku_c_getf_set_range_reverse(c, remaining_flags, key, c_get_wrapper_callback, &context);
            break;
Yoni Fogel's avatar
Yoni Fogel committed
3512
        default:
Yoni Fogel's avatar
Yoni Fogel committed
3513
            r = EINVAL;
3514
            break;
Yoni Fogel's avatar
Yoni Fogel committed
3515
    }
Yoni Fogel's avatar
Yoni Fogel committed
3516
    return r;
Yoni Fogel's avatar
Yoni Fogel committed
3517 3518
}

3519 3520
static int 
locked_c_getf_first(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
3521 3522 3523 3524
    toku_ydb_lock();  
    int r = toku_c_getf_first(c, flag, f, extra); 
    toku_ydb_unlock(); 
    return r;
Yoni Fogel's avatar
Yoni Fogel committed
3525 3526
}

3527 3528
static int 
locked_c_getf_last(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
3529 3530
    toku_ydb_lock();  int r = toku_c_getf_last(c, flag, f, extra); toku_ydb_unlock(); return r;
}
Yoni Fogel's avatar
Yoni Fogel committed
3531

3532 3533
static int 
locked_c_getf_next(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
3534
    toku_ydb_lock();  int r = toku_c_getf_next(c, flag, f, extra); toku_ydb_unlock(); return r;
Yoni Fogel's avatar
Yoni Fogel committed
3535 3536
}

3537 3538
static int 
locked_c_getf_prev(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
3539
    toku_ydb_lock();  int r = toku_c_getf_prev(c, flag, f, extra); toku_ydb_unlock(); return r;
Yoni Fogel's avatar
Yoni Fogel committed
3540 3541
}

3542 3543
static int 
locked_c_getf_current(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
3544
    toku_ydb_lock();  int r = toku_c_getf_current(c, flag, f, extra); toku_ydb_unlock(); return r;
Yoni Fogel's avatar
Yoni Fogel committed
3545 3546
}

3547 3548
static int 
locked_c_getf_current_binding(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
3549 3550
    toku_ydb_lock();  int r = toku_c_getf_current_binding(c, flag, f, extra); toku_ydb_unlock(); return r;
}
Yoni Fogel's avatar
Yoni Fogel committed
3551

3552 3553
static int 
locked_c_getf_set(DBC *c, u_int32_t flag, DBT * key, YDB_CALLBACK_FUNCTION f, void *extra) {
3554 3555
    toku_ydb_lock();  int r = toku_c_getf_set(c, flag, key, f, extra); toku_ydb_unlock(); return r;
}
Yoni Fogel's avatar
Yoni Fogel committed
3556

3557 3558
static int 
locked_c_getf_set_range(DBC *c, u_int32_t flag, DBT * key, YDB_CALLBACK_FUNCTION f, void *extra) {
3559 3560
    toku_ydb_lock();  int r = toku_c_getf_set_range(c, flag, key, f, extra); toku_ydb_unlock(); return r;
}
Yoni Fogel's avatar
Yoni Fogel committed
3561

3562 3563
static int 
locked_c_getf_set_range_reverse(DBC *c, u_int32_t flag, DBT * key, YDB_CALLBACK_FUNCTION f, void *extra) {
3564 3565 3566
    toku_ydb_lock();  int r = toku_c_getf_set_range_reverse(c, flag, key, f, extra); toku_ydb_unlock(); return r;
}

3567 3568 3569
// Get a range lock.
// Return when the range lock is acquired or the default lock tree timeout has expired.  
// The ydb mutex must be held when called and may be released when waiting in the lock tree.
3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582
static int
get_range_lock(DB *db, DB_TXN *txn, const DBT *left_key, const DBT *right_key, toku_lock_type lock_type) {
    int r;
    DB_TXN *txn_anc = toku_txn_ancestor(txn);
    r = toku_txn_add_lt(txn_anc, db->i->lt);
    if (r == 0) {
        TXNID txn_anc_id = toku_txn_get_txnid(db_txn_struct_i(txn_anc)->tokutxn);
        toku_lock_request lock_request;
        toku_lock_request_init(&lock_request, db, txn_anc_id, left_key, right_key, lock_type);
        r = toku_lt_acquire_lock_request_with_default_timeout_locked(db->i->lt, &lock_request);
        toku_lock_request_destroy(&lock_request);
    }
    return r;
3583 3584
}

3585
// Setup and start an asynchronous lock request.
3586
static int
3587
start_range_lock(DB *db, DB_TXN *txn, const DBT *left_key, const DBT *right_key, toku_lock_type lock_type, toku_lock_request *lock_request) {
3588
    int r;
3589 3590 3591
    DB_TXN *txn_anc = toku_txn_ancestor(txn);
    r = toku_txn_add_lt(txn_anc, db->i->lt);
    if (r == 0) {
3592
        TXNID txn_anc_id = toku_txn_get_txnid(db_txn_struct_i(txn_anc)->tokutxn);
3593 3594
        toku_lock_request_set(lock_request, db, txn_anc_id, left_key, right_key, lock_type);
        r = toku_lock_request_start_locked(lock_request, db->i->lt, true);
3595 3596
    }
    return r;
Yoni Fogel's avatar
Yoni Fogel committed
3597 3598
}

3599 3600 3601 3602 3603 3604 3605
static int 
get_point_write_lock(DB *db, DB_TXN *txn, const DBT *key) {
    int r = get_range_lock(db, txn, key, key, LOCK_REQUEST_WRITE);
    return r;
}

// assume ydb is locked
3606
int
Zardosht Kasheff's avatar
Zardosht Kasheff committed
3607 3608
toku_grab_read_lock_on_directory (DB* db, DB_TXN * txn) {
    // bad hack because some environment dictionaries do not have a dname
3609
    char *dname = db->i->dname;
3610
    if (!dname || (db->dbenv->i->directory->i->lt == NULL))
Zardosht Kasheff's avatar
Zardosht Kasheff committed
3611
        return 0;
3612

Zardosht Kasheff's avatar
Zardosht Kasheff committed
3613
    //Left end of range == right end of range (point lock)
3614 3615
    DBT key_in_directory = { .data = dname, .size = strlen(dname)+1 };
    int r = get_range_lock(db->dbenv->i->directory, txn, &key_in_directory, &key_in_directory, LOCK_REQUEST_READ);
3616 3617 3618 3619
    if (r == 0)
	directory_read_locks++;
    else
	directory_read_locks_fail++;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
3620 3621 3622
    return r;
}

3623 3624
//This is the user level callback function given to ydb layer functions like
//toku_c_getf_first
Yoni Fogel's avatar
Yoni Fogel committed
3625

3626 3627 3628 3629
typedef struct query_context_base_t {
    BRT_CURSOR  c;
    DB_TXN     *txn;
    DB         *db;
3630
    YDB_CALLBACK_FUNCTION f;
3631 3632
    void       *f_extra;
    int         r_user_callback;
3633 3634
    BOOL        do_locking;
    BOOL        is_write_op;
3635
    toku_lock_request lock_request;
3636
} *QUERY_CONTEXT_BASE, QUERY_CONTEXT_BASE_S;
3637

3638 3639 3640
typedef struct query_context_t {
    QUERY_CONTEXT_BASE_S  base;
} *QUERY_CONTEXT, QUERY_CONTEXT_S;
Yoni Fogel's avatar
Yoni Fogel committed
3641

3642 3643 3644 3645 3646
typedef struct query_context_with_input_t {
    QUERY_CONTEXT_BASE_S  base;
    DBT                  *input_key;
    DBT                  *input_val;
} *QUERY_CONTEXT_WITH_INPUT, QUERY_CONTEXT_WITH_INPUT_S;
Yoni Fogel's avatar
Yoni Fogel committed
3647

3648
static void
3649
query_context_base_init(QUERY_CONTEXT_BASE context, DBC *c, u_int32_t flag, BOOL is_write_op, YDB_CALLBACK_FUNCTION f, void *extra) {
3650 3651
    context->c       = dbc_struct_i(c)->c;
    context->txn     = dbc_struct_i(c)->txn;
3652
    context->db      = c->dbp;
3653
    context->f       = f;
3654
    context->f_extra = extra;
3655
    context->is_write_op = is_write_op;
3656
    u_int32_t lock_flags = get_cursor_prelocked_flags(flag, c);
3657 3658
    if (context->is_write_op) 
        lock_flags &= DB_PRELOCKED_WRITE; // Only care about whether already locked for write
Zardosht Kasheff's avatar
Zardosht Kasheff committed
3659
    context->do_locking = (BOOL)(context->db->i->lt!=NULL && !(lock_flags & (DB_PRELOCKED|DB_PRELOCKED_WRITE)));
3660
    context->r_user_callback = 0;
3661 3662 3663 3664 3665 3666
    toku_lock_request_default_init(&context->lock_request);
}

static void
query_context_base_destroy(QUERY_CONTEXT_BASE context) {
    toku_lock_request_destroy(&context->lock_request);
Yoni Fogel's avatar
Yoni Fogel committed
3667 3668
}

3669
static void
3670 3671
query_context_init_read(QUERY_CONTEXT context, DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    BOOL is_write = FALSE;
3672
    query_context_base_init(&context->base, c, flag, is_write, f, extra);
3673 3674 3675
}

static void
3676 3677
query_context_init_write(QUERY_CONTEXT context, DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    BOOL is_write = TRUE;
3678
    query_context_base_init(&context->base, c, flag, is_write, f, extra);
3679
}
Yoni Fogel's avatar
Yoni Fogel committed
3680

3681 3682
static void
query_context_with_input_init(QUERY_CONTEXT_WITH_INPUT context, DBC *c, u_int32_t flag, DBT *key, DBT *val, YDB_CALLBACK_FUNCTION f, void *extra) {
3683 3684
    // grab write locks if the DB_RMW flag is set or the cursor was created with the DB_RMW flag
    BOOL is_write = ((flag & DB_RMW) != 0) || dbc_struct_i(c)->rmw;
3685
    query_context_base_init(&context->base, c, flag, is_write, f, extra);
3686 3687
    context->input_key = key;
    context->input_val = val;
3688 3689
}

3690 3691 3692 3693 3694
static int c_del_callback(DBT const *key, DBT const *val, void *extra);

//Delete whatever the cursor is pointing at.
static int
toku_c_del(DBC * c, u_int32_t flags) {
3695
    HANDLE_PANICKED_DB(c->dbp);
3696
    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
3697

3698 3699 3700 3701
    u_int32_t unchecked_flags = flags;
    //DB_DELETE_ANY means delete regardless of whether it exists in the db.
    u_int32_t flag_for_brt = flags&DB_DELETE_ANY;
    unchecked_flags &= ~flag_for_brt;
3702
    u_int32_t lock_flags = get_cursor_prelocked_flags(flags, c);
3703
    unchecked_flags &= ~lock_flags;
3704
    BOOL do_locking = (BOOL)(c->dbp->i->lt && !(lock_flags&DB_PRELOCKED_WRITE));
3705

3706
    int r = 0;
3707 3708
    if (unchecked_flags!=0) 
        r = EINVAL;
3709 3710 3711
    else {
        if (do_locking) {
            QUERY_CONTEXT_S context;
3712
            query_context_init_write(&context, c, lock_flags, NULL, NULL);
3713 3714 3715 3716 3717 3718 3719 3720 3721
            while (r == 0) {
                //We do not need a read lock, we must already have it.
                r = toku_c_getf_current_binding(c, DB_PRELOCKED, c_del_callback, &context);
                if (r == DB_LOCK_NOTGRANTED)
                    r = toku_lock_request_wait_with_default_timeout(&context.base.lock_request, c->dbp->i->lt);
                else
                    break;
            }
            query_context_base_destroy(&context.base);
3722 3723 3724
        }
        if (r==0) {
            //Do the actual delete.
3725 3726
            TOKUTXN txn = dbc_struct_i(c)->txn ? db_txn_struct_i(dbc_struct_i(c)->txn)->tokutxn : 0;
            r = toku_brt_cursor_delete(dbc_struct_i(c)->c, flag_for_brt, txn);
3727 3728
        }
    }
3729 3730
    return r;
}
3731

3732 3733 3734 3735 3736
//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..)
static int
c_del_callback(DBT const *key, DBT const *val, void *extra) {
    QUERY_CONTEXT_WITH_INPUT super_context = extra;
    QUERY_CONTEXT_BASE       context       = &super_context->base;
Yoni Fogel's avatar
Yoni Fogel committed
3737

3738
    int r;
Yoni Fogel's avatar
Yoni Fogel committed
3739

3740
    assert(context->do_locking);
3741
    assert(context->is_write_op);
3742 3743
    assert(key!=NULL);
    assert(val!=NULL);
3744

3745 3746
    //Lock:
    //  left(key,val)==right(key,val) == (key, val);
3747
    r = start_range_lock(context->db, context->txn, key, key, LOCK_REQUEST_WRITE, &context->lock_request);
3748 3749

    //Give brt-layer an error (if any) to return from toku_c_getf_current_binding
Yoni Fogel's avatar
Yoni Fogel committed
3750 3751 3752
    return r;
}

3753 3754
static int c_getf_first_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra);

3755 3756
static void 
c_query_context_init(QUERY_CONTEXT context, DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
3757 3758 3759 3760 3761 3762 3763 3764 3765 3766
    BOOL is_write_op = FALSE;
    // grab write locks if the DB_RMW flag is set or the cursor was created with the DB_RMW flag
    if ((flag & DB_RMW) || dbc_struct_i(c)->rmw)
        is_write_op = TRUE;
    if (is_write_op)
        query_context_init_write(context, c, flag, f, extra);
    else
        query_context_init_read(context, c, flag, f, extra);
}

3767 3768 3769 3770 3771
static void 
c_query_context_destroy(QUERY_CONTEXT context) {
    query_context_base_destroy(&context->base);
}

3772 3773 3774
static int
toku_c_getf_first(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    HANDLE_PANICKED_DB(c->dbp);
3775
    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
3776
    num_point_queries++;   // accountability
3777
    int r = 0;
3778
    QUERY_CONTEXT_S context; //Describes the context of this query.
3779
    c_query_context_init(&context, c, flag, f, extra);
3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791
    while (r == 0) {
        //toku_brt_cursor_first will call c_getf_first_callback(..., context) (if query is successful)
        r = toku_brt_cursor_first(dbc_struct_i(c)->c, c_getf_first_callback, &context);
        if (r == DB_LOCK_NOTGRANTED)
            r = toku_lock_request_wait_with_default_timeout(&context.base.lock_request, c->dbp->i->lt);
        else {
            if (r == TOKUDB_USER_CALLBACK_ERROR)
                r = context.base.r_user_callback;
            break;
        }
    }
    c_query_context_destroy(&context);
3792 3793 3794 3795 3796 3797 3798 3799 3800 3801
    return r;
}

//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..)
static int
c_getf_first_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra) {
    QUERY_CONTEXT      super_context = extra;
    QUERY_CONTEXT_BASE context       = &super_context->base;

    int r;
3802 3803
    DBT found_key = { .data = (void *) key, .size = keylen };
    DBT found_val = { .data = (void *) val, .size = vallen };
3804 3805

    if (context->do_locking) {
3806 3807
        const DBT *left_key = toku_lt_neg_infinity;
        const DBT *right_key = key != NULL ? &found_key : toku_lt_infinity;
3808 3809
        r = start_range_lock(context->db, context->txn, left_key, right_key, 
                             context->is_write_op ? LOCK_REQUEST_WRITE : LOCK_REQUEST_READ, &context->lock_request);
3810 3811
    } else 
        r = 0;
3812 3813 3814

    //Call application-layer callback if found and locks were successfully obtained.
    if (r==0 && key!=NULL) {
3815
        context->r_user_callback = context->f(&found_key, &found_val, context->f_extra);
3816
        r = context->r_user_callback;
3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827
    }

    //Give brt-layer an error (if any) to return from toku_brt_cursor_first
    return r;
}

static int c_getf_last_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra);

static int
toku_c_getf_last(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    HANDLE_PANICKED_DB(c->dbp);
3828
    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
3829
    num_point_queries++;   // accountability
3830
    int r = 0;
3831
    QUERY_CONTEXT_S context; //Describes the context of this query.
3832
    c_query_context_init(&context, c, flag, f, extra); 
3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844
    while (r == 0) {
        //toku_brt_cursor_last will call c_getf_last_callback(..., context) (if query is successful)
        r = toku_brt_cursor_last(dbc_struct_i(c)->c, c_getf_last_callback, &context);
        if (r == DB_LOCK_NOTGRANTED)
            r = toku_lock_request_wait_with_default_timeout(&context.base.lock_request, c->dbp->i->lt);
        else {
            if (r == TOKUDB_USER_CALLBACK_ERROR)
                r = context.base.r_user_callback;
            break;
        }
    }
    c_query_context_destroy(&context);
3845 3846 3847 3848 3849 3850 3851 3852 3853 3854
    return r;
}

//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..)
static int
c_getf_last_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra) {
    QUERY_CONTEXT      super_context = extra;
    QUERY_CONTEXT_BASE context       = &super_context->base;

    int r;
3855 3856
    DBT found_key = { .data = (void *) key, .size = keylen };
    DBT found_val = { .data = (void *) val, .size = vallen };
3857 3858

    if (context->do_locking) {
3859 3860
        const DBT *left_key = key != NULL ? &found_key : toku_lt_neg_infinity;
        const DBT *right_key = toku_lt_infinity;
3861 3862
        r = start_range_lock(context->db, context->txn, left_key, right_key, 
                             context->is_write_op ? LOCK_REQUEST_WRITE : LOCK_REQUEST_READ, &context->lock_request);
3863 3864
    } else 
        r = 0;
3865 3866 3867

    //Call application-layer callback if found and locks were successfully obtained.
    if (r==0 && key!=NULL) {
3868
        context->r_user_callback = context->f(&found_key, &found_val, context->f_extra);
3869
        r = context->r_user_callback;
3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881
    }

    //Give brt-layer an error (if any) to return from toku_brt_cursor_last
    return r;
}

static int c_getf_next_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra);

static int
toku_c_getf_next(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    int r;
    HANDLE_PANICKED_DB(c->dbp);
3882
    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
3883 3884
    if (toku_c_uninitialized(c)) 
        r = toku_c_getf_first(c, flag, f, extra);
3885
    else {
3886
        r = 0;
3887
        QUERY_CONTEXT_S context; //Describes the context of this query.
3888
        c_query_context_init(&context, c, flag, f, extra); 
3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900
        while (r == 0) {
            //toku_brt_cursor_next will call c_getf_next_callback(..., context) (if query is successful)
            r = toku_brt_cursor_next(dbc_struct_i(c)->c, c_getf_next_callback, &context);
            if (r == DB_LOCK_NOTGRANTED)
                r = toku_lock_request_wait_with_default_timeout(&context.base.lock_request, c->dbp->i->lt);
            else {
                if (r == TOKUDB_USER_CALLBACK_ERROR)
                    r = context.base.r_user_callback;
                break;
            }
        }
        c_query_context_destroy(&context);
3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912
    }
    return r;
}

//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..)
static int
c_getf_next_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra) {
    QUERY_CONTEXT      super_context = extra;
    QUERY_CONTEXT_BASE context       = &super_context->base;

    int r;

3913 3914 3915
    DBT found_key = { .data = (void *) key, .size = keylen };
    DBT found_val = { .data = (void *) val, .size = vallen };
    num_sequential_queries++;   // accountability
3916 3917

    if (context->do_locking) {
3918
        const DBT *prevkey, *prevval;
3919
        toku_brt_cursor_peek(context->c, &prevkey, &prevval);
3920 3921
        const DBT *left_key = prevkey;
        const DBT *right_key = key != NULL ? &found_key : toku_lt_infinity;
3922 3923
        r = start_range_lock(context->db, context->txn, left_key, right_key, 
                             context->is_write_op ? LOCK_REQUEST_WRITE : LOCK_REQUEST_READ, &context->lock_request);
3924 3925
    } else 
        r = 0;
3926 3927 3928

    //Call application-layer callback if found and locks were successfully obtained.
    if (r==0 && key!=NULL) {
3929
        context->r_user_callback = context->f(&found_key, &found_val, context->f_extra);
3930
        r = context->r_user_callback;
3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942
    }

    //Give brt-layer an error (if any) to return from toku_brt_cursor_next
    return r;
}

static int c_getf_prev_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra);

static int
toku_c_getf_prev(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    int r;
    HANDLE_PANICKED_DB(c->dbp);
3943
    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
3944 3945
    if (toku_c_uninitialized(c)) 
        r = toku_c_getf_last(c, flag, f, extra);
3946
    else {
3947
        r = 0;
3948
        QUERY_CONTEXT_S context; //Describes the context of this query.
3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961
        c_query_context_init(&context, c, flag, f, extra);
        while (r == 0) {
            //toku_brt_cursor_prev will call c_getf_prev_callback(..., context) (if query is successful)
            r = toku_brt_cursor_prev(dbc_struct_i(c)->c, c_getf_prev_callback, &context);
            if (r == DB_LOCK_NOTGRANTED)
                r = toku_lock_request_wait_with_default_timeout(&context.base.lock_request, c->dbp->i->lt);
            else {
                if (r == TOKUDB_USER_CALLBACK_ERROR)
                    r = context.base.r_user_callback;
                break;
            }
        }
        c_query_context_destroy(&context);
Yoni Fogel's avatar
Yoni Fogel committed
3962 3963 3964 3965
    }
    return r;
}

3966 3967 3968 3969 3970 3971 3972
//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..)
static int
c_getf_prev_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra) {
    QUERY_CONTEXT      super_context = extra;
    QUERY_CONTEXT_BASE context       = &super_context->base;

    int r;
3973 3974
    DBT found_key = { .data = (void *) key, .size = keylen };
    DBT found_val = { .data = (void *) val, .size = vallen };
3975
    num_sequential_queries++;   // accountability
3976 3977

    if (context->do_locking) {
3978
        const DBT *prevkey, *prevval;
3979
        toku_brt_cursor_peek(context->c, &prevkey, &prevval);
3980 3981
        const DBT *left_key = key != NULL ? &found_key : toku_lt_neg_infinity;
        const DBT *right_key = prevkey;
3982 3983
        r = start_range_lock(context->db, context->txn, left_key, right_key, 
                             context->is_write_op ? LOCK_REQUEST_WRITE : LOCK_REQUEST_READ, &context->lock_request);
3984 3985
    } else 
        r = 0;
3986 3987 3988

    //Call application-layer callback if found and locks were successfully obtained.
    if (r==0 && key!=NULL) {
3989
        context->r_user_callback = context->f(&found_key, &found_val, context->f_extra);
3990
        r = context->r_user_callback;
3991 3992 3993 3994
    }

    //Give brt-layer an error (if any) to return from toku_brt_cursor_prev
    return r;
Yoni Fogel's avatar
Yoni Fogel committed
3995 3996
}

3997 3998 3999 4000 4001
static int c_getf_current_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra);

static int
toku_c_getf_current(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    HANDLE_PANICKED_DB(c->dbp);
4002
    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
4003 4004

    QUERY_CONTEXT_S context; //Describes the context of this query.
4005
    num_sequential_queries++;   // accountability
4006
    c_query_context_init(&context, c, flag, f, extra); 
4007
    //toku_brt_cursor_current will call c_getf_current_callback(..., context) (if query is successful)
4008
    int r = toku_brt_cursor_current(dbc_struct_i(c)->c, DB_CURRENT, c_getf_current_callback, &context);
4009
    if (r == TOKUDB_USER_CALLBACK_ERROR) r = context.base.r_user_callback;
4010
    c_query_context_destroy(&context);
4011 4012 4013 4014 4015 4016 4017 4018 4019
    return r;
}

//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..)
static int
c_getf_current_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra) {
    QUERY_CONTEXT      super_context = extra;
    QUERY_CONTEXT_BASE context       = &super_context->base;

4020 4021 4022
    int r;
    DBT found_key = { .data = (void *) key, .size = keylen };
    DBT found_val = { .data = (void *) val, .size = vallen };
4023 4024 4025

    //Call application-layer callback if found.
    if (key!=NULL) {
4026
        context->r_user_callback = context->f(&found_key, &found_val, context->f_extra);
4027
        r = context->r_user_callback;
4028 4029
    } else
        r = 0;
4030 4031 4032 4033 4034 4035 4036 4037

    //Give brt-layer an error (if any) to return from toku_brt_cursor_current
    return r;
}

static int
toku_c_getf_current_binding(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    HANDLE_PANICKED_DB(c->dbp);
4038
    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
4039 4040

    QUERY_CONTEXT_S context; //Describes the context of this query.
4041
    num_sequential_queries++;   // accountability
4042
    c_query_context_init(&context, c, flag, f, extra); 
4043
    //toku_brt_cursor_current will call c_getf_current_callback(..., context) (if query is successful)
4044
    int r = toku_brt_cursor_current(dbc_struct_i(c)->c, DB_CURRENT_BINDING, c_getf_current_callback, &context);
4045
    if (r == TOKUDB_USER_CALLBACK_ERROR) r = context.base.r_user_callback;
4046
    c_query_context_destroy(&context);
4047 4048 4049 4050 4051 4052 4053 4054
    return r;
}

static int c_getf_set_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra);

static int
toku_c_getf_set(DBC *c, u_int32_t flag, DBT *key, YDB_CALLBACK_FUNCTION f, void *extra) {
    HANDLE_PANICKED_DB(c->dbp);
4055
    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
4056

4057
    int r = 0;
4058
    QUERY_CONTEXT_WITH_INPUT_S context; //Describes the context of this query.
4059
    num_point_queries++;   // accountability
4060
    query_context_with_input_init(&context, c, flag, key, NULL, f, extra); 
4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072
    while (r == 0) {
        //toku_brt_cursor_set will call c_getf_set_callback(..., context) (if query is successful)
        r = toku_brt_cursor_set(dbc_struct_i(c)->c, key, c_getf_set_callback, &context);
        if (r == DB_LOCK_NOTGRANTED)
            r = toku_lock_request_wait_with_default_timeout(&context.base.lock_request, c->dbp->i->lt);
        else {
            if (r == TOKUDB_USER_CALLBACK_ERROR)
                r = context.base.r_user_callback;
            break;
        }
    }
    query_context_base_destroy(&context.base);
4073 4074 4075 4076 4077 4078 4079 4080 4081 4082
    return r;
}

//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..)
static int
c_getf_set_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra) {
    QUERY_CONTEXT_WITH_INPUT super_context = extra;
    QUERY_CONTEXT_BASE       context       = &super_context->base;

    int r;
4083 4084
    DBT found_key = { .data = (void *) key, .size = keylen };
    DBT found_val = { .data = (void *) val, .size = vallen };
4085 4086 4087 4088 4089

    //Lock:
    //  left(key,val)  = (input_key, -infinity)
    //  right(key,val) = (input_key, found ? found_val : infinity)
    if (context->do_locking) {
4090 4091
        r = start_range_lock(context->db, context->txn, super_context->input_key, super_context->input_key, 
                             context->is_write_op ? LOCK_REQUEST_WRITE : LOCK_REQUEST_READ, &context->lock_request);
4092 4093
    } else 
        r = 0;
4094 4095 4096

    //Call application-layer callback if found and locks were successfully obtained.
    if (r==0 && key!=NULL) {
4097
        context->r_user_callback = context->f(&found_key, &found_val, context->f_extra);
4098
        r = context->r_user_callback;
4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109
    }

    //Give brt-layer an error (if any) to return from toku_brt_cursor_set
    return r;
}

static int c_getf_set_range_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra);

static int
toku_c_getf_set_range(DBC *c, u_int32_t flag, DBT *key, YDB_CALLBACK_FUNCTION f, void *extra) {
    HANDLE_PANICKED_DB(c->dbp);
4110
    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
4111

4112
    int r = 0;
4113
    QUERY_CONTEXT_WITH_INPUT_S context; //Describes the context of this query.
4114
    num_point_queries++;   // accountability
4115
    query_context_with_input_init(&context, c, flag, key, NULL, f, extra); 
4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127
    while (r == 0) {
        //toku_brt_cursor_set_range will call c_getf_set_range_callback(..., context) (if query is successful)
        r = toku_brt_cursor_set_range(dbc_struct_i(c)->c, key, c_getf_set_range_callback, &context);
        if (r == DB_LOCK_NOTGRANTED)
            r = toku_lock_request_wait_with_default_timeout(&context.base.lock_request, c->dbp->i->lt);
        else {
            if (r == TOKUDB_USER_CALLBACK_ERROR)
                r = context.base.r_user_callback;
            break;
        }
    }
    query_context_base_destroy(&context.base);
4128 4129 4130 4131 4132 4133 4134 4135 4136 4137
    return r;
}

//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..)
static int
c_getf_set_range_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra) {
    QUERY_CONTEXT_WITH_INPUT super_context = extra;
    QUERY_CONTEXT_BASE       context       = &super_context->base;

    int r;
4138 4139
    DBT found_key = { .data = (void *) key, .size = keylen };
    DBT found_val = { .data = (void *) val, .size = vallen };
4140 4141 4142 4143 4144 4145

    //Lock:
    //  left(key,val)  = (input_key, -infinity)
    //  right(key) = found ? found_key : infinity
    //  right(val) = found ? found_val : infinity
    if (context->do_locking) {
4146 4147
        const DBT *left_key = super_context->input_key;
        const DBT *right_key = key != NULL ? &found_key : toku_lt_infinity;
4148 4149
        r = start_range_lock(context->db, context->txn, left_key, right_key, 
                             context->is_write_op ? LOCK_REQUEST_WRITE : LOCK_REQUEST_READ, &context->lock_request);
4150 4151
    } else 
        r = 0;
4152 4153 4154

    //Call application-layer callback if found and locks were successfully obtained.
    if (r==0 && key!=NULL) {
4155
        context->r_user_callback = context->f(&found_key, &found_val, context->f_extra);
4156
        r = context->r_user_callback;
4157 4158 4159 4160 4161 4162
    }

    //Give brt-layer an error (if any) to return from toku_brt_cursor_set_range
    return r;
}

4163 4164 4165 4166 4167 4168 4169
static int c_getf_set_range_reverse_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra);

static int
toku_c_getf_set_range_reverse(DBC *c, u_int32_t flag, DBT *key, YDB_CALLBACK_FUNCTION f, void *extra) {
    HANDLE_PANICKED_DB(c->dbp);
    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);

4170
    int r = 0;
4171
    QUERY_CONTEXT_WITH_INPUT_S context; //Describes the context of this query.
4172
    num_point_queries++;   // accountability
4173
    query_context_with_input_init(&context, c, flag, key, NULL, f, extra); 
4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185
    while (r == 0) {
        //toku_brt_cursor_set_range_reverse will call c_getf_set_range_reverse_callback(..., context) (if query is successful)
        r = toku_brt_cursor_set_range_reverse(dbc_struct_i(c)->c, key, c_getf_set_range_reverse_callback, &context);
        if (r == DB_LOCK_NOTGRANTED)
            r = toku_lock_request_wait_with_default_timeout(&context.base.lock_request, c->dbp->i->lt);
        else {
            if (r == TOKUDB_USER_CALLBACK_ERROR)
                r = context.base.r_user_callback;
            break;
        }
    }
    query_context_base_destroy(&context.base);
4186 4187 4188 4189 4190 4191 4192 4193 4194 4195
    return r;
}

//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..)
static int
c_getf_set_range_reverse_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra) {
    QUERY_CONTEXT_WITH_INPUT super_context = extra;
    QUERY_CONTEXT_BASE       context       = &super_context->base;

    int r;
4196 4197
    DBT found_key = { .data = (void *) key, .size = keylen };
    DBT found_val = { .data = (void *) val, .size = vallen };
4198 4199 4200 4201 4202 4203

    //Lock:
    //  left(key) = found ? found_key : -infinity
    //  left(val) = found ? found_val : -infinity
    //  right(key,val)  = (input_key, infinity)
    if (context->do_locking) {
4204 4205
        const DBT *left_key = key != NULL ? &found_key : toku_lt_neg_infinity;
        const DBT *right_key = super_context->input_key;
4206 4207
        r = start_range_lock(context->db, context->txn, left_key, right_key, 
                             context->is_write_op ? LOCK_REQUEST_WRITE : LOCK_REQUEST_READ, &context->lock_request);
4208 4209
    } else 
        r = 0;
4210 4211 4212

    //Call application-layer callback if found and locks were successfully obtained.
    if (r==0 && key!=NULL) {
4213
        context->r_user_callback = context->f(&found_key, &found_val, context->f_extra);
4214
        r = context->r_user_callback;
4215 4216 4217 4218 4219 4220
    }

    //Give brt-layer an error (if any) to return from toku_brt_cursor_set_range_reverse
    return r;
}

4221
static int toku_c_close(DBC * c) {
4222 4223
    HANDLE_PANICKED_DB(c->dbp);
    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
4224 4225 4226 4227 4228 4229
    int r = toku_brt_cursor_close(dbc_struct_i(c)->c);
    toku_sdbt_cleanup(&dbc_struct_i(c)->skey_s);
    toku_sdbt_cleanup(&dbc_struct_i(c)->sval_s);
#if !TOKUDB_NATIVE_H
    toku_free(dbc_struct_i(c));
#endif
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
4230
    toku_free(c);
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
4231 4232 4233
    return r;
}

4234 4235 4236 4237
// Return the number of entries whose key matches the key currently 
// pointed to by the brt cursor.  
static int 
toku_c_count(DBC *cursor, db_recno_t *count, u_int32_t flags) {
4238 4239
    HANDLE_PANICKED_DB(cursor->dbp);
    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(cursor);
Rich Prohaska's avatar
Rich Prohaska committed
4240 4241
    int r;
    DBC *count_cursor = 0;
4242
    DBT currentkey;
Rich Prohaska's avatar
Rich Prohaska committed
4243

4244
    init_dbt_realloc(&currentkey);
4245
    u_int32_t lock_flags = get_cursor_prelocked_flags(flags, cursor);
4246
    flags &= ~lock_flags;
Rich Prohaska's avatar
Rich Prohaska committed
4247 4248 4249 4250
    if (flags != 0) {
        r = EINVAL; goto finish;
    }

4251
    r = toku_c_get_current_unconditional(cursor, lock_flags, &currentkey, NULL);
Rich Prohaska's avatar
Rich Prohaska committed
4252
    if (r != 0) goto finish;
4253 4254 4255 4256 4257 4258

    //TODO: Optimization
    //if (do_locking) {
    //   do a lock from currentkey,-infinity to currentkey,infinity
    //   lock_flags |= DB_PRELOCKED
    //}
Rich Prohaska's avatar
Rich Prohaska committed
4259
    
4260
    r = toku_db_cursor(cursor->dbp, dbc_struct_i(cursor)->txn, &count_cursor, DBC_DISABLE_PREFETCHING, 0);
Rich Prohaska's avatar
Rich Prohaska committed
4261 4262
    if (r != 0) goto finish;

4263
    r = toku_c_getf_set(count_cursor, lock_flags, &currentkey, ydb_getf_do_nothing, NULL);
4264 4265 4266 4267
    if (r==0) {
	*count = 1; // there is a key, so the count is one (since we don't have DUP dbs anymore, the only answers are 0 or 1.
    } else {
	*count = 0;
Rich Prohaska's avatar
Rich Prohaska committed
4268
    }
4269
    r = 0;
Rich Prohaska's avatar
Rich Prohaska committed
4270 4271 4272 4273 4274 4275 4276 4277
finish:
    if (currentkey.data) toku_free(currentkey.data);
    if (count_cursor) {
        int rr = toku_c_close(count_cursor); assert(rr == 0);
    }
    return r;
}

Yoni Fogel's avatar
 
Yoni Fogel committed
4278

4279 4280
///////////
//db_getf_XXX is equivalent to c_getf_XXX, without a persistent cursor
Yoni Fogel's avatar
 
Yoni Fogel committed
4281

4282 4283
static int
db_getf_set(DB *db, DB_TXN *txn, u_int32_t flags, DBT *key, YDB_CALLBACK_FUNCTION f, void *extra) {
4284 4285
    HANDLE_PANICKED_DB(db);
    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
4286
    DBC *c;
4287
    uint32_t create_flags = flags & (DB_ISOLATION_FLAGS | DB_RMW);
4288
    flags &= ~DB_ISOLATION_FLAGS;
4289
    int r = toku_db_cursor(db, txn, &c, create_flags | DBC_DISABLE_PREFETCHING, 1);
4290 4291 4292 4293
    if (r==0) {
        r = toku_c_getf_set(c, flags, key, f, extra);
        int r2 = toku_c_close(c);
        if (r==0) r = r2;
Yoni Fogel's avatar
 
Yoni Fogel committed
4294
    }
4295
    return r;
Yoni Fogel's avatar
 
Yoni Fogel committed
4296 4297
}

4298 4299
static int
toku_db_del(DB *db, DB_TXN *txn, DBT *key, u_int32_t flags) {
4300
    HANDLE_PANICKED_DB(db);
4301
    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
4302

4303 4304
    u_int32_t unchecked_flags = flags;
    //DB_DELETE_ANY means delete regardless of whether it exists in the db.
4305
    BOOL error_if_missing = (BOOL)(!(flags&DB_DELETE_ANY));
4306
    unchecked_flags &= ~DB_DELETE_ANY;
4307
    u_int32_t lock_flags = get_prelocked_flags(flags);
4308
    unchecked_flags &= ~lock_flags;
4309
    BOOL do_locking = (BOOL)(db->i->lt && !(lock_flags&DB_PRELOCKED_WRITE));
4310
    BOOL do_dir_locking = !(lock_flags&DB_PRELOCKED_FILE_READ);
4311

4312
    int r = 0;
4313 4314 4315
    if (unchecked_flags!=0) 
        r = EINVAL;

4316
    if (r == 0 && do_dir_locking) {
4317
        r = toku_grab_read_lock_on_directory(db, txn);
Zardosht Kasheff's avatar
Zardosht Kasheff committed
4318
    }
4319
    if (r == 0 && error_if_missing) {
4320
        //Check if the key exists in the db.
4321
        r = db_getf_set(db, txn, lock_flags|DB_SERIALIZABLE|DB_RMW, key, ydb_getf_do_nothing, NULL);
4322
    }
4323
    if (r == 0 && do_locking) {
4324
        //Do locking if necessary.
4325
        r = get_point_write_lock(db, txn, key);
4326
    }
4327
    if (r == 0) {
4328
        //Do the actual deleting.
4329
        r = toku_brt_delete(db->i->brt, key, txn ? db_txn_struct_i(txn)->tokutxn : 0);
4330
    }
4331

4332 4333 4334
    if (r == 0) 
        num_deletes++;       // accountability 
    else
Zardosht Kasheff's avatar
Zardosht Kasheff committed
4335
        num_deletes_fail++;
4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346

    return r;
}

static int
log_del_single(DB_TXN *txn, BRT brt, const DBT *key) {
    TOKUTXN ttxn = db_txn_struct_i(txn)->tokutxn;
    int r = toku_brt_log_del(ttxn, brt, key);
    return r;
}

4347 4348 4349 4350 4351 4352 4353 4354
static uint32_t
sum_size(uint32_t num_keys, DBT keys[], uint32_t overhead) {
    uint32_t sum = 0;
    for (uint32_t i = 0; i < num_keys; i++) 
        sum += keys[i].size + overhead;
    return sum;
}

4355
static int
4356
log_del_multiple(DB_TXN *txn, DB *src_db, const DBT *key, const DBT *val, uint32_t num_dbs, BRT brts[], DBT keys[]) {
4357 4358 4359 4360
    int r = 0;
    if (num_dbs > 0) {
        TOKUTXN ttxn = db_txn_struct_i(txn)->tokutxn;
        BRT src_brt  = src_db ? src_db->i->brt : NULL;
Rich Prohaska's avatar
Rich Prohaska committed
4361 4362
        uint32_t del_multiple_size = key->size + val->size + num_dbs*sizeof (uint32_t) + toku_log_enq_delete_multiple_overhead;
        uint32_t del_single_sizes = sum_size(num_dbs, keys, toku_log_enq_delete_any_overhead);
4363 4364 4365 4366 4367 4368
        if (del_single_sizes < del_multiple_size) {
            for (uint32_t i = 0; r == 0 && i < num_dbs; i++)
                r = log_del_single(txn, brts[i], &keys[i]);
        } else {
            r = toku_brt_log_del_multiple(ttxn, src_brt, brts, num_dbs, key, val);
        }
4369 4370 4371 4372
    }
    return r;
}

4373 4374 4375 4376 4377 4378 4379 4380 4381
static uint32_t 
lookup_src_db(uint32_t num_dbs, DB *db_array[], DB *src_db) {
    uint32_t which_db;
    for (which_db = 0; which_db < num_dbs; which_db++) 
        if (db_array[which_db] == src_db)
            break;
    return which_db;
}

4382
static int
4383 4384
do_del_multiple(DB_TXN *txn, uint32_t num_dbs, DB *db_array[], DBT keys[], DB *src_db, const DBT *src_key) {
    src_db = src_db; src_key = src_key;
4385 4386 4387 4388
    int r = 0;
    TOKUTXN ttxn = db_txn_struct_i(txn)->tokutxn;
    for (uint32_t which_db = 0; r == 0 && which_db < num_dbs; which_db++) {
        DB *db = db_array[which_db];
4389 4390 4391

        // if db is being indexed by an indexer, then insert a delete message into the db if the src key is to the left or equal to the 
        // indexers cursor.  we have to get the src_db from the indexer and find it in the db_array.
4392 4393
	int do_delete = TRUE;
	DB_INDEXER *indexer = toku_db_get_indexer(db);
4394
	if (indexer) { // if this db is the index under construction
4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405
            DB *indexer_src_db = toku_indexer_get_src_db(indexer);
            invariant(indexer_src_db != NULL);
            const DBT *indexer_src_key;
            if (src_db == indexer_src_db)
                indexer_src_key = src_key;
            else {
                uint32_t which_src_db = lookup_src_db(num_dbs, db_array, indexer_src_db);
                invariant(which_src_db < num_dbs);
                indexer_src_key = &keys[which_src_db];
            }
            do_delete = !toku_indexer_is_key_right_of_le_cursor(indexer, indexer_src_db, indexer_src_key);
4406 4407 4408 4409
        }
	if (r == 0 && do_delete) {
            r = toku_brt_maybe_delete(db->i->brt, &keys[which_db], ttxn, FALSE, ZERO_LSN, FALSE);
        }
Zardosht Kasheff's avatar
Zardosht Kasheff committed
4410
    }
4411
    return r;
Rich Prohaska's avatar
Rich Prohaska committed
4412 4413
}

4414
static int
4415 4416 4417 4418
env_del_multiple(
    DB_ENV *env, 
    DB *src_db, 
    DB_TXN *txn, 
4419 4420
    const DBT *src_key, 
    const DBT *src_val, 
4421 4422 4423
    uint32_t num_dbs, 
    DB **db_array, 
    DBT *keys, 
4424
    uint32_t *flags_array) 
4425
{
4426
    int r;
4427
    DBT del_keys[num_dbs];
4428 4429 4430 4431

    HANDLE_PANICKED_ENV(env);

    if (!txn) {
4432 4433 4434
        r = EINVAL;
        goto cleanup;
    }
4435
    if (!env->i->generate_row_for_del) {
4436 4437 4438 4439
        r = EINVAL;
        goto cleanup;
    }

4440 4441 4442 4443 4444 4445 4446 4447
    HANDLE_ILLEGAL_WORKING_PARENT_TXN(env, txn);

    {
    uint32_t lock_flags[num_dbs];
    uint32_t remaining_flags[num_dbs];
    BRT brts[num_dbs];

    for (uint32_t which_db = 0; which_db < num_dbs; which_db++) {
4448
        DB *db = db_array[which_db];
4449 4450
        lock_flags[which_db] = get_prelocked_flags(flags_array[which_db]);
        remaining_flags[which_db] = flags_array[which_db] & ~lock_flags[which_db];
4451

4452 4453 4454 4455 4456
        //Do locking if necessary.
        if (!(lock_flags[which_db] & DB_PRELOCKED_FILE_READ)) {
            r = toku_grab_read_lock_on_directory(db, txn);
            if (r != 0) goto cleanup;
        }
4457
        if (db == src_db) {
4458
            del_keys[which_db] = *src_key;
4459 4460
        }
        else {
4461
        //Generate the key
4462
            r = env->i->generate_row_for_del(db, src_db, &keys[which_db], src_key, src_val);
4463 4464 4465
            if (r != 0) goto cleanup;
            del_keys[which_db] = keys[which_db];
        }
4466 4467 4468 4469 4470 4471 4472 4473

        if (remaining_flags[which_db] & ~DB_DELETE_ANY) {
            r = EINVAL;
            goto cleanup;
        }
        BOOL error_if_missing = (BOOL)(!(remaining_flags[which_db]&DB_DELETE_ANY));
        if (error_if_missing) {
            //Check if the key exists in the db.
4474
            r = db_getf_set(db, txn, lock_flags[which_db]|DB_SERIALIZABLE|DB_RMW, &del_keys[which_db], ydb_getf_do_nothing, NULL);
4475
            if (r != 0) goto cleanup;
4476 4477 4478 4479 4480
        }

        //Do locking if necessary.
        if (db->i->lt && !(lock_flags[which_db] & DB_PRELOCKED_WRITE)) {
            //Needs locking
4481
            r = get_point_write_lock(db, txn, &del_keys[which_db]);
4482
            if (r != 0) goto cleanup;
4483 4484 4485
        }
        brts[which_db] = db->i->brt;
    }
4486 4487

    if (num_dbs == 1)
4488
        r = log_del_single(txn, brts[0], &del_keys[0]);
4489
    else
4490
        r = log_del_multiple(txn, src_db, src_key, src_val, num_dbs, brts, del_keys);
4491 4492

    if (r == 0) 
4493
        r = do_del_multiple(txn, num_dbs, db_array, del_keys, src_db, src_key);
4494 4495 4496
    }

cleanup:
4497 4498 4499 4500
    if (r == 0)
        num_multi_deletes += num_dbs;
    else
        num_multi_deletes_fail += num_dbs;
4501 4502 4503 4504
    return r;
}


4505 4506
static int 
locked_c_get(DBC * c, DBT * key, DBT * data, u_int32_t flag) {
4507 4508
    toku_ydb_lock(); int r = toku_c_get(c, key, data, flag); toku_ydb_unlock();
    return r;
Rich Prohaska's avatar
Rich Prohaska committed
4509 4510
}

4511 4512
static int 
locked_c_close(DBC * c) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
4513
    toku_ydb_lock(); int r = toku_c_close(c); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
4514 4515
}

4516 4517
static int 
locked_c_count(DBC *cursor, db_recno_t *count, u_int32_t flags) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
4518
    toku_ydb_lock(); int r = toku_c_count(cursor, count, flags); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
4519 4520
}

4521 4522
static int 
locked_c_del(DBC * c, u_int32_t flags) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
4523
    toku_ydb_lock(); int r = toku_c_del(c, flags); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
4524 4525
}

4526
static int locked_c_pre_acquire_range_lock(DBC *dbc, const DBT *key_left, const DBT *key_right);
4527

4528 4529
static int 
toku_db_cursor(DB * db, DB_TXN * txn, DBC ** c, u_int32_t flags, int is_temporary_cursor) {
4530
    HANDLE_PANICKED_DB(db);
4531
    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
4532
    DB_ENV* env = db->dbenv;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
4533
    int r;
4534

4535
    if (flags & ~(DB_SERIALIZABLE | DB_INHERIT_ISOLATION | DB_RMW | DBC_DISABLE_PREFETCHING)) {
4536 4537 4538
        return toku_ydb_do_error(
            env, 
            EINVAL, 
4539
            "Invalid flags set for toku_db_cursor\n"
4540 4541
            );
    }
4542

Zardosht Kasheff's avatar
Zardosht Kasheff committed
4543
    r = toku_grab_read_lock_on_directory(db, txn);
4544
    if (r != 0) 
Zardosht Kasheff's avatar
Zardosht Kasheff committed
4545 4546
        return r;
    
4547 4548 4549 4550
    struct __toku_dbc_external *XMALLOC(eresult); // so the internal stuff is stuck on the end
    memset(eresult, 0, sizeof(*eresult));
    DBC *result = &eresult->external_part;

Yoni Fogel's avatar
Yoni Fogel committed
4551 4552 4553 4554 4555
#define SCRS(name) result->name = locked_ ## name
    SCRS(c_get);
    SCRS(c_close);
    SCRS(c_del);
    SCRS(c_count);
Yoni Fogel's avatar
Yoni Fogel committed
4556 4557
    SCRS(c_getf_first);
    SCRS(c_getf_last);
Yoni Fogel's avatar
Yoni Fogel committed
4558
    SCRS(c_getf_next);
4559 4560 4561 4562 4563
    SCRS(c_getf_prev);
    SCRS(c_getf_current);
    SCRS(c_getf_current_binding);
    SCRS(c_getf_set);
    SCRS(c_getf_set_range);
4564
    SCRS(c_getf_set_range_reverse);
4565
    SCRS(c_pre_acquire_range_lock);
Yoni Fogel's avatar
Yoni Fogel committed
4566
#undef SCRS
4567 4568 4569

#if !TOKUDB_NATIVE_H
    MALLOC(result->i); // otherwise it is allocated as part of result->ii
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
4570
    assert(result->i);
4571
#endif
4572
    result->dbp = db;
4573

4574 4575 4576
    dbc_struct_i(result)->txn = txn;
    dbc_struct_i(result)->skey_s = (struct simple_dbt){0,0};
    dbc_struct_i(result)->sval_s = (struct simple_dbt){0,0};
4577
    if (is_temporary_cursor) {
4578 4579
	dbc_struct_i(result)->skey = &db->i->skey;
	dbc_struct_i(result)->sval = &db->i->sval;
4580
    } else {
4581 4582
	dbc_struct_i(result)->skey = &dbc_struct_i(result)->skey_s;
	dbc_struct_i(result)->sval = &dbc_struct_i(result)->sval_s;
4583
    }
4584 4585 4586 4587
    if (flags & DB_SERIALIZABLE) {
        dbc_struct_i(result)->iso = TOKU_ISO_SERIALIZABLE;
    } else {
        dbc_struct_i(result)->iso = txn ? db_txn_struct_i(txn)->iso : TOKU_ISO_SERIALIZABLE;
4588
    }
4589
    dbc_struct_i(result)->rmw = (flags & DB_RMW) != 0;
4590
    BOOL is_snapshot_read = FALSE;
4591
    if (txn) {
4592 4593
        is_snapshot_read = (dbc_struct_i(result)->iso == TOKU_ISO_READ_COMMITTED || 
                            dbc_struct_i(result)->iso == TOKU_ISO_SNAPSHOT);
4594
    }
Zardosht Kasheff's avatar
Zardosht Kasheff committed
4595
    r = toku_brt_cursor(
4596
        db->i->brt, 
4597 4598
        &dbc_struct_i(result)->c,
        txn ? db_txn_struct_i(txn)->tokutxn : NULL,
4599 4600
        is_snapshot_read,
        ((flags & DBC_DISABLE_PREFETCHING) != 0)
4601
        );
4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612
    assert(r == 0 || r == TOKUDB_MVCC_DICTIONARY_TOO_NEW);
    if (r == 0) {
        *c = result;
    }
    else {
#if !TOKUDB_NATIVE_H
        toku_free(result->i); // otherwise it is allocated as part of result->ii
#endif
        toku_free(result);
    }
    return r;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
4613 4614
}

4615 4616
static inline int 
db_thread_need_flags(DBT *dbt) {
Rich Prohaska's avatar
Rich Prohaska committed
4617 4618 4619
    return (dbt->flags & (DB_DBT_MALLOC+DB_DBT_REALLOC+DB_DBT_USERMEM)) == 0;
}

4620 4621
static int 
toku_db_get (DB * db, DB_TXN * txn, DBT * key, DBT * data, u_int32_t flags) {
4622
    HANDLE_PANICKED_DB(db);
4623
    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
Yoni Fogel's avatar
Yoni Fogel committed
4624
    int r;
4625
    u_int32_t iso_flags = flags & DB_ISOLATION_FLAGS;
4626

Rich Prohaska's avatar
Rich Prohaska committed
4627
    if ((db->i->open_flags & DB_THREAD) && db_thread_need_flags(data))
4628 4629
        return EINVAL;

4630
    u_int32_t lock_flags = flags & (DB_PRELOCKED | DB_PRELOCKED_WRITE | DB_PRELOCKED_FILE_READ);
Yoni Fogel's avatar
Yoni Fogel committed
4631
    flags &= ~lock_flags;
4632
    flags &= ~DB_ISOLATION_FLAGS;
4633 4634 4635
    // And DB_GET_BOTH is no longer supported. #2862.
    if (flags != 0) return EINVAL;

Yoni Fogel's avatar
Yoni Fogel committed
4636 4637

    DBC *dbc;
4638
    r = toku_db_cursor(db, txn, &dbc, iso_flags | DBC_DISABLE_PREFETCHING, 1);
Yoni Fogel's avatar
Yoni Fogel committed
4639
    if (r!=0) return r;
4640
    u_int32_t c_get_flags = DB_SET;
Yoni Fogel's avatar
Yoni Fogel committed
4641
    r = toku_c_get(dbc, key, data, c_get_flags | lock_flags);
Yoni Fogel's avatar
Yoni Fogel committed
4642 4643
    int r2 = toku_c_close(dbc);
    return r ? r : r2;
4644 4645
}

Rich Prohaska's avatar
Rich Prohaska committed
4646
#if 0
4647 4648
static int 
toku_db_key_range(DB * db, DB_TXN * txn, DBT * dbt, DB_KEY_RANGE * kr, u_int32_t flags) {
4649
    HANDLE_PANICKED_DB(db);
4650
    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
4651
    txn=txn; dbt=dbt; kr=kr; flags=flags;
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
4652
    toku_ydb_barf();
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
4653
    abort();
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
4654
}
Rich Prohaska's avatar
Rich Prohaska committed
4655
#endif
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
4656

4657 4658
static int 
toku_db_lt_panic(DB* db, int r) {
4659
    assert(r!=0);
Yoni Fogel's avatar
Yoni Fogel committed
4660 4661
    assert(db && db->i && db->dbenv && db->dbenv->i);
    DB_ENV* env = db->dbenv;
4662 4663 4664 4665
    char * panic_string;

    if (r < 0) panic_string = toku_lt_strerror((TOKU_LT_ERROR)r);
    else       panic_string = "Error in locktree.\n";
4666

4667
    env_panic(env, r, panic_string);
4668

4669
    return toku_ydb_do_error(env, r, "%s", panic_string);
Yoni Fogel's avatar
Yoni Fogel committed
4670 4671
}

4672 4673
static int 
toku_txn_add_lt(DB_TXN* txn, toku_lock_tree* lt) {
Yoni Fogel's avatar
Yoni Fogel committed
4674
    int r = ENOSYS;
Yoni Fogel's avatar
Yoni Fogel committed
4675
    assert(txn && lt);
4676
    toku_lth* lth = db_txn_struct_i(txn)->lth;
Yoni Fogel's avatar
Yoni Fogel committed
4677 4678 4679 4680 4681
    assert(lth);

    toku_lock_tree* find = toku_lth_find(lth, lt);
    if (find) {
        assert(find == lt);
Yoni Fogel's avatar
Yoni Fogel committed
4682 4683
        r = 0;
        goto cleanup;
Yoni Fogel's avatar
Yoni Fogel committed
4684
    }
Yoni Fogel's avatar
Yoni Fogel committed
4685 4686 4687 4688 4689 4690
    r = toku_lth_insert(lth, lt);
    if (r != 0) { goto cleanup; }
    
    toku_lt_add_ref(lt);
    r = 0;
cleanup:
Yoni Fogel's avatar
Yoni Fogel committed
4691 4692 4693
    return r;
}

4694 4695
static toku_dbt_cmp 
toku_db_get_compare_fun(DB* db) {
Yoni Fogel's avatar
Yoni Fogel committed
4696 4697
    return db->i->brt->compare_fun;
}
Yoni Fogel's avatar
Yoni Fogel committed
4698

4699

4700 4701 4702 4703 4704
static int
db_open_subdb(DB * db, DB_TXN * txn, const char *fname, const char *dbname, DBTYPE dbtype, u_int32_t flags, int mode) {
    int r;
    if (!fname || !dbname) r = EINVAL;
    else {
4705 4706 4707 4708 4709 4710
        char subdb_full_name[strlen(fname) + sizeof("/") + strlen(dbname)];
        int bytes = snprintf(subdb_full_name, sizeof(subdb_full_name), "%s/%s", fname, dbname);
        assert(bytes==(int)sizeof(subdb_full_name)-1);
        const char *null_subdbname = NULL;
        r = toku_db_open(db, txn, subdb_full_name, null_subdbname, dbtype, flags, mode);
    }
4711 4712
    return r;
}
4713

4714 4715 4716 4717 4718 4719 4720 4721
static void
create_iname_hint(const char *dname, char *hint) {
    //Requires: size of hint array must be > strlen(dname)
    //Copy alphanumeric characters only.
    //Replace strings of non-alphanumeric characters with a single underscore.
    BOOL underscored = FALSE;
    while (*dname) {
        if (isalnum(*dname)) {
4722 4723
            char c = *dname++;
            *hint++ = c;
4724 4725 4726 4727 4728 4729 4730
            underscored = FALSE;
        }
        else {
            if (!underscored)
                *hint++ = '_';
            dname++;
            underscored = TRUE;
4731 4732
        }
    }
4733 4734 4735
    *hint = '\0';
}

4736

4737
// n < 0  means to ignore mark and ignore n
4738
// n >= 0 means to include mark ("_B_" or "_P_") with hex value of n in iname
4739
// (intended for use by loader, which will create many inames using one txnid).
4740
static char *
4741
create_iname(DB_ENV *env, u_int64_t id, char *hint, char *mark, int n) {
4742
    int bytes;
4743
    char inamebase[strlen(hint) +
4744 4745 4746
		   8 +  // hex file format version
		   16 + // hex id (normally the txnid)
		   8  + // hex value of n if non-neg
4747
		   sizeof("_B___.tokudb")]; // extra pieces
4748
    if (n < 0)
4749 4750 4751
	bytes = snprintf(inamebase, sizeof(inamebase),
                         "%s_%"PRIx64"_%"PRIx32            ".tokudb",
                         hint, id, BRT_LAYOUT_VERSION);
4752 4753
    else {
	invariant(strlen(mark) == 1);
4754
	bytes = snprintf(inamebase, sizeof(inamebase),
4755 4756 4757
                         "%s_%"PRIx64"_%"PRIx32"_%s_%"PRIx32".tokudb",
                         hint, id, BRT_LAYOUT_VERSION, mark, n);
    }
4758 4759 4760 4761
    assert(bytes>0);
    assert(bytes<=(int)sizeof(inamebase)-1);
    char *rval;
    if (env->i->data_dir)
Yoni Fogel's avatar
Yoni Fogel committed
4762
        rval = toku_construct_full_name(2, env->i->data_dir, inamebase);
4763
    else
Yoni Fogel's avatar
Yoni Fogel committed
4764
        rval = toku_construct_full_name(1, inamebase);
4765 4766
    assert(rval);
    return rval;
4767 4768
}

4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781

static int db_open_iname(DB * db, DB_TXN * txn, const char *iname, u_int32_t flags, int mode);


// inames are created here.
// algorithm:
//  begin txn
//  convert dname to iname (possibly creating new iname)
//  open file (toku_brt_open() will handle logging)
//  close txn
//  if created a new iname, take full range lock
static int 
toku_db_open(DB * db, DB_TXN * txn, const char *fname, const char *dbname, DBTYPE dbtype, u_int32_t flags, int mode) {
4782
    HANDLE_PANICKED_DB(db);
4783
    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
4784
    if (dbname!=NULL) 
4785
        return db_open_subdb(db, txn, fname, dbname, dbtype, flags, mode);
4786

4787
    // at this point fname is the dname
4788 4789
    //This code ONLY supports single-db files.
    assert(dbname==NULL);
4790
    const char * dname = fname;  // db_open_subdb() converts (fname, dbname) to dname
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
4791

4792 4793 4794
    ////////////////////////////// do some level of parameter checking.
    u_int32_t unused_flags = flags;
    int using_txns = db->dbenv->i->open_flags & DB_INIT_TXN;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
4795
    int r;
Yoni Fogel's avatar
Yoni Fogel committed
4796
    if (dbtype!=DB_BTREE && dbtype!=DB_UNKNOWN) return EINVAL;
4797 4798
    int is_db_excl    = flags & DB_EXCL;    unused_flags&=~DB_EXCL;
    int is_db_create  = flags & DB_CREATE;  unused_flags&=~DB_CREATE;
4799
    int is_db_hot_index  = flags & DB_IS_HOT_INDEX;  unused_flags&=~DB_IS_HOT_INDEX;
4800

4801
    //We support READ_UNCOMMITTED and READ_COMMITTED whether or not the flag is provided.
4802
                                            unused_flags&=~DB_READ_UNCOMMITTED;
4803
                                            unused_flags&=~DB_READ_COMMITTED;
4804
                                            unused_flags&=~DB_SERIALIZABLE;
4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822
    if (unused_flags & ~DB_THREAD) return EINVAL; // unknown flags

    if (is_db_excl && !is_db_create) return EINVAL;
    if (dbtype==DB_UNKNOWN && is_db_excl) return EINVAL;

    /* tokudb supports no duplicates and sorted duplicates only */
    unsigned int tflags;
    r = toku_brt_get_flags(db->i->brt, &tflags);
    if (r != 0) 
        return r;

    if (db_opened(db))
        return EINVAL;              /* It was already open. */
    //////////////////////////////

    DB_TXN *child = NULL;
    // begin child (unless transactionless)
    if (using_txns) {
4823 4824
        r = toku_txn_begin(db->dbenv, txn, &child, DB_TXN_NOSYNC, 1);
        assert(r==0);
4825 4826 4827 4828 4829 4830
    }

    // convert dname to iname
    //  - look up dname, get iname
    //  - if dname does not exist, create iname and make entry in directory
    DBT dname_dbt;  // holds dname
4831
    DBT iname_dbt;  // holds iname_in_env
4832 4833
    toku_fill_dbt(&dname_dbt, dname, strlen(dname)+1);
    init_dbt_realloc(&iname_dbt);  // sets iname_dbt.data = NULL
4834
    r = toku_db_get(db->dbenv->i->directory, child, &dname_dbt, &iname_dbt, DB_SERIALIZABLE);  // allocates memory for iname
4835 4836 4837 4838 4839 4840 4841
    char *iname = iname_dbt.data;
    if (r==DB_NOTFOUND && !is_db_create)
        r = ENOENT;
    else if (r==0 && is_db_excl) {
        r = EEXIST;
    }
    else if (r==DB_NOTFOUND) {
4842
        char hint[strlen(dname) + 1];
4843

4844 4845 4846 4847 4848 4849 4850
        // create iname and make entry in directory
        u_int64_t id = 0;

        if (using_txns) {
            id = toku_txn_get_txnid(db_txn_struct_i(child)->tokutxn);
        }
        create_iname_hint(dname, hint);
4851
        iname = create_iname(db->dbenv, id, hint, NULL, -1);  // allocated memory for iname
4852
        toku_fill_dbt(&iname_dbt, iname, strlen(iname) + 1);
4853
        //
4854
        // 0 for performance only, avoid unnecessary query
4855 4856 4857
        // if we are creating a hot index, per #3166, we do not want the write lock  in directory grabbed.
        // directory read lock is grabbed in toku_db_get above
        //
4858
        u_int32_t put_flags = 0 | ((is_db_hot_index) ? DB_PRELOCKED_WRITE : 0); 
4859
        r = toku_db_put(db->dbenv->i->directory, child, &dname_dbt, &iname_dbt, put_flags);  
4860 4861 4862 4863
    }

    // we now have an iname
    if (r == 0) {
4864
        r = db_open_iname(db, child, iname, flags, mode);
4865 4866 4867 4868 4869 4870 4871 4872 4873 4874
        if (r==0) {
            db->i->dname = toku_xstrdup(dname);
            env_note_db_opened(db->dbenv, db);  // tell env that a new db handle is open (using dname)
        }
    }

    // free string holding iname
    if (iname) toku_free(iname);

    if (using_txns) {
4875 4876
        // close txn
        if (r == 0) {  // commit
4877
            r = toku_txn_commit(child, DB_TXN_NOSYNC, NULL, NULL, false);
4878
            invariant(r==0);  // TODO panic
4879 4880
        }
        else {         // abort
4881
            int r2 = toku_txn_abort(child, NULL, NULL, false);
4882
            invariant(r2==0);  // TODO panic
4883
        }
4884 4885 4886 4887 4888 4889
    }

    return r;
}

static int 
4890
db_open_iname(DB * db, DB_TXN * txn, const char *iname_in_env, u_int32_t flags, int mode) {
4891 4892 4893 4894 4895 4896 4897 4898
    int r;

    //Set comparison functions if not yet set.
    if (!db->i->key_compare_was_set && db->dbenv->i->bt_compare) {
        r = toku_brt_set_bt_compare(db->i->brt, db->dbenv->i->bt_compare);
        assert(r==0);
        db->i->key_compare_was_set = TRUE;
    }
4899 4900 4901 4902
    if (db->dbenv->i->update_function) {
        r = toku_brt_set_update(db->i->brt,db->dbenv->i->update_function);
        assert(r==0);
    }
4903 4904 4905
    BOOL need_locktree = (BOOL)((db->dbenv->i->open_flags & DB_INIT_LOCK) &&
                                (db->dbenv->i->open_flags & DB_INIT_TXN));

4906 4907
    int is_db_excl    = flags & DB_EXCL;    flags&=~DB_EXCL;
    int is_db_create  = flags & DB_CREATE;  flags&=~DB_CREATE;
4908
    //We support READ_UNCOMMITTED and READ_COMMITTED whether or not the flag is provided.
4909
                                            flags&=~DB_READ_UNCOMMITTED;
4910
                                            flags&=~DB_READ_COMMITTED;
4911
                                            flags&=~DB_SERIALIZABLE;
4912
                                            flags&=~DB_IS_HOT_INDEX;
4913
    if (flags & ~DB_THREAD) return EINVAL; // unknown flags
4914 4915

    if (is_db_excl && !is_db_create) return EINVAL;
4916

4917 4918 4919 4920 4921 4922
    /* tokudb supports no duplicates and sorted duplicates only */
    unsigned int tflags;
    r = toku_brt_get_flags(db->i->brt, &tflags);
    if (r != 0) 
        return r;

4923
    if (db_opened(db))
4924
        return EINVAL;              /* It was already open. */
Yoni Fogel's avatar
Yoni Fogel committed
4925
    
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
4926 4927
    db->i->open_flags = flags;
    db->i->open_mode = mode;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
4928

Yoni Fogel's avatar
Yoni Fogel committed
4929
    r = toku_brt_open(db->i->brt, iname_in_env,
4930
		      is_db_create, is_db_excl,
4931
		      db->dbenv->i->cachetable,
4932
		      txn ? db_txn_struct_i(txn)->tokutxn : NULL_TXN,
4933
		      db);
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
4934 4935 4936
    if (r != 0)
        goto error_cleanup;

4937
    db->i->opened = 1;
Yoni Fogel's avatar
Yoni Fogel committed
4938
    if (need_locktree) {
4939
	db->i->dict_id = toku_brt_get_dictionary_id(db->i->brt);
4940
        r = toku_ltm_get_lt(db->dbenv->i->ltm, &db->i->lt, db->i->dict_id, db);
Yoni Fogel's avatar
Yoni Fogel committed
4941
        if (r!=0) { goto error_cleanup; }
Yoni Fogel's avatar
Yoni Fogel committed
4942
    }
4943 4944 4945
    //Add to transaction's list of 'must close' if necessary.
    if (txn) {
        //Do last so we don't have to undo.
4946
        toku_list_push(&db_txn_struct_i(txn)->dbs_that_must_close_before_abort,
4947 4948
                  &db->i->dbs_that_must_close_before_abort);
    }
Yoni Fogel's avatar
Yoni Fogel committed
4949

Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
4950
    return 0;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
4951 4952
 
error_cleanup:
4953
    db->i->dict_id = DICTIONARY_ID_NONE;
4954
    db->i->opened = 0;
Yoni Fogel's avatar
Yoni Fogel committed
4955
    if (db->i->lt) {
4956
        toku_lt_remove_db_ref(db->i->lt, db);
Yoni Fogel's avatar
Yoni Fogel committed
4957 4958
        db->i->lt = NULL;
    }
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
4959
    return r;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
4960
}
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
4961

4962 4963 4964 4965
//Return 0 if proposed pair do not violate size constraints of DB
//(insertion is legal)
//Return non zero otherwise.
static int
4966
db_put_check_size_constraints(DB *db, const DBT *key, const DBT *val) {
4967 4968 4969 4970 4971 4972 4973
    unsigned int klimit, vlimit;
    int r = 0;
    toku_brt_get_maximum_advised_key_value_lengths(&klimit, &vlimit);
    if (key->size > klimit)
	r = toku_ydb_do_error(db->dbenv, EINVAL, "The largest key allowed is %u bytes", klimit);
    else if (val->size > vlimit)
	r = toku_ydb_do_error(db->dbenv, EINVAL, "The largest value allowed is %u bytes", vlimit);
4974
    
4975 4976 4977
    return r;
}

4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998
//Return 0 if supported.
//Return ERANGE if out of range.
static int
db_row_size_supported(DB *db, u_int32_t size) {
    DBT key, val;

    toku_fill_dbt(&key, NULL, size);
    toku_fill_dbt(&val, NULL, 0);
    int r = db_put_check_size_constraints(db, &key, &val);
    if (r!=0) r = ERANGE;
    return r;
}

static int
locked_db_row_size_supported(DB *db, u_int32_t size) {
    toku_ydb_lock();
    int r = db_row_size_supported(db, size);
    toku_ydb_unlock();
    return r;
}

4999 5000
//Return 0 if insert is legal
static int
5001
db_put_check_overwrite_constraint(DB *db, DB_TXN *txn, DBT *key,
5002 5003 5004
                                  u_int32_t lock_flags, u_int32_t overwrite_flag) {
    int r;

5005
    if (overwrite_flag == 0) { // 0 (yesoverwrite) does not impose constraints.
5006 5007
        r = 0;
    } else if (overwrite_flag == DB_NOOVERWRITE) {
5008 5009 5010 5011
        // Check if (key,anything) exists in dictionary.
        // If exists, fail.  Otherwise, do insert.
        // The DB_RMW flag causes the cursor to grab a write lock instead of a read lock on the key if it exists.
        r = db_getf_set(db, txn, lock_flags|DB_SERIALIZABLE|DB_RMW, key, ydb_getf_do_nothing, NULL);
5012 5013 5014 5015
        if (r == DB_NOTFOUND) 
            r = 0;
        else if (r == 0)      
            r = DB_KEYEXIST;
5016
        //Any other error is passed through.
5017
    } else if (overwrite_flag == DB_NOOVERWRITE_NO_ERROR) {
5018
        r = 0;
5019
    } else {
5020 5021
        //Other flags are not (yet) supported.
        r = EINVAL;
5022 5023 5024 5025
    }
    return r;
}

5026 5027
static int
toku_db_put(DB *db, DB_TXN *txn, DBT *key, DBT *val, u_int32_t flags) {
5028
    HANDLE_PANICKED_DB(db);
5029
    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
5030
    int r = 0;
5031

5032
    u_int32_t lock_flags = get_prelocked_flags(flags);
5033
    flags &= ~lock_flags;
5034

5035 5036 5037 5038
    if (!(lock_flags & DB_PRELOCKED_FILE_READ)) {
        r = toku_grab_read_lock_on_directory(db, txn);
    }
    
5039 5040 5041
    if (r == 0)
        r = db_put_check_size_constraints(db, key, val);
    if (r == 0) {
5042
        //Do any checking required by the flags.
5043
        r = db_put_check_overwrite_constraint(db, txn, key, lock_flags, flags);
5044
    }
5045 5046
    BOOL do_locking = (BOOL)(db->i->lt && !(lock_flags&DB_PRELOCKED_WRITE));
    if (r == 0 && do_locking) {
5047
        //Do locking if necessary.
5048
        r = get_point_write_lock(db, txn, key);
5049
    }
5050
    if (r == 0) {
5051
        //Insert into the brt.
5052 5053 5054 5055 5056
        TOKUTXN ttxn = txn ? db_txn_struct_i(txn)->tokutxn : NULL;
        enum brt_msg_type type = BRT_INSERT;
        if (flags==DB_NOOVERWRITE_NO_ERROR)
            type = BRT_INSERT_NO_OVERWRITE;
        r = toku_brt_maybe_insert(db->i->brt, key, val, ttxn, FALSE, ZERO_LSN, TRUE, type);
5057
    }
5058

5059 5060 5061
    if (r == 0)
	num_inserts++;
    else
Zardosht Kasheff's avatar
Zardosht Kasheff committed
5062
        num_inserts_fail++;
5063 5064 5065 5066

    return r;
}

5067 5068
static int toku_db_pre_acquire_fileops_lock(DB *db, DB_TXN *txn) {
    // bad hack because some environment dictionaries do not have a dname
5069 5070
    char *dname = db->i->dname;
    if (!dname)
5071
        return 0;
5072 5073

    DBT key_in_directory = { .data = dname, .size = strlen(dname)+1 };
5074
    //Left end of range == right end of range (point lock)
5075
    int r = get_range_lock(db->dbenv->i->directory, txn, &key_in_directory, &key_in_directory, LOCK_REQUEST_WRITE);
5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104
    if (r == 0)
	directory_write_locks++;
    else
	directory_write_locks_fail++;
    return r;
}

static int
toku_db_update(DB *db, DB_TXN *txn,
               const DBT *key,
               const DBT *update_function_extra,
               u_int32_t flags) {
    HANDLE_PANICKED_DB(db);
    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
    int r = 0;

    u_int32_t lock_flags = get_prelocked_flags(flags);
    flags &= ~lock_flags;

    if (!(lock_flags & DB_PRELOCKED_FILE_READ)) {
        r = toku_grab_read_lock_on_directory(db, txn);
        if (r != 0) { goto cleanup; }
    }

    r = db_put_check_size_constraints(db, key, update_function_extra);
    if (r != 0) { goto cleanup; }

    BOOL do_locking = (db->i->lt && !(lock_flags & DB_PRELOCKED_WRITE));
    if (do_locking) {
5105
        r = get_point_write_lock(db, txn, key);
5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178
        if (r != 0) { goto cleanup; }
    }

    TOKUTXN ttxn = txn ? db_txn_struct_i(txn)->tokutxn : NULL;
    r = toku_brt_maybe_update(db->i->brt, key, update_function_extra, ttxn,
                              FALSE, ZERO_LSN, TRUE);

cleanup:
    if (r == 0) 
	num_updates++;
    else
	num_updates_fail++;
    return r;
}


// DB_IS_RESETTING_OP is true if the dictionary should be considered as if created by this transaction.
// For example, it will be true if toku_db_update_broadcast() is used to implement a schema change (such
// as adding a column), and will be false if used simply to update all the rows of a table (such as 
// incrementing a field).
static int
toku_db_update_broadcast(DB *db, DB_TXN *txn,
                         const DBT *update_function_extra,
                         u_int32_t flags) {
    HANDLE_PANICKED_DB(db);
    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
    int r = 0;

    u_int32_t lock_flags = get_prelocked_flags(flags);
    flags &= ~lock_flags;
    u_int32_t is_resetting_op_flag = flags & DB_IS_RESETTING_OP;
    flags &= is_resetting_op_flag;
    BOOL is_resetting_op = (is_resetting_op_flag != 0);
    

    if (is_resetting_op) {
        if (txn->parent != NULL) {
            r = EINVAL; // cannot have a parent if you are a resetting op
            goto cleanup;
        }
        r = toku_db_pre_acquire_fileops_lock(db, txn);
        if (r != 0) { goto cleanup; }
    }
    else if (!(lock_flags & DB_PRELOCKED_FILE_READ)) {
        r = toku_grab_read_lock_on_directory(db, txn);
        if (r != 0) { goto cleanup; }
    }

    {
        DBT null_key;
        toku_init_dbt(&null_key);
        r = db_put_check_size_constraints(db, &null_key, update_function_extra);
        if (r != 0) { goto cleanup; }
    }

    BOOL do_locking = (db->i->lt && !(lock_flags & DB_PRELOCKED_WRITE));
    if (do_locking) {
        r = toku_db_pre_acquire_table_lock(db, txn, TRUE);
        if (r != 0) { goto cleanup; }
    }

    TOKUTXN ttxn = txn ? db_txn_struct_i(txn)->tokutxn : NULL;
    r = toku_brt_maybe_update_broadcast(db->i->brt, update_function_extra, ttxn,
                                        FALSE, ZERO_LSN, TRUE, is_resetting_op);

cleanup:
    if (r == 0) 
	num_updates_broadcast++;
    else
	num_updates_broadcast_fail++;
    return r;
}

5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197
static int
log_put_single(DB_TXN *txn, BRT brt, const DBT *key, const DBT *val) {
    TOKUTXN ttxn = db_txn_struct_i(txn)->tokutxn;
    int r = toku_brt_log_put(ttxn, brt, key, val);
    return r;
}

static int
log_put_multiple(DB_TXN *txn, DB *src_db, const DBT *src_key, const DBT *src_val, uint32_t num_dbs, BRT brts[]) {
    int r = 0;
    if (num_dbs > 0) {
        TOKUTXN ttxn = db_txn_struct_i(txn)->tokutxn;
        BRT src_brt  = src_db ? src_db->i->brt : NULL;
        r = toku_brt_log_put_multiple(ttxn, src_brt, brts, num_dbs, src_key, src_val);
    }
    return r;
}

static int
5198
do_put_multiple(DB_TXN *txn, uint32_t num_dbs, DB *db_array[], DBT keys[], DBT vals[], DB *src_db, const DBT *src_key) {
5199 5200 5201 5202
    int r = 0;
    TOKUTXN ttxn = db_txn_struct_i(txn)->tokutxn;
    for (uint32_t which_db = 0; r == 0 && which_db < num_dbs; which_db++) {
        DB *db = db_array[which_db];
5203 5204 5205

        // if db is being indexed by an indexer, then put into that db if the src key is to the left or equal to the 
        // indexers cursor.  we have to get the src_db from the indexer and find it in the db_array.
5206 5207
	int do_put = TRUE;
	DB_INDEXER *indexer = toku_db_get_indexer(db);
5208
	if (indexer) { // if this db is the index under construction
5209 5210 5211 5212 5213 5214 5215 5216 5217 5218
            DB *indexer_src_db = toku_indexer_get_src_db(indexer);
            invariant(indexer_src_db != NULL);
            const DBT *indexer_src_key;
            if (src_db == indexer_src_db)
                indexer_src_key = src_key;
            else {
                uint32_t which_src_db = lookup_src_db(num_dbs, db_array, indexer_src_db);
                invariant(which_src_db < num_dbs);
                indexer_src_key = &keys[which_src_db];
            }
5219
            do_put = !toku_indexer_is_key_right_of_le_cursor(indexer, indexer_src_db, indexer_src_key);
5220 5221 5222 5223
        }
        if (r == 0 && do_put) {
            r = toku_brt_maybe_insert(db->i->brt, &keys[which_db], &vals[which_db], ttxn, FALSE, ZERO_LSN, FALSE, BRT_INSERT);
        }
Zardosht Kasheff's avatar
Zardosht Kasheff committed
5224
    }
5225
    return r;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
5226
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
5227

5228
static int
5229 5230 5231 5232
env_put_multiple(
    DB_ENV *env, 
    DB *src_db, 
    DB_TXN *txn, 
5233 5234
    const DBT *src_key, 
    const DBT *src_val, 
5235 5236 5237 5238
    uint32_t num_dbs, 
    DB **db_array, 
    DBT *keys, 
    DBT *vals, 
5239
    uint32_t *flags_array) 
5240
{
5241
    int r;
5242 5243
    DBT put_keys[num_dbs];
    DBT put_vals[num_dbs];
5244 5245 5246 5247

    HANDLE_PANICKED_ENV(env);

    {
5248 5249 5250
    uint32_t lock_flags[num_dbs];
    uint32_t remaining_flags[num_dbs];
    BRT brts[num_dbs];
5251

5252 5253 5254 5255
    if (!txn || !num_dbs) {
        r = EINVAL;
        goto cleanup;
    }
5256
    if (!env->i->generate_row_for_put) {
5257 5258 5259 5260
        r = EINVAL;
        goto cleanup;
    }

5261 5262 5263
    HANDLE_ILLEGAL_WORKING_PARENT_TXN(env, txn);

    for (uint32_t which_db = 0; which_db < num_dbs; which_db++) {
5264
        DB *db = db_array[which_db];
5265

5266 5267 5268 5269 5270 5271 5272 5273 5274
        lock_flags[which_db] = get_prelocked_flags(flags_array[which_db]);
        remaining_flags[which_db] = flags_array[which_db] & ~lock_flags[which_db];

        //Do locking if necessary.
        if (!(lock_flags[which_db] & DB_PRELOCKED_FILE_READ)) {
            r = toku_grab_read_lock_on_directory(db, txn);
            if (r != 0) goto cleanup;
        }

5275
        //Generate the row
5276
        if (db == src_db) {
5277 5278
            put_keys[which_db] = *src_key;
            put_vals[which_db] = *src_val;
5279 5280
        }
        else {
5281
            r = env->i->generate_row_for_put(db, src_db, &keys[which_db], &vals[which_db], src_key, src_val);
5282 5283 5284 5285
            if (r != 0) goto cleanup;
            put_keys[which_db] = keys[which_db];
            put_vals[which_db] = vals[which_db];            
        }
5286 5287

        // check size constraints
5288
        r = db_put_check_size_constraints(db, &put_keys[which_db], &put_vals[which_db]);
5289 5290
        if (r != 0) goto cleanup;

5291 5292
        //Check overwrite constraints
        r = db_put_check_overwrite_constraint(db, txn,
5293
                                              &put_keys[which_db],
5294
                                              lock_flags[which_db], remaining_flags[which_db]);
5295
        if (r != 0) goto cleanup;
5296 5297 5298 5299 5300
        if (remaining_flags[which_db] == DB_NOOVERWRITE_NO_ERROR) {
            //put_multiple does not support delaying the no error, since we would
            //have to log the flag in the put_multiple.
            r = EINVAL; goto cleanup;
        }
5301

5302 5303 5304
        //Do locking if necessary.
        if (db->i->lt && !(lock_flags[which_db] & DB_PRELOCKED_WRITE)) {
            //Needs locking
5305
            r = get_point_write_lock(db, txn, &put_keys[which_db]);
5306
            if (r != 0) goto cleanup;
5307 5308 5309
        }
        brts[which_db] = db->i->brt;
    }
5310 5311

    if (num_dbs == 1)
5312
        r = log_put_single(txn, brts[0], &put_keys[0], &put_vals[0]);
5313
    else
5314
        r = log_put_multiple(txn, src_db, src_key, src_val, num_dbs, brts);
5315 5316
    
    if (r == 0)
5317
        r = do_put_multiple(txn, num_dbs, db_array, put_keys, put_vals, src_db, src_key);
5318

5319 5320
    }

5321
cleanup:
5322
    if (r == 0)
5323
        num_multi_inserts += num_dbs;
5324
    else
5325
        num_multi_inserts_fail += num_dbs;
5326 5327 5328 5329 5330
    return r;
}

static int
env_update_multiple(DB_ENV *env, DB *src_db, DB_TXN *txn,                                
5331 5332 5333
                    DBT *old_src_key, DBT *old_src_data,
                    DBT *new_src_key, DBT *new_src_data,
                    uint32_t num_dbs, DB **db_array, uint32_t* flags_array, 
5334
                    uint32_t num_keys, DBT keys[], 
5335
                    uint32_t num_vals, DBT vals[]) {
5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362
    int r = 0;

    HANDLE_PANICKED_ENV(env);

    if (!txn) {
        r = EINVAL;
        goto cleanup;
    }
    if (!env->i->generate_row_for_put) {
        r = EINVAL;
        goto cleanup;
    }

    HANDLE_ILLEGAL_WORKING_PARENT_TXN(env, txn);

    {
        uint32_t n_del_dbs = 0;
        DB *del_dbs[num_dbs];
        BRT del_brts[num_dbs];
        DBT del_keys[num_dbs];
        
        uint32_t n_put_dbs = 0;
        DB *put_dbs[num_dbs];
        BRT put_brts[num_dbs];
        DBT put_keys[num_dbs];
        DBT put_vals[num_dbs];

5363 5364 5365
        uint32_t lock_flags[num_dbs];
        uint32_t remaining_flags[num_dbs];

5366 5367
        for (uint32_t which_db = 0; which_db < num_dbs; which_db++) {
            DB *db = db_array[which_db];
5368 5369 5370 5371
            DBT curr_old_key, curr_new_key, curr_new_val;
            
            lock_flags[which_db] = get_prelocked_flags(flags_array[which_db]);
            remaining_flags[which_db] = flags_array[which_db] & ~lock_flags[which_db];
5372

5373 5374 5375 5376
            if (!(lock_flags[which_db] & DB_PRELOCKED_FILE_READ)) {
                r = toku_grab_read_lock_on_directory(db, txn);
                if (r != 0) goto cleanup;
            }
5377 5378 5379
            // keys[0..num_dbs-1] are the new keys
            // keys[num_dbs..2*num_dbs-1] are the old keys
            // vals[0..num_dbs-1] are the new vals
5380

5381 5382
            // Generate the old key and val
            if (which_db + num_dbs >= num_keys) {
5383 5384
                r = ENOMEM; goto cleanup;
            }
5385 5386 5387 5388
            if (db == src_db) {
                curr_old_key = *old_src_key;
            }
            else {
5389
                r = env->i->generate_row_for_put(db, src_db, &keys[which_db + num_dbs], NULL, old_src_key, old_src_data);
5390 5391 5392
                if (r != 0) goto cleanup;
                curr_old_key = keys[which_db + num_dbs];
            }
5393
            // Generate the new key and val
5394 5395 5396
            if (which_db >= num_keys || which_db >= num_vals) {
                r = ENOMEM; goto cleanup;
            }
5397 5398 5399 5400 5401
            if (db == src_db) {
                curr_new_key = *new_src_key;
                curr_new_val = *new_src_data;
            }
            else {
5402
                r = env->i->generate_row_for_put(db, src_db, &keys[which_db], &vals[which_db], new_src_key, new_src_data);
5403 5404 5405 5406
                if (r != 0) goto cleanup;
                curr_new_key = keys[which_db];
                curr_new_val = vals[which_db];
            }
5407
            toku_dbt_cmp cmpfun = toku_db_get_compare_fun(db);
5408
            BOOL key_eq = cmpfun(db, &curr_old_key, &curr_new_key) == 0;
5409
            if (!key_eq) {
5410 5411
                //Check overwrite constraints only in the case where 
                // the keys are not equal.
5412
                // If the keys are equal, then we do not care of the flag is DB_NOOVERWRITE or 0
5413 5414 5415 5416 5417 5418 5419 5420 5421 5422
                r = db_put_check_overwrite_constraint(db, txn,
                                                      &curr_new_key,
                                                      lock_flags[which_db], remaining_flags[which_db]);
                if (r != 0) goto cleanup;
                if (remaining_flags[which_db] == DB_NOOVERWRITE_NO_ERROR) {
                    //update_multiple does not support delaying the no error, since we would
                    //have to log the flag in the put_multiple.
                    r = EINVAL; goto cleanup;
                }

5423
                // lock old key
5424
                if (db->i->lt && !(lock_flags[which_db] & DB_PRELOCKED_WRITE)) {
5425
                    r = get_point_write_lock(db, txn, &curr_old_key);
5426 5427 5428 5429
                    if (r != 0) goto cleanup;
                }
                del_dbs[n_del_dbs] = db;
                del_brts[n_del_dbs] = db->i->brt;
5430
                del_keys[n_del_dbs] = curr_old_key;
5431
                n_del_dbs++;
5432
                
5433 5434
            }

5435 5436 5437
            // we take a shortcut and avoid generating the old val
            // we assume that any new vals with size > 0 are different than the old val
            // if (!key_eq || !(dbt_cmp(&vals[which_db], &vals[which_db + num_dbs]) == 0)) {
5438 5439
            if (!key_eq || curr_new_val.size > 0) {
                r = db_put_check_size_constraints(db, &curr_new_key, &curr_new_val);
5440 5441 5442 5443
                if (r != 0) goto cleanup;

                // lock new key
                if (db->i->lt) {
5444
                    r = get_point_write_lock(db, txn, &curr_new_key);
5445 5446 5447 5448
                    if (r != 0) goto cleanup;
                }
                put_dbs[n_put_dbs] = db;
                put_brts[n_put_dbs] = db->i->brt;
5449 5450
                put_keys[n_put_dbs] = curr_new_key;
                put_vals[n_put_dbs] = curr_new_val;
5451 5452 5453 5454 5455 5456 5457 5458
                n_put_dbs++;
            }
        }

        if (r == 0 && n_del_dbs > 0) {
            if (n_del_dbs == 1)
                r = log_del_single(txn, del_brts[0], &del_keys[0]);
            else
5459
                r = log_del_multiple(txn, src_db, old_src_key, old_src_data, n_del_dbs, del_brts, del_keys);
5460 5461
            if (r == 0)
                r = do_del_multiple(txn, n_del_dbs, del_dbs, del_keys, src_db, old_src_key);
5462 5463 5464 5465 5466 5467 5468 5469
        }

        if (r == 0 && n_put_dbs > 0) {
            if (n_put_dbs == 1)
                r = log_put_single(txn, put_brts[0], &put_keys[0], &put_vals[0]);
            else
                r = log_put_multiple(txn, src_db, new_src_key, new_src_data, n_put_dbs, put_brts);
            if (r == 0)
5470
                r = do_put_multiple(txn, n_put_dbs, put_dbs, put_keys, put_vals, src_db, new_src_key);
5471 5472 5473 5474
        }
    }

cleanup:
5475 5476 5477 5478
    if (r == 0)
        num_multi_updates += num_dbs;
    else
        num_multi_updates_fail += num_dbs;
5479 5480 5481 5482
    return r;
}

static int toku_db_remove(DB * db, const char *fname, const char *dbname, u_int32_t flags);
5483 5484

//We do not (yet?) support deleting subdbs by deleting the enclosing 'fname'
5485
static int
5486 5487 5488 5489
env_dbremove_subdb(DB_ENV * env, DB_TXN * txn, const char *fname, const char *dbname, int32_t flags) {
    int r;
    if (!fname || !dbname) r = EINVAL;
    else {
5490 5491 5492 5493
        char subdb_full_name[strlen(fname) + sizeof("/") + strlen(dbname)];
        int bytes = snprintf(subdb_full_name, sizeof(subdb_full_name), "%s/%s", fname, dbname);
        assert(bytes==(int)sizeof(subdb_full_name)-1);
        const char *null_subdbname = NULL;
5494
        r = toku_env_dbremove(env, txn, subdb_full_name, null_subdbname, flags);
5495
    }
5496 5497
    return r;
}
5498

5499 5500 5501 5502

//Called during committing an fdelete ONLY IF you still have an fd AND it is not connected to /dev/null
//Called during aborting an fcreate (harmless to do, and definitely correct)
static void
5503
finalize_file_removal(DICTIONARY_ID dict_id, void * extra) {
5504 5505 5506
    toku_ltm *ltm = (toku_ltm*) extra;
    if (ltm) {
        //Poison the lock tree to prevent a future file from re-using it.
5507
        toku_ltm_invalidate_lt(ltm, dict_id);
5508 5509 5510
    }
}

5511
//static int toku_db_pre_acquire_table_lock(DB *db, DB_TXN *txn);
5512

5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538
static int
toku_env_dbremove(DB_ENV * env, DB_TXN *txn, const char *fname, const char *dbname, u_int32_t flags) {
    int r;
    HANDLE_PANICKED_ENV(env);
    HANDLE_ILLEGAL_WORKING_PARENT_TXN(env, txn);
    if (!env_opened(env)) return EINVAL;
    if (dbname!=NULL) 
        return env_dbremove_subdb(env, txn, fname, dbname, flags);
    // env_dbremove_subdb() converts (fname, dbname) to dname

    const char * dname = fname;
    assert(dbname == NULL);

    if (flags!=0) return EINVAL;
    if (env_is_db_with_dname_open(env, dname))
        return toku_ydb_do_error(env, EINVAL, "Cannot remove dictionary with an open handle.\n");
    
    DBT dname_dbt;  
    DBT iname_dbt;  
    toku_fill_dbt(&dname_dbt, dname, strlen(dname)+1);
    init_dbt_realloc(&iname_dbt);  // sets iname_dbt.data = NULL

    int using_txns = env->i->open_flags & DB_INIT_TXN;
    DB_TXN *child = NULL;
    // begin child (unless transactionless)
    if (using_txns) {
5539
	r = toku_txn_begin(env, txn, &child, DB_TXN_NOSYNC, 1);
5540 5541 5542 5543
	assert(r==0);
    }

    // get iname
5544
    r = toku_db_get(env->i->directory, child, &dname_dbt, &iname_dbt, DB_SERIALIZABLE);  // allocates memory for iname
5545 5546 5547 5548 5549 5550 5551 5552
    char *iname = iname_dbt.data;
    if (r==DB_NOTFOUND)
        r = ENOENT;
    else if (r==0) {
	// remove (dname,iname) from directory
	r = toku_db_del(env->i->directory, child, &dname_dbt, DB_DELETE_ANY);
	if (r == 0) {
            if (using_txns) {
Yoni Fogel's avatar
Yoni Fogel committed
5553
                r = toku_brt_remove_on_commit(db_txn_struct_i(child)->tokutxn, &iname_dbt);
5554
		assert(r==0);
5555 5556 5557
                //Now that we have a writelock on dname, verify that there are still no handles open. (to prevent race conditions)
                if (r==0 && env_is_db_with_dname_open(env, dname))
                    r = toku_ydb_do_error(env, EINVAL, "Cannot remove dictionary with an open handle.\n");
5558 5559 5560
                if (r==0) {
                    DB* zombie = env_get_zombie_db_with_dname(env, dname);
                    if (zombie)
5561
                        r = toku_db_pre_acquire_table_lock(zombie, child, TRUE);
5562
                    if (r!=0 && r!=DB_LOCK_NOTGRANTED)
5563 5564
                        toku_ydb_do_error(env, r, "Cannot remove dictionary.\n");
                }
5565 5566
            }
            else {
Yoni Fogel's avatar
Yoni Fogel committed
5567
                r = toku_brt_remove_now(env->i->cachetable, &iname_dbt);
5568 5569 5570 5571 5572 5573 5574 5575
		assert(r==0);
            }
	}
    }

    if (using_txns) {
	// close txn
	if (r == 0) {  // commit
5576
	    r = toku_txn_commit(child, DB_TXN_NOSYNC, NULL, NULL, false);
5577
	    invariant(r==0);  // TODO panic
5578 5579
	}
	else {         // abort
5580
	    int r2 = toku_txn_abort(child, NULL, NULL, false);
5581
	    invariant(r2==0);  // TODO panic
5582
	}
5583
    }
5584 5585

    if (iname) toku_free(iname);
5586
    return r;
5587

5588 5589
}

5590 5591 5592

static int
toku_db_remove(DB * db, const char *fname, const char *dbname, u_int32_t flags) {
5593
    HANDLE_PANICKED_DB(db);
5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614
    DB_TXN *null_txn = NULL;
    int r  = toku_env_dbremove(db->dbenv, null_txn, fname, dbname, flags);
    int r2 = toku_db_close(db, 0);
    if (r==0) r = r2;
    return r;
}

static int
env_dbrename_subdb(DB_ENV *env, DB_TXN *txn, const char *fname, const char *dbname, const char *newname, u_int32_t flags) {
    int r;
    if (!fname || !dbname || !newname) r = EINVAL;
    else {
        char subdb_full_name[strlen(fname) + sizeof("/") + strlen(dbname)];
        {
            int bytes = snprintf(subdb_full_name, sizeof(subdb_full_name), "%s/%s", fname, dbname);
            assert(bytes==(int)sizeof(subdb_full_name)-1);
        }
        char new_full_name[strlen(fname) + sizeof("/") + strlen(dbname)];
        {
            int bytes = snprintf(new_full_name, sizeof(new_full_name), "%s/%s", fname, dbname);
            assert(bytes==(int)sizeof(new_full_name)-1);
Yoni Fogel's avatar
Yoni Fogel committed
5615
        }
5616 5617
        const char *null_subdbname = NULL;
        r = toku_env_dbrename(env, txn, subdb_full_name, null_subdbname, new_full_name, flags);
Yoni Fogel's avatar
Yoni Fogel committed
5618
    }
5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640
    return r;
}


static int
toku_env_dbrename(DB_ENV *env, DB_TXN *txn, const char *fname, const char *dbname, const char *newname, u_int32_t flags) {
    int r;
    HANDLE_PANICKED_ENV(env);
    HANDLE_ILLEGAL_WORKING_PARENT_TXN(env, txn);
    if (!env_opened(env)) return EINVAL;
    if (dbname!=NULL) 
        return env_dbrename_subdb(env, txn, fname, dbname, newname, flags);
    // env_dbrename_subdb() converts (fname, dbname) to dname and (fname, newname) to newdname

    const char * dname = fname;
    assert(dbname == NULL);

    if (flags!=0) return EINVAL;
    if (env_is_db_with_dname_open(env, dname))
        return toku_ydb_do_error(env, EINVAL, "Cannot rename dictionary with an open handle.\n");
    if (env_is_db_with_dname_open(env, newname))
        return toku_ydb_do_error(env, EINVAL, "Cannot rename dictionary; Dictionary with target name has an open handle.\n");
Yoni Fogel's avatar
Yoni Fogel committed
5641
    
5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652
    DBT old_dname_dbt;  
    DBT new_dname_dbt;  
    DBT iname_dbt;  
    toku_fill_dbt(&old_dname_dbt, dname, strlen(dname)+1);
    toku_fill_dbt(&new_dname_dbt, newname, strlen(newname)+1);
    init_dbt_realloc(&iname_dbt);  // sets iname_dbt.data = NULL

    int using_txns = env->i->open_flags & DB_INIT_TXN;
    DB_TXN *child = NULL;
    // begin child (unless transactionless)
    if (using_txns) {
5653
	r = toku_txn_begin(env, txn, &child, DB_TXN_NOSYNC, 1);
5654 5655
	assert(r==0);
    }
5656

5657
    r = toku_db_get(env->i->directory, child, &old_dname_dbt, &iname_dbt, DB_SERIALIZABLE);  // allocates memory for iname
5658 5659 5660 5661 5662
    char *iname = iname_dbt.data;
    if (r==DB_NOTFOUND)
        r = ENOENT;
    else if (r==0) {
	// verify that newname does not already exist
5663
	r = db_getf_set(env->i->directory, child, DB_SERIALIZABLE, &new_dname_dbt, ydb_getf_do_nothing, NULL);
5664 5665 5666 5667 5668 5669
	if (r == 0) 
	    r = EEXIST;
	else if (r == DB_NOTFOUND) {
	    // remove old (dname,iname) and insert (newname,iname) in directory
	    r = toku_db_del(env->i->directory, child, &old_dname_dbt, DB_DELETE_ANY);
	    if (r == 0)
5670
		r = toku_db_put(env->i->directory, child, &new_dname_dbt, &iname_dbt, 0);
5671 5672 5673
            //Now that we have writelocks on both dnames, verify that there are still no handles open. (to prevent race conditions)
            if (r==0 && env_is_db_with_dname_open(env, dname))
                r = toku_ydb_do_error(env, EINVAL, "Cannot rename dictionary with an open handle.\n");
5674
            DB* zombie = NULL;
5675
            if (r==0) {
5676
                zombie = env_get_zombie_db_with_dname(env, dname);
5677
                if (zombie)
5678
                    r = toku_db_pre_acquire_table_lock(zombie, child, TRUE);
5679
                if (r!=0 && r!=DB_LOCK_NOTGRANTED)
5680 5681
                    toku_ydb_do_error(env, r, "Cannot rename dictionary.\n");
            }
5682 5683
            if (r==0 && env_is_db_with_dname_open(env, newname))
                r = toku_ydb_do_error(env, EINVAL, "Cannot rename dictionary; Dictionary with target name has an open handle.\n");
5684 5685 5686 5687 5688 5689
            if (r==0 && zombie) {
                //Update zombie in list if exists.
                env_note_zombie_db_closed(env, zombie);  // tell env that this db is no longer a zombie (it is completely closed)
                toku_free(zombie->i->dname);
                zombie->i->dname = toku_xstrdup(newname);
                env_note_zombie_db(env, zombie);  // tell env that this db is a zombie
5690
            }
5691 5692 5693 5694 5695 5696
	}
    }

    if (using_txns) {
	// close txn
	if (r == 0) {  // commit
5697
	    r = toku_txn_commit(child, DB_TXN_NOSYNC, NULL, NULL, false);
5698
	    invariant(r==0);  // TODO panic
5699 5700
	}
	else {         // abort
5701
	    int r2 = toku_txn_abort(child, NULL, NULL, false);
5702
	    invariant(r2==0);  // TODO panic
5703 5704 5705 5706 5707
	}
    }

    if (iname) toku_free(iname);
    return r;
Yoni Fogel's avatar
Yoni Fogel committed
5708

Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
5709
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
5710

5711 5712
static int
toku_db_rename(DB * db, const char *fname, const char *dbname, const char *newname, u_int32_t flags) {
5713
    HANDLE_PANICKED_DB(db);
5714 5715 5716 5717 5718
    DB_TXN *null_txn = NULL;
    int r  = toku_env_dbrename(db->dbenv, null_txn, fname, dbname, newname, flags);
    int r2 = toku_db_close(db, 0);
    if (r==0) r = r2;
    return r;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
5719
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
5720

5721 5722 5723
//
// This function is the only way to set a descriptor of a DB.
//
5724
static int 
5725
toku_db_change_descriptor(DB *db, DB_TXN* txn, const DBT* descriptor, u_int32_t flags) {
5726
    HANDLE_PANICKED_DB(db);
5727
    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
5728
    int r;
5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758
    TOKUTXN ttxn = txn ? db_txn_struct_i(txn)->tokutxn : NULL;
    DBT old_descriptor;
    BOOL is_db_hot_index  = ((flags & DB_IS_HOT_INDEX) != 0);

    toku_init_dbt(&old_descriptor);
    if (!db_opened(db) || !txn || !descriptor || (descriptor->size>0 && !descriptor->data)){
        r = EINVAL;
        goto cleanup;
    }
    if (txn->parent != NULL) {
        r = EINVAL; // cannot have a parent if you are a resetting op
        goto cleanup;
    }
    //
    // If the DB is created for the purpose of being a hot index, 
    // then do not grab a write lock on the directory when setting the
    // descriptor, because the hot index DB must not have a write
    // lock grabbed in order to work
    //
    if (is_db_hot_index) {
        r = toku_grab_read_lock_on_directory(db, txn);
        if (r != 0) { goto cleanup; }    
    }
    else {
        r = toku_db_pre_acquire_fileops_lock(db, txn);
        if (r != 0) { goto cleanup; }    
    }
    
    old_descriptor.size = db->descriptor->dbt.size;
    old_descriptor.data = toku_memdup(db->descriptor->dbt.data, db->descriptor->dbt.size);
5759
    r = toku_brt_change_descriptor(db->i->brt, &old_descriptor, descriptor, TRUE, ttxn);
5760 5761
cleanup:
    if (old_descriptor.data) toku_free(old_descriptor.data);
5762 5763 5764
    return r;
}

5765 5766
static int 
toku_db_set_flags(DB *db, u_int32_t flags) {
5767
    HANDLE_PANICKED_DB(db);
5768

Rich Prohaska's avatar
Rich Prohaska committed
5769
    /* the following matches BDB */
5770 5771
    if (db_opened(db) && flags != 0) return EINVAL;

5772
    return 0;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
5773 5774
}

5775 5776
static int 
toku_db_get_flags(DB *db, u_int32_t *pflags) {
5777
    HANDLE_PANICKED_DB(db);
5778
    if (!pflags) return EINVAL;
5779
    *pflags = 0;
5780 5781 5782
    return 0;
}

5783 5784
static int 
toku_db_set_pagesize(DB *db, u_int32_t pagesize) {
5785
    HANDLE_PANICKED_DB(db);
5786
    int r = toku_brt_set_nodesize(db->i->brt, pagesize);
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
5787
    return r;
Bradley C. Kuszmaul's avatar
Rename  
Bradley C. Kuszmaul committed
5788
}
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
5789

5790 5791 5792 5793 5794 5795 5796
static int 
toku_db_get_pagesize(DB *db, u_int32_t *pagesize_ptr) {
    HANDLE_PANICKED_DB(db);
    int r = toku_brt_get_nodesize(db->i->brt, pagesize_ptr);
    return r;
}

5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810
static int 
toku_db_set_readpagesize(DB *db, u_int32_t readpagesize) {
    HANDLE_PANICKED_DB(db);
    int r = toku_brt_set_basementnodesize(db->i->brt, readpagesize);
    return r;
}

static int 
toku_db_get_readpagesize(DB *db, u_int32_t *readpagesize_ptr) {
    HANDLE_PANICKED_DB(db);
    int r = toku_brt_get_basementnodesize(db->i->brt, readpagesize_ptr);
    return r;
}

5811 5812
static int 
toku_db_stat64(DB * db, DB_TXN *txn, DB_BTREE_STAT64 *s) {
5813
    HANDLE_PANICKED_DB(db);
5814
    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
5815
    struct brtstat64_s brtstat;
5816 5817 5818 5819 5820
    TOKUTXN tokutxn = NULL;
    if (txn != NULL) {
        tokutxn = db_txn_struct_i(txn)->tokutxn;
    }
    int r = toku_brt_stat64(db->i->brt, tokutxn, &brtstat);
5821
    if (r==0) {
John Esmet's avatar
John Esmet committed
5822 5823 5824 5825 5826 5827 5828
        s->bt_nkeys = brtstat.nkeys;
        s->bt_ndata = brtstat.ndata;
        s->bt_dsize = brtstat.dsize;
        s->bt_fsize = brtstat.fsize;
        // 4018
        s->bt_create_time_sec = brtstat.create_time_sec;
        s->bt_modify_time_sec = brtstat.modify_time_sec;
5829
        s->bt_verify_time_sec = brtstat.verify_time_sec;
5830 5831
    }
    return r;
5832
}
5833 5834 5835

static int 
locked_db_stat64 (DB *db, DB_TXN *txn, DB_BTREE_STAT64 *s) {
5836 5837 5838 5839
    toku_ydb_lock();
    int r = toku_db_stat64(db, txn, s);
    toku_ydb_unlock();
    return r;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
5840 5841
}

5842 5843
static int 
toku_db_key_range64(DB* db, DB_TXN* txn __attribute__((__unused__)), DBT* key, u_int64_t* less, u_int64_t* equal, u_int64_t* greater, int* is_exact) {
Zardosht Kasheff's avatar
Zardosht Kasheff committed
5844
    HANDLE_PANICKED_DB(db);
5845
    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
Zardosht Kasheff's avatar
Zardosht Kasheff committed
5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858

    // note that toku_brt_keyrange does not have a txn param
    // this will be fixed later
    // temporarily, because the caller, locked_db_keyrange, 
    // has the ydb lock, we are ok
    int r = toku_brt_keyrange(db->i->brt, key, less, equal, greater);
    if (r != 0) { goto cleanup; }
    // temporarily set is_exact to 0 because brt_keyrange does not have this parameter
    *is_exact = 0;
cleanup:
    return r;
}

5859
static int
5860 5861 5862
toku_c_pre_acquire_range_lock(DBC *dbc, const DBT *key_left, const DBT *key_right) {
    DB *db = dbc->dbp;
    DB_TXN *txn = dbc_struct_i(dbc)->txn;
Yoni Fogel's avatar
Yoni Fogel committed
5863
    HANDLE_PANICKED_DB(db);
5864 5865 5866 5867 5868
    toku_brt_cursor_set_range_lock(dbc_struct_i(dbc)->c, key_left, key_right,
                                   (key_left == toku_lt_neg_infinity),
                                   (key_right == toku_lt_infinity));
    if (!db->i->lt || !txn)
        return 0;
5869
    //READ_UNCOMMITTED and READ_COMMITTED transactions do not need read locks.
5870 5871
    if (!dbc_struct_i(dbc)->rmw && dbc_struct_i(dbc)->iso != TOKU_ISO_SERIALIZABLE)
        return 0;
Yoni Fogel's avatar
Yoni Fogel committed
5872

5873 5874
    toku_lock_type lock_type = dbc_struct_i(dbc)->rmw ? LOCK_REQUEST_WRITE : LOCK_REQUEST_READ;
    int r = get_range_lock(db, txn, key_left, key_right, lock_type);
Yoni Fogel's avatar
Yoni Fogel committed
5875 5876 5877
    return r;
}

5878 5879
//static int toku_db_pre_acquire_table_lock(DB *db, DB_TXN *txn) {
// needed by loader.c
5880 5881
int 
toku_db_pre_acquire_table_lock(DB *db, DB_TXN *txn, BOOL just_lock) {
Yoni Fogel's avatar
Yoni Fogel committed
5882
    HANDLE_PANICKED_DB(db);
5883
    if (!db->i->lt || !txn) return 0;
Yoni Fogel's avatar
Yoni Fogel committed
5884 5885 5886

    int r;

5887
    r = get_range_lock(db, txn, toku_lt_neg_infinity, toku_lt_infinity, LOCK_REQUEST_WRITE);
5888

5889 5890
    if (r==0 && !just_lock &&
        !toku_brt_is_recovery_logging_suppressed(db->i->brt) &&
5891
        toku_brt_is_empty_fast(db->i->brt)
5892 5893 5894 5895 5896 5897
    ) {
        //Try to suppress both rollback and recovery logs
        DB_LOADER *loader;
        DB *dbs[1] = {db};
        uint32_t db_flags[1]  = {DB_NOOVERWRITE};
        uint32_t dbt_flags[1] = {0};
5898
        uint32_t loader_flags = DB_PRELOCKED_WRITE; //Don't recursively prelock
5899
        DB_ENV *env = db->dbenv;
5900 5901 5902 5903 5904 5905 5906
	DB_TXN *child = NULL;
	
	{
	    // begin child
	    int rt = toku_txn_begin(env, txn, &child, DB_TXN_NOSYNC, 1);
	    assert(rt==0);
	}
5907 5908

        toku_ydb_unlock(); //Cannot hold ydb lock when creating loader
5909 5910
	
        int r_loader = env->create_loader(env, child, &loader, NULL, 1, dbs, db_flags, dbt_flags, loader_flags);
5911
        if (r_loader==0) {
5912 5913 5914 5915
            r_loader = loader->set_error_callback(loader, NULL, NULL);
            assert(r_loader==0);
            r_loader = loader->set_poll_function(loader, NULL, NULL);
            assert(r_loader==0);
5916
            // close the loader
5917 5918 5919 5920
            r_loader = loader->close(loader);
	    if (r_loader==0) {
		toku_brt_suppress_recovery_logs(db->i->brt, db_txn_struct_i(child)->tokutxn);
	    }
5921 5922 5923 5924 5925 5926 5927
        }
        else if (r_loader != DB_LOCK_NOTGRANTED) {
            //Lock not granted is not an error.
            //It just means we cannot use the loader optimization.
            assert(r==0);
            r = r_loader;
        }
5928 5929 5930
	if (r_loader == 0) { // commit
	    r = locked_txn_commit(child, 0);
	    assert(r==0);
5931
	    logsuppress++;
5932 5933 5934 5935
	}
	else {  // abort
	    r = locked_txn_abort(child);
	    assert(r==0);
5936
	    logsuppressfail++;
5937
	}
5938
        toku_ydb_lock(); //Reaquire ydb lock.
5939 5940
    }

Yoni Fogel's avatar
Yoni Fogel committed
5941 5942
    return r;
}
Zardosht Kasheff's avatar
Zardosht Kasheff committed
5943

Yoni Fogel's avatar
Yoni Fogel committed
5944 5945 5946
//TODO: DB_AUTO_COMMIT.
//TODO: Nowait only conditionally?
//TODO: NOSYNC change to SYNC if DB_ENV has something in set_flags
5947 5948
static inline int 
toku_db_construct_autotxn(DB* db, DB_TXN **txn, BOOL* changed, BOOL force_auto_commit) {
Yoni Fogel's avatar
Yoni Fogel committed
5949 5950 5951 5952 5953 5954
    assert(db && txn && changed);
    DB_ENV* env = db->dbenv;
    if (*txn || !(env->i->open_flags & DB_INIT_TXN)) {
        *changed = FALSE;
        return 0;
    }
5955
    BOOL nosync = (BOOL)(!force_auto_commit && !(env->i->open_flags & DB_AUTO_COMMIT));
Yoni Fogel's avatar
Yoni Fogel committed
5956
    u_int32_t txn_flags = DB_TXN_NOWAIT | (nosync ? DB_TXN_NOSYNC : 0);
5957
    int r = toku_txn_begin(env, NULL, txn, txn_flags, 1);
Yoni Fogel's avatar
Yoni Fogel committed
5958 5959 5960 5961 5962
    if (r!=0) return r;
    *changed = TRUE;
    return 0;
}

5963 5964
static inline int 
toku_db_destruct_autotxn(DB_TXN *txn, int r, BOOL changed) {
Yoni Fogel's avatar
Yoni Fogel committed
5965
    if (!changed) return r;
5966 5967
    if (r==0) return toku_txn_commit(txn, 0, NULL, NULL, false);
    toku_txn_abort(txn, NULL, NULL, false);
Yoni Fogel's avatar
Yoni Fogel committed
5968 5969 5970
    return r; 
}

5971 5972
static int 
locked_db_close(DB * db, u_int32_t flags) {
5973 5974 5975 5976
    toku_ydb_lock(); 
    int r = toku_db_close(db, flags); 
    toku_ydb_unlock(); 
    return r;
Rich Prohaska's avatar
Rich Prohaska committed
5977 5978
}

5979 5980
static inline int 
autotxn_db_cursor(DB *db, DB_TXN *txn, DBC **c, u_int32_t flags) {
Yoni Fogel's avatar
Yoni Fogel committed
5981
    if (!txn && (db->dbenv->i->open_flags & DB_INIT_TXN)) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
5982
        return toku_ydb_do_error(db->dbenv, EINVAL,
Yoni Fogel's avatar
Yoni Fogel committed
5983 5984
              "Cursors in a transaction environment must have transactions.\n");
    }
5985
    return toku_db_cursor(db, txn, c, flags, 0);
Yoni Fogel's avatar
Yoni Fogel committed
5986 5987
}

5988 5989
static int 
locked_db_cursor(DB *db, DB_TXN *txn, DBC **c, u_int32_t flags) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
5990
    toku_ydb_lock(); int r = autotxn_db_cursor(db, txn, c, flags); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
5991 5992
}

5993 5994
static inline int 
autotxn_db_del(DB* db, DB_TXN* txn, DBT* key, u_int32_t flags) {
Yoni Fogel's avatar
Yoni Fogel committed
5995 5996 5997 5998 5999 6000 6001
    BOOL changed; int r;
    r = toku_db_construct_autotxn(db, &txn, &changed, FALSE);
    if (r!=0) return r;
    r = toku_db_del(db, txn, key, flags);
    return toku_db_destruct_autotxn(txn, r, changed);
}

6002 6003
static int 
locked_db_del(DB * db, DB_TXN * txn, DBT * key, u_int32_t flags) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
6004
    toku_ydb_lock(); int r = autotxn_db_del(db, txn, key, flags); toku_ydb_unlock(); return r;
Yoni Fogel's avatar
Yoni Fogel committed
6005 6006
}

6007 6008
static inline int 
autotxn_db_get(DB* db, DB_TXN* txn, DBT* key, DBT* data, u_int32_t flags) {
Yoni Fogel's avatar
Yoni Fogel committed
6009 6010 6011 6012 6013
    BOOL changed; int r;
    r = toku_db_construct_autotxn(db, &txn, &changed, FALSE);
    if (r!=0) return r;
    r = toku_db_get(db, txn, key, data, flags);
    return toku_db_destruct_autotxn(txn, r, changed);
Rich Prohaska's avatar
Rich Prohaska committed
6014 6015
}

6016 6017
static int 
locked_db_get (DB * db, DB_TXN * txn, DBT * key, DBT * data, u_int32_t flags) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
6018
    toku_ydb_lock(); int r = autotxn_db_get(db, txn, key, data, flags); toku_ydb_unlock(); return r;
Yoni Fogel's avatar
Yoni Fogel committed
6019 6020
}

6021 6022
static inline int 
autotxn_db_getf_set (DB *db, DB_TXN *txn, u_int32_t flags, DBT *key, YDB_CALLBACK_FUNCTION f, void *extra) {
6023 6024 6025 6026 6027 6028 6029
    BOOL changed; int r;
    r = toku_db_construct_autotxn(db, &txn, &changed, FALSE);
    if (r!=0) return r;
    r = db_getf_set(db, txn, flags, key, f, extra);
    return toku_db_destruct_autotxn(txn, r, changed);
}

6030 6031
static int 
locked_db_getf_set (DB *db, DB_TXN *txn, u_int32_t flags, DBT *key, YDB_CALLBACK_FUNCTION f, void *extra) {
6032 6033 6034
    toku_ydb_lock(); int r = autotxn_db_getf_set(db, txn, flags, key, f, extra); toku_ydb_unlock(); return r;
}

6035
static int 
6036
locked_c_pre_acquire_range_lock(DBC *dbc, const DBT *key_left, const DBT *key_right) {
Yoni Fogel's avatar
Yoni Fogel committed
6037
    toku_ydb_lock();
6038
    int r = toku_c_pre_acquire_range_lock(dbc, key_left, key_right);
Yoni Fogel's avatar
Yoni Fogel committed
6039 6040 6041 6042
    toku_ydb_unlock();
    return r;
}

6043 6044
static int 
locked_db_pre_acquire_table_lock(DB *db, DB_TXN *txn) {
Yoni Fogel's avatar
Yoni Fogel committed
6045
    toku_ydb_lock();
6046
    int r = toku_db_pre_acquire_table_lock(db, txn, FALSE);
Yoni Fogel's avatar
Yoni Fogel committed
6047 6048 6049 6050
    toku_ydb_unlock();
    return r;
}

Zardosht Kasheff's avatar
Zardosht Kasheff committed
6051 6052 6053 6054 6055 6056 6057
static int locked_db_pre_acquire_fileops_lock(DB *db, DB_TXN *txn) {
    toku_ydb_lock();
    int r = toku_db_pre_acquire_fileops_lock(db, txn);
    toku_ydb_unlock();
    return r;
}

6058 6059 6060 6061 6062 6063 6064
static int locked_db_pre_acquire_fileops_shared_lock(DB *db, DB_TXN *txn) {
    toku_ydb_lock();
    int r = toku_grab_read_lock_on_directory(db, txn);
    toku_ydb_unlock();
    return r;
}

6065 6066
// truncate a database
// effect: remove all of the rows from a database
6067 6068
static int 
toku_db_truncate(DB *db, DB_TXN *txn, u_int32_t *row_count, u_int32_t flags) {
6069
    HANDLE_PANICKED_DB(db);
6070
    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
6071 6072
    int r;

6073 6074 6075 6076 6077 6078 6079
    u_int32_t unhandled_flags = flags;
    int ignore_cursors = 0;
    if (flags & DB_TRUNCATE_WITHCURSORS) {
        ignore_cursors = 1;
        unhandled_flags &= ~DB_TRUNCATE_WITHCURSORS;
    }

6080
    // dont support flags (yet)
6081
    if (unhandled_flags)
6082
        return EINVAL;
6083 6084
    // dont support cursors unless explicitly told to
    if (!ignore_cursors && toku_brt_get_cursor_count(db->i->brt) > 0)
6085 6086 6087 6088
        return EINVAL;

    // acquire a table lock
    if (txn) {
Zardosht Kasheff's avatar
Zardosht Kasheff committed
6089 6090 6091 6092
        r = toku_db_pre_acquire_fileops_lock(db, txn);
        if (r != 0) {
            return r;
        }
6093
        r = toku_db_pre_acquire_table_lock(db, txn, TRUE);
Zardosht Kasheff's avatar
Zardosht Kasheff committed
6094
        if (r != 0) {
6095
            return r;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
6096
        }
6097 6098 6099 6100
    }

    *row_count = 0;

6101
    r = toku_brt_truncate(db->i->brt);
6102 6103 6104 6105

    return r;
}

6106 6107
static inline int 
autotxn_db_open(DB* db, DB_TXN* txn, const char *fname, const char *dbname, DBTYPE dbtype, u_int32_t flags, int mode) {
Yoni Fogel's avatar
Yoni Fogel committed
6108
    BOOL changed; int r;
6109
    r = toku_db_construct_autotxn(db, &txn, &changed, (BOOL)((flags & DB_AUTO_COMMIT) != 0));
Yoni Fogel's avatar
Yoni Fogel committed
6110 6111 6112
    if (r!=0) return r;
    r = toku_db_open(db, txn, fname, dbname, dbtype, flags & ~DB_AUTO_COMMIT, mode);
    return toku_db_destruct_autotxn(txn, r, changed);
Rich Prohaska's avatar
Rich Prohaska committed
6113 6114
}

6115 6116
static int 
locked_db_open(DB *db, DB_TXN *txn, const char *fname, const char *dbname, DBTYPE dbtype, u_int32_t flags, int mode) {
6117 6118 6119 6120
    toku_multi_operation_client_lock(); //Cannot begin checkpoint
    toku_ydb_lock(); int r = autotxn_db_open(db, txn, fname, dbname, dbtype, flags, mode); toku_ydb_unlock();
    toku_multi_operation_client_unlock(); //Can now begin checkpoint
    return r;
Rich Prohaska's avatar
Rich Prohaska committed
6121 6122
}

6123 6124
static inline int 
autotxn_db_put(DB* db, DB_TXN* txn, DBT* key, DBT* data, u_int32_t flags) {
6125
    //{ unsigned i; printf("put %p keylen=%d key={", db, key->size); for(i=0; i<key->size; i++) printf("%d,", ((char*)key->data)[i]); printf("} datalen=%d data={", data->size); for(i=0; i<data->size; i++) printf("%d,", ((char*)data->data)[i]); printf("}\n"); }
Yoni Fogel's avatar
Yoni Fogel committed
6126 6127 6128 6129 6130 6131 6132
    BOOL changed; int r;
    r = toku_db_construct_autotxn(db, &txn, &changed, FALSE);
    if (r!=0) return r;
    r = toku_db_put(db, txn, key, data, flags);
    return toku_db_destruct_autotxn(txn, r, changed);
}

6133 6134
static int 
locked_db_put(DB * db, DB_TXN * txn, DBT * key, DBT * data, u_int32_t flags) {
6135 6136 6137 6138 6139 6140 6141
    int r = env_check_avail_fs_space(db->dbenv);
    if (r == 0) {
	toku_ydb_lock(); 
	r = autotxn_db_put(db, txn, key, data, flags); 
	toku_ydb_unlock(); 
    }
    return r;
Rich Prohaska's avatar
Rich Prohaska committed
6142 6143
}

6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193
static inline int
autotxn_db_update(DB *db, DB_TXN *txn,
                  const DBT *key,
                  const DBT *update_function_extra,
                  u_int32_t flags) {
    BOOL changed; int r;
    r = toku_db_construct_autotxn(db, &txn, &changed, FALSE);
    if (r != 0) { return r; }
    r = toku_db_update(db, txn, key, update_function_extra, flags);
    return toku_db_destruct_autotxn(txn, r, changed);
}

static int
locked_db_update(DB *db, DB_TXN *txn,
                 const DBT *key,
                 const DBT *update_function_extra,
                 u_int32_t flags) {
    int r = env_check_avail_fs_space(db->dbenv);
    if (r != 0) { goto cleanup; }
    toku_ydb_lock();
    r = autotxn_db_update(db, txn, key, update_function_extra, flags);
    toku_ydb_unlock();
cleanup:
    return r;
}

static inline int
autotxn_db_update_broadcast(DB *db, DB_TXN *txn,
                            const DBT *update_function_extra,
                            u_int32_t flags) {
    BOOL changed; int r;
    r = toku_db_construct_autotxn(db, &txn, &changed, FALSE);
    if (r != 0) { return r; }
    r = toku_db_update_broadcast(db, txn, update_function_extra, flags);
    return toku_db_destruct_autotxn(txn, r, changed);
}

static int
locked_db_update_broadcast(DB *db, DB_TXN *txn,
                           const DBT *update_function_extra,
                           u_int32_t flags) {
    int r = env_check_avail_fs_space(db->dbenv);
    if (r != 0) { goto cleanup; }
    toku_ydb_lock();
    r = autotxn_db_update_broadcast(db, txn, update_function_extra, flags);
    toku_ydb_unlock();
cleanup:
    return r;
}

6194 6195
static int 
locked_db_remove(DB * db, const char *fname, const char *dbname, u_int32_t flags) {
6196
    toku_multi_operation_client_lock(); //Cannot begin checkpoint
6197 6198 6199
    toku_ydb_lock();
    int r = toku_db_remove(db, fname, dbname, flags);
    toku_ydb_unlock();
6200
    toku_multi_operation_client_unlock(); //Can now begin checkpoint
6201
    return r;
Rich Prohaska's avatar
Rich Prohaska committed
6202 6203
}

6204 6205
static int 
locked_db_rename(DB * db, const char *namea, const char *nameb, const char *namec, u_int32_t flags) {
6206
    toku_multi_operation_client_lock(); //Cannot begin checkpoint
6207 6208 6209
    toku_ydb_lock();
    int r = toku_db_rename(db, namea, nameb, namec, flags);
    toku_ydb_unlock();
6210
    toku_multi_operation_client_unlock(); //Can now begin checkpoint
6211
    return r;
Rich Prohaska's avatar
Rich Prohaska committed
6212 6213
}

6214
static int 
6215
locked_db_change_descriptor(DB *db, DB_TXN* txn, const DBT* descriptor, u_int32_t flags) {
6216
    toku_ydb_lock();
6217
    int r = toku_db_change_descriptor(db, txn, descriptor, flags);
6218 6219
    toku_ydb_unlock();
    return r;
6220 6221
}

6222 6223
static void 
locked_db_set_errfile (DB *db, FILE *errfile) {
Rich Prohaska's avatar
Rich Prohaska committed
6224 6225 6226
    db->dbenv->set_errfile(db->dbenv, errfile);
}

6227 6228
static int 
locked_db_set_flags(DB *db, u_int32_t flags) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
6229
    toku_ydb_lock(); int r = toku_db_set_flags(db, flags); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
6230 6231
}

6232 6233
static int 
locked_db_get_flags(DB *db, u_int32_t *flags) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
6234
    toku_ydb_lock(); int r = toku_db_get_flags(db, flags); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
6235 6236
}

6237 6238
static int 
locked_db_set_pagesize(DB *db, u_int32_t pagesize) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
6239
    toku_ydb_lock(); int r = toku_db_set_pagesize(db, pagesize); toku_ydb_unlock(); return r;
Rich Prohaska's avatar
Rich Prohaska committed
6240 6241
}

6242 6243 6244 6245 6246
static int 
locked_db_get_pagesize(DB *db, u_int32_t *pagesize_ptr) {
    toku_ydb_lock(); int r = toku_db_get_pagesize(db, pagesize_ptr); toku_ydb_unlock(); return r;
}

6247 6248 6249 6250 6251 6252 6253 6254 6255 6256
static int 
locked_db_set_readpagesize(DB *db, u_int32_t readpagesize) {
    toku_ydb_lock(); int r = toku_db_set_readpagesize(db, readpagesize); toku_ydb_unlock(); return r;
}

static int 
locked_db_get_readpagesize(DB *db, u_int32_t *readpagesize_ptr) {
    toku_ydb_lock(); int r = toku_db_get_readpagesize(db, readpagesize_ptr); toku_ydb_unlock(); return r;
}

6257
// TODO 2216 delete this
6258 6259
static int 
locked_db_fd(DB * UU(db), int * UU(fdp)) {
6260 6261 6262 6263 6264
    //    toku_ydb_lock(); 
    // int r = toku_db_fd(db, fdp); 
    //    toku_ydb_unlock(); 
    //    return r;
    return 0;
Rich Prohaska's avatar
Rich Prohaska committed
6265 6266
}

Zardosht Kasheff's avatar
Zardosht Kasheff committed
6267

Yoni Fogel's avatar
Yoni Fogel committed
6268 6269 6270 6271 6272
static int locked_db_key_range64(DB* db, DB_TXN* txn, DBT* dbt, u_int64_t* less, u_int64_t* equal, u_int64_t* greater, int* is_exact) {
    toku_ydb_lock(); int r = toku_db_key_range64(db, txn, dbt, less, equal, greater, is_exact); toku_ydb_unlock(); return r;
}

static const DBT* toku_db_dbt_pos_infty(void) __attribute__((pure));
6273 6274
static const DBT*
toku_db_dbt_pos_infty(void) {
Yoni Fogel's avatar
Yoni Fogel committed
6275 6276 6277 6278
    return toku_lt_infinity;
}

static const DBT* toku_db_dbt_neg_infty(void) __attribute__((pure));
6279 6280
static const DBT* 
toku_db_dbt_neg_infty(void) {
Yoni Fogel's avatar
Yoni Fogel committed
6281
    return toku_lt_neg_infinity;
Zardosht Kasheff's avatar
Zardosht Kasheff committed
6282 6283
}

6284 6285
static int 
locked_db_truncate(DB *db, DB_TXN *txn, u_int32_t *row_count, u_int32_t flags) {
6286 6287 6288 6289 6290 6291
    toku_checkpoint_safe_client_lock();
    toku_ydb_lock();
    int r = toku_db_truncate(db, txn, row_count, flags);
    toku_ydb_unlock();
    toku_checkpoint_safe_client_unlock();
    return r;
6292 6293
}

6294 6295 6296 6297 6298 6299 6300
static int
toku_db_optimize(DB *db) {
    HANDLE_PANICKED_DB(db);
    int r = toku_brt_optimize(db->i->brt);
    return r;
}

6301 6302 6303
static int
toku_db_flatten(DB *db, DB_TXN *txn) {
    HANDLE_PANICKED_DB(db);
6304 6305
    TOKUTXN ttxn = txn ? db_txn_struct_i(txn)->tokutxn : NULL;
    int r = toku_brt_flatten(db->i->brt, ttxn);
6306 6307 6308
    return r;
}

6309 6310
static inline int 
autotxn_db_flatten(DB* db, DB_TXN* txn) {
6311 6312 6313 6314 6315 6316 6317 6318
    BOOL changed; int r;
    r = toku_db_construct_autotxn(db, &txn, &changed, FALSE);
    if (r!=0) return r;
    r = toku_db_flatten(db, txn);
    return toku_db_destruct_autotxn(txn, r, changed);
}


6319 6320
static int 
locked_db_flatten(DB *db, DB_TXN *txn) {
6321 6322 6323
    toku_ydb_lock(); int r = autotxn_db_flatten(db, txn); toku_ydb_unlock(); return r;
}

6324 6325
static int 
locked_db_optimize(DB *db) {
6326 6327 6328 6329 6330 6331
    toku_ydb_lock();
    int r = toku_db_optimize(db);
    toku_ydb_unlock();
    return r;
}

6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350
static int
db_get_fragmentation(DB * db, TOKU_DB_FRAGMENTATION report) {
    HANDLE_PANICKED_DB(db);
    int r;
    if (!db_opened(db))
        r = toku_ydb_do_error(db->dbenv, EINVAL, "Fragmentation report available only on open DBs.\n");
    else
        r = toku_brt_get_fragmentation(db->i->brt, report);
    return r;
}

static int
locked_db_get_fragmentation(DB * db, TOKU_DB_FRAGMENTATION report) {
    toku_ydb_lock();
    int r = db_get_fragmentation(db, report);
    toku_ydb_unlock();
    return r;
}

6351 6352
int 
toku_db_set_indexer(DB *db, DB_INDEXER * indexer) {
Dave Wells's avatar
Dave Wells committed
6353 6354 6355 6356 6357 6358 6359 6360 6361
    int r = 0;
    if ( db->i->indexer != NULL && indexer != NULL ) {
        // you are trying to overwrite a valid indexer
        r = EINVAL;
    }
    else {
        db->i->indexer = indexer;
    }
    return r;
6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378
}

static int 
locked_db_set_indexer(DB *db, DB_INDEXER *indexer) {
    toku_ydb_lock(); int r = toku_db_set_indexer(db, indexer); toku_ydb_unlock(); return r;
}

DB_INDEXER *
toku_db_get_indexer(DB *db) {
    return db->i->indexer;
}

static void 
locked_db_get_indexer(DB *db, DB_INDEXER **indexer_ptr) {
    toku_ydb_lock(); *indexer_ptr = toku_db_get_indexer(db); toku_ydb_unlock();
}

6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403
struct ydb_verify_context {
    int (*progress_callback)(void *extra, float progress);
    void *progress_extra;
};

static int
ydb_verify_progress_callback(void *extra, float progress) {
    struct ydb_verify_context *context = (struct ydb_verify_context *) extra;
    toku_ydb_unlock_and_yield(1000);
    int r = 0;
    if (context->progress_callback)
        r = context->progress_callback(context->progress_extra, progress);
    toku_ydb_lock();
    return r;
}

static int
locked_db_verify_with_progress(DB *db, int (*progress_callback)(void *extra, float progress), void *progress_extra, int verbose, int keep_going) {
    struct ydb_verify_context context = { progress_callback, progress_extra };
    toku_ydb_lock();
    int r = toku_verify_brt_with_progress(db->i->brt, ydb_verify_progress_callback, &context, verbose, keep_going);
    toku_ydb_unlock();
    return r;
}

6404 6405
static int 
toku_db_create(DB ** db, DB_ENV * env, u_int32_t flags) {
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
6406 6407
    int r;

6408 6409 6410 6411 6412
    if (flags || env == NULL) 
        return EINVAL;

    if (!env_opened(env))
        return EINVAL;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
6413
    
6414
    DB *MALLOC(result);
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
6415 6416
    if (result == 0) {
        return ENOMEM;
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
6417
    }
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
6418 6419
    memset(result, 0, sizeof *result);
    result->dbenv = env;
Yoni Fogel's avatar
Yoni Fogel committed
6420 6421 6422 6423 6424 6425 6426 6427 6428
#define SDB(name) result->name = locked_db_ ## name
    SDB(key_range64);
    SDB(close);
    SDB(cursor);
    SDB(del);
    SDB(get);
    //    SDB(key_range);
    SDB(open);
    SDB(put);
6429 6430
    SDB(update);
    SDB(update_broadcast);
Yoni Fogel's avatar
Yoni Fogel committed
6431 6432
    SDB(remove);
    SDB(rename);
6433
    SDB(change_descriptor);
Yoni Fogel's avatar
Yoni Fogel committed
6434 6435
    SDB(set_errfile);
    SDB(set_pagesize);
6436
    SDB(get_pagesize);
6437 6438
    SDB(set_readpagesize);
    SDB(get_readpagesize);
Yoni Fogel's avatar
Yoni Fogel committed
6439 6440
    SDB(set_flags);
    SDB(get_flags);
6441
    SDB(stat64);
Yoni Fogel's avatar
Yoni Fogel committed
6442
    SDB(fd);
Yoni Fogel's avatar
Yoni Fogel committed
6443
    SDB(pre_acquire_table_lock);
Zardosht Kasheff's avatar
Zardosht Kasheff committed
6444
    SDB(pre_acquire_fileops_lock);
6445
    SDB(pre_acquire_fileops_shared_lock);
6446
    SDB(truncate);
6447
    SDB(row_size_supported);
6448
    SDB(getf_set);
6449
    SDB(flatten);
6450
    SDB(optimize);
6451
    SDB(get_fragmentation);
6452 6453
    SDB(set_indexer);
    SDB(get_indexer);
6454
    SDB(verify_with_progress);
Yoni Fogel's avatar
Yoni Fogel committed
6455 6456 6457
#undef SDB
    result->dbt_pos_infty = toku_db_dbt_pos_infty;
    result->dbt_neg_infty = toku_db_dbt_neg_infty;
6458
    MALLOC(result->i);
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
6459 6460 6461 6462 6463
    if (result->i == 0) {
        toku_free(result);
        return ENOMEM;
    }
    memset(result->i, 0, sizeof *result->i);
6464
    result->i->dict_id = DICTIONARY_ID_NONE;
6465
    result->i->opened = 0;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
6466 6467 6468
    result->i->open_flags = 0;
    result->i->open_mode = 0;
    result->i->brt = 0;
6469 6470
    result->i->indexer = NULL;
    result->i->refs = 1;
6471
    toku_list_init(&result->i->dbs_that_must_close_before_abort);
6472
    r = toku_brt_create(&result->i->brt);
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
6473 6474 6475
    if (r != 0) {
        toku_free(result->i);
        toku_free(result);
Yoni Fogel's avatar
Yoni Fogel committed
6476
        return r;
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
6477 6478 6479
    }
    *db = result;
    return 0;
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
6480
}
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
6481

6482 6483
int 
DB_CREATE_FUN (DB ** db, DB_ENV * env, u_int32_t flags) {
6484 6485 6486 6487
    toku_ydb_lock(); 
    int r = toku_db_create(db, env, flags); 
    toku_ydb_unlock(); 
    return r;
Rich Prohaska's avatar
Rich Prohaska committed
6488 6489 6490 6491
}

/* need db_strerror_r for multiple threads */

6492 6493
char *
db_strerror(int error) {
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
6494 6495 6496 6497 6498 6499 6500
    char *errorstr;
    if (error >= 0) {
        errorstr = strerror(error);
        if (errorstr)
            return errorstr;
    }
    
6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515
    switch (error) {
        case DB_BADFORMAT:
            return "Database Bad Format (probably a corrupted database)";
        case DB_NOTFOUND:
            return "Not found";
        case TOKUDB_OUT_OF_LOCKS:
            return "Out of locks";
        case TOKUDB_DICTIONARY_TOO_OLD:
            return "Dictionary too old for this version of TokuDB";
        case TOKUDB_DICTIONARY_TOO_NEW:
            return "Dictionary too new for this version of TokuDB";
        case TOKUDB_CANCELED:
            return "User cancelled operation";
        case TOKUDB_NO_DATA:
            return "Ran out of data (not EOF)";
6516
    }
6517

Rich Prohaska's avatar
Rich Prohaska committed
6518
    static char unknown_result[100];    // Race condition if two threads call this at the same time. However even in a bad case, it should be some sort of null-terminated string.
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
6519 6520 6521 6522 6523
    errorstr = unknown_result;
    snprintf(errorstr, sizeof unknown_result, "Unknown error code: %d", error);
    return errorstr;
}

6524 6525
const char *
db_version(int *major, int *minor, int *patch) {
Bradley C. Kuszmaul's avatar
up  
Bradley C. Kuszmaul committed
6526 6527 6528 6529 6530 6531
    if (major)
        *major = DB_VERSION_MAJOR;
    if (minor)
        *minor = DB_VERSION_MINOR;
    if (patch)
        *patch = DB_VERSION_PATCH;
6532 6533 6534 6535 6536
#if defined(TOKUDB_REVISION)
#define xstr(X) str(X)
#define str(X) #X
    return "tokudb " xstr(DB_VERSION_MAJOR) "." xstr(DB_VERSION_MINOR) "." xstr(DB_VERSION_PATCH) " build " xstr(TOKUDB_REVISION);
#else
6537
#error
6538
#endif
Bradley C. Kuszmaul's avatar
Bradley C. Kuszmaul committed
6539
}
6540
 
6541 6542
int 
db_env_set_func_fsync (int (*fsync_function)(int)) {
6543 6544
    return toku_set_func_fsync(fsync_function);
}
Yoni Fogel's avatar
Yoni Fogel committed
6545

6546 6547
int 
db_env_set_func_pwrite (ssize_t (*pwrite_function)(int, const void *, size_t, toku_off_t)) {
6548 6549
    return toku_set_func_pwrite(pwrite_function);
}
6550 6551 6552

int 
db_env_set_func_full_pwrite (ssize_t (*pwrite_function)(int, const void *, size_t, toku_off_t)) {
6553 6554
    return toku_set_func_full_pwrite(pwrite_function);
}
6555 6556 6557

int 
db_env_set_func_write (ssize_t (*write_function)(int, const void *, size_t)) {
6558 6559
    return toku_set_func_write(write_function);
}
6560 6561 6562

int 
db_env_set_func_full_write (ssize_t (*write_function)(int, const void *, size_t)) {
6563 6564
    return toku_set_func_full_write(write_function);
}
6565

6566 6567
int 
db_env_set_func_fdopen (FILE * (*fdopen_function)(int, const char *)) {
6568 6569
    return toku_set_func_fdopen(fdopen_function);
}
6570 6571 6572

int 
db_env_set_func_fopen (FILE * (*fopen_function)(const char *, const char *)) {
6573 6574
    return toku_set_func_fopen(fopen_function);
}
6575 6576 6577

int 
db_env_set_func_open (int (*open_function)(const char *, int, int)) {
6578 6579
    return toku_set_func_open(open_function);
}
6580 6581 6582

int 
db_env_set_func_fclose (int (*fclose_function)(FILE*)) {
6583 6584 6585
    return toku_set_func_fclose(fclose_function);
}

6586 6587 6588 6589 6590
int
db_env_set_func_pread (ssize_t (*fun)(int, void *, size_t, off_t)) {
    return toku_set_func_pread(fun);
}

6591 6592 6593 6594 6595
void 
db_env_set_func_loader_fwrite (size_t (*fwrite_fun)(const void*,size_t,size_t,FILE*)) {
    brtloader_set_os_fwrite(fwrite_fun);
}

6596 6597
int 
db_env_set_func_malloc (void *(*f)(size_t)) {
6598 6599
    toku_set_func_malloc(f);
    return 0;
6600
}
6601 6602 6603

int 
db_env_set_func_realloc (void *(*f)(void*, size_t)) {
6604 6605
    toku_set_func_realloc(f);
    return 0;
6606
}
6607 6608 6609

int 
db_env_set_func_free (void (*f)(void*)) {
6610 6611
    toku_set_func_free(f);
    return 0;
6612
}
6613

6614

6615
// Got to call dlmalloc, or else it won't get included.
6616 6617
void 
setup_dlmalloc (void) {
6618 6619 6620 6621
    db_env_set_func_malloc(dlmalloc);
    db_env_set_func_realloc(dlrealloc);
    db_env_set_func_free(dlfree);
}
6622 6623

// For test purposes only.
6624
// With this interface, all checkpoint users get the same callbacks and the same extras.
6625 6626
void 
db_env_set_checkpoint_callback (void (*callback_f)(void*), void* extra) {
6627
    toku_checkpoint_safe_client_lock();
6628 6629
    checkpoint_callback_f = callback_f;
    checkpoint_callback_extra = extra;
6630
    toku_checkpoint_safe_client_unlock();
6631
    //printf("set callback = %p, extra = %p\n", callback_f, extra);
6632
}
6633 6634 6635

void 
db_env_set_checkpoint_callback2 (void (*callback_f)(void*), void* extra) {
6636 6637 6638 6639 6640 6641
    toku_checkpoint_safe_client_lock();
    checkpoint_callback2_f = callback_f;
    checkpoint_callback2_extra = extra;
    toku_checkpoint_safe_client_unlock();
    //printf("set callback2 = %p, extra2 = %p\n", callback2_f, extra2);
}
Yoni Fogel's avatar
Yoni Fogel committed
6642

6643 6644
void 
db_env_set_recover_callback (void (*callback_f)(void*), void* extra) {
6645 6646 6647
    toku_recover_set_callback(callback_f, extra);
}

6648 6649
void 
db_env_set_recover_callback2 (void (*callback_f)(void*), void* extra) {
6650 6651 6652
    toku_recover_set_callback2(callback_f, extra);
}

Zardosht Kasheff's avatar
Zardosht Kasheff committed
6653 6654 6655 6656 6657
void 
db_env_set_flusher_thread_callback(void (*callback_f)(int, void*), void* extra) {
    toku_flusher_thread_set_callback(callback_f, extra);
}

6658 6659
void 
db_env_set_loader_size_factor (uint32_t factor) {
6660 6661 6662
    toku_brtloader_set_size_factor(factor);
}

6663 6664
void 
db_env_set_mvcc_garbage_collection_verification(u_int32_t verification_mode) {
Zardosht Kasheff's avatar
Zardosht Kasheff committed
6665 6666
    garbage_collection_debug = (verification_mode != 0);
}
6667

6668 6669 6670 6671 6672
// Purpose: allow test programs that expect to fail to suppress engine status output on failed assert.
void
db_env_enable_engine_status(uint32_t enable) {
    engine_status_enable = enable;
}
6673

Yoni Fogel's avatar
Yoni Fogel committed
6674 6675 6676 6677 6678 6679 6680 6681 6682 6683
// HACK: To ensure toku_pthread_yield gets included in the .so
// non-static would require a prototype in a header
// static (since unused) would give a warning
// static + unused would not actually help toku_pthread_yield get in the .so
// static + used avoids all the warnings and makes sure toku_pthread_yield is in the .so
static void __attribute__((__used__))
include_toku_pthread_yield (void) {
    toku_pthread_yield();
}

6684 6685 6686 6687

// For test purposes only, translate dname to iname
static int 
env_get_iname(DB_ENV* env, DBT* dname_dbt, DBT* iname_dbt) {
6688
    toku_ydb_lock();
6689
    DB *directory = env->i->directory;
6690
    int r = autotxn_db_get(directory, NULL, dname_dbt, iname_dbt, DB_SERIALIZABLE|DB_PRELOCKED); // allocates memory for iname
6691
    toku_ydb_unlock();
6692 6693 6694
    return r;
}

6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707
/* Following functions (ydb_load_xxx()) are used by loader:
 */


// When the loader is created, it makes this call.
// For each dictionary to be loaded, replace old iname in directory
// with a newly generated iname.  This will also take a write lock
// on the directory entries.  The write lock will be released when
// the transaction of the loader is completed.
// If the transaction commits, the new inames are in place.
// If the transaction aborts, the old inames will be restored.
// The new inames are returned to the caller.  
// It is the caller's responsibility to free them.
6708 6709
// If "mark_as_loader" is true, then include a mark in the iname
// to indicate that the file is created by the brt loader.
6710 6711
// Return 0 on success (could fail if write lock not available).
int
6712
ydb_load_inames(DB_ENV * env, DB_TXN * txn, int N, DB * dbs[N], char * new_inames_in_env[N], LSN *load_lsn, BOOL mark_as_loader) {
6713 6714 6715 6716 6717
    int rval;
    int i;
    
    int using_txns = env->i->open_flags & DB_INIT_TXN;
    DB_TXN * child = NULL;
6718
    TXNID xid = 0;
6719 6720 6721
    DBT dname_dbt;  // holds dname
    DBT iname_dbt;  // holds new iname
    
6722 6723 6724
    char * mark;

    if (mark_as_loader)
6725
	mark = "B";
6726 6727 6728
    else
	mark = "P";

6729 6730 6731 6732
    for (i=0; i<N; i++) {
	new_inames_in_env[i] = NULL;
    }

6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744
    // begin child (unless transactionless)
    if (using_txns) {
	rval = toku_txn_begin(env, txn, &child, DB_TXN_NOSYNC, 1);
	assert(rval == 0);
	xid = toku_txn_get_txnid(db_txn_struct_i(child)->tokutxn);
    }
    for (i = 0; i < N; i++) {
	char * dname = dbs[i]->i->dname;
	toku_fill_dbt(&dname_dbt, dname, strlen(dname)+1);
	// now create new iname
	char hint[strlen(dname) + 1];
	create_iname_hint(dname, hint);
6745
	char * new_iname = create_iname(env, xid, hint, mark, i);               // allocates memory for iname_in_env
6746
	new_inames_in_env[i] = new_iname;
6747
        toku_fill_dbt(&iname_dbt, new_iname, strlen(new_iname) + 1);      // iname_in_env goes in directory
6748
        rval = toku_db_put(env->i->directory, child, &dname_dbt, &iname_dbt, 0);
6749
	if (rval) break;
Yoni Fogel's avatar
Yoni Fogel committed
6750 6751 6752 6753 6754 6755
    }

    // Generate load log entries.
    if (!rval && using_txns) {
        TOKUTXN ttxn = db_txn_struct_i(txn)->tokutxn;
        int do_fsync = 0;
6756
        LSN *get_lsn = NULL;
Yoni Fogel's avatar
Yoni Fogel committed
6757 6758 6759
        for (i = 0; i < N; i++) {
            BRT brt  = dbs[i]->i->brt;
            //Fsync is necessary for the last one only.
6760 6761 6762 6763 6764
            if (i==N-1) {
                do_fsync = 1; //We only need a single fsync of logs.
                get_lsn  = load_lsn; //Set pointer to capture the last lsn.
            }
            rval = toku_brt_load(brt, ttxn, new_inames_in_env[i], do_fsync, get_lsn);
Yoni Fogel's avatar
Yoni Fogel committed
6765 6766
            if (rval) break;
        }
6767 6768 6769 6770 6771
    }
	
    if (using_txns) {
	// close txn
	if (rval == 0) {  // all well so far, commit child
6772
	    rval = toku_txn_commit(child, DB_TXN_NOSYNC, NULL, NULL, false);
6773 6774 6775
	    assert(rval==0);
	}
	else {         // abort child
6776
	    int r2 = toku_txn_abort(child, NULL, NULL, false);
6777 6778
	    assert(r2==0);
	    for (i=0; i<N; i++) {
6779 6780 6781 6782
		if (new_inames_in_env[i]) {
		    toku_free(new_inames_in_env[i]);
		    new_inames_in_env[i] = NULL;
		}
6783 6784 6785 6786 6787 6788 6789
	    }
	}
    }

    return rval;
}

6790
int
6791
locked_ydb_load_inames(DB_ENV * env, DB_TXN * txn, int N, DB * dbs[N], char * new_inames_in_env[N], LSN *load_lsn, BOOL mark_as_loader) {
6792
    toku_ydb_lock();
6793
    int r = ydb_load_inames(env, txn, N, dbs, new_inames_in_env, load_lsn, mark_as_loader);
6794 6795 6796 6797
    toku_ydb_unlock();
    return r;
}

6798 6799 6800 6801
// TODO 2216:  Patch out this (dangerous) function when loader is working and 
//             we don't need to test the low-level redirect anymore.
// for use by test programs only, just a wrapper around brt call:
int
6802
toku_test_db_redirect_dictionary(DB * db, char * dname_of_new_file, DB_TXN *dbtxn) {
6803 6804 6805 6806 6807 6808 6809 6810 6811 6812
    int r;
    DBT dname_dbt;
    DBT iname_dbt;
    char * new_iname_in_env;

    BRT brt = db->i->brt;
    TOKUTXN tokutxn = db_txn_struct_i(dbtxn)->tokutxn;

    toku_fill_dbt(&dname_dbt, dname_of_new_file, strlen(dname_of_new_file)+1);
    init_dbt_realloc(&iname_dbt);  // sets iname_dbt.data = NULL
6813
    r = toku_db_get(db->dbenv->i->directory, dbtxn, &dname_dbt, &iname_dbt, DB_SERIALIZABLE);  // allocates memory for iname
6814 6815 6816
    assert(r==0);
    new_iname_in_env = iname_dbt.data;

Yoni Fogel's avatar
Yoni Fogel committed
6817
    r = toku_dictionary_redirect(new_iname_in_env, brt, tokutxn);
6818 6819 6820 6821

    toku_free(new_iname_in_env);
    return r;
}
6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832

//Tets only function
uint64_t
toku_test_get_latest_lsn(DB_ENV *env) {
    LSN rval = ZERO_LSN;
    if (env && env->i->logger) {
        rval = toku_logger_last_lsn(env->i->logger);
    }
    return rval.lsn;
}

6833 6834
int 
toku_test_get_checkpointing_user_data_status (void) {
6835 6836
    return toku_cachetable_get_checkpointing_user_data_status();
}
Zardosht Kasheff's avatar
Zardosht Kasheff committed
6837

6838 6839
// acquire a point write lock on the key for a given txn.
// this does not block the calling thread.
6840
int
6841 6842 6843 6844 6845 6846 6847 6848
toku_grab_write_lock (DB *db, DBT *key, TOKUTXN tokutxn) {
    DB_TXN *txn = toku_txn_get_container_db_txn(tokutxn);
    DB_TXN *txn_anc = toku_txn_ancestor(txn);
    int r = toku_txn_add_lt(txn_anc, db->i->lt);
    if (r == 0) {
        TXNID txn_anc_id = toku_txn_get_txnid(db_txn_struct_i(txn_anc)->tokutxn);
        r = toku_lt_acquire_write_lock(db->i->lt, db, txn_anc_id, key);
    }
6849 6850
    return r;
}