#50 count long tail events like long fsyncs, checkpoints, and lock tree waits

1fd6c8ea · Rich Prohaska · 39a68da2 · 1fd6c8ea · 1fd6c8ea · 1fd6c8ea
Commit 1fd6c8ea authored Aug 20, 2013 by Rich Prohaska
8 changed files
--- a/ft/checkpoint.cc
+++ b/ft/checkpoint.cc
@@ -163,6 +163,11 @@ status_init(void) {
    STATUS_INIT(CP_WAITERS_MAX,                         nullptr, UINT64,   "waiters max", TOKU_ENGINE_STATUS);
    STATUS_INIT(CP_CLIENT_WAIT_ON_MO,                   nullptr, UINT64,   "non-checkpoint client wait on mo lock", TOKU_ENGINE_STATUS);
    STATUS_INIT(CP_CLIENT_WAIT_ON_CS,                   nullptr, UINT64,   "non-checkpoint client wait on cs lock", TOKU_ENGINE_STATUS);
+
+    STATUS_INIT(CP_BEGIN_TIME,                          nullptr, UINT64,   "checkpoint begin time", TOKU_ENGINE_STATUS);
+    STATUS_INIT(CP_LONG_BEGIN_COUNT,                    nullptr, UINT64,   "long checkpoint begin count", TOKU_ENGINE_STATUS);
+    STATUS_INIT(CP_LONG_BEGIN_TIME,                     nullptr, UINT64,   "long checkpoint begin time", TOKU_ENGINE_STATUS);
+
    cp_status.initialized = true;
 }
 #undef STATUS_INIT
@@ -188,12 +193,11 @@ static toku_pthread_rwlock_t low_priority_multi_operation_lock;
 static bool initialized = false;     // sanity check
 static volatile bool locked_mo = false;       // true when the multi_operation write lock is held (by checkpoint)
 static volatile bool locked_cs = false;       // true when the checkpoint_safe write lock is held (by checkpoint)
-
+static volatile uint64_t toku_checkpoint_long_threshold = 1000000;

 // Note following static functions are called from checkpoint internal logic only,
 // and use the "writer" calls for locking and unlocking.

-
 static void
 multi_operation_lock_init(void) {
    pthread_rwlockattr_t attr;
@@ -335,7 +339,9 @@ toku_checkpoint(CHECKPOINTER cp, TOKULOGGER logger,
    
    SET_CHECKPOINT_FOOTPRINT(30);
    STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_BEGIN) = time(NULL);
+    uint64_t t_checkpoint_begin_start = toku_current_time_microsec();
    toku_cachetable_begin_checkpoint(cp, logger);
+    uint64_t t_checkpoint_begin_end = toku_current_time_microsec();

    toku_ft_open_close_unlock();
    multi_operation_checkpoint_unlock();
@@ -357,6 +363,12 @@ toku_checkpoint(CHECKPOINTER cp, TOKULOGGER logger,
    STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_END) = time(NULL);
    STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_BEGIN_COMPLETE) = STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_BEGIN);
    STATUS_VALUE(CP_CHECKPOINT_COUNT)++;
+    uint64_t duration = t_checkpoint_begin_end - t_checkpoint_begin_start;
+    STATUS_VALUE(CP_BEGIN_TIME) += duration;
+    if (duration >= toku_checkpoint_long_threshold) {
+        STATUS_VALUE(CP_LONG_BEGIN_TIME) += duration;
+        STATUS_VALUE(CP_LONG_BEGIN_COUNT) += 1;
+    }
    STATUS_VALUE(CP_FOOTPRINT) = 0;

    checkpoint_safe_checkpoint_unlock();

--- a/ft/checkpoint.h
+++ b/ft/checkpoint.h
@@ -185,7 +185,10 @@ typedef enum {
    CP_WAITERS_MAX,          // max threads ever simultaneously waiting for the checkpoint_safe lock to perform a checkpoint
    CP_CLIENT_WAIT_ON_MO,    // how many times a client thread waited to take the multi_operation lock, not for checkpoint
    CP_CLIENT_WAIT_ON_CS,    // how many times a client thread waited for the checkpoint_safe lock, not for checkpoint
-    CP_STATUS_NUM_ROWS       // number of rows in this status array
+    CP_BEGIN_TIME,
+    CP_LONG_BEGIN_TIME,
+    CP_LONG_BEGIN_COUNT,
+    CP_STATUS_NUM_ROWS       // number of rows in this status array.  must be last.
 } cp_status_entry;

 typedef struct {

--- a/locktree/lock_request.cc
+++ b/locktree/lock_request.cc
@@ -245,6 +245,7 @@ void lock_request::calculate_cond_wakeup_time(struct timespec *ts) {

 // sleep on the lock request until it becomes resolved or the wait time has elapsed.
 int lock_request::wait(void) {
+    uint64_t t_start = toku_current_time_microsec();
    toku_mutex_lock(&m_info->mutex);
    while (m_state == state::PENDING) {
        struct timespec ts;
@@ -259,6 +260,14 @@ int lock_request::wait(void) {
            complete(DB_LOCK_NOTGRANTED);
        }
    }
+    uint64_t t_end = toku_current_time_microsec();
+    uint64_t duration = t_end - t_start;
+    m_info->wait_count += 1;
+    m_info->wait_time += duration;
+    if (duration >= 1000000) {
+        m_info->long_wait_count += 1;
+        m_info->long_wait_time += duration;
+    }
    toku_mutex_unlock(&m_info->mutex);

    invariant(m_state == state::COMPLETE);

--- a/locktree/locktree.h
+++ b/locktree/locktree.h
@@ -115,7 +115,11 @@ enum {
    LTM_STO_NUM_ELIGIBLE,
    LTM_STO_END_EARLY_COUNT,
    LTM_STO_END_EARLY_TIME,
-    LTM_STATUS_NUM_ROWS
+    LTM_WAIT_COUNT,
+    LTM_WAIT_TIME,
+    LTM_LONG_WAIT_COUNT,
+    LTM_LONG_WAIT_TIME,
+    LTM_STATUS_NUM_ROWS // must be last
 };

 typedef struct {
@@ -187,6 +191,8 @@ public:
        omt<lock_request *> pending_lock_requests;
        toku_mutex_t mutex;
        bool should_retry_lock_requests;
+        uint64_t wait_count, wait_time;
+        uint64_t long_wait_count, long_wait_time;
    };

    // Private info struct for storing pending lock request state.

--- a/locktree/manager.cc
+++ b/locktree/manager.cc
@@ -368,7 +368,7 @@ bool locktree::manager::memory_tracker::out_of_locks(void) const {
 void locktree::manager::status_init(void) {
    STATUS_INIT(LTM_SIZE_CURRENT,             LOCKTREE_MEMORY_SIZE, UINT64,   "memory size", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
    STATUS_INIT(LTM_SIZE_LIMIT,               LOCKTREE_MEMORY_SIZE_LIMIT, UINT64,   "memory size limit", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
-    STATUS_INIT(LTM_ESCALATION_COUNT, LOCKTREE_ESCALATION_NUM, UINT64, "number of times lock escalation ran", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
+    STATUS_INIT(LTM_ESCALATION_COUNT,         LOCKTREE_ESCALATION_NUM, UINT64, "number of times lock escalation ran", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
    STATUS_INIT(LTM_ESCALATION_TIME,          LOCKTREE_ESCALATION_SECONDS, TOKUTIME, "time spent running escalation (seconds)", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
    STATUS_INIT(LTM_ESCALATION_LATEST_RESULT, LOCKTREE_LATEST_POST_ESCALATION_MEMORY_SIZE, UINT64,   "latest post-escalation memory size", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
    STATUS_INIT(LTM_NUM_LOCKTREES,            LOCKTREE_OPEN_CURRENT, UINT64,   "number of locktrees open now", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
@@ -376,6 +376,12 @@ void locktree::manager::status_init(void) {
    STATUS_INIT(LTM_STO_NUM_ELIGIBLE,         LOCKTREE_STO_ELIGIBLE_NUM, UINT64,   "number of locktrees eligible for the STO", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
    STATUS_INIT(LTM_STO_END_EARLY_COUNT,      LOCKTREE_STO_ENDED_NUM, UINT64,   "number of times a locktree ended the STO early", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
    STATUS_INIT(LTM_STO_END_EARLY_TIME,       LOCKTREE_STO_ENDED_SECONDS, TOKUTIME, "time spent ending the STO early (seconds)", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
+
+    STATUS_INIT(LTM_WAIT_COUNT,               LOCKTREE_WAIT_COUNT, UINT64, "number of wait locks", TOKU_ENGINE_STATUS);
+    STATUS_INIT(LTM_WAIT_TIME,                LOCKTREE_WAIT_TIME, UINT64, "time waiting for locks", TOKU_ENGINE_STATUS);
+    STATUS_INIT(LTM_LONG_WAIT_COUNT,          LOCKTREE_LONG_WAIT_COUNT, UINT64, "number of long wait locks ", TOKU_ENGINE_STATUS);
+    STATUS_INIT(LTM_LONG_WAIT_TIME,           LOCKTREE_LONG_WAIT_TIME, UINT64, "long time waiting for locks", TOKU_ENGINE_STATUS);
+
    status.initialized = true;
 }

@@ -399,6 +405,9 @@ void locktree::manager::get_status(LTM_STATUS statp) {
    uint64_t sto_num_eligible = 0;
    uint64_t sto_end_early_count = 0;
    tokutime_t sto_end_early_time = 0;
+    
+    uint64_t lock_wait_count = 0, lock_wait_time = 0;
+    uint64_t long_lock_wait_count = 0, long_lock_wait_time = 0;

    size_t num_locktrees = m_locktree_map.size();
    for (size_t i = 0; i < num_locktrees; i++) {
@@ -407,7 +416,11 @@ void locktree::manager::get_status(LTM_STATUS statp) {
        invariant_zero(r);

        toku_mutex_lock(&lt->m_lock_request_info.mutex);
-        lock_requests_pending += lt->get_lock_request_info()->pending_lock_requests.size();
+        lock_requests_pending += lt->m_lock_request_info.pending_lock_requests.size();
+        lock_wait_count += lt->m_lock_request_info.wait_count;
+        lock_wait_time += lt->m_lock_request_info.wait_time;
+        long_lock_wait_count += lt->m_lock_request_info.long_wait_count;
+        long_lock_wait_time  += lt->m_lock_request_info.long_wait_time;        
        toku_mutex_unlock(&lt->m_lock_request_info.mutex);

        sto_num_eligible += lt->sto_txnid_is_valid_unsafe() ? 1 : 0;
@@ -422,6 +435,10 @@ void locktree::manager::get_status(LTM_STATUS statp) {
    STATUS_VALUE(LTM_STO_NUM_ELIGIBLE) = sto_num_eligible;
    STATUS_VALUE(LTM_STO_END_EARLY_COUNT) = sto_end_early_count;
    STATUS_VALUE(LTM_STO_END_EARLY_TIME) = sto_end_early_time;
+    STATUS_VALUE(LTM_WAIT_COUNT) = lock_wait_count;
+    STATUS_VALUE(LTM_WAIT_TIME) = lock_wait_time;
+    STATUS_VALUE(LTM_LONG_WAIT_COUNT) = long_lock_wait_count;
+    STATUS_VALUE(LTM_LONG_WAIT_TIME) = long_lock_wait_time;
    *statp = status;
 }
 #undef STATUS_VALUE

--- a/portability/file.cc
+++ b/portability/file.cc
@@ -439,23 +439,22 @@ void toku_os_recursive_delete(const char *path) {
    assert_zero(r);
 }

-/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // fsync logic:

 // t_fsync exists for testing purposes only
 static int (*t_fsync)(int) = 0;
 static uint64_t toku_fsync_count;
 static uint64_t toku_fsync_time;
-
-static uint64_t sched_fsync_count;
-static uint64_t sched_fsync_time;
+static uint64_t toku_long_fsync_threshold = 1000000;
+static uint64_t toku_long_fsync_count;
+static uint64_t toku_long_fsync_time;

 void toku_set_func_fsync(int (*fsync_function)(int)) {
    t_fsync = fsync_function;
 }

 // keep trying if fsync fails because of EINTR
-static void file_fsync_internal (int fd, uint64_t *duration_p) {
+static void file_fsync_internal (int fd) {
    uint64_t tstart = toku_current_time_microsec();
    int r = -1;
    while (r != 0) {
@@ -471,13 +470,14 @@ static void file_fsync_internal (int fd, uint64_t *duration_p) {
    toku_sync_fetch_and_add(&toku_fsync_count, 1);
    uint64_t duration = toku_current_time_microsec() - tstart;
    toku_sync_fetch_and_add(&toku_fsync_time, duration);
-    if (duration_p) {
-        *duration_p = duration;
+    if (duration >= toku_long_fsync_threshold) {
+        toku_sync_fetch_and_add(&toku_long_fsync_count, 1);
+        toku_sync_fetch_and_add(&toku_long_fsync_time, duration);
    }
 }

 void toku_file_fsync_without_accounting(int fd) {
-    file_fsync_internal(fd, NULL);
+    file_fsync_internal(fd);
 }

 void toku_fsync_dirfd_without_accounting(DIR *dir) {
@@ -502,22 +502,16 @@ int toku_fsync_dir_by_name_without_accounting(const char *dir_name) {

 // include fsync in scheduling accounting
 void toku_file_fsync(int fd) {
-    uint64_t duration;
-    file_fsync_internal (fd, &duration);
-    toku_sync_fetch_and_add(&sched_fsync_count, 1);
-    toku_sync_fetch_and_add(&sched_fsync_time, duration);
+    file_fsync_internal (fd);
 }

 // for real accounting
-void toku_get_fsync_times(uint64_t *fsync_count, uint64_t *fsync_time) {
+void toku_get_fsync_times(uint64_t *fsync_count, uint64_t *fsync_time, uint64_t *long_fsync_threshold, uint64_t *long_fsync_count, uint64_t *long_fsync_time) {
    *fsync_count = toku_fsync_count;
    *fsync_time = toku_fsync_time;
-}
-
-// for scheduling algorithm only
-void toku_get_fsync_sched(uint64_t *fsync_count, uint64_t *fsync_time) {
-    *fsync_count = sched_fsync_count;
-    *fsync_time  = sched_fsync_time;
+    *long_fsync_threshold = toku_long_fsync_threshold;
+    *long_fsync_count = toku_long_fsync_count;
+    *long_fsync_time = toku_long_fsync_time;
 }

 int toku_fsync_directory(const char *fname) {

--- a/src/ydb.cc
+++ b/src/ydb.cc
@@ -1745,9 +1745,11 @@ typedef enum {
    FS_ENOSPC_REDZONE_CTR,        // number of operations rejected by enospc prevention (red zone)
    FS_ENOSPC_MOST_RECENT,        // most recent time that file system was completely full
    FS_ENOSPC_COUNT,              // total number of times ENOSPC was returned from an attempt to write
-    FS_FSYNC_TIME ,
+    FS_FSYNC_TIME,
    FS_FSYNC_COUNT,
-    FS_STATUS_NUM_ROWS
+    FS_LONG_FSYNC_TIME,
+    FS_LONG_FSYNC_COUNT,
+    FS_STATUS_NUM_ROWS,           // must be last
 } fs_status_entry;

 typedef struct {
@@ -1766,8 +1768,10 @@ fs_status_init(void) {
    FS_STATUS_INIT(FS_ENOSPC_REDZONE_CTR,     nullptr, UINT64,   "number of operations rejected by enospc prevention (red zone)", TOKU_ENGINE_STATUS);
    FS_STATUS_INIT(FS_ENOSPC_MOST_RECENT,     nullptr, UNIXTIME, "most recent disk full", TOKU_ENGINE_STATUS);
    FS_STATUS_INIT(FS_ENOSPC_COUNT,           nullptr, UINT64,   "number of write operations that returned ENOSPC", TOKU_ENGINE_STATUS);
-    FS_STATUS_INIT(FS_FSYNC_TIME,             FILESYSTEM_FSYNC_SECONDS, UINT64,   "fsync time", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
+    FS_STATUS_INIT(FS_FSYNC_TIME,             FILESYSTEM_FSYNC_TIME, UINT64,   "fsync time", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
    FS_STATUS_INIT(FS_FSYNC_COUNT,            FILESYSTEM_FSYNC_NUM, UINT64,   "fsync count", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
+    FS_STATUS_INIT(FS_LONG_FSYNC_TIME,        FILESYSTEM_LONG_FSYNC_TIME, UINT64,   "long fsync time", TOKU_ENGINE_STATUS);
+    FS_STATUS_INIT(FS_LONG_FSYNC_COUNT,       FILESYSTEM_LONG_FSYNC_NUM, UINT64,   "long fsync count", TOKU_ENGINE_STATUS);
    fsstat.initialized = true;
 }
 #undef FS_STATUS_INIT
@@ -1792,10 +1796,12 @@ fs_get_status(DB_ENV * env, fs_redzone_state * redzone_state) {
    FS_STATUS_VALUE(FS_ENOSPC_MOST_RECENT) = enospc_most_recent_timestamp;
    FS_STATUS_VALUE(FS_ENOSPC_COUNT) = enospc_total;
    
-    uint64_t fsync_count, fsync_time;
-    toku_get_fsync_times(&fsync_count, &fsync_time);
+    uint64_t fsync_count, fsync_time, long_fsync_threshold, long_fsync_count, long_fsync_time;
+    toku_get_fsync_times(&fsync_count, &fsync_time, &long_fsync_threshold, &long_fsync_count, &long_fsync_time);
    FS_STATUS_VALUE(FS_FSYNC_COUNT) = fsync_count;
    FS_STATUS_VALUE(FS_FSYNC_TIME) = fsync_time;
+    FS_STATUS_VALUE(FS_LONG_FSYNC_COUNT) = long_fsync_count;
+    FS_STATUS_VALUE(FS_LONG_FSYNC_TIME) = long_fsync_time;
 }
 #undef FS_STATUS_VALUE


--- a/toku_include/toku_portability.h
+++ b/toku_include/toku_portability.h
@@ -397,10 +397,7 @@ void toku_file_fsync(int fd);
 int toku_fsync_directory(const char *fname);

 // get the number of fsync calls and the fsync times (total)
-void toku_get_fsync_times(uint64_t *fsync_count, uint64_t *fsync_time);
-
-// get the number of fsync calls and the fsync times for use by scheduler (subset of total)
-void toku_get_fsync_sched(uint64_t *fsync_count, uint64_t *fsync_time);
+void toku_get_fsync_times(uint64_t *fsync_count, uint64_t *fsync_time, uint64_t *long_fsync_threshold, uint64_t *long_fsync_count, uint64_t *long_fsync_time);

 void toku_set_func_fsync (int (*fsync_function)(int));
 void toku_set_func_pwrite (ssize_t (*)(int, const void *, size_t, toku_off_t));