Commit bf05180b authored by Vadim Tkachenko's avatar Vadim Tkachenko

innodb_io_patches.patch merged

parent d2a38600
......@@ -187,6 +187,10 @@ buf_read_ahead_random(
ulint i;
ulint buf_read_ahead_random_area;
if (!(srv_read_ahead & 1)) {
return(0);
}
if (srv_startup_is_before_trx_rollback_phase) {
/* No read-ahead to avoid thread deadlocks */
return(0);
......@@ -412,6 +416,10 @@ buf_read_ahead_linear(
const ulint buf_read_ahead_linear_area
= BUF_READ_AHEAD_LINEAR_AREA;
if (!(srv_read_ahead & 2)) {
return(0);
}
if (UNIV_UNLIKELY(srv_startup_is_before_trx_rollback_phase)) {
/* No read-ahead to avoid thread deadlocks */
return(0);
......
......@@ -137,6 +137,7 @@ static long innobase_mirrored_log_groups, innobase_log_files_in_group,
innobase_force_recovery, innobase_open_files,
innobase_autoinc_lock_mode;
static unsigned long innobase_read_io_threads, innobase_write_io_threads;
static long long innobase_buffer_pool_size, innobase_log_file_size;
/* The default values for the following char* start-up parameters
......@@ -2066,6 +2067,8 @@ innobase_init(
srv_mem_pool_size = (ulint) innobase_additional_mem_pool_size;
srv_n_file_io_threads = (ulint) innobase_file_io_threads;
srv_n_read_io_threads = (ulint) innobase_read_io_threads;
srv_n_write_io_threads = (ulint) innobase_write_io_threads;
srv_force_recovery = (ulint) innobase_force_recovery;
......@@ -9558,6 +9561,31 @@ static MYSQL_SYSVAR_STR(version, innodb_version_str,
PLUGIN_VAR_NOCMDOPT | PLUGIN_VAR_READONLY,
"InnoDB version", NULL, NULL, INNODB_VERSION_STR);
static MYSQL_SYSVAR_ULONG(io_capacity, srv_io_capacity,
PLUGIN_VAR_RQCMDARG,
"Number of IO operations per second the server can do. Tunes background IO rate.",
NULL, NULL, 100, 100, 999999999, 0);
static MYSQL_SYSVAR_ULONG(read_ahead, srv_read_ahead,
PLUGIN_VAR_RQCMDARG,
"Enable/Diasable read aheads bit0:random bit1:linear",
NULL, NULL, 3, 0, 3, 0);
static MYSQL_SYSVAR_ULONG(adaptive_checkpoint, srv_adaptive_checkpoint,
PLUGIN_VAR_RQCMDARG,
"Enable/Diasable flushing along modified age 0:disable 1:enable",
NULL, NULL, 0, 0, 1, 0);
static MYSQL_SYSVAR_ULONG(read_io_threads, innobase_read_io_threads,
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
"Number of background read I/O threads in InnoDB.",
NULL, NULL, 1, 1, 64, 0);
static MYSQL_SYSVAR_ULONG(write_io_threads, innobase_write_io_threads,
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
"Number of background write I/O threads in InnoDB.",
NULL, NULL, 1, 1, 64, 0);
static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(additional_mem_pool_size),
MYSQL_SYSVAR(autoextend_increment),
......@@ -9604,6 +9632,11 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(thread_sleep_delay),
MYSQL_SYSVAR(autoinc_lock_mode),
MYSQL_SYSVAR(version),
MYSQL_SYSVAR(io_capacity),
MYSQL_SYSVAR(read_ahead),
MYSQL_SYSVAR(adaptive_checkpoint),
MYSQL_SYSVAR(read_io_threads),
MYSQL_SYSVAR(write_io_threads),
NULL
};
......
......@@ -561,8 +561,10 @@ os_aio_init(
/*========*/
ulint n, /* in: maximum number of pending aio operations
allowed; n must be divisible by n_segments */
ulint n_segments, /* in: combined number of segments in the four
first aio arrays; must be >= 4 */
// ulint n_segments, /* in: combined number of segments in the four
// first aio arrays; must be >= 4 */
ulint n_read_threads, /* n_segments == 2 + n_read_threads + n_write_threads */
ulint n_write_threads, /**/
ulint n_slots_sync); /* in: number of slots in the sync aio array */
/***********************************************************************
Requests an asynchronous i/o operation. */
......
......@@ -100,6 +100,8 @@ extern ulint srv_mem_pool_size;
extern ulint srv_lock_table_size;
extern ulint srv_n_file_io_threads;
extern ulint srv_n_read_io_threads;
extern ulint srv_n_write_io_threads;
#ifdef UNIV_LOG_ARCHIVE
extern ibool srv_log_archive_on;
......@@ -144,6 +146,11 @@ extern ulong srv_max_buf_pool_modified_pct;
extern ulong srv_max_purge_lag;
extern ulong srv_replication_delay;
extern ulint srv_io_capacity;
extern ulint srv_read_ahead;
extern ulint srv_adaptive_checkpoint;
/*-------------------------------------------*/
extern ulint srv_n_rows_inserted;
......
......@@ -328,17 +328,7 @@ rw_lock_get_x_lock_count(
Accessor functions for rw lock. */
UNIV_INLINE
ulint
rw_lock_get_s_waiters(
/*==================*/
rw_lock_t* lock);
UNIV_INLINE
ulint
rw_lock_get_x_waiters(
/*==================*/
rw_lock_t* lock);
UNIV_INLINE
ulint
rw_lock_get_wx_waiters(
rw_lock_get_waiters(
/*================*/
rw_lock_t* lock);
UNIV_INLINE
......@@ -422,11 +412,6 @@ rw_lock_debug_print(
rw_lock_debug_t* info); /* in: debug struct */
#endif /* UNIV_SYNC_DEBUG */
#ifdef HAVE_GCC_ATOMIC_BUILTINS
/* This value means NOT_LOCKED */
#define RW_LOCK_BIAS 0x00100000
#endif
/* NOTE! The structure appears here only for the compiler to know its size.
Do not use its fields directly! The structure used in the spin lock
implementation of a read-write lock. Several threads may have a shared lock
......@@ -436,9 +421,9 @@ blocked by readers, a writer may queue for the lock by setting the writer
field. Then no new readers are allowed in. */
struct rw_lock_struct {
/* Used by sync0arr.c for thread queueing */
os_event_t s_event; /* Used for s_lock */
os_event_t x_event; /* Used for x_lock */
os_event_t event; /* Used by sync0arr.c for thread queueing */
#ifdef __WIN__
os_event_t wait_ex_event; /* This windows specific event is
used by the thread which has set the
lock state to RW_LOCK_WAIT_EX. The
......@@ -446,34 +431,30 @@ struct rw_lock_struct {
thread will be the next one to proceed
once the current the event gets
signalled. See LEMMA 2 in sync0sync.c */
#ifdef HAVE_GCC_ATOMIC_BUILTINS
volatile lint lock_word; /* Used by using atomic builtin */
#endif
volatile ulint reader_count; /* Number of readers who have locked this
ulint reader_count; /* Number of readers who have locked this
lock in the shared mode */
volatile ulint writer; /* This field is set to RW_LOCK_EX if there
ulint writer; /* This field is set to RW_LOCK_EX if there
is a writer owning the lock (in exclusive
mode), RW_LOCK_WAIT_EX if a writer is
queueing for the lock, and
RW_LOCK_NOT_LOCKED, otherwise. */
volatile os_thread_id_t writer_thread;
os_thread_id_t writer_thread;
/* Thread id of a possible writer thread */
volatile ulint writer_count; /* Number of times the same thread has
ulint writer_count; /* Number of times the same thread has
recursively locked the lock in the exclusive
mode */
#ifndef HAVE_GCC_ATOMIC_BUILTINS
mutex_t mutex; /* The mutex protecting rw_lock_struct */
#endif
ulint pass; /* Default value 0. This is set to some
value != 0 given by the caller of an x-lock
operation, if the x-lock is to be passed to
another thread to unlock (which happens in
asynchronous i/o). */
volatile ulint s_waiters; /* 1: there are waiters (s_lock) */
volatile ulint x_waiters; /* 1: there are waiters (x_lock) */
volatile ulint wait_ex_waiters; /* 1: there are waiters (wait_ex) */
ulint waiters; /* This ulint is set to 1 if there are
waiters (readers or writers) in the global
wait array, waiting for this rw_lock.
Otherwise, == 0. */
UT_LIST_NODE_T(rw_lock_t) list;
/* All allocated rw locks are put into a
list */
......@@ -486,7 +467,7 @@ struct rw_lock_struct {
const char* cfile_name;/* File name where lock created */
const char* last_s_file_name;/* File name where last s-locked */
const char* last_x_file_name;/* File name where last x-locked */
volatile ibool writer_is_wait_ex;
ibool writer_is_wait_ex;
/* This is TRUE if the writer field is
RW_LOCK_WAIT_EX; this field is located far
from the memory update hotspot fields which
......
This diff is collapsed.
......@@ -3258,6 +3258,15 @@ log_print(
log_sys->flushed_to_disk_lsn,
log_sys->last_checkpoint_lsn);
fprintf(file,
"Max checkpoint age %lu\n"
"Modified age %lu\n"
"Checkpoint age %lu\n",
(ulong) log_sys->max_checkpoint_age,
(ulong) ut_dulint_minus(log_sys->lsn,
log_buf_pool_get_oldest_modification()),
(ulong) ut_dulint_minus(log_sys->lsn, log_sys->last_checkpoint_lsn));
current_time = time(NULL);
time_elapsed = 0.001 + difftime(current_time,
......
......@@ -2920,8 +2920,10 @@ os_aio_init(
/*========*/
ulint n, /* in: maximum number of pending aio operations
allowed; n must be divisible by n_segments */
ulint n_segments, /* in: combined number of segments in the four
first aio arrays; must be >= 4 */
// ulint n_segments, /* in: combined number of segments in the four
// first aio arrays; must be >= 4 */
ulint n_read_threads, /* n_segments == 2 + n_read_threads + n_write_threads*/
ulint n_write_threads, /**/
ulint n_slots_sync) /* in: number of slots in the sync aio array */
{
ulint n_read_segs;
......@@ -2929,6 +2931,8 @@ os_aio_init(
ulint n_per_seg;
ulint i;
ulint n_segments = 2 + n_read_threads + n_write_threads;
ut_ad(n % n_segments == 0);
ut_ad(n_segments >= 4);
......@@ -2939,8 +2943,8 @@ os_aio_init(
}
n_per_seg = n / n_segments;
n_write_segs = (n_segments - 2) / 2;
n_read_segs = n_segments - 2 - n_write_segs;
n_write_segs = n_write_threads;
n_read_segs = n_read_threads;
/* fprintf(stderr, "Array n per seg %lu\n", n_per_seg); */
......@@ -3140,6 +3144,13 @@ os_aio_array_reserve_slot(
OVERLAPPED* control;
#endif
ulint i;
ulint prim_segment;
ulint n;
n = array->n_slots / array->n_segments;
/* 64 blocks' striping ( aligning max(BUF_READ_AHEAD_AREA) ) */
prim_segment = ( offset >> (UNIV_PAGE_SIZE_SHIFT + 6) ) % (array->n_segments);
loop:
os_mutex_enter(array->mutex);
......@@ -3158,6 +3169,16 @@ loop:
goto loop;
}
for (i = prim_segment * n; i < array->n_slots; i++) {
slot = os_aio_array_get_nth_slot(array, i);
if (slot->reserved == FALSE) {
break;
}
}
if (slot->reserved == TRUE){
/* Not found after the intended segment. So we should search before. */
for (i = 0;; i++) {
slot = os_aio_array_get_nth_slot(array, i);
......@@ -3165,6 +3186,7 @@ loop:
break;
}
}
}
array->n_reserved++;
......
......@@ -147,6 +147,8 @@ UNIV_INTERN ulint srv_mem_pool_size = ULINT_MAX;
UNIV_INTERN ulint srv_lock_table_size = ULINT_MAX;
UNIV_INTERN ulint srv_n_file_io_threads = ULINT_MAX;
ulint srv_n_read_io_threads = 1;
ulint srv_n_write_io_threads = 1;
#ifdef UNIV_LOG_ARCHIVE
UNIV_INTERN ibool srv_log_archive_on = FALSE;
......@@ -311,6 +313,15 @@ UNIV_INTERN int srv_query_thread_priority = 0;
UNIV_INTERN ulong srv_replication_delay = 0;
ulint srv_io_capacity = 100;
/* Returns the number of IO operations that is X percent of the capacity.
PCT_IO(5) -> returns the number of IO operations that is 5% of the max
where max is srv_io_capacity. */
#define PCT_IO(pct) ((ulint) (srv_io_capacity * ((double) pct / 100.0)))
ulint srv_read_ahead = 3; /* 1: random 2: linear 3: Both */
ulint srv_adaptive_checkpoint = 0; /* 0:disable 1:enable */
/*-------------------------------------------*/
UNIV_INTERN ulong srv_n_spin_wait_rounds = 20;
UNIV_INTERN ulong srv_n_free_tickets_to_enter = 500;
......@@ -2203,6 +2214,8 @@ srv_master_thread(
ibool skip_sleep = FALSE;
ulint i;
dulint oldest_lsn;
#ifdef UNIV_DEBUG_THREAD_CREATION
fprintf(stderr, "Master thread starts, id %lu\n",
os_thread_pf(os_thread_get_curr_id()));
......@@ -2293,7 +2306,7 @@ loop:
if (n_pend_ios < 3 && (n_ios - n_ios_old < 5)) {
srv_main_thread_op_info = "doing insert buffer merge";
ibuf_contract_for_n_pages(
TRUE, srv_insert_buffer_batch_size / 4);
TRUE, PCT_IO((srv_insert_buffer_batch_size / 4)));
srv_main_thread_op_info = "flushing log";
......@@ -2306,7 +2319,7 @@ loop:
/* Try to keep the number of modified pages in the
buffer pool under the limit wished by the user */
n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 100,
n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100),
IB_ULONGLONG_MAX);
/* If we had to do the flush, it may have taken
......@@ -2315,6 +2328,44 @@ loop:
iteration of this loop. */
skip_sleep = TRUE;
} else if (srv_adaptive_checkpoint) {
/* Try to keep modified age not to exceed
max_checkpoint_age * 7/8 line */
mutex_enter(&(log_sys->mutex));
oldest_lsn = buf_pool_get_oldest_modification();
if (ut_dulint_is_zero(oldest_lsn)) {
mutex_exit(&(log_sys->mutex));
} else {
if (ut_dulint_minus(log_sys->lsn, oldest_lsn)
> (log_sys->max_checkpoint_age) - ((log_sys->max_checkpoint_age) / 4)) {
/* 2nd defence line (max_checkpoint_age * 3/4) */
mutex_exit(&(log_sys->mutex));
n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100),
ut_dulint_max);
skip_sleep = TRUE;
} else if (ut_dulint_minus(log_sys->lsn, oldest_lsn)
> (log_sys->max_checkpoint_age)/2 ) {
/* 1st defence line (max_checkpoint_age * 1/2) */
mutex_exit(&(log_sys->mutex));
n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(10),
ut_dulint_max);
skip_sleep = TRUE;
} else {
mutex_exit(&(log_sys->mutex));
}
}
}
if (srv_activity_count == old_activity_count) {
......@@ -2341,10 +2392,10 @@ loop:
n_pend_ios = buf_get_n_pending_ios() + log_sys->n_pending_writes;
n_ios = log_sys->n_log_ios + buf_pool->n_pages_read
+ buf_pool->n_pages_written;
if (n_pend_ios < 3 && (n_ios - n_ios_very_old < 200)) {
srv_main_thread_op_info = "flushing buffer pool pages";
buf_flush_batch(BUF_FLUSH_LIST, 100, IB_ULONGLONG_MAX);
if (n_pend_ios < 3 && (n_ios - n_ios_very_old < PCT_IO(200))) {
srv_main_thread_op_info = "flushing buffer pool pages";
buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), IB_ULONGLONG_MAX);
srv_main_thread_op_info = "flushing log";
log_buffer_flush_to_disk();
......@@ -2354,7 +2405,7 @@ loop:
even if the server were active */
srv_main_thread_op_info = "doing insert buffer merge";
ibuf_contract_for_n_pages(TRUE, srv_insert_buffer_batch_size / 4);
ibuf_contract_for_n_pages(TRUE, PCT_IO((srv_insert_buffer_batch_size / 4)));
srv_main_thread_op_info = "flushing log";
log_buffer_flush_to_disk();
......@@ -2394,14 +2445,14 @@ loop:
(> 70 %), we assume we can afford reserving the disk(s) for
the time it requires to flush 100 pages */
n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 100,
n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100),
IB_ULONGLONG_MAX);
} else {
/* Otherwise, we only flush a small number of pages so that
we do not unnecessarily use much disk i/o capacity from
other work */
n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 10,
n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(10),
IB_ULONGLONG_MAX);
}
......@@ -2489,7 +2540,7 @@ background_loop:
n_bytes_merged = 0;
} else {
n_bytes_merged = ibuf_contract_for_n_pages(
TRUE, srv_insert_buffer_batch_size);
TRUE, PCT_IO((srv_insert_buffer_batch_size * 5)));
}
srv_main_thread_op_info = "reserving kernel mutex";
......@@ -2505,7 +2556,7 @@ flush_loop:
srv_main_thread_op_info = "flushing buffer pool pages";
if (srv_fast_shutdown < 2) {
n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 100,
n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100),
IB_ULONGLONG_MAX);
} else {
/* In the fastest shutdown we do not flush the buffer pool
......
......@@ -1204,24 +1204,28 @@ innobase_start_or_create_for_mysql(void)
return(DB_ERROR);
}
/* over write innodb_file_io_threads */
srv_n_file_io_threads = 2 + srv_n_read_io_threads + srv_n_write_io_threads;
/* Restrict the maximum number of file i/o threads */
if (srv_n_file_io_threads > SRV_MAX_N_IO_THREADS) {
srv_n_file_io_threads = SRV_MAX_N_IO_THREADS;
srv_n_read_io_threads = srv_n_write_io_threads = (SRV_MAX_N_IO_THREADS - 2) / 2;
}
if (!os_aio_use_native_aio) {
/* In simulated aio we currently have use only for 4 threads */
srv_n_file_io_threads = 4;
/*srv_n_file_io_threads = 4;*/
os_aio_init(8 * SRV_N_PENDING_IOS_PER_THREAD
* srv_n_file_io_threads,
srv_n_file_io_threads,
SRV_MAX_N_PENDING_SYNC_IOS);
srv_n_read_io_threads, srv_n_write_io_threads,
SRV_MAX_N_PENDING_SYNC_IOS * 8);
} else {
os_aio_init(SRV_N_PENDING_IOS_PER_THREAD
* srv_n_file_io_threads,
srv_n_file_io_threads,
srv_n_read_io_threads, srv_n_write_io_threads,
SRV_MAX_N_PENDING_SYNC_IOS);
}
......
......@@ -307,13 +307,13 @@ sync_cell_event_reset(
{
if (type == SYNC_MUTEX) {
return(os_event_reset(((mutex_t *) object)->event));
#ifdef __WIN__
} else if (type == RW_LOCK_WAIT_EX) {
return(os_event_reset(
((rw_lock_t *) object)->wait_ex_event));
} else if (type == RW_LOCK_SHARED) {
return(os_event_reset(((rw_lock_t *) object)->s_event));
} else { /* RW_LOCK_EX */
return(os_event_reset(((rw_lock_t *) object)->x_event));
#endif
} else {
return(os_event_reset(((rw_lock_t *) object)->event));
}
}
......@@ -413,12 +413,15 @@ sync_array_wait_event(
if (cell->request_type == SYNC_MUTEX) {
event = ((mutex_t*) cell->wait_object)->event;
#ifdef __WIN__
/* On windows if the thread about to wait is the one which
has set the state of the rw_lock to RW_LOCK_WAIT_EX, then
it waits on a special event i.e.: wait_ex_event. */
} else if (cell->request_type == RW_LOCK_WAIT_EX) {
event = ((rw_lock_t*) cell->wait_object)->wait_ex_event;
} else if (cell->request_type == RW_LOCK_SHARED) {
event = ((rw_lock_t*) cell->wait_object)->s_event;
#endif
} else {
event = ((rw_lock_t*) cell->wait_object)->x_event;
event = ((rw_lock_t*) cell->wait_object)->event;
}
cell->waiting = TRUE;
......@@ -459,7 +462,6 @@ sync_array_cell_print(
mutex_t* mutex;
rw_lock_t* rwlock;
ulint type;
ulint writer;
type = cell->request_type;
......@@ -489,10 +491,12 @@ sync_array_cell_print(
(ulong) mutex->waiters);
} else if (type == RW_LOCK_EX
#ifdef __WIN__
|| type == RW_LOCK_WAIT_EX
#endif
|| type == RW_LOCK_SHARED) {
fputs(type == RW_LOCK_SHARED ? "S-lock on" : "X-lock on", file);
fputs(type == RW_LOCK_EX ? "X-lock on" : "S-lock on", file);
rwlock = cell->old_wait_rw_lock;
......@@ -500,24 +504,22 @@ sync_array_cell_print(
" RW-latch at %p created in file %s line %lu\n",
(void*) rwlock, rwlock->cfile_name,
(ulong) rwlock->cline);
writer = rw_lock_get_writer(rwlock);
if (writer != RW_LOCK_NOT_LOCKED) {
if (rwlock->writer != RW_LOCK_NOT_LOCKED) {
fprintf(file,
"a writer (thread id %lu) has"
" reserved it in mode %s",
(ulong) os_thread_pf(rwlock->writer_thread),
writer == RW_LOCK_EX
rwlock->writer == RW_LOCK_EX
? " exclusive\n"
: " wait exclusive\n");
}
fprintf(file,
"number of readers %lu, s_waiters flag %lu, x_waiters flag %lu\n"
"number of readers %lu, waiters flag %lu\n"
"Last time read locked in file %s line %lu\n"
"Last time write locked in file %s line %lu\n",
(ulong) rwlock->reader_count,
(ulong) rwlock->s_waiters,
(ulong) (rwlock->x_waiters || rwlock->wait_ex_waiters),
(ulong) rwlock->waiters,
rwlock->last_s_file_name,
(ulong) rwlock->last_s_line,
rwlock->last_x_file_name,
......@@ -842,15 +844,11 @@ sync_array_object_signalled(
/*========================*/
sync_array_t* arr) /* in: wait array */
{
#ifdef HAVE_GCC_ATOMIC_BUILTINS
__sync_fetch_and_add(&(arr->sg_count),1);
#else
sync_array_enter(arr);
arr->sg_count++;
sync_array_exit(arr);
#endif
}
/**************************************************************************
......@@ -891,23 +889,19 @@ sync_arr_wake_threads_if_sema_free(void)
mutex = cell->wait_object;
os_event_set(mutex->event);
#ifdef __WIN__
} else if (cell->request_type
== RW_LOCK_WAIT_EX) {
rw_lock_t* lock;
lock = cell->wait_object;
os_event_set(lock->wait_ex_event);
} else if (cell->request_type
== RW_LOCK_SHARED) {
rw_lock_t* lock;
lock = cell->wait_object;
os_event_set(lock->s_event);
#endif
} else {
rw_lock_t* lock;
rw_lock_t* lock;
lock = cell->wait_object;
os_event_set(lock->x_event);
os_event_set(lock->event);
}
}
}
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment