Applied innodb-5.1-ss1004

Fixes: - Bug #15815: Very poor performance with multiple queries running concurrently - Bug #22868: 'Thread thrashing' with > 50 concurrent conns under an upd-intensive workloadw - Bug #24089: Race condition in fil_flush_file_spaces()

Applied innodb-5.1-ss1004
Fixes: - Bug #15815: Very poor performance with multiple queries running concurrently - Bug #22868: 'Thread thrashing' with > 50 concurrent conns under an upd-intensive workloadw - Bug #24089: Race condition in fil_flush_file_spaces()
18080530 · tsmith@quadxeon.mysql.com · a77dde86 · 18080530 · 18080530 · 18080530
Commit 18080530 authored Nov 09, 2006 by tsmith@quadxeon.mysql.com
12 changed files
--- a/storage/innobase/btr/btr0btr.c
+++ b/storage/innobase/btr/btr0btr.c
@@ -1949,7 +1949,12 @@ btr_lift_page_up(
 	mtr_t*		mtr)	/* in: mtr */
 {
 	page_t*		father_page;
+	page_t*		iter_page;
+	page_t*		pages[BTR_MAX_LEVELS];
 	ulint		page_level;
+	ulint		root_page_no;
+	ulint		ancestors;
+	ulint		i;

 	ut_ad(btr_page_get_prev(page, mtr) == FIL_NULL);
 	ut_ad(btr_page_get_next(page, mtr) == FIL_NULL);
@@ -1959,6 +1964,30 @@ btr_lift_page_up(
 		btr_page_get_father_node_ptr(index, page, mtr));

 	page_level = btr_page_get_level(page, mtr);
+	root_page_no = dict_index_get_page(index);
+
+	ancestors = 1;
+	pages[0] = father_page;
+
+	/* Store all ancestor pages so we can reset their levels later on.
+	We have to do all the searches on the tree now because later on,
+	after we've replaced the first level, the tree is in an inconsistent
+	state and can not be searched. */
+	iter_page = father_page;
+	for (;;) {
+		if (buf_block_get_page_no(buf_block_align(iter_page))
+		    == root_page_no) {
+
+			break;
+		}
+
+		ut_a(ancestors < BTR_MAX_LEVELS);
+
+		iter_page = buf_frame_align(
+			btr_page_get_father_node_ptr(index, iter_page, mtr));
+
+		pages[ancestors++] = iter_page;
+	}

 	btr_search_drop_page_hash_index(page);

@@ -1970,7 +1999,15 @@ btr_lift_page_up(
 			       index, mtr);
 	lock_update_copy_and_discard(father_page, page);

-	btr_page_set_level(father_page, page_level, mtr);
+	/* Go upward to root page, decreasing levels by one. */
+	for (i = 0; i < ancestors; i++) {
+		iter_page = pages[i];
+
+		ut_ad(btr_page_get_level(iter_page, mtr) == (page_level + 1));
+
+		btr_page_set_level(iter_page, page_level, mtr);
+		page_level++;
+	}

 	/* Free the file page */
 	btr_page_free(index, page, mtr);

--- a/storage/innobase/buf/buf0buf.c
+++ b/storage/innobase/buf/buf0buf.c
@@ -221,6 +221,9 @@ in the free list to the frames.
 5) When we have AWE enabled, we disable adaptive hash indexes.
 */

+/* Value in microseconds */
+static const int WAIT_FOR_READ	= 20000;
+
 buf_pool_t*	buf_pool = NULL; /* The buffer buf_pool of the database */

 #ifdef UNIV_DEBUG
@@ -539,6 +542,8 @@ buf_block_init(

 	block->n_pointers = 0;

+	mutex_create(&block->mutex, SYNC_BUF_BLOCK);
+
 	rw_lock_create(&block->lock, SYNC_LEVEL_VARYING);
 	ut_ad(rw_lock_validate(&(block->lock)));

@@ -813,8 +818,15 @@ buf_awe_map_page_to_frame(
 	bck = UT_LIST_GET_LAST(buf_pool->awe_LRU_free_mapped);

 	while (bck) {
-		if (bck->state == BUF_BLOCK_FILE_PAGE
-		    && (bck->buf_fix_count != 0 || bck->io_fix != 0)) {
+		ibool skip;
+
+		mutex_enter(&bck->mutex);
+
+		skip = (bck->state == BUF_BLOCK_FILE_PAGE
+			&& (bck->buf_fix_count != 0 || bck->io_fix != 0));
+
+		if (skip) {
+			mutex_exit(&bck->mutex);

 			/* We have to skip this */
 			bck = UT_LIST_GET_PREV(awe_LRU_free_mapped, bck);
@@ -848,6 +860,8 @@ buf_awe_map_page_to_frame(

 			buf_pool->n_pages_awe_remapped++;

+			mutex_exit(&bck->mutex);
+
 			return;
 		}
 	}
@@ -886,13 +900,22 @@ buf_block_make_young(
 /*=================*/
 	buf_block_t*	block)	/* in: block to make younger */
 {
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!mutex_own(&(buf_pool->mutex)));
+#endif /* UNIV_SYNC_DEBUG */
+
+	/* Note that we read freed_page_clock's without holding any mutex:
+	this is allowed since the result is used only in heuristics */
+
 	if (buf_pool->freed_page_clock >= block->freed_page_clock
-	    + 1 + (buf_pool->curr_size / 1024)) {
+				+ 1 + (buf_pool->curr_size / 4)) {

+		mutex_enter(&buf_pool->mutex);
 		/* There has been freeing activity in the LRU list:
 		best to move to the head of the LRU list */

 		buf_LRU_make_block_young(block);
+		mutex_exit(&buf_pool->mutex);
 	}
 }

@@ -927,12 +950,16 @@ buf_block_free(
 /*===========*/
 	buf_block_t*	block)	/* in, own: block to be freed */
 {
-	ut_a(block->state != BUF_BLOCK_FILE_PAGE);
-
 	mutex_enter(&(buf_pool->mutex));

+	mutex_enter(&block->mutex);
+
+	ut_a(block->state != BUF_BLOCK_FILE_PAGE);
+
 	buf_LRU_block_free_non_file_page(block);

+	mutex_exit(&block->mutex);
+
 	mutex_exit(&(buf_pool->mutex));
 }

@@ -1151,9 +1178,8 @@ buf_page_get_gen(
 #endif
 	buf_pool->n_page_gets++;
 loop:
-	mutex_enter_fast(&(buf_pool->mutex));
-
 	block = NULL;
+	mutex_enter_fast(&(buf_pool->mutex));

 	if (guess) {
 		block = buf_block_align(guess);
@@ -1191,6 +1217,8 @@ loop:
 		goto loop;
 	}

+	mutex_enter(&block->mutex);
+
 	ut_a(block->state == BUF_BLOCK_FILE_PAGE);

 	must_read = FALSE;
@@ -1200,9 +1228,9 @@ loop:
 		must_read = TRUE;

 		if (mode == BUF_GET_IF_IN_POOL) {
-
 			/* The page is only being read to buffer */
-			mutex_exit(&(buf_pool->mutex));
+			mutex_exit(&buf_pool->mutex);
+			mutex_exit(&block->mutex);

 			return(NULL);
 		}
@@ -1226,7 +1254,7 @@ loop:
 #else
 	buf_block_buf_fix_inc(block);
 #endif
-	buf_block_make_young(block);
+	mutex_exit(&buf_pool->mutex);

 	/* Check if this is the first access to the page */

@@ -1234,10 +1262,13 @@ loop:

 	block->accessed = TRUE;

+	mutex_exit(&block->mutex);
+
+	buf_block_make_young(block);
+
 #ifdef UNIV_DEBUG_FILE_ACCESSES
 	ut_a(block->file_page_was_freed == FALSE);
 #endif
-	mutex_exit(&(buf_pool->mutex));

 #ifdef UNIV_DEBUG
 	buf_dbg_counter++;
@@ -1262,13 +1293,14 @@ loop:
 		}

 		if (!success) {
-			mutex_enter(&(buf_pool->mutex));
+			mutex_enter(&block->mutex);

 			block->buf_fix_count--;
+
+			mutex_exit(&block->mutex);
 #ifdef UNIV_SYNC_DEBUG
 			rw_lock_s_unlock(&(block->debug_latch));
 #endif
-			mutex_exit(&(buf_pool->mutex));

 			return(NULL);
 		}
@@ -1279,18 +1311,16 @@ loop:
 			completes */

 			for (;;) {
-				mutex_enter(&(buf_pool->mutex));
+				mutex_enter(&block->mutex);

 				if (block->io_fix == BUF_IO_READ) {

-					mutex_exit(&(buf_pool->mutex));
-
-					/* Sleep 20 milliseconds */
+					mutex_exit(&block->mutex);

-					os_thread_sleep(20000);
+					os_thread_sleep(WAIT_FOR_READ);
 				} else {

-					mutex_exit(&(buf_pool->mutex));
+					mutex_exit(&block->mutex);

 					break;
 				}
@@ -1349,14 +1379,14 @@ buf_page_optimistic_get_func(
 	ut_ad(mtr && block);
 	ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));

-	mutex_enter(&(buf_pool->mutex));
-
 	/* If AWE is used, block may have a different frame now, e.g., NULL */

+	mutex_enter(&block->mutex);
+
 	if (UNIV_UNLIKELY(block->state != BUF_BLOCK_FILE_PAGE)
 	    || UNIV_UNLIKELY(block->frame != guess)) {
-exit_func:
-		mutex_exit(&(buf_pool->mutex));
+
+		mutex_exit(&block->mutex);

 		return(FALSE);
 	}
@@ -1366,15 +1396,14 @@ exit_func:
 #else
 	buf_block_buf_fix_inc(block);
 #endif
-	buf_block_make_young(block);
-
-	/* Check if this is the first access to the page */
-
 	accessed = block->accessed;
-
 	block->accessed = TRUE;

-	mutex_exit(&(buf_pool->mutex));
+	mutex_exit(&block->mutex);
+
+	buf_block_make_young(block);
+
+	/* Check if this is the first access to the page */

 	ut_ad(!ibuf_inside() || ibuf_page(block->space, block->offset));

@@ -1389,13 +1418,16 @@ exit_func:
 	}

 	if (UNIV_UNLIKELY(!success)) {
-		mutex_enter(&(buf_pool->mutex));
+		mutex_enter(&block->mutex);

 		block->buf_fix_count--;
+
+		mutex_exit(&block->mutex);
+
 #ifdef UNIV_SYNC_DEBUG
 		rw_lock_s_unlock(&(block->debug_latch));
 #endif
-		goto exit_func;
+		return(FALSE);
 	}

 	if (UNIV_UNLIKELY(!UT_DULINT_EQ(modify_clock, block->modify_clock))) {
@@ -1408,13 +1440,16 @@ exit_func:
 			rw_lock_x_unlock(&(block->lock));
 		}

-		mutex_enter(&(buf_pool->mutex));
+		mutex_enter(&block->mutex);

 		block->buf_fix_count--;
+
+		mutex_exit(&block->mutex);
+
 #ifdef UNIV_SYNC_DEBUG
 		rw_lock_s_unlock(&(block->debug_latch));
 #endif
-		goto exit_func;
+		return(FALSE);
 	}

 	mtr_memo_push(mtr, block, fix_type);
@@ -1471,10 +1506,10 @@ buf_page_get_known_nowait(
 	ut_ad(mtr);
 	ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));

-	mutex_enter(&(buf_pool->mutex));
-
 	block = buf_block_align(guess);

+	mutex_enter(&block->mutex);
+
 	if (block->state == BUF_BLOCK_REMOVE_HASH) {
 		/* Another thread is just freeing the block from the LRU list
 		of the buffer pool: do not try to access this page; this
@@ -1483,7 +1518,7 @@ buf_page_get_known_nowait(
 		we have already removed it from the page address hash table
 		of the buffer pool. */

-		mutex_exit(&(buf_pool->mutex));
+		mutex_exit(&block->mutex);

 		return(FALSE);
 	}
@@ -1495,12 +1530,12 @@ buf_page_get_known_nowait(
 #else
 	buf_block_buf_fix_inc(block);
 #endif
+	mutex_exit(&block->mutex);
+
 	if (mode == BUF_MAKE_YOUNG) {
 		buf_block_make_young(block);
 	}

-	mutex_exit(&(buf_pool->mutex));
-
 	ut_ad(!ibuf_inside() || (mode == BUF_KEEP_OLD));

 	if (rw_latch == RW_S_LATCH) {
@@ -1514,13 +1549,15 @@ buf_page_get_known_nowait(
 	}

 	if (!success) {
-		mutex_enter(&(buf_pool->mutex));
+		mutex_enter(&block->mutex);

 		block->buf_fix_count--;
+
+		mutex_exit(&block->mutex);
+
 #ifdef UNIV_SYNC_DEBUG
 		rw_lock_s_unlock(&(block->debug_latch));
 #endif
-		mutex_exit(&(buf_pool->mutex));

 		return(FALSE);
 	}
@@ -1568,7 +1605,6 @@ buf_page_init_for_backup_restore(
 	block->offset		= offset;

 	block->lock_hash_val	= 0;
-	block->lock_mutex	= NULL;

 	block->freed_page_clock = 0;

@@ -1601,6 +1637,7 @@ buf_page_init(
 {
 #ifdef UNIV_SYNC_DEBUG
 	ut_ad(mutex_own(&(buf_pool->mutex)));
+	ut_ad(mutex_own(&(block->mutex)));
 #endif /* UNIV_SYNC_DEBUG */
 	ut_a(block->state != BUF_BLOCK_FILE_PAGE);

@@ -1615,7 +1652,6 @@ buf_page_init(
 	block->index		= NULL;

 	block->lock_hash_val	= lock_rec_hash(space, offset);
-	block->lock_mutex	= NULL;

 	/* Insert into the hash table of file pages */

@@ -1709,6 +1745,7 @@ buf_page_init_for_read(
 	ut_a(block);

 	mutex_enter(&(buf_pool->mutex));
+	mutex_enter(&block->mutex);

 	if (fil_tablespace_deleted_or_being_deleted_in_mem(
 		    space, tablespace_version)) {
@@ -1722,7 +1759,9 @@ buf_page_init_for_read(
 		    deleted or is being deleted, or the page is
 		    already in buf_pool, return */

+		mutex_exit(&block->mutex);
 		    mutex_exit(&(buf_pool->mutex));
+
 		    buf_block_free(block);

 		    if (mode == BUF_READ_IBUF_PAGES_ONLY) {
@@ -1742,6 +1781,7 @@ buf_page_init_for_read(
 	buf_LRU_add_block(block, TRUE);		/* TRUE == to old blocks */

 	block->io_fix = BUF_IO_READ;
+
 	buf_pool->n_pend_reads++;

 	/* We set a pass-type x-lock on the frame because then the same
@@ -1753,6 +1793,7 @@ buf_page_init_for_read(

 	rw_lock_x_lock_gen(&(block->lock), BUF_IO_READ);

+	mutex_exit(&block->mutex);
 	mutex_exit(&(buf_pool->mutex));

 	if (mode == BUF_READ_IBUF_PAGES_ONLY) {
@@ -1817,6 +1858,8 @@ buf_page_create(

 	block = free_block;

+	mutex_enter(&block->mutex);
+
 	buf_page_init(space, offset, block);

 	/* The block must be put to the LRU list */
@@ -1827,13 +1870,15 @@ buf_page_create(
 #else
 	buf_block_buf_fix_inc(block);
 #endif
+	buf_pool->n_pages_created++;
+
+	mutex_exit(&(buf_pool->mutex));
+
 	mtr_memo_push(mtr, block, MTR_MEMO_BUF_FIX);

 	block->accessed = TRUE;

-	buf_pool->n_pages_created++;
-
-	mutex_exit(&(buf_pool->mutex));
+	mutex_exit(&block->mutex);

 	/* Delete possible entries for the page from the insert buffer:
 	such can exist if the page belonged to an index which was dropped */
@@ -1885,6 +1930,12 @@ buf_page_io_complete(

 	ut_a(block->state == BUF_BLOCK_FILE_PAGE);

+	/* We do not need protect block->io_fix here by block->mutex to read
+	it because this is the only function where we can change the value
+	from BUF_IO_READ or BUF_IO_WRITE to some other value, and our code
+	ensures that this is the only thread that handles the i/o for this
+	block. */
+
 	io_type = block->io_fix;

 	if (io_type == BUF_IO_READ) {
@@ -1986,11 +2037,12 @@ buf_page_io_complete(
 		}
 	}

+	mutex_enter(&(buf_pool->mutex));
+	mutex_enter(&block->mutex);
+
 #ifdef UNIV_IBUF_DEBUG
 	ut_a(ibuf_count_get(block->space, block->offset) == 0);
 #endif
-	mutex_enter(&(buf_pool->mutex));
-
 	/* Because this thread which does the unlocking is not the same that
 	did the locking, we use a pass value != 0 in unlock, which simply
 	removes the newest lock debug record, without checking the thread
@@ -2033,6 +2085,7 @@ buf_page_io_complete(
 #endif /* UNIV_DEBUG */
 	}

+	mutex_exit(&block->mutex);
 	mutex_exit(&(buf_pool->mutex));

 #ifdef UNIV_DEBUG
@@ -2095,6 +2148,8 @@ buf_validate(void)

 		block = buf_pool_get_nth_block(buf_pool, i);

+		mutex_enter(&block->mutex);
+
 		if (block->state == BUF_BLOCK_FILE_PAGE) {

 			ut_a(buf_page_hash_get(block->space,
@@ -2139,6 +2194,8 @@ buf_validate(void)
 		} else if (block->state == BUF_BLOCK_NOT_USED) {
 			n_free++;
 		}
+
+		mutex_exit(&block->mutex);
 	}

 	if (n_lru + n_free > buf_pool->curr_size) {
@@ -2286,9 +2343,14 @@ buf_get_latched_pages_number(void)

 		block = buf_pool_get_nth_block(buf_pool, i);

-		if (((block->buf_fix_count != 0) || (block->io_fix != 0))
-		    && block->magic_n == BUF_BLOCK_MAGIC_N) {
-			fixed_pages_number++;
+		if (block->magic_n == BUF_BLOCK_MAGIC_N) {
+			mutex_enter(&block->mutex);
+
+			if (block->buf_fix_count != 0 || block->io_fix != 0) {
+				fixed_pages_number++;
+			}
+
+			mutex_exit(&block->mutex);
 		}
 	}

@@ -2458,6 +2520,8 @@ buf_all_freed(void)

 		block = buf_pool_get_nth_block(buf_pool, i);

+		mutex_enter(&block->mutex);
+
 		if (block->state == BUF_BLOCK_FILE_PAGE) {

 			if (!buf_flush_ready_for_replace(block)) {
@@ -2469,6 +2533,8 @@ buf_all_freed(void)
 				ut_error;
 			}
 		}
+
+		mutex_exit(&block->mutex);
 	}

 	mutex_exit(&(buf_pool->mutex));

--- a/storage/innobase/buf/buf0flu.c
+++ b/storage/innobase/buf/buf0flu.c
@@ -113,6 +113,7 @@ buf_flush_ready_for_replace(
 {
 #ifdef UNIV_SYNC_DEBUG
 	ut_ad(mutex_own(&(buf_pool->mutex)));
+	ut_ad(mutex_own(&block->mutex));
 #endif /* UNIV_SYNC_DEBUG */
 	if (block->state != BUF_BLOCK_FILE_PAGE) {
 		ut_print_timestamp(stderr);
@@ -148,6 +149,7 @@ buf_flush_ready_for_flush(
 {
 #ifdef UNIV_SYNC_DEBUG
 	ut_ad(mutex_own(&(buf_pool->mutex)));
+	ut_ad(mutex_own(&(block->mutex)));
 #endif /* UNIV_SYNC_DEBUG */
 	ut_a(block->state == BUF_BLOCK_FILE_PAGE);

@@ -559,8 +561,15 @@ buf_flush_try_page(

 	ut_a(!block || block->state == BUF_BLOCK_FILE_PAGE);

+	if (!block) {
+		mutex_exit(&(buf_pool->mutex));
+		return(0);
+	}
+
+	mutex_enter(&block->mutex);
+
 	if (flush_type == BUF_FLUSH_LIST
-	    && block && buf_flush_ready_for_flush(block, flush_type)) {
+	    && buf_flush_ready_for_flush(block, flush_type)) {

 		block->io_fix = BUF_IO_WRITE;

@@ -598,6 +607,7 @@ buf_flush_try_page(
 			locked = TRUE;
 		}

+		mutex_exit(&block->mutex);
 		mutex_exit(&(buf_pool->mutex));

 		if (!locked) {
@@ -618,7 +628,7 @@ buf_flush_try_page(

 		return(1);

-	} else if (flush_type == BUF_FLUSH_LRU && block
+	} else if (flush_type == BUF_FLUSH_LRU
 		   && buf_flush_ready_for_flush(block, flush_type)) {

 		/* VERY IMPORTANT:
@@ -659,13 +669,14 @@ buf_flush_try_page(
 		buf_pool mutex: this ensures that the latch is acquired
 		immediately. */

+		mutex_exit(&block->mutex);
 		mutex_exit(&(buf_pool->mutex));

 		buf_flush_write_block_low(block);

 		return(1);

-	} else if (flush_type == BUF_FLUSH_SINGLE_PAGE && block
+	} else if (flush_type == BUF_FLUSH_SINGLE_PAGE
 		   && buf_flush_ready_for_flush(block, flush_type)) {

 		block->io_fix = BUF_IO_WRITE;
@@ -692,6 +703,7 @@ buf_flush_try_page(

 		(buf_pool->n_flush[flush_type])++;

+		mutex_exit(&block->mutex);
 		mutex_exit(&(buf_pool->mutex));

 		rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE);
@@ -709,11 +721,12 @@ buf_flush_try_page(
 		buf_flush_write_block_low(block);

 		return(1);
-	} else {
-		mutex_exit(&(buf_pool->mutex));
-
-		return(0);
 	}
+
+	mutex_exit(&block->mutex);
+	mutex_exit(&(buf_pool->mutex));
+
+	return(0);
 }

 /***************************************************************
@@ -758,34 +771,48 @@ buf_flush_try_neighbors(
 		block = buf_page_hash_get(space, i);
 		ut_a(!block || block->state == BUF_BLOCK_FILE_PAGE);

-		if (block && flush_type == BUF_FLUSH_LRU && i != offset
-		    && !block->old) {
+		if (!block) {
+
+			continue;
+
+		} else if (flush_type == BUF_FLUSH_LRU && i != offset
+			   && !block->old) {

 			/* We avoid flushing 'non-old' blocks in an LRU flush,
 			because the flushed blocks are soon freed */

 			continue;
-		}
+		} else {

-		if (block && buf_flush_ready_for_flush(block, flush_type)
-		    && (i == offset || block->buf_fix_count == 0)) {
-			/* We only try to flush those neighbors != offset
-			where the buf fix count is zero, as we then know that
-			we probably can latch the page without a semaphore
-			wait. Semaphore waits are expensive because we must
-			flush the doublewrite buffer before we start
-			waiting. */
+			mutex_enter(&block->mutex);

-			mutex_exit(&(buf_pool->mutex));
+			if (buf_flush_ready_for_flush(block, flush_type)
+			    && (i == offset || block->buf_fix_count == 0)) {
+				/* We only try to flush those
+				neighbors != offset where the buf fix count is
+				zero, as we then know that we probably can
+				latch the page without a semaphore wait.
+				Semaphore waits are expensive because we must
+				flush the doublewrite buffer before we start
+				waiting. */

-			/* Note: as we release the buf_pool mutex above, in
-			buf_flush_try_page we cannot be sure the page is still
-			in a flushable state: therefore we check it again
-			inside that function. */
+				mutex_exit(&block->mutex);

-			count += buf_flush_try_page(space, i, flush_type);
+				mutex_exit(&(buf_pool->mutex));

-			mutex_enter(&(buf_pool->mutex));
+				/* Note: as we release the buf_pool mutex
+				above, in buf_flush_try_page we cannot be sure
+				the page is still in a flushable state:
+				therefore we check it again inside that
+				function. */
+
+				count += buf_flush_try_page(space, i,
+							    flush_type);
+
+				mutex_enter(&(buf_pool->mutex));
+			} else {
+				mutex_exit(&block->mutex);
+			}
 		}
 	}

@@ -879,12 +906,15 @@ buf_flush_batch(
 		while ((block != NULL) && !found) {
 			ut_a(block->state == BUF_BLOCK_FILE_PAGE);

+			mutex_enter(&block->mutex);
+
 			if (buf_flush_ready_for_flush(block, flush_type)) {

 				found = TRUE;
 				space = block->space;
 				offset = block->offset;

+				mutex_exit(&block->mutex);
 				mutex_exit(&(buf_pool->mutex));

 				old_page_count = page_count;
@@ -901,10 +931,14 @@ buf_flush_batch(

 			} else if (flush_type == BUF_FLUSH_LRU) {

+				mutex_exit(&block->mutex);
+
 				block = UT_LIST_GET_PREV(LRU, block);
 			} else {
 				ut_ad(flush_type == BUF_FLUSH_LIST);

+				mutex_exit(&block->mutex);
+
 				block = UT_LIST_GET_PREV(flush_list, block);
 			}
 		}
@@ -986,10 +1020,14 @@ buf_flush_LRU_recommendation(void)
 		   + BUF_FLUSH_EXTRA_MARGIN)
 	       && (distance < BUF_LRU_FREE_SEARCH_LEN)) {

+		mutex_enter(&block->mutex);
+
 		if (buf_flush_ready_for_replace(block)) {
 			n_replaceable++;
 		}

+		mutex_exit(&block->mutex);
+
 		distance++;

 		block = UT_LIST_GET_PREV(LRU, block);

--- a/storage/innobase/buf/buf0lru.c
+++ b/storage/innobase/buf/buf0lru.c
@@ -86,6 +86,11 @@ scan_again:
 	block = UT_LIST_GET_LAST(buf_pool->LRU);

 	while (block != NULL) {
+		buf_block_t*	prev_block;
+
+		mutex_enter(&block->mutex);
+		prev_block = UT_LIST_GET_PREV(LRU, block);
+
 		ut_a(block->state == BUF_BLOCK_FILE_PAGE);

 		if (block->space == id
@@ -112,6 +117,8 @@ scan_again:
 			if (block->is_hashed) {
 				page_no = block->offset;

+				mutex_exit(&block->mutex);
+
 				mutex_exit(&(buf_pool->mutex));

 				/* Note that the following call will acquire
@@ -138,7 +145,8 @@ scan_again:
 			buf_LRU_block_free_hashed_page(block);
 		}
 next_page:
-		block = UT_LIST_GET_PREV(LRU, block);
+		mutex_exit(&block->mutex);
+		block = prev_block;
 	}

 	mutex_exit(&(buf_pool->mutex));
@@ -211,6 +219,9 @@ buf_LRU_search_and_free_block(

 	while (block != NULL) {
 		ut_a(block->in_LRU_list);
+
+		mutex_enter(&block->mutex);
+
 		if (buf_flush_ready_for_replace(block)) {

 #ifdef UNIV_DEBUG
@@ -226,6 +237,7 @@ buf_LRU_search_and_free_block(
 			buf_LRU_block_remove_hashed_page(block);

 			mutex_exit(&(buf_pool->mutex));
+			mutex_exit(&block->mutex);

 			/* Remove possible adaptive hash index built on the
 			page; in the case of AWE the block may not have a
@@ -234,15 +246,21 @@ buf_LRU_search_and_free_block(
 			if (block->frame) {
 				btr_search_drop_page_hash_index(block->frame);
 			}
-			mutex_enter(&(buf_pool->mutex));

 			ut_a(block->buf_fix_count == 0);

+			mutex_enter(&(buf_pool->mutex));
+			mutex_enter(&block->mutex);
+
 			buf_LRU_block_free_hashed_page(block);
 			freed = TRUE;
+			mutex_exit(&block->mutex);

 			break;
 		}
+
+		mutex_exit(&block->mutex);
+
 		block = UT_LIST_GET_PREV(LRU, block);
 		distance++;

@@ -428,8 +446,12 @@ loop:
 			}
 		}

+		mutex_enter(&block->mutex);
+
 		block->state = BUF_BLOCK_READY_FOR_USE;

+		mutex_exit(&block->mutex);
+
 		mutex_exit(&(buf_pool->mutex));

 		if (started_monitor) {
@@ -838,6 +860,7 @@ buf_LRU_block_free_non_file_page(
 {
 #ifdef UNIV_SYNC_DEBUG
 	ut_ad(mutex_own(&(buf_pool->mutex)));
+	ut_ad(mutex_own(&block->mutex));
 #endif /* UNIV_SYNC_DEBUG */
 	ut_ad(block);

@@ -877,6 +900,7 @@ buf_LRU_block_remove_hashed_page(
 {
 #ifdef UNIV_SYNC_DEBUG
 	ut_ad(mutex_own(&(buf_pool->mutex)));
+	ut_ad(mutex_own(&block->mutex));
 #endif /* UNIV_SYNC_DEBUG */
 	ut_ad(block);

@@ -939,6 +963,7 @@ buf_LRU_block_free_hashed_page(
 {
 #ifdef UNIV_SYNC_DEBUG
 	ut_ad(mutex_own(&(buf_pool->mutex)));
+	ut_ad(mutex_own(&block->mutex));
 #endif /* UNIV_SYNC_DEBUG */
 	ut_a(block->state == BUF_BLOCK_REMOVE_HASH);


--- a/storage/innobase/dict/dict0crea.c
+++ b/storage/innobase/dict/dict0crea.c
@@ -700,8 +700,10 @@ dict_truncate_index_tree(
 				/* out: new root page number, or
 				FIL_NULL on failure */
 	dict_table_t*	table,	/* in: the table the index belongs to */
-	rec_t*		rec,	/* in: record in the clustered index of
-				SYS_INDEXES table */
+	btr_pcur_t*	pcur,	/* in/out: persistent cursor pointing to
+				record in the clustered index of
+				SYS_INDEXES table. The cursor may be
+				repositioned in this call. */
 	mtr_t*		mtr)	/* in: mtr having the latch
 				on the record page. The mtr may be
 				committed and restarted in this call. */
@@ -710,6 +712,7 @@ dict_truncate_index_tree(
 	ulint		space;
 	ulint		type;
 	dulint		index_id;
+	rec_t*		rec;
 	byte*		ptr;
 	ulint		len;
 	ulint		comp;
@@ -720,6 +723,7 @@ dict_truncate_index_tree(
 #endif /* UNIV_SYNC_DEBUG */

 	ut_a(!dict_table_is_comp(dict_sys->sys_indexes));
+	rec = btr_pcur_get_rec(pcur);
 	ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len);

 	ut_ad(len == 4);
@@ -785,10 +789,11 @@ dict_truncate_index_tree(
 	/* We will need to commit the mini-transaction in order to avoid
 	deadlocks in the btr_create() call, because otherwise we would
 	be freeing and allocating pages in the same mini-transaction. */
+	btr_pcur_store_position(pcur, mtr);
 	mtr_commit(mtr);
-	/* mtr_commit() will invalidate rec. */
-	rec = NULL;
+
 	mtr_start(mtr);
+	btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr);

 	/* Find the index corresponding to this SYS_INDEXES record. */
 	for (index = UT_LIST_GET_FIRST(table->indexes);

--- a/storage/innobase/fil/fil0fil.c
+++ b/storage/innobase/fil/fil0fil.c
@@ -4416,29 +4416,47 @@ fil_flush_file_spaces(
 {
 	fil_system_t*	system	= fil_system;
 	fil_space_t*	space;
+	ulint*		space_ids;
+	ulint		n_space_ids;
+	ulint		i;

 	mutex_enter(&(system->mutex));

-	space = UT_LIST_GET_FIRST(system->unflushed_spaces);
+	n_space_ids = UT_LIST_GET_LEN(system->unflushed_spaces);
+	if (n_space_ids == 0) {

-	while (space) {
-		if (space->purpose == purpose && !space->is_being_deleted) {
+		mutex_exit(&system->mutex);
+		return;
+	}

-			space->n_pending_flushes++; /* prevent dropping of
-						    the space while we are
-						    flushing */
-			mutex_exit(&(system->mutex));
+	/* Assemble a list of space ids to flush.  Previously, we
+	traversed system->unflushed_spaces and called UT_LIST_GET_NEXT()
+	on a space that was just removed from the list by fil_flush().
+	Thus, the space could be dropped and the memory overwritten. */
+	space_ids = mem_alloc(n_space_ids * sizeof *space_ids);

-			fil_flush(space->id);
+	n_space_ids = 0;

-			mutex_enter(&(system->mutex));
+	for (space = UT_LIST_GET_FIRST(system->unflushed_spaces);
+	     space;
+	     space = UT_LIST_GET_NEXT(unflushed_spaces, space)) {

-			space->n_pending_flushes--;
+		if (space->purpose == purpose && !space->is_being_deleted) {
+
+			space_ids[n_space_ids++] = space->id;
 		}
-		space = UT_LIST_GET_NEXT(unflushed_spaces, space);
 	}

-	mutex_exit(&(system->mutex));
+	mutex_exit(&system->mutex);
+
+	/* Flush the spaces.  It will not hurt to call fil_flush() on
+	a non-existing space id. */
+	for (i = 0; i < n_space_ids; i++) {
+
+		fil_flush(space_ids[i]);
+	}
+
+	mem_free(space_ids);
 }

 /**********************************************************************

--- a/storage/innobase/include/btr0btr.h
+++ b/storage/innobase/include/btr0btr.h
@@ -23,7 +23,17 @@ special big record storage structure */

 #define	BTR_PAGE_MAX_REC_SIZE	(UNIV_PAGE_SIZE / 2 - 200)

-/* Latching modes for the search function (in btr0cur.*) */
+/* Maximum depth of a B-tree in InnoDB. Note that this isn't a maximum as
+such; none of the tree operations avoid producing trees bigger than this. It
+is instead a "max depth that other code must work with", useful for e.g.
+fixed-size arrays that must store some information about each level in a
+tree. In other words: if a B-tree with bigger depth than this is
+encountered, it is not acceptable for it to lead to mysterious memory
+corruption, but it is acceptable for the program to die with a clear assert
+failure. */
+#define BTR_MAX_LEVELS		100
+
+/* Latching modes for btr_cur_search_to_nth_level(). */
 #define BTR_SEARCH_LEAF		RW_S_LATCH
 #define BTR_MODIFY_LEAF		RW_X_LATCH
 #define BTR_NO_LATCHES		RW_NO_LATCH

--- a/storage/innobase/include/buf0buf.h
+++ b/storage/innobase/include/buf0buf.h
@@ -461,8 +461,8 @@ Gets the mutex number protecting the page record lock hash chain in the lock
 table. */
 UNIV_INLINE
 mutex_t*
-buf_frame_get_lock_mutex(
-/*=====================*/
+buf_frame_get_mutex(
+/*================*/
 			/* out: mutex */
 	byte*	ptr);	/* in: pointer to within a buffer frame */
 /***********************************************************************
@@ -713,7 +713,10 @@ struct buf_block_struct{

 	ulint		magic_n;	/* magic number to check */
 	ulint		state;		/* state of the control block:
-					BUF_BLOCK_NOT_USED, ... */
+					BUF_BLOCK_NOT_USED, ...; changing
+					this is only allowed when a thread
+					has BOTH the buffer pool mutex AND
+					block->mutex locked */
 	byte*		frame;		/* pointer to buffer frame which
 					is of size UNIV_PAGE_SIZE, and
 					aligned to an address divisible by
@@ -731,8 +734,12 @@ struct buf_block_struct{
 	ulint		offset;		/* page number within the space */
 	ulint		lock_hash_val;	/* hashed value of the page address
 					in the record lock hash table */
-	mutex_t*	lock_mutex;	/* mutex protecting the chain in the
-					record lock hash table */
+	mutex_t		mutex;		/* mutex protecting this block:
+					state (also protected by the buffer
+					pool mutex), io_fix, buf_fix_count,
+					and accessed; we introduce this new
+					mutex in InnoDB-5.1 to relieve
+					contention on the buffer pool mutex */
 	rw_lock_t	lock;		/* read-write lock of the buffer
 					frame */
 	buf_block_t*	hash;		/* node used in chaining to the page
@@ -788,20 +795,27 @@ struct buf_block_struct{
 					in heuristic algorithms, because of
 					the possibility of a wrap-around! */
 	ulint		freed_page_clock;/* the value of freed_page_clock
-					buffer pool when this block was
-					last time put to the head of the
-					LRU list */
+					of the buffer pool when this block was
+					the last time put to the head of the
+					LRU list; a thread is allowed to
+					read this for heuristic purposes
+					without holding any mutex or latch */
 	ibool		old;		/* TRUE if the block is in the old
 					blocks in the LRU list */
 	ibool		accessed;	/* TRUE if the page has been accessed
 					while in the buffer pool: read-ahead
 					may read in pages which have not been
-					accessed yet */
+					accessed yet; this is protected by
+					block->mutex; a thread is allowed to
+					read this for heuristic purposes
+					without holding any mutex or latch */
 	ulint		buf_fix_count;	/* count of how manyfold this block
-					is currently bufferfixed */
+					is currently bufferfixed; this is
+					protected by block->mutex */
 	ulint		io_fix;		/* if a read is pending to the frame,
 					io_fix is BUF_IO_READ, in the case
-					of a write BUF_IO_WRITE, otherwise 0 */
+					of a write BUF_IO_WRITE, otherwise 0;
+					this is protected by block->mutex */
 	/* 4. Optimistic search field */

 	dulint		modify_clock;	/* this clock is incremented every
@@ -959,7 +973,9 @@ struct buf_pool_struct{
 					number of buffer blocks removed from
 					the end of the LRU list; NOTE that
 					this counter may wrap around at 4
-					billion! */
+					billion! A thread is allowed to
+					read this for heuristic purposes
+					without holding any mutex or latch */
 	ulint		LRU_flush_ended;/* when an LRU flush ends for a page,
 					this is incremented by one; this is
 					set to zero when a buffer block is

--- a/storage/innobase/include/buf0buf.ic
+++ b/storage/innobase/include/buf0buf.ic
@@ -337,8 +337,8 @@ Gets the mutex number protecting the page record lock hash chain in the lock
 table. */
 UNIV_INLINE
 mutex_t*
-buf_frame_get_lock_mutex(
-/*=====================*/
+buf_frame_get_mutex(
+/*================*/
 			/* out: mutex */
 	byte*	ptr)	/* in: pointer to within a buffer frame */
 {
@@ -346,7 +346,7 @@ buf_frame_get_lock_mutex(

 	block = buf_block_align(ptr);

-	return(block->lock_mutex);
+	return(&block->mutex);
 }

 /*************************************************************************
@@ -519,6 +519,7 @@ buf_block_buf_fix_inc_debug(
 	ret = rw_lock_s_lock_func_nowait(&(block->debug_latch), file, line);

 	ut_ad(ret == TRUE);
+	ut_ad(mutex_own(&block->mutex));
 #endif
 	block->buf_fix_count++;
 }
@@ -531,6 +532,9 @@ buf_block_buf_fix_inc(
 /*==================*/
 	buf_block_t*	block)	/* in: block to bufferfix */
 {
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(mutex_own(&block->mutex));
+#endif
 	block->buf_fix_count++;
 }
 #endif /* UNIV_SYNC_DEBUG */
@@ -625,23 +629,24 @@ buf_page_release(

 	ut_ad(block);

-	mutex_enter_fast(&(buf_pool->mutex));
-
 	ut_a(block->state == BUF_BLOCK_FILE_PAGE);
 	ut_a(block->buf_fix_count > 0);

 	if (rw_latch == RW_X_LATCH && mtr->modifications) {
-
+		mutex_enter(&buf_pool->mutex);
 		buf_flush_note_modification(block, mtr);
+		mutex_exit(&buf_pool->mutex);
 	}

+	mutex_enter(&block->mutex);
+
 #ifdef UNIV_SYNC_DEBUG
 	rw_lock_s_unlock(&(block->debug_latch));
 #endif
 	buf_fix_count = block->buf_fix_count;
 	block->buf_fix_count = buf_fix_count - 1;

-	mutex_exit(&(buf_pool->mutex));
+	mutex_exit(&block->mutex);

 	if (rw_latch == RW_S_LATCH) {
 		rw_lock_s_unlock(&(block->lock));

--- a/storage/innobase/include/dict0crea.h
+++ b/storage/innobase/include/dict0crea.h
@@ -62,8 +62,10 @@ dict_truncate_index_tree(
 				/* out: new root page number, or
 				FIL_NULL on failure */
 	dict_table_t*	table,	/* in: the table the index belongs to */
-	rec_t*		rec,	/* in: record in the clustered index of
-				SYS_INDEXES table */
+	btr_pcur_t*	pcur,	/* in/out: persistent cursor pointing to
+				record in the clustered index of
+				SYS_INDEXES table. The cursor may be
+				repositioned in this call. */
 	mtr_t*		mtr);	/* in: mtr having the latch
 				on the record page. The mtr may be
 				committed and restarted in this call. */

--- a/storage/innobase/include/ut0lst.h
+++ b/storage/innobase/include/ut0lst.h
@@ -123,27 +123,36 @@ name, NODE1 and NODE2 are pointers to nodes. */
 	}\
 }\

+/* Invalidate the pointers in a list node. */
+#ifdef UNIV_DEBUG
+# define UT_LIST_REMOVE_CLEAR(NAME, N)		\
+((N)->NAME.prev = (N)->NAME.next = (void*) -1)
+#else
+# define UT_LIST_REMOVE_CLEAR(NAME, N) while (0)
+#endif
+
 /***********************************************************************
 Removes a node from a two-way linked list. BASE has to be the base node
 (not a pointer to it). N has to be the pointer to the node to be removed
 from the list. NAME is the list name. */

-#define UT_LIST_REMOVE(NAME, BASE, N)\
-{\
-	ut_ad(N);\
-	ut_a((BASE).count > 0);\
-	((BASE).count)--;\
-	if (((N)->NAME).next != NULL) {\
-		((((N)->NAME).next)->NAME).prev = ((N)->NAME).prev;\
-	} else {\
-		(BASE).end = ((N)->NAME).prev;\
-	}\
-	if (((N)->NAME).prev != NULL) {\
-		((((N)->NAME).prev)->NAME).next = ((N)->NAME).next;\
-	} else {\
-		(BASE).start = ((N)->NAME).next;\
-	}\
-}\
+#define UT_LIST_REMOVE(NAME, BASE, N)					\
+do {									\
+	ut_ad(N);							\
+	ut_a((BASE).count > 0);						\
+	((BASE).count)--;						\
+	if (((N)->NAME).next != NULL) {					\
+		((((N)->NAME).next)->NAME).prev = ((N)->NAME).prev;	\
+	} else {							\
+		(BASE).end = ((N)->NAME).prev;				\
+	}								\
+	if (((N)->NAME).prev != NULL) {					\
+		((((N)->NAME).prev)->NAME).next = ((N)->NAME).next;	\
+	} else {							\
+		(BASE).start = ((N)->NAME).next;			\
+	}								\
+	UT_LIST_REMOVE_CLEAR(NAME, N);					\
+} while (0)

 /************************************************************************
 Gets the next node in a two-way list. NAME is the name of the list

--- a/storage/innobase/row/row0mysql.c
+++ b/storage/innobase/row/row0mysql.c
@@ -2820,12 +2820,10 @@ row_truncate_table_for_mysql(
 			goto next_rec;
 		}

-		btr_pcur_store_position(&pcur, &mtr);
+		/* This call may commit and restart mtr
+		and reposition pcur. */
+		root_page_no = dict_truncate_index_tree(table, &pcur, &mtr);

-		/* This call may commit and restart mtr. */
-		root_page_no = dict_truncate_index_tree(table, rec, &mtr);
-
-		btr_pcur_restore_position(BTR_MODIFY_LEAF, &pcur, &mtr);
 		rec = btr_pcur_get_rec(&pcur);

 		if (root_page_no != FIL_NULL) {