Applied innodb-5.1-ss1004

Fixes: - Bug #15815: Very poor performance with multiple queries running concurrently - Bug #22868: 'Thread thrashing' with > 50 concurrent conns under an upd-intensive workloadw - Bug #24089: Race condition in fil_flush_file_spaces()

Applied innodb-5.1-ss1004
Fixes: - Bug #15815: Very poor performance with multiple queries running concurrently - Bug #22868: 'Thread thrashing' with > 50 concurrent conns under an upd-intensive workloadw - Bug #24089: Race condition in fil_flush_file_spaces()
18080530 · tsmith@quadxeon.mysql.com · a77dde86 · 18080530 · 18080530 · 18080530
Commit 18080530 authored Nov 09, 2006 by tsmith@quadxeon.mysql.com
12 changed files
--- a/storage/innobase/btr/btr0btr.c
+++ b/storage/innobase/btr/btr0btr.c
@@ -1949,7 +1949,12 @@ btr_lift_page_up(
 	mtr_t*		mtr)	/* in: mtr */
 {
 	page_t*		father_page;
+	page_t*		iter_page;
+	page_t*		pages[BTR_MAX_LEVELS];
 	ulint		page_level;
+	ulint		root_page_no;
+	ulint		ancestors;
+	ulint		i;
 	ut_ad(btr_page_get_prev(page, mtr) == FIL_NULL);
 	ut_ad(btr_page_get_next(page, mtr) == FIL_NULL);
@@ -1959,6 +1964,30 @@ btr_lift_page_up(
 		btr_page_get_father_node_ptr(index, page, mtr));
 	page_level = btr_page_get_level(page, mtr);
+	root_page_no = dict_index_get_page(index);
+	ancestors = 1;
+	pages[0] = father_page;
+	/* Store all ancestor pages so we can reset their levels later on.
+	We have to do all the searches on the tree now because later on,
+	after we've replaced the first level, the tree is in an inconsistent
+	state and can not be searched. */
+	iter_page = father_page;
+	for (;;) {
+		if (buf_block_get_page_no(buf_block_align(iter_page))
+		    == root_page_no) {
+			break;
+		}
+		ut_a(ancestors < BTR_MAX_LEVELS);
+		iter_page = buf_frame_align(
+			btr_page_get_father_node_ptr(index, iter_page, mtr));
+		pages[ancestors++] = iter_page;
+	}
 	btr_search_drop_page_hash_index(page);
@@ -1970,7 +1999,15 @@ btr_lift_page_up(
 			       index, mtr);
 	lock_update_copy_and_discard(father_page, page);
-	btr_page_set_level(father_page, page_level, mtr);
+	/* Go upward to root page, decreasing levels by one. */
+	for (i = 0; i < ancestors; i++) {
+		iter_page = pages[i];
+		ut_ad(btr_page_get_level(iter_page, mtr) == (page_level + 1));
+		btr_page_set_level(iter_page, page_level, mtr);
+		page_level++;
+	}
 	/* Free the file page */
 	btr_page_free(index, page, mtr);

--- a/storage/innobase/buf/buf0buf.c
+++ b/storage/innobase/buf/buf0buf.c
--- a/storage/innobase/buf/buf0flu.c
+++ b/storage/innobase/buf/buf0flu.c
@@ -113,6 +113,7 @@ buf_flush_ready_for_replace(
 {
 #ifdef UNIV_SYNC_DEBUG
 	ut_ad(mutex_own(&(buf_pool->mutex)));
+	ut_ad(mutex_own(&block->mutex));
 #endif /* UNIV_SYNC_DEBUG */
 	if (block->state != BUF_BLOCK_FILE_PAGE) {
 		ut_print_timestamp(stderr);
@@ -148,6 +149,7 @@ buf_flush_ready_for_flush(
 {
 #ifdef UNIV_SYNC_DEBUG
 	ut_ad(mutex_own(&(buf_pool->mutex)));
+	ut_ad(mutex_own(&(block->mutex)));
 #endif /* UNIV_SYNC_DEBUG */
 	ut_a(block->state == BUF_BLOCK_FILE_PAGE);
@@ -559,8 +561,15 @@ buf_flush_try_page(
 	ut_a(!block || block->state == BUF_BLOCK_FILE_PAGE);
+	if (!block) {
+		mutex_exit(&(buf_pool->mutex));
+		return(0);
+	}
+	mutex_enter(&block->mutex);
 	if (flush_type == BUF_FLUSH_LIST
-	    && block && buf_flush_ready_for_flush(block, flush_type)) {
+	    && buf_flush_ready_for_flush(block, flush_type)) {
 		block->io_fix = BUF_IO_WRITE;
@@ -598,6 +607,7 @@ buf_flush_try_page(
 			locked = TRUE;
 		}
+		mutex_exit(&block->mutex);
 		mutex_exit(&(buf_pool->mutex));
 		if (!locked) {
@@ -618,7 +628,7 @@ buf_flush_try_page(
 		return(1);
-	} else if (flush_type == BUF_FLUSH_LRU && block
+	} else if (flush_type == BUF_FLUSH_LRU
 		   && buf_flush_ready_for_flush(block, flush_type)) {
 		/* VERY IMPORTANT:
@@ -659,13 +669,14 @@ buf_flush_try_page(
 		buf_pool mutex: this ensures that the latch is acquired
 		immediately. */
+		mutex_exit(&block->mutex);
 		mutex_exit(&(buf_pool->mutex));
 		buf_flush_write_block_low(block);
 		return(1);
-	} else if (flush_type == BUF_FLUSH_SINGLE_PAGE && block
+	} else if (flush_type == BUF_FLUSH_SINGLE_PAGE
 		   && buf_flush_ready_for_flush(block, flush_type)) {
 		block->io_fix = BUF_IO_WRITE;
@@ -692,6 +703,7 @@ buf_flush_try_page(
 		(buf_pool->n_flush[flush_type])++;
+		mutex_exit(&block->mutex);
 		mutex_exit(&(buf_pool->mutex));
 		rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE);
@@ -709,11 +721,12 @@ buf_flush_try_page(
 		buf_flush_write_block_low(block);
 		return(1);
-	} else {
-		mutex_exit(&(buf_pool->mutex));
-		return(0);
 	}
+	mutex_exit(&block->mutex);
+	mutex_exit(&(buf_pool->mutex));
+	return(0);
 }
 /***************************************************************
@@ -758,34 +771,48 @@ buf_flush_try_neighbors(
 		block = buf_page_hash_get(space, i);
 		ut_a(!block || block->state == BUF_BLOCK_FILE_PAGE);
-		if (block && flush_type == BUF_FLUSH_LRU && i != offset
+		if (!block) {
-		    && !block->old) {
+			continue;
+		} else if (flush_type == BUF_FLUSH_LRU && i != offset
+			   && !block->old) {
 			/* We avoid flushing 'non-old' blocks in an LRU flush,
 			because the flushed blocks are soon freed */
 			continue;
-		}
+		} else {
-		if (block && buf_flush_ready_for_flush(block, flush_type)
+			mutex_enter(&block->mutex);
-		    && (i == offset || block->buf_fix_count == 0)) {
-			/* We only try to flush those neighbors != offset
-			where the buf fix count is zero, as we then know that
-			we probably can latch the page without a semaphore
-			wait. Semaphore waits are expensive because we must
-			flush the doublewrite buffer before we start
-			waiting. */
-			mutex_exit(&(buf_pool->mutex));
+			if (buf_flush_ready_for_flush(block, flush_type)
+			    && (i == offset || block->buf_fix_count == 0)) {
+				/* We only try to flush those
+				neighbors != offset where the buf fix count is
+				zero, as we then know that we probably can
+				latch the page without a semaphore wait.
+				Semaphore waits are expensive because we must
+				flush the doublewrite buffer before we start
+				waiting. */
-			/* Note: as we release the buf_pool mutex above, in
+				mutex_exit(&block->mutex);
-			buf_flush_try_page we cannot be sure the page is still
-			in a flushable state: therefore we check it again
-			inside that function. */
-			count += buf_flush_try_page(space, i, flush_type);
+				mutex_exit(&(buf_pool->mutex));
-			mutex_enter(&(buf_pool->mutex));
+				/* Note: as we release the buf_pool mutex
+				above, in buf_flush_try_page we cannot be sure
+				the page is still in a flushable state:
+				therefore we check it again inside that
+				function. */
+				count += buf_flush_try_page(space, i,
+							    flush_type);
+				mutex_enter(&(buf_pool->mutex));
+			} else {
+				mutex_exit(&block->mutex);
+			}
 		}
 	}
@@ -879,12 +906,15 @@ buf_flush_batch(
 		while ((block != NULL) && !found) {
 			ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+			mutex_enter(&block->mutex);
 			if (buf_flush_ready_for_flush(block, flush_type)) {
 				found = TRUE;
 				space = block->space;
 				offset = block->offset;
+				mutex_exit(&block->mutex);
 				mutex_exit(&(buf_pool->mutex));
 				old_page_count = page_count;
@@ -901,10 +931,14 @@ buf_flush_batch(
 			} else if (flush_type == BUF_FLUSH_LRU) {
+				mutex_exit(&block->mutex);
 				block = UT_LIST_GET_PREV(LRU, block);
 			} else {
 				ut_ad(flush_type == BUF_FLUSH_LIST);
+				mutex_exit(&block->mutex);
 				block = UT_LIST_GET_PREV(flush_list, block);
 			}
 		}
@@ -986,10 +1020,14 @@ buf_flush_LRU_recommendation(void)
 		   + BUF_FLUSH_EXTRA_MARGIN)
 	       && (distance < BUF_LRU_FREE_SEARCH_LEN)) {
+		mutex_enter(&block->mutex);
 		if (buf_flush_ready_for_replace(block)) {
 			n_replaceable++;
 		}
+		mutex_exit(&block->mutex);
 		distance++;
 		block = UT_LIST_GET_PREV(LRU, block);

--- a/storage/innobase/buf/buf0lru.c
+++ b/storage/innobase/buf/buf0lru.c
@@ -86,6 +86,11 @@ scan_again:
 	block = UT_LIST_GET_LAST(buf_pool->LRU);
 	while (block != NULL) {
+		buf_block_t*	prev_block;
+		mutex_enter(&block->mutex);
+		prev_block = UT_LIST_GET_PREV(LRU, block);
 		ut_a(block->state == BUF_BLOCK_FILE_PAGE);
 		if (block->space == id
@@ -112,6 +117,8 @@ scan_again:
 			if (block->is_hashed) {
 				page_no = block->offset;
+				mutex_exit(&block->mutex);
 				mutex_exit(&(buf_pool->mutex));
 				/* Note that the following call will acquire
@@ -138,7 +145,8 @@ scan_again:
 			buf_LRU_block_free_hashed_page(block);
 		}
 next_page:
-		block = UT_LIST_GET_PREV(LRU, block);
+		mutex_exit(&block->mutex);
+		block = prev_block;
 	}
 	mutex_exit(&(buf_pool->mutex));
@@ -211,6 +219,9 @@ buf_LRU_search_and_free_block(
 	while (block != NULL) {
 		ut_a(block->in_LRU_list);
+		mutex_enter(&block->mutex);
 		if (buf_flush_ready_for_replace(block)) {
 #ifdef UNIV_DEBUG
@@ -226,6 +237,7 @@ buf_LRU_search_and_free_block(
 			buf_LRU_block_remove_hashed_page(block);
 			mutex_exit(&(buf_pool->mutex));
+			mutex_exit(&block->mutex);
 			/* Remove possible adaptive hash index built on the
 			page; in the case of AWE the block may not have a
@@ -234,15 +246,21 @@ buf_LRU_search_and_free_block(
 			if (block->frame) {
 				btr_search_drop_page_hash_index(block->frame);
 			}
-			mutex_enter(&(buf_pool->mutex));
 			ut_a(block->buf_fix_count == 0);
+			mutex_enter(&(buf_pool->mutex));
+			mutex_enter(&block->mutex);
 			buf_LRU_block_free_hashed_page(block);
 			freed = TRUE;
+			mutex_exit(&block->mutex);
 			break;
 		}
+		mutex_exit(&block->mutex);
 		block = UT_LIST_GET_PREV(LRU, block);
 		distance++;
@@ -428,8 +446,12 @@ loop:
 			}
 		}
+		mutex_enter(&block->mutex);
 		block->state = BUF_BLOCK_READY_FOR_USE;
+		mutex_exit(&block->mutex);
 		mutex_exit(&(buf_pool->mutex));
 		if (started_monitor) {
@@ -838,6 +860,7 @@ buf_LRU_block_free_non_file_page(
 {
 #ifdef UNIV_SYNC_DEBUG
 	ut_ad(mutex_own(&(buf_pool->mutex)));
+	ut_ad(mutex_own(&block->mutex));
 #endif /* UNIV_SYNC_DEBUG */
 	ut_ad(block);
@@ -877,6 +900,7 @@ buf_LRU_block_remove_hashed_page(
 {
 #ifdef UNIV_SYNC_DEBUG
 	ut_ad(mutex_own(&(buf_pool->mutex)));
+	ut_ad(mutex_own(&block->mutex));
 #endif /* UNIV_SYNC_DEBUG */
 	ut_ad(block);
@@ -939,6 +963,7 @@ buf_LRU_block_free_hashed_page(
 {
 #ifdef UNIV_SYNC_DEBUG
 	ut_ad(mutex_own(&(buf_pool->mutex)));
+	ut_ad(mutex_own(&block->mutex));
 #endif /* UNIV_SYNC_DEBUG */
 	ut_a(block->state == BUF_BLOCK_REMOVE_HASH);

--- a/storage/innobase/dict/dict0crea.c
+++ b/storage/innobase/dict/dict0crea.c
@@ -700,8 +700,10 @@ dict_truncate_index_tree(
 				/* out: new root page number, or
 				FIL_NULL on failure */
 	dict_table_t*	table,	/* in: the table the index belongs to */
-	rec_t*		rec,	/* in: record in the clustered index of
+	btr_pcur_t*	pcur,	/* in/out: persistent cursor pointing to
-				SYS_INDEXES table */
+				record in the clustered index of
+				SYS_INDEXES table. The cursor may be
+				repositioned in this call. */
 	mtr_t*		mtr)	/* in: mtr having the latch
 				on the record page. The mtr may be
 				committed and restarted in this call. */
@@ -710,6 +712,7 @@ dict_truncate_index_tree(
 	ulint		space;
 	ulint		type;
 	dulint		index_id;
+	rec_t*		rec;
 	byte*		ptr;
 	ulint		len;
 	ulint		comp;
@@ -720,6 +723,7 @@ dict_truncate_index_tree(
 #endif /* UNIV_SYNC_DEBUG */
 	ut_a(!dict_table_is_comp(dict_sys->sys_indexes));
+	rec = btr_pcur_get_rec(pcur);
 	ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len);
 	ut_ad(len == 4);
@@ -785,10 +789,11 @@ dict_truncate_index_tree(
 	/* We will need to commit the mini-transaction in order to avoid
 	deadlocks in the btr_create() call, because otherwise we would
 	be freeing and allocating pages in the same mini-transaction. */
+	btr_pcur_store_position(pcur, mtr);
 	mtr_commit(mtr);
-	/* mtr_commit() will invalidate rec. */
-	rec = NULL;
 	mtr_start(mtr);
+	btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr);
 	/* Find the index corresponding to this SYS_INDEXES record. */
 	for (index = UT_LIST_GET_FIRST(table->indexes);

--- a/storage/innobase/fil/fil0fil.c
+++ b/storage/innobase/fil/fil0fil.c
@@ -4416,29 +4416,47 @@ fil_flush_file_spaces(
 {
 	fil_system_t*	system	= fil_system;
 	fil_space_t*	space;
+	ulint*		space_ids;
+	ulint		n_space_ids;
+	ulint		i;
 	mutex_enter(&(system->mutex));
-	space = UT_LIST_GET_FIRST(system->unflushed_spaces);
+	n_space_ids = UT_LIST_GET_LEN(system->unflushed_spaces);
+	if (n_space_ids == 0) {
-	while (space) {
+		mutex_exit(&system->mutex);
-		if (space->purpose == purpose && !space->is_being_deleted) {
+		return;
+	}
-			space->n_pending_flushes++; /* prevent dropping of
+	/* Assemble a list of space ids to flush.  Previously, we
-						    the space while we are
+	traversed system->unflushed_spaces and called UT_LIST_GET_NEXT()
-						    flushing */
+	on a space that was just removed from the list by fil_flush().
-			mutex_exit(&(system->mutex));
+	Thus, the space could be dropped and the memory overwritten. */
+	space_ids = mem_alloc(n_space_ids * sizeof *space_ids);
-			fil_flush(space->id);
+	n_space_ids = 0;
-			mutex_enter(&(system->mutex));
+	for (space = UT_LIST_GET_FIRST(system->unflushed_spaces);
+	     space;
+	     space = UT_LIST_GET_NEXT(unflushed_spaces, space)) {
-			space->n_pending_flushes--;
+		if (space->purpose == purpose && !space->is_being_deleted) {
+			space_ids[n_space_ids++] = space->id;
 		}
-		space = UT_LIST_GET_NEXT(unflushed_spaces, space);
 	}
-	mutex_exit(&(system->mutex));
+	mutex_exit(&system->mutex);
+	/* Flush the spaces.  It will not hurt to call fil_flush() on
+	a non-existing space id. */
+	for (i = 0; i < n_space_ids; i++) {
+		fil_flush(space_ids[i]);
+	}
+	mem_free(space_ids);
 }
 /**********************************************************************

--- a/storage/innobase/include/btr0btr.h
+++ b/storage/innobase/include/btr0btr.h
@@ -23,7 +23,17 @@ special big record storage structure */
 #define	BTR_PAGE_MAX_REC_SIZE	(UNIV_PAGE_SIZE / 2 - 200)
-/* Latching modes for the search function (in btr0cur.*) */
+/* Maximum depth of a B-tree in InnoDB. Note that this isn't a maximum as
+such; none of the tree operations avoid producing trees bigger than this. It
+is instead a "max depth that other code must work with", useful for e.g.
+fixed-size arrays that must store some information about each level in a
+tree. In other words: if a B-tree with bigger depth than this is
+encountered, it is not acceptable for it to lead to mysterious memory
+corruption, but it is acceptable for the program to die with a clear assert
+failure. */
+#define BTR_MAX_LEVELS		100
+/* Latching modes for btr_cur_search_to_nth_level(). */
 #define BTR_SEARCH_LEAF		RW_S_LATCH
 #define BTR_MODIFY_LEAF		RW_X_LATCH
 #define BTR_NO_LATCHES		RW_NO_LATCH

--- a/storage/innobase/include/buf0buf.h
+++ b/storage/innobase/include/buf0buf.h
@@ -461,8 +461,8 @@ Gets the mutex number protecting the page record lock hash chain in the lock
 table. */
 UNIV_INLINE
 mutex_t*
-buf_frame_get_lock_mutex(
+buf_frame_get_mutex(
-/*=====================*/
+/*================*/
 			/* out: mutex */
 	byte*	ptr);	/* in: pointer to within a buffer frame */
 /***********************************************************************
@@ -713,7 +713,10 @@ struct buf_block_struct{
 	ulint		magic_n;	/* magic number to check */
 	ulint		state;		/* state of the control block:
-					BUF_BLOCK_NOT_USED, ... */
+					BUF_BLOCK_NOT_USED, ...; changing
+					this is only allowed when a thread
+					has BOTH the buffer pool mutex AND
+					block->mutex locked */
 	byte*		frame;		/* pointer to buffer frame which
 					is of size UNIV_PAGE_SIZE, and
 					aligned to an address divisible by
@@ -731,8 +734,12 @@ struct buf_block_struct{
 	ulint		offset;		/* page number within the space */
 	ulint		lock_hash_val;	/* hashed value of the page address
 					in the record lock hash table */
-	mutex_t*	lock_mutex;	/* mutex protecting the chain in the
+	mutex_t		mutex;		/* mutex protecting this block:
-					record lock hash table */
+					state (also protected by the buffer
+					pool mutex), io_fix, buf_fix_count,
+					and accessed; we introduce this new
+					mutex in InnoDB-5.1 to relieve
+					contention on the buffer pool mutex */
 	rw_lock_t	lock;		/* read-write lock of the buffer
 					frame */
 	buf_block_t*	hash;		/* node used in chaining to the page
@@ -788,20 +795,27 @@ struct buf_block_struct{
 					in heuristic algorithms, because of
 					the possibility of a wrap-around! */
 	ulint		freed_page_clock;/* the value of freed_page_clock
-					buffer pool when this block was
+					of the buffer pool when this block was
-					last time put to the head of the
+					the last time put to the head of the
-					LRU list */
+					LRU list; a thread is allowed to
+					read this for heuristic purposes
+					without holding any mutex or latch */
 	ibool		old;		/* TRUE if the block is in the old
 					blocks in the LRU list */
 	ibool		accessed;	/* TRUE if the page has been accessed
 					while in the buffer pool: read-ahead
 					may read in pages which have not been
-					accessed yet */
+					accessed yet; this is protected by
+					block->mutex; a thread is allowed to
+					read this for heuristic purposes
+					without holding any mutex or latch */
 	ulint		buf_fix_count;	/* count of how manyfold this block
-					is currently bufferfixed */
+					is currently bufferfixed; this is
+					protected by block->mutex */
 	ulint		io_fix;		/* if a read is pending to the frame,
 					io_fix is BUF_IO_READ, in the case
-					of a write BUF_IO_WRITE, otherwise 0 */
+					of a write BUF_IO_WRITE, otherwise 0;
+					this is protected by block->mutex */
 	/* 4. Optimistic search field */
 	dulint		modify_clock;	/* this clock is incremented every
@@ -959,7 +973,9 @@ struct buf_pool_struct{
 					number of buffer blocks removed from
 					the end of the LRU list; NOTE that
 					this counter may wrap around at 4
-					billion! */
+					billion! A thread is allowed to
+					read this for heuristic purposes
+					without holding any mutex or latch */
 	ulint		LRU_flush_ended;/* when an LRU flush ends for a page,
 					this is incremented by one; this is
 					set to zero when a buffer block is

--- a/storage/innobase/include/buf0buf.ic
+++ b/storage/innobase/include/buf0buf.ic
@@ -337,8 +337,8 @@ Gets the mutex number protecting the page record lock hash chain in the lock
 table. */
 UNIV_INLINE
 mutex_t*
-buf_frame_get_lock_mutex(
+buf_frame_get_mutex(
-/*=====================*/
+/*================*/
 			/* out: mutex */
 	byte*	ptr)	/* in: pointer to within a buffer frame */
 {
@@ -346,7 +346,7 @@ buf_frame_get_lock_mutex(
 	block = buf_block_align(ptr);
-	return(block->lock_mutex);
+	return(&block->mutex);
 }
 /*************************************************************************
@@ -519,6 +519,7 @@ buf_block_buf_fix_inc_debug(
 	ret = rw_lock_s_lock_func_nowait(&(block->debug_latch), file, line);
 	ut_ad(ret == TRUE);
+	ut_ad(mutex_own(&block->mutex));
 #endif
 	block->buf_fix_count++;
 }
@@ -531,6 +532,9 @@ buf_block_buf_fix_inc(
 /*==================*/
 	buf_block_t*	block)	/* in: block to bufferfix */
 {
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(mutex_own(&block->mutex));
+#endif
 	block->buf_fix_count++;
 }
 #endif /* UNIV_SYNC_DEBUG */
@@ -625,23 +629,24 @@ buf_page_release(
 	ut_ad(block);
-	mutex_enter_fast(&(buf_pool->mutex));
 	ut_a(block->state == BUF_BLOCK_FILE_PAGE);
 	ut_a(block->buf_fix_count > 0);
 	if (rw_latch == RW_X_LATCH && mtr->modifications) {
+		mutex_enter(&buf_pool->mutex);
 		buf_flush_note_modification(block, mtr);
+		mutex_exit(&buf_pool->mutex);
 	}
+	mutex_enter(&block->mutex);
 #ifdef UNIV_SYNC_DEBUG
 	rw_lock_s_unlock(&(block->debug_latch));
 #endif
 	buf_fix_count = block->buf_fix_count;
 	block->buf_fix_count = buf_fix_count - 1;
-	mutex_exit(&(buf_pool->mutex));
+	mutex_exit(&block->mutex);
 	if (rw_latch == RW_S_LATCH) {
 		rw_lock_s_unlock(&(block->lock));

--- a/storage/innobase/include/dict0crea.h
+++ b/storage/innobase/include/dict0crea.h
@@ -62,8 +62,10 @@ dict_truncate_index_tree(
 				/* out: new root page number, or
 				FIL_NULL on failure */
 	dict_table_t*	table,	/* in: the table the index belongs to */
-	rec_t*		rec,	/* in: record in the clustered index of
+	btr_pcur_t*	pcur,	/* in/out: persistent cursor pointing to
-				SYS_INDEXES table */
+				record in the clustered index of
+				SYS_INDEXES table. The cursor may be
+				repositioned in this call. */
 	mtr_t*		mtr);	/* in: mtr having the latch
 				on the record page. The mtr may be
 				committed and restarted in this call. */

--- a/storage/innobase/include/ut0lst.h
+++ b/storage/innobase/include/ut0lst.h
@@ -123,27 +123,36 @@ name, NODE1 and NODE2 are pointers to nodes. */
 	}\
 }\
+/* Invalidate the pointers in a list node. */
+#ifdef UNIV_DEBUG
+# define UT_LIST_REMOVE_CLEAR(NAME, N)		\
+((N)->NAME.prev = (N)->NAME.next = (void*) -1)
+#else
+# define UT_LIST_REMOVE_CLEAR(NAME, N) while (0)
+#endif
 /***********************************************************************
 Removes a node from a two-way linked list. BASE has to be the base node
 (not a pointer to it). N has to be the pointer to the node to be removed
 from the list. NAME is the list name. */
-#define UT_LIST_REMOVE(NAME, BASE, N)\
+#define UT_LIST_REMOVE(NAME, BASE, N)					\
-{\
+do {									\
-	ut_ad(N);\
+	ut_ad(N);							\
-	ut_a((BASE).count > 0);\
+	ut_a((BASE).count > 0);						\
-	((BASE).count)--;\
+	((BASE).count)--;						\
-	if (((N)->NAME).next != NULL) {\
+	if (((N)->NAME).next != NULL) {					\
-		((((N)->NAME).next)->NAME).prev = ((N)->NAME).prev;\
+		((((N)->NAME).next)->NAME).prev = ((N)->NAME).prev;	\
-	} else {\
+	} else {							\
-		(BASE).end = ((N)->NAME).prev;\
+		(BASE).end = ((N)->NAME).prev;				\
-	}\
+	}								\
-	if (((N)->NAME).prev != NULL) {\
+	if (((N)->NAME).prev != NULL) {					\
-		((((N)->NAME).prev)->NAME).next = ((N)->NAME).next;\
+		((((N)->NAME).prev)->NAME).next = ((N)->NAME).next;	\
-	} else {\
+	} else {							\
-		(BASE).start = ((N)->NAME).next;\
+		(BASE).start = ((N)->NAME).next;			\
-	}\
+	}								\
-}\
+	UT_LIST_REMOVE_CLEAR(NAME, N);					\
+} while (0)
 /************************************************************************
 Gets the next node in a two-way list. NAME is the name of the list

--- a/storage/innobase/row/row0mysql.c
+++ b/storage/innobase/row/row0mysql.c
@@ -2820,12 +2820,10 @@ row_truncate_table_for_mysql(
 			goto next_rec;
 		}
-		btr_pcur_store_position(&pcur, &mtr);
+		/* This call may commit and restart mtr
+		and reposition pcur. */
+		root_page_no = dict_truncate_index_tree(table, &pcur, &mtr);
-		/* This call may commit and restart mtr. */
-		root_page_no = dict_truncate_index_tree(table, rec, &mtr);
-		btr_pcur_restore_position(BTR_MODIFY_LEAF, &pcur, &mtr);
 		rec = btr_pcur_get_rec(&pcur);
 		if (root_page_no != FIL_NULL) {