Updated with XtraDB from Percona Server 5.5.17-rel22.1

Files copied from Percona-Server-5.5.17-rel22.1.tar.gz source tarball.

Updated with XtraDB from Percona Server 5.5.17-rel22.1
Files copied from Percona-Server-5.5.17-rel22.1.tar.gz source tarball.
6afbf295 · unknown · d4d7a8fa · 6afbf295 · 6afbf295 · 6afbf295
Commit 6afbf295 authored Dec 14, 2011 by unknown
103 changed files
--- a/btr/btr0btr.c
+++ b/btr/btr0btr.c
@@ -690,7 +690,8 @@ btr_root_block_get(
 	zip_size = dict_table_zip_size(index->table);
 	root_page_no = dict_index_get_page(index);

-	block = btr_block_get(space, zip_size, root_page_no, RW_X_LATCH, mtr);
+	block = btr_block_get(space, zip_size, root_page_no, RW_X_LATCH,
+			      index, mtr);

 	if (srv_pass_corrupt_table && !block) {
 		return(0);
@@ -897,7 +898,7 @@ btr_page_alloc_for_ibuf(
 				 dict_table_zip_size(index->table),
 				 node_addr.page, RW_X_LATCH, mtr);
 	new_page = buf_block_get_frame(new_block);
-	buf_block_dbg_add_level(new_block, SYNC_TREE_NODE_NEW);
+	buf_block_dbg_add_level(new_block, SYNC_IBUF_TREE_NODE_NEW);

 	flst_remove(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
 		    new_page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE,
@@ -1151,7 +1152,7 @@ btr_node_ptr_get_child(
 	page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);

 	return(btr_block_get(space, dict_table_zip_size(index->table),
-			     page_no, RW_X_LATCH, mtr));
+			     page_no, RW_X_LATCH, index, mtr));
 }

 /************************************************************//**
@@ -1324,7 +1325,8 @@ btr_create(
 			space, 0,
 			IBUF_HEADER + IBUF_TREE_SEG_HEADER, mtr);

-		buf_block_dbg_add_level(ibuf_hdr_block, SYNC_TREE_NODE_NEW);
+		buf_block_dbg_add_level(
+			ibuf_hdr_block, SYNC_IBUF_TREE_NODE_NEW);

 		ut_ad(buf_block_get_page_no(ibuf_hdr_block)
 		      == IBUF_HEADER_PAGE_NO);
@@ -1362,10 +1364,9 @@ btr_create(
 	page_no = buf_block_get_page_no(block);
 	frame = buf_block_get_frame(block);

-	buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);
-
 	if (type & DICT_IBUF) {
 		/* It is an insert buffer tree: initialize the free list */
+		buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE_NEW);

 		ut_ad(page_no == IBUF_TREE_ROOT_PAGE_NO);

@@ -1373,6 +1374,8 @@ btr_create(
 	} else {
 		/* It is a non-ibuf tree: create a file segment for leaf
 		pages */
+		buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);
+
 		if (!fseg_create(space, page_no,
 				 PAGE_HEADER + PAGE_BTR_SEG_LEAF, mtr)) {
 			/* Not enough space for new segment, free root
@@ -1444,14 +1447,15 @@ btr_free_but_not_root(
 leaf_loop:
 	mtr_start(&mtr);

-	root = btr_page_get(space, zip_size, root_page_no, RW_X_LATCH, &mtr);
+	root = btr_page_get(space, zip_size, root_page_no, RW_X_LATCH,
+			    NULL, &mtr);

 	if (srv_pass_corrupt_table && !root) {
 		mtr_commit(&mtr);
 		return;
 	}
 	ut_a(root);
-	
+
 #ifdef UNIV_BTR_DEBUG
 	ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
 				    + root, space));
@@ -1473,7 +1477,8 @@ leaf_loop:
 top_loop:
 	mtr_start(&mtr);

-	root = btr_page_get(space, zip_size, root_page_no, RW_X_LATCH, &mtr);
+	root = btr_page_get(space, zip_size, root_page_no, RW_X_LATCH,
+			    NULL, &mtr);

 	if (srv_pass_corrupt_table && !root) {
 		mtr_commit(&mtr);
@@ -1505,13 +1510,13 @@ btr_free_root(
 	ulint	zip_size,	/*!< in: compressed page size in bytes
 				or 0 for uncompressed pages */
 	ulint	root_page_no,	/*!< in: root page number */
-	mtr_t*	mtr)		/*!< in: a mini-transaction which has already
-				been started */
+	mtr_t*	mtr)		/*!< in/out: mini-transaction */
 {
 	buf_block_t*	block;
 	fseg_header_t*	header;

-	block = btr_block_get(space, zip_size, root_page_no, RW_X_LATCH, mtr);
+	block = btr_block_get(space, zip_size, root_page_no, RW_X_LATCH,
+			      NULL, mtr);

 	if (srv_pass_corrupt_table && !block) {
 		return;
@@ -2395,9 +2400,8 @@ btr_attach_half_pages(
 	/* Update page links of the level */

 	if (prev_page_no != FIL_NULL) {
-		buf_block_t*	prev_block = btr_block_get(space, zip_size,
-							   prev_page_no,
-							   RW_X_LATCH, mtr);
+		buf_block_t*	prev_block = btr_block_get(
+			space, zip_size, prev_page_no, RW_X_LATCH, index, mtr);
 #ifdef UNIV_BTR_DEBUG
 		ut_a(page_is_comp(prev_block->frame) == page_is_comp(page));
 		ut_a(btr_page_get_next(prev_block->frame, mtr)
@@ -2410,9 +2414,8 @@ btr_attach_half_pages(
 	}

 	if (next_page_no != FIL_NULL) {
-		buf_block_t*	next_block = btr_block_get(space, zip_size,
-							   next_page_no,
-							   RW_X_LATCH, mtr);
+		buf_block_t*	next_block = btr_block_get(
+			space, zip_size, next_page_no, RW_X_LATCH, index, mtr);
 #ifdef UNIV_BTR_DEBUG
 		ut_a(page_is_comp(next_block->frame) == page_is_comp(page));
 		ut_a(btr_page_get_prev(next_block->frame, mtr)
@@ -2834,17 +2837,42 @@ func_exit:
 	return(rec);
 }

+#ifdef UNIV_SYNC_DEBUG
+/*************************************************************//**
+Removes a page from the level list of pages.
+@param space	in: space where removed
+@param zip_size	in: compressed page size in bytes, or 0 for uncompressed
+@param page	in/out: page to remove
+@param index	in: index tree
+@param mtr	in/out: mini-transaction */
+# define btr_level_list_remove(space,zip_size,page,index,mtr)		\
+	btr_level_list_remove_func(space,zip_size,page,index,mtr)
+#else /* UNIV_SYNC_DEBUG */
+/*************************************************************//**
+Removes a page from the level list of pages.
+@param space	in: space where removed
+@param zip_size	in: compressed page size in bytes, or 0 for uncompressed
+@param page	in/out: page to remove
+@param index	in: index tree
+@param mtr	in/out: mini-transaction */
+# define btr_level_list_remove(space,zip_size,page,index,mtr)		\
+	btr_level_list_remove_func(space,zip_size,page,mtr)
+#endif /* UNIV_SYNC_DEBUG */
+
 /*************************************************************//**
 Removes a page from the level list of pages. */
-static
+static __attribute__((nonnull))
 void
-btr_level_list_remove(
-/*==================*/
-	ulint		space,	/*!< in: space where removed */
-	ulint		zip_size,/*!< in: compressed page size in bytes
-				or 0 for uncompressed pages */
-	page_t*		page,	/*!< in: page to remove */
-	mtr_t*		mtr)	/*!< in: mtr */
+btr_level_list_remove_func(
+/*=======================*/
+	ulint			space,	/*!< in: space where removed */
+	ulint			zip_size,/*!< in: compressed page size in bytes
+					or 0 for uncompressed pages */
+	page_t*			page,	/*!< in/out: page to remove */
+#ifdef UNIV_SYNC_DEBUG
+	const dict_index_t*	index,	/*!< in: index tree */
+#endif /* UNIV_SYNC_DEBUG */
+	mtr_t*			mtr)	/*!< in/out: mini-transaction */
 {
 	ulint	prev_page_no;
 	ulint	next_page_no;
@@ -2862,7 +2890,7 @@ btr_level_list_remove(
 	if (prev_page_no != FIL_NULL) {
 		buf_block_t*	prev_block
 			= btr_block_get(space, zip_size, prev_page_no,
-					RW_X_LATCH, mtr);
+					RW_X_LATCH, index, mtr);
 		page_t*		prev_page
 			= buf_block_get_frame(prev_block);
 #ifdef UNIV_BTR_DEBUG
@@ -2879,7 +2907,7 @@ btr_level_list_remove(
 	if (next_page_no != FIL_NULL) {
 		buf_block_t*	next_block
 			= btr_block_get(space, zip_size, next_page_no,
-					RW_X_LATCH, mtr);
+					RW_X_LATCH, index, mtr);
 		page_t*		next_page
 			= buf_block_get_frame(next_block);
 #ifdef UNIV_BTR_DEBUG
@@ -3194,7 +3222,7 @@ btr_compress(
 	if (is_left) {

 		merge_block = btr_block_get(space, zip_size, left_page_no,
-					    RW_X_LATCH, mtr);
+					    RW_X_LATCH, index, mtr);
 		merge_page = buf_block_get_frame(merge_block);
 #ifdef UNIV_BTR_DEBUG
 		ut_a(btr_page_get_next(merge_page, mtr)
@@ -3203,7 +3231,7 @@ btr_compress(
 	} else if (right_page_no != FIL_NULL) {

 		merge_block = btr_block_get(space, zip_size, right_page_no,
-					    RW_X_LATCH, mtr);
+					    RW_X_LATCH, index, mtr);
 		merge_page = buf_block_get_frame(merge_block);
 #ifdef UNIV_BTR_DEBUG
 		ut_a(btr_page_get_prev(merge_page, mtr)
@@ -3292,7 +3320,7 @@ err_exit:
 		btr_search_drop_page_hash_index(block, index);

 		/* Remove the page from the level list */
-		btr_level_list_remove(space, zip_size, page, mtr);
+		btr_level_list_remove(space, zip_size, page, index, mtr);

 		btr_node_ptr_delete(index, block, mtr);
 		lock_update_merge_left(merge_block, orig_pred, block);
@@ -3345,7 +3373,7 @@ err_exit:
 #endif /* UNIV_BTR_DEBUG */

 		/* Remove the page from the level list */
-		btr_level_list_remove(space, zip_size, page, mtr);
+		btr_level_list_remove(space, zip_size, page, index, mtr);

 		/* Replace the address of the old child node (= page) with the
 		address of the merge page to the right */
@@ -3528,7 +3556,7 @@ btr_discard_page(

 	if (left_page_no != FIL_NULL) {
 		merge_block = btr_block_get(space, zip_size, left_page_no,
-					    RW_X_LATCH, mtr);
+					    RW_X_LATCH, index, mtr);
 		merge_page = buf_block_get_frame(merge_block);
 #ifdef UNIV_BTR_DEBUG
 		ut_a(btr_page_get_next(merge_page, mtr)
@@ -3536,7 +3564,7 @@ btr_discard_page(
 #endif /* UNIV_BTR_DEBUG */
 	} else if (right_page_no != FIL_NULL) {
 		merge_block = btr_block_get(space, zip_size, right_page_no,
-					    RW_X_LATCH, mtr);
+					    RW_X_LATCH, index, mtr);
 		merge_page = buf_block_get_frame(merge_block);
 #ifdef UNIV_BTR_DEBUG
 		ut_a(btr_page_get_prev(merge_page, mtr)
@@ -3571,7 +3599,7 @@ btr_discard_page(
 	btr_node_ptr_delete(index, block, mtr);

 	/* Remove the page from the level list */
-	btr_level_list_remove(space, zip_size, page, mtr);
+	btr_level_list_remove(space, zip_size, page, index, mtr);
 #ifdef UNIV_ZIP_DEBUG
 	{
 		page_zip_des_t*	merge_page_zip
@@ -4089,7 +4117,7 @@ loop:
 	if (right_page_no != FIL_NULL) {
 		const rec_t*	right_rec;
 		right_block = btr_block_get(space, zip_size, right_page_no,
-					    RW_X_LATCH, &mtr);
+					    RW_X_LATCH, index, &mtr);
 		right_page = buf_block_get_frame(right_block);
 		if (UNIV_UNLIKELY(btr_page_get_prev(right_page, &mtr)
 				  != page_get_page_no(page))) {
@@ -4315,7 +4343,7 @@ node_ptr_fails:
 		mtr_start(&mtr);

 		block = btr_block_get(space, zip_size, right_page_no,
-				      RW_X_LATCH, &mtr);
+				      RW_X_LATCH, index, &mtr);
 		page = buf_block_get_frame(block);

 		goto loop;

--- a/btr/btr0cur.c
+++ b/btr/btr0cur.c
--- a/btr/btr0pcur.c
+++ b/btr/btr0pcur.c
 /*****************************************************************************

-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.

 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -272,8 +272,10 @@ btr_pcur_restore_position_func(
 					file, line, mtr))) {
 			cursor->pos_state = BTR_PCUR_IS_POSITIONED;

-			buf_block_dbg_add_level(btr_pcur_get_block(cursor),
-						SYNC_TREE_NODE);
+			buf_block_dbg_add_level(
+				btr_pcur_get_block(cursor),
+				dict_index_is_ibuf(index)
+				? SYNC_IBUF_TREE_NODE : SYNC_TREE_NODE);

 			if (cursor->rel_pos == BTR_PCUR_ON) {
 #ifdef UNIV_DEBUG
@@ -362,33 +364,6 @@ btr_pcur_restore_position_func(
 	return(FALSE);
 }

-/**************************************************************//**
-If the latch mode of the cursor is BTR_LEAF_SEARCH or BTR_LEAF_MODIFY,
-releases the page latch and bufferfix reserved by the cursor.
-NOTE! In the case of BTR_LEAF_MODIFY, there should not exist changes
-made by the current mini-transaction to the data protected by the
-cursor latch, as then the latch must not be released until mtr_commit. */
-UNIV_INTERN
-void
-btr_pcur_release_leaf(
-/*==================*/
-	btr_pcur_t*	cursor, /*!< in: persistent cursor */
-	mtr_t*		mtr)	/*!< in: mtr */
-{
-	buf_block_t*	block;
-
-	ut_a(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
-	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
-
-	block = btr_pcur_get_block(cursor);
-
-	btr_leaf_page_release(block, cursor->latch_mode, mtr);
-
-	cursor->latch_mode = BTR_NO_LATCHES;
-
-	cursor->pos_state = BTR_PCUR_WAS_POSITIONED;
-}
-
 /*********************************************************//**
 Moves the persistent cursor to the first record on the next page. Releases the
 latch on the current page, and bufferunfixes it. Note that there must not be
@@ -423,7 +398,8 @@ btr_pcur_move_to_next_page(
 	ut_ad(next_page_no != FIL_NULL);

 	next_block = btr_block_get(space, zip_size, next_page_no,
-				   cursor->latch_mode, mtr);
+				   cursor->latch_mode,
+				   btr_pcur_get_btr_cur(cursor)->index, mtr);
 	next_page = buf_block_get_frame(next_block);

 	if (srv_pass_corrupt_table && !next_page) {

--- a/btr/btr0sea.c
+++ b/btr/btr0sea.c
 /*****************************************************************************

-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2008, Google Inc.

 Portions of this file contain modifications contributed and copyrighted by
@@ -878,6 +878,7 @@ btr_search_guess_on_hash(
 	btr_pcur_t	pcur;
 #endif
 	ut_ad(index && info && tuple && cursor && mtr);
+	ut_ad(!dict_index_is_ibuf(index));
 	ut_ad((latch_mode == BTR_SEARCH_LEAF)
 	      || (latch_mode == BTR_MODIFY_LEAF));


--- a/buf/buf0buddy.c
+++ b/buf/buf0buddy.c
--- a/buf/buf0buf.c
+++ b/buf/buf0buf.c
--- a/buf/buf0flu.c
+++ b/buf/buf0flu.c
 /*****************************************************************************

-Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.

 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -534,7 +534,9 @@ buf_flush_remove(
 	case BUF_BLOCK_ZIP_DIRTY:
 		buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE);
 		UT_LIST_REMOVE(flush_list, buf_pool->flush_list, bpage);
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 		buf_LRU_insert_zip_clean(bpage);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
 		break;
 	case BUF_BLOCK_FILE_PAGE:
 		UT_LIST_REMOVE(flush_list, buf_pool->flush_list, bpage);

--- a/buf/buf0lru.c
+++ b/buf/buf0lru.c
--- a/buf/buf0rea.c
+++ b/buf/buf0rea.c
@@ -40,8 +40,10 @@ Created 11/5/1995 Heikki Tuuri
 #include "mysql/plugin.h"
 #include "mysql/service_thd_wait.h"

-/** The linear read-ahead area size */
-#define	BUF_READ_AHEAD_LINEAR_AREA	BUF_READ_AHEAD_AREA
+/** There must be at least this many pages in buf_pool in the area to start
+a random read-ahead */
+#define BUF_READ_AHEAD_RANDOM_THRESHOLD(b)	\
+				(5 + BUF_READ_AHEAD_AREA(b) / 8)

 /** If there are buf_pool->curr_size per the number below pending reads, then
 read-ahead is not done: this is to prevent flooding the buffer pool with
@@ -211,6 +213,172 @@ not_to_recover:
 	return(1);
 }

+/********************************************************************//**
+Applies a random read-ahead in buf_pool if there are at least a threshold
+value of accessed pages from the random read-ahead area. Does not read any
+page, not even the one at the position (space, offset), if the read-ahead
+mechanism is not activated. NOTE 1: the calling thread may own latches on
+pages: to avoid deadlocks this function must be written such that it cannot
+end up waiting for these latches! NOTE 2: the calling thread must want
+access to the page given: this rule is set to prevent unintended read-aheads
+performed by ibuf routines, a situation which could result in a deadlock if
+the OS does not support asynchronous i/o.
+@return number of page read requests issued; NOTE that if we read ibuf
+pages, it may happen that the page at the given page number does not
+get read even if we return a positive value!
+@return	number of page read requests issued */
+UNIV_INTERN
+ulint
+buf_read_ahead_random(
+/*==================*/
+	ulint	space,		/*!< in: space id */
+	ulint	zip_size,	/*!< in: compressed page size in bytes,
+				or 0 */
+	ulint	offset,		/*!< in: page number of a page which
+				the current thread wants to access */
+	ibool	inside_ibuf,	/*!< in: TRUE if we are inside ibuf
+				routine */
+	trx_t*	trx)
+{
+	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
+	ib_int64_t	tablespace_version;
+	ulint		recent_blocks	= 0;
+	ulint		ibuf_mode;
+	ulint		count;
+	ulint		low, high;
+	ulint		err;
+	ulint		i;
+	const ulint	buf_read_ahead_random_area
+				= BUF_READ_AHEAD_AREA(buf_pool);
+
+	if (!srv_random_read_ahead) {
+		/* Disabled by user */
+		return(0);
+	}
+
+	if (srv_startup_is_before_trx_rollback_phase) {
+		/* No read-ahead to avoid thread deadlocks */
+		return(0);
+	}
+
+	if (ibuf_bitmap_page(zip_size, offset)
+	    || trx_sys_hdr_page(space, offset)) {
+
+		/* If it is an ibuf bitmap page or trx sys hdr, we do
+		no read-ahead, as that could break the ibuf page access
+		order */
+
+		return(0);
+	}
+
+	/* Remember the tablespace version before we ask te tablespace size
+	below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we
+	do not try to read outside the bounds of the tablespace! */
+
+	tablespace_version = fil_space_get_version(space);
+
+	low  = (offset / buf_read_ahead_random_area)
+		* buf_read_ahead_random_area;
+	high = (offset / buf_read_ahead_random_area + 1)
+		* buf_read_ahead_random_area;
+	if (high > fil_space_get_size(space)) {
+
+		high = fil_space_get_size(space);
+	}
+
+	buf_pool_mutex_enter(buf_pool);
+
+	if (buf_pool->n_pend_reads
+	    > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
+		buf_pool_mutex_exit(buf_pool);
+
+		return(0);
+	}
+
+	/* Count how many blocks in the area have been recently accessed,
+	that is, reside near the start of the LRU list. */
+
+	for (i = low; i < high; i++) {
+		const buf_page_t* bpage =
+			buf_page_hash_get(buf_pool, space, i);
+
+		if (bpage
+		    && buf_page_is_accessed(bpage)
+		    && buf_page_peek_if_young(bpage)) {
+
+			recent_blocks++;
+
+			if (recent_blocks
+			    >= BUF_READ_AHEAD_RANDOM_THRESHOLD(buf_pool)) {
+
+				buf_pool_mutex_exit(buf_pool);
+				goto read_ahead;
+			}
+		}
+	}
+
+	buf_pool_mutex_exit(buf_pool);
+	/* Do nothing */
+	return(0);
+
+read_ahead:
+	/* Read all the suitable blocks within the area */
+
+	if (inside_ibuf) {
+		ibuf_mode = BUF_READ_IBUF_PAGES_ONLY;
+	} else {
+		ibuf_mode = BUF_READ_ANY_PAGE;
+	}
+
+	count = 0;
+
+	for (i = low; i < high; i++) {
+		/* It is only sensible to do read-ahead in the non-sync aio
+		mode: hence FALSE as the first parameter */
+
+		if (!ibuf_bitmap_page(zip_size, i)) {
+			count += buf_read_page_low(
+				&err, FALSE,
+				ibuf_mode | OS_AIO_SIMULATED_WAKE_LATER,
+				space, zip_size, FALSE,
+				tablespace_version, i, trx);
+			if (err == DB_TABLESPACE_DELETED) {
+				ut_print_timestamp(stderr);
+				fprintf(stderr,
+					"  InnoDB: Warning: in random"
+					" readahead trying to access\n"
+					"InnoDB: tablespace %lu page %lu,\n"
+					"InnoDB: but the tablespace does not"
+					" exist or is just being dropped.\n",
+					(ulong) space, (ulong) i);
+			}
+		}
+	}
+
+	/* In simulated aio we wake the aio handler threads only after
+	queuing all aio requests, in native aio the following call does
+	nothing: */
+
+	os_aio_simulated_wake_handler_threads();
+
+#ifdef UNIV_DEBUG
+	if (buf_debug_prints && (count > 0)) {
+		fprintf(stderr,
+			"Random read-ahead space %lu offset %lu pages %lu\n",
+			(ulong) space, (ulong) offset,
+			(ulong) count);
+	}
+#endif /* UNIV_DEBUG */
+
+	/* Read ahead is considered one I/O operation for the purpose of
+	LRU policy decision. */
+	buf_LRU_stat_inc_io();
+
+	buf_pool->stat.n_ra_pages_read_rnd += count;
+	srv_buf_pool_reads += count;
+	return(count);
+}
+
 /********************************************************************//**
 High-level function which reads a page asynchronously from a file to the
 buffer buf_pool if it is not already there. Sets the io_fix flag and sets
@@ -309,7 +477,7 @@ buf_read_ahead_linear(
 	ulint		err;
 	ulint		i;
 	const ulint	buf_read_ahead_linear_area
-		= BUF_READ_AHEAD_LINEAR_AREA(buf_pool);
+		= BUF_READ_AHEAD_AREA(buf_pool);
 	ulint		threshold;

 	if (!(srv_read_ahead & 2)) {

--- a/dict/dict0crea.c
+++ b/dict/dict0crea.c
 /*****************************************************************************

-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.

 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -898,7 +898,7 @@ dict_truncate_index_tree(
 	appropriate field in the SYS_INDEXES record: this mini-transaction
 	marks the B-tree totally truncated */

-	btr_block_get(space, zip_size, root_page_no, RW_X_LATCH, mtr);
+	btr_block_get(space, zip_size, root_page_no, RW_X_LATCH, NULL, mtr);

 	btr_free_root(space, zip_size, root_page_no, mtr);
 create:

--- a/dict/dict0dict.c
+++ b/dict/dict0dict.c
 /*****************************************************************************

-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.

 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -54,6 +54,7 @@ UNIV_INTERN dict_index_t*	dict_ind_compact;
 #include "row0merge.h"
 #include "m_ctype.h" /* my_isspace() */
 #include "ha_prototypes.h" /* innobase_strcasecmp(), innobase_casedn_str()*/
+#include "row0upd.h"
 #include "srv0start.h" /* SRV_LOG_SPACE_FIRST_ID */

 #include <ctype.h>
@@ -612,8 +613,7 @@ dict_table_get_on_id(
 {
 	dict_table_t*	table;

-	if (table_id <= DICT_FIELDS_ID
-	    || trx->dict_operation_lock_mode == RW_X_LATCH) {
+	if (trx->dict_operation_lock_mode == RW_X_LATCH) {

 		/* Note: An X latch implies that the transaction
 		already owns the dictionary mutex. */
@@ -1777,7 +1777,8 @@ undo_size_ok:

 	new_index->page = page_no;
 	rw_lock_create(index_tree_rw_lock_key, &new_index->lock,
-		       SYNC_INDEX_TREE);
+		       dict_index_is_ibuf(index)
+		       ? SYNC_IBUF_INDEX_TREE : SYNC_INDEX_TREE);

 	if (!UNIV_UNLIKELY(new_index->type & DICT_UNIVERSAL)) {

@@ -5522,6 +5523,189 @@ dict_close(void)
 	}
 }

+/**********************************************************************//**
+Find a table in dict_sys->table_LRU list with specified space id
+@return table if found, NULL if not */
+static
+dict_table_t*
+dict_find_table_by_space(
+/*=====================*/
+	ulint	space_id)		/*!< in: space ID */
+{
+	dict_table_t*   table;
+	ulint		num_item;
+	ulint		count = 0;
+
+	ut_ad(space_id > 0);
+
+	table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
+	num_item =  UT_LIST_GET_LEN(dict_sys->table_LRU);
+
+	/* This function intentionally does not acquire mutex as it is used
+	by error handling code in deep call stack as last means to avoid
+	killing the server, so it worth to risk some consequencies for
+	the action. */
+	while (table && count < num_item) {
+		if (table->space == space_id) {
+			return(table);
+		}
+
+		table = UT_LIST_GET_NEXT(table_LRU, table);
+		count++;
+	}
+
+	return(NULL);
+}
+
+/**********************************************************************//**
+Flags a table with specified space_id corrupted in the data dictionary
+cache
+@return TRUE if successful */
+UNIV_INTERN
+ibool
+dict_set_corrupted_by_space(
+/*========================*/
+	ulint	space_id)		/*!< in: space ID */
+{
+	dict_table_t*   table;
+
+	table = dict_find_table_by_space(space_id);
+
+	if (!table) {
+		return(FALSE);
+	}
+
+	/* mark the table->corrupted bit only, since the caller
+	could be too deep in the stack for SYS_INDEXES update */
+	table->corrupted = TRUE;
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Flags an index corrupted both in the data dictionary cache
+and in the SYS_INDEXES */
+UNIV_INTERN
+void
+dict_set_corrupted(
+/*===============*/
+	dict_index_t*	index)		/*!< in/out: index */
+{
+	mem_heap_t*	heap;
+	mtr_t		mtr;
+	dict_index_t*	sys_index;
+	dtuple_t*	tuple;
+	dfield_t*	dfield;
+	byte*		buf;
+	const char*	status;
+	btr_cur_t	cursor;
+
+	ut_ad(index);
+	ut_ad(mutex_own(&dict_sys->mutex));
+	ut_ad(!dict_table_is_comp(dict_sys->sys_tables));
+	ut_ad(!dict_table_is_comp(dict_sys->sys_indexes));
+
+#ifdef UNIV_SYNC_DEBUG
+        ut_ad(sync_thread_levels_empty_except_dict());
+#endif
+
+	/* Mark the table as corrupted only if the clustered index
+	is corrupted */
+	if (dict_index_is_clust(index)) {
+		index->table->corrupted = TRUE;
+	}
+
+	if (UNIV_UNLIKELY(dict_index_is_corrupted(index))) {
+		/* The index was already flagged corrupted. */
+		ut_ad(index->table->corrupted);
+		return;
+	}
+
+	heap = mem_heap_create(sizeof(dtuple_t) + 2 * (sizeof(dfield_t)
+			       + sizeof(que_fork_t) + sizeof(upd_node_t)
+			       + sizeof(upd_t) + 12));
+	mtr_start(&mtr);
+	index->type |= DICT_CORRUPT;
+
+	sys_index = UT_LIST_GET_FIRST(dict_sys->sys_indexes->indexes);
+
+	/* Find the index row in SYS_INDEXES */
+	tuple = dtuple_create(heap, 2);
+
+	dfield = dtuple_get_nth_field(tuple, 0);
+	buf = mem_heap_alloc(heap, 8);
+	mach_write_to_8(buf, index->table->id);
+	dfield_set_data(dfield, buf, 8);
+
+	dfield = dtuple_get_nth_field(tuple, 1);
+	buf = mem_heap_alloc(heap, 8);
+	mach_write_to_8(buf, index->id);
+	dfield_set_data(dfield, buf, 8);
+
+	dict_index_copy_types(tuple, sys_index, 2);
+
+	btr_cur_search_to_nth_level(sys_index, 0, tuple, PAGE_CUR_GE,
+				    BTR_MODIFY_LEAF,
+				    &cursor, 0, __FILE__, __LINE__, &mtr);
+
+	if (cursor.up_match == dtuple_get_n_fields(tuple)) {
+		/* UPDATE SYS_INDEXES SET TYPE=index->type
+		WHERE TABLE_ID=index->table->id AND INDEX_ID=index->id */
+		ulint	len;
+		byte*	field	= rec_get_nth_field_old(
+			btr_cur_get_rec(&cursor),
+			DICT_SYS_INDEXES_TYPE_FIELD, &len);
+		if (len != 4) {
+			goto fail;
+		}
+		mlog_write_ulint(field, index->type, MLOG_4BYTES, &mtr);
+		status = "  InnoDB: Flagged corruption of ";
+	} else {
+fail:
+		status = "  InnoDB: Unable to flag corruption of ";
+	}
+
+	mtr_commit(&mtr);
+	mem_heap_free(heap);
+
+	ut_print_timestamp(stderr);
+	fputs(status, stderr);
+	dict_index_name_print(stderr, NULL, index);
+	putc('\n', stderr);
+}
+
+/**********************************************************************//**
+Flags an index corrupted in the data dictionary cache only. This
+is used mostly to mark a corrupted index when index's own dictionary
+is corrupted, and we force to load such index for repair purpose */
+UNIV_INTERN
+void
+dict_set_corrupted_index_cache_only(
+/*================================*/
+	dict_index_t*	index,		/*!< in/out: index */
+	dict_table_t*	table)		/*!< in/out: table */
+{
+	ut_ad(index);
+	ut_ad(mutex_own(&dict_sys->mutex));
+	ut_ad(!dict_table_is_comp(dict_sys->sys_tables));
+	ut_ad(!dict_table_is_comp(dict_sys->sys_indexes));
+
+	/* Mark the table as corrupted only if the clustered index
+	is corrupted */
+	if (dict_index_is_clust(index)) {
+		dict_table_t*	corrupt_table;
+
+		corrupt_table = table ? table : index->table;
+		ut_ad(!index->table || !table || index->table  == table);
+
+		if (corrupt_table) {
+			corrupt_table->corrupted = TRUE;
+		}
+	}
+
+	index->type |= DICT_CORRUPT;
+}
+
 /*************************************************************************
 set is_corrupt flag by space_id*/


--- a/dict/dict0load.c
+++ b/dict/dict0load.c
@@ -54,6 +54,11 @@ static const char* SYSTEM_TABLE_NAME[] = {
 	"SYS_FOREIGN_COLS",
 	"SYS_STATS"
 };
+
+/* If this flag is TRUE, then we will load the cluster index's (and tables')
+metadata even if it is marked as "corrupted". */
+UNIV_INTERN my_bool     srv_load_corrupted = FALSE;
+
 /****************************************************************//**
 Compare the name of an index column.
 @return	TRUE if the i'th column of index is 'name'. */
@@ -1396,6 +1401,9 @@ err_len:
 		goto err_len;
 	}
 	type = mach_read_from_4(field);
+	if (UNIV_UNLIKELY(type & (~0 << DICT_IT_BITS))) {
+		return("unknown SYS_INDEXES.TYPE bits");
+	}

 	field = rec_get_nth_field_old(rec, 7/*SPACE*/, &len);
 	if (UNIV_UNLIKELY(len != 4)) {
@@ -1495,16 +1503,47 @@ dict_load_indexes(
 			goto next_rec;
 		} else if (err_msg) {
 			fprintf(stderr, "InnoDB: %s\n", err_msg);
+			if (ignore_err & DICT_ERR_IGNORE_CORRUPT) {
+				goto next_rec;
+			}
 			error = DB_CORRUPTION;
 			goto func_exit;
 		}

 		ut_ad(index);

+		/* Check whether the index is corrupted */
+		if (dict_index_is_corrupted(index)) {
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: ", stderr);
+			dict_index_name_print(stderr, NULL, index);
+			fputs(" is corrupted\n", stderr);
+
+			if (!srv_load_corrupted
+			    && !(ignore_err & DICT_ERR_IGNORE_CORRUPT)
+			    && dict_index_is_clust(index)) {
+				dict_mem_index_free(index);
+
+				error = DB_INDEX_CORRUPT;
+				goto func_exit;
+			} else {
+				/* We will load the index if
+				1) srv_load_corrupted is TRUE
+				2) ignore_err is set with
+				DICT_ERR_IGNORE_CORRUPT
+				3) if the index corrupted is a secondary
+				index */
+				ut_print_timestamp(stderr);
+				fputs("  InnoDB: load corrupted index ", stderr);
+				dict_index_name_print(stderr, NULL, index);
+				putc('\n', stderr);
+			}
+		}
+
 		/* We check for unsupported types first, so that the
 		subsequent checks are relevant for the supported types. */
-		if (index->type & ~(DICT_CLUSTERED | DICT_UNIQUE)) {
-
+		if (index->type & ~(DICT_CLUSTERED | DICT_UNIQUE
+				    | DICT_CORRUPT)) {
 			fprintf(stderr,
 				"InnoDB: Error: unknown type %lu"
 				" of index %s of table %s\n",
@@ -1525,9 +1564,14 @@ dict_load_indexes(
 				/* If caller can tolerate this error,
 				we will continue to load the index and
 				let caller deal with this error. However
-				mark the index and table corrupted */
-				index->corrupted = TRUE;
-				table->corrupted = TRUE;
+				mark the index and table corrupted. We
+				only need to mark such in the index
+				dictionary cache for such metadata corruption,
+				since we would always be able to set it
+				when loading the dictionary cache */
+				dict_set_corrupted_index_cache_only(
+					index, table);
+
 				fprintf(stderr,
 					"InnoDB: Index is corrupt but forcing"
 					" load into data dictionary\n");
@@ -1567,9 +1611,10 @@ corrupted:
 					index->name, table->name);

 				/* If the force recovery flag is set, and
-				if the failed index is not the primary index, we
-				will continue and open other indexes */
-				if (srv_force_recovery
+				if the failed index is not the clustered index,
+				we will continue and open other indexes */
+				if ((srv_force_recovery
+				     || srv_load_corrupted)
 				    && !dict_index_is_clust(index)) {
 					error = DB_SUCCESS;
 					goto next_rec;
@@ -1884,6 +1929,30 @@ err_exit:

 	err = dict_load_indexes(table, heap, ignore_err);

+	if (err == DB_INDEX_CORRUPT) {
+		/* Refuse to load the table if the table has a corrupted
+		cluster index */
+		if (!srv_load_corrupted) {
+			fprintf(stderr, "InnoDB: Error: Load table ");
+			ut_print_name(stderr, NULL, TRUE, table->name);
+			fprintf(stderr, " failed, the table has corrupted"
+					" clustered indexes. Turn on"
+					" 'innodb_force_load_corrupted'"
+					" to drop it\n");
+
+			dict_table_remove_from_cache(table);
+			table = NULL;
+			goto func_exit;
+		} else {
+			dict_index_t*	clust_index;
+			clust_index = dict_table_get_first_index(table);
+
+			if (dict_index_is_corrupted(clust_index)) {
+				table->corrupted = TRUE;
+			}
+		}
+	}
+
 	/* Initialize table foreign_child value. Its value could be
 	changed when dict_load_foreigns() is called below */
 	table->fk_max_recusive_level = 0;
@@ -1910,9 +1979,15 @@ err_exit:
 		index = dict_table_get_first_index(table);

 		if (!srv_force_recovery || !index
-		     || !dict_index_is_clust(index)) {
+		    || !dict_index_is_clust(index)) {
 			dict_table_remove_from_cache(table);
 			table = NULL;
+		} else if (dict_index_is_corrupted(index)) {
+
+			/* It is possible we force to load a corrupted
+			clustered index if srv_load_corrupted is set.
+			Mark the table as corrupted in this case */
+			table->corrupted = TRUE;
 		}
 	}
 #if 0
@@ -1939,6 +2014,7 @@ err_exit:
 		mutex_exit(&dict_foreign_err_mutex);
 	}
 #endif /* 0 */
+func_exit:
 	mem_heap_free(heap);

 	return(table);

--- a/fil/fil0fil.c
+++ b/fil/fil0fil.c
@@ -3541,7 +3541,7 @@ skip_info:
 										ULINT_UNDEFINED, &heap);
 								n_fields = rec_offs_n_fields(offsets);
 								if (!offset) {
-									offset = row_get_trx_id_offset(rec, index, offsets);
+									offset = row_get_trx_id_offset(index, offsets);
 								}
 								trx_write_trx_id(rec + offset, 1);

@@ -3826,7 +3826,7 @@ convert_err_exit:
 						ULINT_UNDEFINED, &heap);
 				n_fields = rec_offs_n_fields(offsets);
 				if (!offset) {
-					offset = row_get_trx_id_offset(rec, index, offsets);
+					offset = row_get_trx_id_offset(index, offsets);
 				}
 				trx_write_trx_id(rec + offset, 1);


--- a/handler/ha_innodb.cc
+++ b/handler/ha_innodb.cc
--- a/handler/handler0alter.cc
+++ b/handler/handler0alter.cc
@@ -695,6 +695,10 @@ ha_innobase::add_index(
 	possible adaptive hash latch to avoid deadlocks of threads. */
 	trx_search_latch_release_if_reserved(prebuilt->trx);

+	if (prebuilt->trx->fake_changes) {
+		DBUG_RETURN(HA_ERR_WRONG_COMMAND);
+	}
+
 	/* Check if the index name is reserved. */
 	if (innobase_index_name_is_reserved(user_thd, key_info, num_of_keys)) {
 		DBUG_RETURN(-1);
@@ -732,6 +736,13 @@ ha_innobase::add_index(
 	/* Create a background transaction for the operations on
 	the data dictionary tables. */
 	trx = innobase_trx_allocate(user_thd);
+	if (trx->fake_changes) {
+		mem_heap_free(heap);
+		trx_general_rollback_for_mysql(trx, NULL);
+		trx_free_for_mysql(trx);
+		DBUG_RETURN(HA_ERR_WRONG_COMMAND);
+	}
+
 	trx_start_if_not_started(trx);

 	/* Create table containing all indexes to be built in this
@@ -1092,6 +1103,10 @@ ha_innobase::prepare_drop_index(
 	trx_search_latch_release_if_reserved(prebuilt->trx);
 	trx = prebuilt->trx;

+	if (trx->fake_changes) {
+		DBUG_RETURN(HA_ERR_WRONG_COMMAND);
+	}
+
 	/* Test and mark all the indexes to be dropped */

 	row_mysql_lock_data_dictionary(trx);
@@ -1296,6 +1311,12 @@ ha_innobase::final_drop_index(
 	/* Create a background transaction for the operations on
 	the data dictionary tables. */
 	trx = innobase_trx_allocate(user_thd);
+	if (trx->fake_changes) {
+		trx_general_rollback_for_mysql(trx, NULL);
+		trx_free_for_mysql(trx);
+		DBUG_RETURN(HA_ERR_WRONG_COMMAND);
+	}
+
 	trx_start_if_not_started(trx);

 	/* Flag this transaction as a dictionary operation, so that

--- a/handler/i_s.cc
+++ b/handler/i_s.cc
@@ -645,7 +645,11 @@ UNIV_INTERN struct st_mysql_plugin	i_s_innodb_trx =

 	/* reserved for dependency checking */
 	/* void* */
-	STRUCT_FLD(__reserved1, NULL)
+	STRUCT_FLD(__reserved1, NULL),
+
+	/* Plugin flags */
+	/* unsigned long */
+	STRUCT_FLD(flags, 0UL),
 };

 /* Fields of the dynamic table INFORMATION_SCHEMA.innodb_locks */
@@ -911,7 +915,11 @@ UNIV_INTERN struct st_mysql_plugin	i_s_innodb_locks =

 	/* reserved for dependency checking */
 	/* void* */
-	STRUCT_FLD(__reserved1, NULL)
+	STRUCT_FLD(__reserved1, NULL),
+
+	/* Plugin flags */
+	/* unsigned long */
+	STRUCT_FLD(flags, 0UL),
 };

 /* Fields of the dynamic table INFORMATION_SCHEMA.innodb_lock_waits */
@@ -1094,7 +1102,11 @@ UNIV_INTERN struct st_mysql_plugin	i_s_innodb_lock_waits =

 	/* reserved for dependency checking */
 	/* void* */
-	STRUCT_FLD(__reserved1, NULL)
+	STRUCT_FLD(__reserved1, NULL),
+
+	/* Plugin flags */
+	/* unsigned long */
+	STRUCT_FLD(flags, 0UL),
 };

 /*******************************************************************//**
@@ -1427,7 +1439,11 @@ UNIV_INTERN struct st_mysql_plugin	i_s_innodb_cmp =

 	/* reserved for dependency checking */
 	/* void* */
-	STRUCT_FLD(__reserved1, NULL)
+	STRUCT_FLD(__reserved1, NULL),
+
+	/* Plugin flags */
+	/* unsigned long */
+	STRUCT_FLD(flags, 0UL),
 };

 UNIV_INTERN struct st_mysql_plugin	i_s_innodb_cmp_reset =
@@ -1477,7 +1493,11 @@ UNIV_INTERN struct st_mysql_plugin	i_s_innodb_cmp_reset =

 	/* reserved for dependency checking */
 	/* void* */
-	STRUCT_FLD(__reserved1, NULL)
+	STRUCT_FLD(__reserved1, NULL),
+
+	/* Plugin flags */
+	/* unsigned long */
+	STRUCT_FLD(flags, 0UL),
 };

 /* Fields of the dynamic table information_schema.innodb_cmpmem. */
@@ -1720,7 +1740,11 @@ UNIV_INTERN struct st_mysql_plugin	i_s_innodb_cmpmem =

 	/* reserved for dependency checking */
 	/* void* */
-	STRUCT_FLD(__reserved1, NULL)
+	STRUCT_FLD(__reserved1, NULL),
+
+	/* Plugin flags */
+	/* unsigned long */
+	STRUCT_FLD(flags, 0UL),
 };

 UNIV_INTERN struct st_mysql_plugin	i_s_innodb_cmpmem_reset =
@@ -1770,7 +1794,11 @@ UNIV_INTERN struct st_mysql_plugin	i_s_innodb_cmpmem_reset =

 	/* reserved for dependency checking */
 	/* void* */
-	STRUCT_FLD(__reserved1, NULL)
+	STRUCT_FLD(__reserved1, NULL),
+
+	/* Plugin flags */
+	/* unsigned long */
+	STRUCT_FLD(flags, 0UL),
 };

 /*******************************************************************//**

--- a/ibuf/ibuf0ibuf.c
+++ b/ibuf/ibuf0ibuf.c
 /*****************************************************************************

-Copyright (c) 1997, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.

 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -402,7 +402,7 @@ ibuf_tree_root_get(
 	block = buf_page_get(
 		IBUF_SPACE_ID, 0, FSP_IBUF_TREE_ROOT_PAGE_NO, RW_X_LATCH, mtr);

-	buf_block_dbg_add_level(block, SYNC_TREE_NODE);
+	buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE_NEW);

 	root = buf_block_get_frame(block);

@@ -590,7 +590,7 @@ ibuf_init_at_db_start(void)
 		block = buf_page_get(
 			IBUF_SPACE_ID, 0, FSP_IBUF_TREE_ROOT_PAGE_NO,
 			RW_X_LATCH, &mtr);
-		buf_block_dbg_add_level(block, SYNC_TREE_NODE);
+		buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE);

 		root = buf_block_get_frame(block);
 	}
@@ -2251,16 +2251,17 @@ ibuf_add_free_page(void)
 	} else {
 		buf_block_t*	block = buf_page_get(
 			IBUF_SPACE_ID, 0, page_no, RW_X_LATCH, &mtr);
-		buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);

-		page = buf_block_get_frame(block);
-	}
+		ibuf_enter(&mtr);

-	ibuf_enter(&mtr);
+		mutex_enter(&ibuf_mutex);

-	mutex_enter(&ibuf_mutex);
+		root = ibuf_tree_root_get(&mtr);

-	root = ibuf_tree_root_get(&mtr);
+		buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE_NEW);
+
+		page = buf_block_get_frame(block);
+	}

 	/* Add the page to the free list and update the ibuf size data */

@@ -2374,8 +2375,7 @@ ibuf_remove_free_page(void)
 		block = buf_page_get(
 			IBUF_SPACE_ID, 0, page_no, RW_X_LATCH, &mtr);

-		buf_block_dbg_add_level(block, SYNC_TREE_NODE);
-
+		buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE);

 		page = buf_block_get_frame(block);
 	}
@@ -3066,7 +3066,7 @@ ibuf_get_volume_buffered(
 			IBUF_SPACE_ID, 0, prev_page_no, RW_X_LATCH,
 			mtr);

-		buf_block_dbg_add_level(block, SYNC_TREE_NODE);
+		buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE);


 		prev_page = buf_block_get_frame(block);
@@ -3139,7 +3139,7 @@ count_later:
 			IBUF_SPACE_ID, 0, next_page_no, RW_X_LATCH,
 			mtr);

-		buf_block_dbg_add_level(block, SYNC_TREE_NODE);
+		buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE);


 		next_page = buf_block_get_frame(block);
@@ -3377,7 +3377,7 @@ ibuf_set_entry_counter(
 				IBUF_SPACE_ID, 0, prev_page_no,
 				RW_X_LATCH, mtr);

-			buf_block_dbg_add_level(block, SYNC_TREE_NODE);
+			buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE);

 			prev_page = buf_block_get_frame(block);

@@ -3496,6 +3496,8 @@ ibuf_insert_low(

 	ut_a(trx_sys_multiple_tablespace_format);

+	ut_ad(!(thr_get_trx(thr)->fake_changes));
+
 	do_merge = FALSE;

 	/* Perform dirty reads of ibuf->size and ibuf->max_size, to
@@ -4465,6 +4467,7 @@ ibuf_merge_or_delete_for_page(
 	ut_ad(!block || buf_block_get_space(block) == space);
 	ut_ad(!block || buf_block_get_page_no(block) == page_no);
 	ut_ad(!block || buf_block_get_zip_size(block) == zip_size);
+	ut_ad(!block || buf_block_get_io_fix(block) == BUF_IO_READ);

 	if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE
 	    || trx_sys_hdr_page(space, page_no)) {
@@ -4617,7 +4620,13 @@ loop:

 		ut_a(success);

-		buf_block_dbg_add_level(block, SYNC_TREE_NODE);
+		/* This is a user page (secondary index leaf page),
+		but we pretend that it is a change buffer page in
+		order to obey the latching order. This should be OK,
+		because buffered changes are applied immediately while
+		the block is io-fixed. Other threads must not try to
+		latch an io-fixed block. */
+		buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE);
 	}

 	/* Position pcur in the insert buffer at the first entry for this
@@ -4721,7 +4730,12 @@ loop:
 					__FILE__, __LINE__, &mtr);
 				ut_a(success);

-				buf_block_dbg_add_level(block, SYNC_TREE_NODE);
+				/* This is a user page (secondary
+				index leaf page), but it should be OK
+				to use too low latching order for it,
+				as the block is io-fixed. */
+				buf_block_dbg_add_level(
+					block, SYNC_IBUF_TREE_NODE);

 				if (!ibuf_restore_pos(space, page_no,
 						      search_tuple,

--- a/include/btr0btr.h
+++ b/include/btr0btr.h
@@ -199,26 +199,45 @@ btr_block_get_func(
 	ulint		mode,		/*!< in: latch mode */
 	const char*	file,		/*!< in: file name */
 	ulint		line,		/*!< in: line where called */
-	mtr_t*		mtr)		/*!< in/out: mtr */
-	__attribute__((nonnull));
+# ifdef UNIV_SYNC_DEBUG
+	const dict_index_t*	index,	/*!< in: index tree, may be NULL
+					if it is not an insert buffer tree */
+# endif /* UNIV_SYNC_DEBUG */
+	mtr_t*		mtr);		/*!< in/out: mini-transaction */
+# ifdef UNIV_SYNC_DEBUG
+/** Gets a buffer page and declares its latching order level.
+@param space	tablespace identifier
+@param zip_size	compressed page size in bytes or 0 for uncompressed pages
+@param page_no	page number
+@param mode	latch mode
+@param index	index tree, may be NULL if not the insert buffer tree
+@param mtr	mini-transaction handle
+@return the block descriptor */
+#  define btr_block_get(space,zip_size,page_no,mode,index,mtr)	\
+	btr_block_get_func(space,zip_size,page_no,mode,		\
+			   __FILE__,__LINE__,index,mtr)
+# else /* UNIV_SYNC_DEBUG */
 /** Gets a buffer page and declares its latching order level.
 @param space	tablespace identifier
 @param zip_size	compressed page size in bytes or 0 for uncompressed pages
 @param page_no	page number
 @param mode	latch mode
+@param idx	index tree, may be NULL if not the insert buffer tree
 @param mtr	mini-transaction handle
 @return the block descriptor */
-# define btr_block_get(space,zip_size,page_no,mode,mtr) \
+#  define btr_block_get(space,zip_size,page_no,mode,idx,mtr)		\
 	btr_block_get_func(space,zip_size,page_no,mode,__FILE__,__LINE__,mtr)
+# endif /* UNIV_SYNC_DEBUG */
 /** Gets a buffer page and declares its latching order level.
 @param space	tablespace identifier
 @param zip_size	compressed page size in bytes or 0 for uncompressed pages
 @param page_no	page number
 @param mode	latch mode
+@param idx	index tree, may be NULL if not the insert buffer tree
 @param mtr	mini-transaction handle
 @return the uncompressed page frame */
-# define btr_page_get(space,zip_size,page_no,mode,mtr) \
-	buf_block_get_frame(btr_block_get(space,zip_size,page_no,mode,mtr))
+# define btr_page_get(space,zip_size,page_no,mode,idx,mtr)		\
+	buf_block_get_frame(btr_block_get(space,zip_size,page_no,mode,idx,mtr))
 /**************************************************************//**
 Sets the index id field of a page. */
 UNIV_INLINE
@@ -389,8 +408,7 @@ btr_free_root(
 	ulint	zip_size,	/*!< in: compressed page size in bytes
 				or 0 for uncompressed pages */
 	ulint	root_page_no,	/*!< in: root page number */
-	mtr_t*	mtr);		/*!< in: a mini-transaction which has already
-				been started */
+	mtr_t*	mtr);		/*!< in/out: mini-transaction */
 /*************************************************************//**
 Makes tree one level higher by splitting the root, and inserts
 the tuple. It is assumed that mtr contains an x-latch on the tree.

--- a/include/btr0btr.ic
+++ b/include/btr0btr.ic
 /*****************************************************************************

-Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.

 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -48,6 +48,10 @@ btr_block_get_func(
 	ulint		mode,		/*!< in: latch mode */
 	const char*	file,		/*!< in: file name */
 	ulint		line,		/*!< in: line where called */
+#ifdef UNIV_SYNC_DEBUG
+	const dict_index_t*	index,	/*!< in: index tree, may be NULL
+					if it is not an insert buffer tree */
+#endif /* UNIV_SYNC_DEBUG */
 	mtr_t*		mtr)		/*!< in/out: mtr */
 {
 	buf_block_t*	block;
@@ -59,7 +63,9 @@ btr_block_get_func(

 	if (block && mode != RW_NO_LATCH) {

-		buf_block_dbg_add_level(block, SYNC_TREE_NODE);
+		buf_block_dbg_add_level(
+			block, index != NULL && dict_index_is_ibuf(index)
+			? SYNC_IBUF_TREE_NODE : SYNC_TREE_NODE);
 	}

 	return(block);

--- a/include/btr0pcur.h
+++ b/include/btr0pcur.h
 /*****************************************************************************

-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.

 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -244,18 +244,6 @@ btr_pcur_restore_position_func(
 	mtr_t*		mtr);		/*!< in: mtr */
 #define btr_pcur_restore_position(l,cur,mtr)				\
 	btr_pcur_restore_position_func(l,cur,__FILE__,__LINE__,mtr)
-/**************************************************************//**
-If the latch mode of the cursor is BTR_LEAF_SEARCH or BTR_LEAF_MODIFY,
-releases the page latch and bufferfix reserved by the cursor.
-NOTE! In the case of BTR_LEAF_MODIFY, there should not exist changes
-made by the current mini-transaction to the data protected by the
-cursor latch, as then the latch must not be released until mtr_commit. */
-UNIV_INTERN
-void
-btr_pcur_release_leaf(
-/*==================*/
-	btr_pcur_t*	cursor, /*!< in: persistent cursor */
-	mtr_t*		mtr);	/*!< in: mtr */
 /*********************************************************//**
 Gets the rel_pos field for a cursor whose position has been stored.
 @return	BTR_PCUR_ON, ... */
@@ -266,10 +254,9 @@ btr_pcur_get_rel_pos(
 	const btr_pcur_t*	cursor);/*!< in: persistent cursor */
 /**************************************************************//**
 Commits the mtr and sets the pcur latch mode to BTR_NO_LATCHES,
-that is, the cursor becomes detached. If there have been modifications
-to the page where pcur is positioned, this can be used instead of
-btr_pcur_release_leaf. Function btr_pcur_store_position should be used
-before calling this, if restoration of cursor is wanted later. */
+that is, the cursor becomes detached.
+Function btr_pcur_store_position should be used before calling this,
+if restoration of cursor is wanted later. */
 UNIV_INLINE
 void
 btr_pcur_commit_specify_mtr(

--- a/include/btr0pcur.ic
+++ b/include/btr0pcur.ic
 /*****************************************************************************

-Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.

 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -369,10 +369,9 @@ btr_pcur_move_to_next(

 /**************************************************************//**
 Commits the mtr and sets the pcur latch mode to BTR_NO_LATCHES,
-that is, the cursor becomes detached. If there have been modifications
-to the page where pcur is positioned, this can be used instead of
-btr_pcur_release_leaf. Function btr_pcur_store_position should be used
-before calling this, if restoration of cursor is wanted later. */
+that is, the cursor becomes detached.
+Function btr_pcur_store_position should be used before calling this,
+if restoration of cursor is wanted later. */
 UNIV_INLINE
 void
 btr_pcur_commit_specify_mtr(

--- a/include/buf0buddy.h
+++ b/include/buf0buddy.h
 /*****************************************************************************

-Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2006, 2011, Oracle and/or its affiliates. All Rights Reserved.

 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -37,39 +37,39 @@ Created December 2006 by Marko Makela
 /**********************************************************************//**
 Allocate a block.  The thread calling this function must hold
 buf_pool->mutex and must not hold buf_pool->zip_mutex or any
-block->mutex.  The buf_pool->mutex may only be released and reacquired
-if lru != NULL.  This function should only be used for allocating
-compressed page frames or control blocks (buf_page_t).  Allocated
-control blocks must be properly initialized immediately after
-buf_buddy_alloc() has returned the memory, before releasing
-buf_pool->mutex.
-@return	allocated block, possibly NULL if lru == NULL */
+block->mutex.  The buf_pool->mutex may be released and reacquired.
+This function should only be used for allocating compressed page frames.
+@return	allocated block, never NULL */
 UNIV_INLINE
-void*
+byte*
 buf_buddy_alloc(
 /*============*/
-	buf_pool_t*	buf_pool,
-			/*!< buffer pool in which the block resides */
-	ulint	size,	/*!< in: block size, up to UNIV_PAGE_SIZE */
-	ibool*	lru,	/*!< in: pointer to a variable that will be assigned
-			TRUE if storage was allocated from the LRU list
-			and buf_pool->mutex was temporarily released,
-			or NULL if the LRU list should not be used */
-	ibool	have_page_hash_mutex)
-	__attribute__((malloc));
+	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool in which
+					the page resides */
+	ulint		size,		/*!< in: compressed page size
+					(between PAGE_ZIP_MIN_SIZE and
+					UNIV_PAGE_SIZE) */
+	ibool*		lru,		/*!< in: pointer to a variable
+					that will be assigned TRUE if
+				       	storage was allocated from the
+				       	LRU list and buf_pool->mutex was
+				       	temporarily released */
+	ibool		have_page_hash_mutex)
+	__attribute__((malloc, nonnull));

 /**********************************************************************//**
-Release a block. */
+Deallocate a block. */
 UNIV_INLINE
 void
 buf_buddy_free(
 /*===========*/
-	buf_pool_t*	buf_pool,
-			/*!< buffer pool in which the block resides */
-	void*	buf,	/*!< in: block to be freed, must not be
-			pointed to by the buffer pool */
-	ulint	size,	/*!< in: block size, up to UNIV_PAGE_SIZE */
-	ibool	have_page_hash_mutex)
+	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool in which
+					the block resides */
+	void*		buf,		/*!< in: block to be freed, must not
+					be pointed to by the buffer pool */
+	ulint		size,		/*!< in: block size,
+					up to UNIV_PAGE_SIZE */
+	ibool		have_page_hash_mutex)
 	__attribute__((nonnull));

 #ifndef UNIV_NONINL

--- a/include/buf0buddy.ic
+++ b/include/buf0buddy.ic
 /*****************************************************************************

-Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2006, 2011, Oracle and/or its affiliates. All Rights Reserved.

 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -36,22 +36,22 @@ Created December 2006 by Marko Makela
 /**********************************************************************//**
 Allocate a block.  The thread calling this function must hold
 buf_pool->mutex and must not hold buf_pool->zip_mutex or any block->mutex.
-The buf_pool->mutex may only be released and reacquired if lru != NULL.
-@return	allocated block, possibly NULL if lru==NULL */
+The buf_pool_mutex may be released and reacquired.
+@return	allocated block, never NULL */
 UNIV_INTERN
 void*
 buf_buddy_alloc_low(
 /*================*/
-	buf_pool_t*	buf_pool,
-			/*!< in: buffer pool in which the page resides */
-	ulint	i,	/*!< in: index of buf_pool->zip_free[],
-			or BUF_BUDDY_SIZES */
-	ibool*	lru,	/*!< in: pointer to a variable that will be assigned
-			TRUE if storage was allocated from the LRU list
-			and buf_pool->mutex was temporarily released,
-			or NULL if the LRU list should not be used */
-	ibool	have_page_hash_mutex)
-	__attribute__((malloc));
+	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
+	ulint		i,		/*!< in: index of buf_pool->zip_free[],
+					or BUF_BUDDY_SIZES */
+	ibool*		lru,		/*!< in: pointer to a variable that
+					will be assigned TRUE if storage was
+					allocated from the LRU list and
+					buf_pool->mutex was temporarily
+					released */
+	ibool		have_page_hash_mutex)
+	__attribute__((malloc, nonnull));

 /**********************************************************************//**
 Deallocate a block. */
@@ -79,6 +79,8 @@ buf_buddy_get_slot(
 	ulint	i;
 	ulint	s;

+	ut_ad(size >= PAGE_ZIP_MIN_SIZE);
+
 	for (i = 0, s = BUF_BUDDY_LOW; s < size; i++, s <<= 1) {
 	}

@@ -89,32 +91,32 @@ buf_buddy_get_slot(
 /**********************************************************************//**
 Allocate a block.  The thread calling this function must hold
 buf_pool->mutex and must not hold buf_pool->zip_mutex or any
-block->mutex.  The buf_pool->mutex may only be released and reacquired
-if lru != NULL.  This function should only be used for allocating
-compressed page frames or control blocks (buf_page_t).  Allocated
-control blocks must be properly initialized immediately after
-buf_buddy_alloc() has returned the memory, before releasing
-buf_pool->mutex.
-@return	allocated block, possibly NULL if lru == NULL */
+block->mutex.  The buf_pool->mutex may be released and reacquired.
+This function should only be used for allocating compressed page frames.
+@return	allocated block, never NULL */
 UNIV_INLINE
-void*
+byte*
 buf_buddy_alloc(
 /*============*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool in which
+	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool in which
 					the page resides */
-	ulint		size,		/*!< in: block size, up to
-					UNIV_PAGE_SIZE */
+	ulint		size,		/*!< in: compressed page size
+					(between PAGE_ZIP_MIN_SIZE and
+					UNIV_PAGE_SIZE) */
 	ibool*		lru,		/*!< in: pointer to a variable
 					that will be assigned TRUE if
 				       	storage was allocated from the
 				       	LRU list and buf_pool->mutex was
-				       	temporarily released, or NULL if
-				       	the LRU list should not be used */
+				       	temporarily released */
 	ibool		have_page_hash_mutex)
 {
 	//ut_ad(buf_pool_mutex_own(buf_pool));
+	ut_ad(ut_is_2pow(size));
+	ut_ad(size >= PAGE_ZIP_MIN_SIZE);
+	ut_ad(size <= UNIV_PAGE_SIZE);

-	return(buf_buddy_alloc_low(buf_pool, buf_buddy_get_slot(size), lru, have_page_hash_mutex));
+	return((byte*) buf_buddy_alloc_low(buf_pool, buf_buddy_get_slot(size),
+					   lru, have_page_hash_mutex));
 }

 /**********************************************************************//**
@@ -123,14 +125,18 @@ UNIV_INLINE
 void
 buf_buddy_free(
 /*===========*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	void*		buf,		/*!< in: block to be freed, must not be
-					pointed to by the buffer pool */
-	ulint		size,		/*!< in: block size, up to
-					UNIV_PAGE_SIZE */
+	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool in which
+					the block resides */
+	void*		buf,		/*!< in: block to be freed, must not
+					be pointed to by the buffer pool */
+	ulint		size,		/*!< in: block size,
+					up to UNIV_PAGE_SIZE */
 	ibool		have_page_hash_mutex)
 {
 	//ut_ad(buf_pool_mutex_own(buf_pool));
+	ut_ad(ut_is_2pow(size));
+	ut_ad(size >= PAGE_ZIP_MIN_SIZE);
+	ut_ad(size <= UNIV_PAGE_SIZE);

 	if (!have_page_hash_mutex) {
 		mutex_enter(&buf_pool->LRU_list_mutex);

--- a/include/buf0buf.h
+++ b/include/buf0buf.h
@@ -148,6 +148,8 @@ struct buf_pool_info_struct{
 	ulint	n_pages_created;	/*!< buf_pool->n_pages_created */
 	ulint	n_pages_written;	/*!< buf_pool->n_pages_written */
 	ulint	n_page_gets;		/*!< buf_pool->n_page_gets */
+	ulint	n_ra_pages_read_rnd;	/*!< buf_pool->n_ra_pages_read_rnd,
+					number of pages readahead */
 	ulint	n_ra_pages_read;	/*!< buf_pool->n_ra_pages_read, number
 					of pages readahead */
 	ulint	n_ra_pages_evicted;	/*!< buf_pool->n_ra_pages_evicted,
@@ -172,6 +174,8 @@ struct buf_pool_info_struct{
 					last printout */

 	/* Statistics about read ahead algorithm.  */
+	double	pages_readahead_rnd_rate;/*!< random readahead rate in pages per
+					second */
 	double	pages_readahead_rate;	/*!< readahead rate in pages per
 					second */
 	double	pages_evicted_rate;	/*!< rate of readahead page evicted
@@ -261,12 +265,6 @@ buf_relocate(
 				BUF_BLOCK_ZIP_DIRTY or BUF_BLOCK_ZIP_PAGE */
 	buf_page_t*	dpage)	/*!< in/out: destination control block */
 	__attribute__((nonnull));
-/********************************************************************//**
-Resizes the buffer pool. */
-UNIV_INTERN
-void
-buf_pool_resize(void);
-/*=================*/
 /*********************************************************************//**
 Gets the current size of buffer buf_pool in bytes.
 @return	size in bytes */
@@ -289,6 +287,23 @@ UNIV_INTERN
 ib_uint64_t
 buf_pool_get_oldest_modification(void);
 /*==================================*/
+/********************************************************************//**
+Allocates a buf_page_t descriptor. This function must succeed. In case
+of failure we assert in this function. */
+UNIV_INLINE
+buf_page_t*
+buf_page_alloc_descriptor(void)
+/*===========================*/
+	__attribute__((malloc));
+/********************************************************************//**
+Free a buf_page_t descriptor. */
+UNIV_INLINE
+void
+buf_page_free_descriptor(
+/*=====================*/
+	buf_page_t*	bpage)	/*!< in: bpage descriptor to free. */
+	__attribute__((nonnull));
+
 /********************************************************************//**
 Allocates a buffer block.
 @return	own: the allocated block, in state BUF_BLOCK_MEMORY */
@@ -546,6 +561,18 @@ buf_block_get_freed_page_clock(
 	__attribute__((pure));

 /********************************************************************//**
+Tells if a block is still close enough to the MRU end of the LRU list
+meaning that it is not in danger of getting evicted and also implying
+that it has been accessed recently.
+Note that this is for heuristics only and does not reserve buffer pool
+mutex.
+@return	TRUE if block is close to MRU end of LRU */
+UNIV_INLINE
+ibool
+buf_page_peek_if_young(
+/*===================*/
+	const buf_page_t*	bpage);	/*!< in: block */
+/********************************************************************//**
 Recommends a move of a block to the start of the LRU list if there is danger
 of dropping from the buffer pool. NOTE: does not reserve the buffer pool
 mutex.
@@ -1233,7 +1260,7 @@ ulint
 buf_get_free_list_len(void);
 /*=======================*/

-/********************************************************************
+/********************************************************************//**
 Determine if a block is a sentinel for a buffer pool watch.
 @return	TRUE if a sentinel for a buffer pool watch, FALSE if not */
 UNIV_INTERN
@@ -1622,6 +1649,8 @@ struct buf_pool_stat_struct{
 	ulint	n_pages_written;/*!< number write operations */
 	ulint	n_pages_created;/*!< number of pages created
 				in the pool with no read */
+	ulint	n_ra_pages_read_rnd;/*!< number of pages read in
+				as part of random read ahead */
 	ulint	n_ra_pages_read;/*!< number of pages read in
 				as part of read ahead */
 	ulint	n_ra_pages_evicted;/*!< number of read ahead
@@ -1766,7 +1795,7 @@ struct buf_pool_struct{
 	UT_LIST_BASE_NODE_T(buf_page_t) LRU;
 					/*!< base node of the LRU list */
 	buf_page_t*	LRU_old;	/*!< pointer to the about
-					buf_LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV
+					LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV
 					oldest blocks in the LRU list;
 					NULL if LRU length less than
 					BUF_LRU_OLD_MIN_LEN;
@@ -1790,8 +1819,10 @@ struct buf_pool_struct{
 	frames and buf_page_t descriptors of blocks that exist
 	in the buffer pool only in compressed form. */
 	/* @{ */
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 	UT_LIST_BASE_NODE_T(buf_page_t)	zip_clean;
 					/*!< unmodified compressed pages */
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
 	UT_LIST_BASE_NODE_T(buf_page_t) zip_free[BUF_BUDDY_SIZES_MAX];
 					/*!< buddy free lists */


--- a/include/buf0buf.ic
+++ b/include/buf0buf.ic
@@ -124,6 +124,29 @@ buf_block_get_freed_page_clock(
 	return(buf_page_get_freed_page_clock(&block->page));
 }

+/********************************************************************//**
+Tells if a block is still close enough to the MRU end of the LRU list
+meaning that it is not in danger of getting evicted and also implying
+that it has been accessed recently.
+Note that this is for heuristics only and does not reserve buffer pool
+mutex.
+@return	TRUE if block is close to MRU end of LRU */
+UNIV_INLINE
+ibool
+buf_page_peek_if_young(
+/*===================*/
+	const buf_page_t*	bpage)	/*!< in: block */
+{
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+
+	/* FIXME: bpage->freed_page_clock is 31 bits */
+	return((buf_pool->freed_page_clock & ((1UL << 31) - 1))
+	       < ((ulint) bpage->freed_page_clock
+		  + (buf_pool->curr_size
+		     * (BUF_LRU_OLD_RATIO_DIV - buf_pool->LRU_old_ratio)
+		     / (BUF_LRU_OLD_RATIO_DIV * 4))));
+}
+
 /********************************************************************//**
 Recommends a move of a block to the start of the LRU list if there is danger
 of dropping from the buffer pool. NOTE: does not reserve the buffer pool
@@ -154,12 +177,7 @@ buf_page_peek_if_too_old(
 		buf_pool->stat.n_pages_not_made_young++;
 		return(FALSE);
 	} else {
-		/* FIXME: bpage->freed_page_clock is 31 bits */
-		return((buf_pool->freed_page_clock & ((1UL << 31) - 1))
-		       > ((ulint) bpage->freed_page_clock
-			  + (buf_pool->curr_size
-			     * (BUF_LRU_OLD_RATIO_DIV - buf_pool->LRU_old_ratio)
-			     / (BUF_LRU_OLD_RATIO_DIV * 4))));
+		return(!buf_page_peek_if_young(bpage));
 	}
 }

@@ -791,6 +809,35 @@ buf_block_get_lock_hash_val(
 	return(block->lock_hash_val);
 }

+/********************************************************************//**
+Allocates a buf_page_t descriptor. This function must succeed. In case
+of failure we assert in this function.
+@return: the allocated descriptor. */
+UNIV_INLINE
+buf_page_t*
+buf_page_alloc_descriptor(void)
+/*===========================*/
+{
+	buf_page_t*	bpage;
+
+	bpage = (buf_page_t*) ut_malloc(sizeof *bpage);
+	ut_d(memset(bpage, 0, sizeof *bpage));
+	UNIV_MEM_ALLOC(bpage, sizeof *bpage);
+
+	return(bpage);
+}
+
+/********************************************************************//**
+Free a buf_page_t descriptor. */
+UNIV_INLINE
+void
+buf_page_free_descriptor(
+/*=====================*/
+	buf_page_t*	bpage)	/*!< in: bpage descriptor to free. */
+{
+	ut_free(bpage);
+}
+
 /********************************************************************//**
 Frees a buffer block which does not contain a file page. */
 UNIV_INLINE

--- a/include/buf0lru.h
+++ b/include/buf0lru.h
 /*****************************************************************************

-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.

 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -30,18 +30,6 @@ Created 11/5/1995 Heikki Tuuri
 #include "ut0byte.h"
 #include "buf0types.h"

-/** The return type of buf_LRU_free_block() */
-enum buf_lru_free_block_status {
-	/** freed */
-	BUF_LRU_FREED = 0,
-	/** not freed because the caller asked to remove the
-	uncompressed frame but the control block cannot be
-	relocated */
-	BUF_LRU_CANNOT_RELOCATE,
-	/** not freed because of some other reason */
-	BUF_LRU_NOT_FREED
-};
-
 /******************************************************************//**
 Tries to remove LRU flushed blocks from the end of the LRU list and put them
 to the free list. This is beneficial for the efficiency of the insert buffer
@@ -85,6 +73,7 @@ void
 buf_LRU_invalidate_tablespace(
 /*==========================*/
 	ulint	id);	/*!< in: space id */
+
 /******************************************************************//**
 */
 UNIV_INTERN
@@ -92,6 +81,7 @@ void
 buf_LRU_mark_space_was_deleted(
 /*===========================*/
 	ulint	id);	/*!< in: space id */
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 /********************************************************************//**
 Insert a compressed block into buf_pool->zip_clean in the LRU order. */
 UNIV_INTERN
@@ -99,22 +89,22 @@ void
 buf_LRU_insert_zip_clean(
 /*=====================*/
 	buf_page_t*	bpage);	/*!< in: pointer to the block in question */
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */

 /******************************************************************//**
 Try to free a block.  If bpage is a descriptor of a compressed-only
 page, the descriptor object will be freed as well.

-NOTE: If this function returns BUF_LRU_FREED, it will temporarily
+NOTE: If this function returns TRUE, it will temporarily
 release buf_pool->mutex.  Furthermore, the page frame will no longer be
 accessible via bpage.

 The caller must hold buf_pool->mutex and buf_page_get_mutex(bpage) and
 release these two mutexes after the call.  No other
 buf_page_get_mutex() may be held when calling this function.
-@return BUF_LRU_FREED if freed, BUF_LRU_CANNOT_RELOCATE or
-BUF_LRU_NOT_FREED otherwise. */
+@return TRUE if freed, FALSE otherwise. */
 UNIV_INTERN
-enum buf_lru_free_block_status
+ibool
 buf_LRU_free_block(
 /*===============*/
 	buf_page_t*	bpage,	/*!< in: block to be freed */
@@ -204,7 +194,7 @@ buf_LRU_make_block_old(
 /*===================*/
 	buf_page_t*	bpage);	/*!< in: control block */
 /**********************************************************************//**
-Updates buf_LRU_old_ratio.
+Updates buf_pool->LRU_old_ratio.
 @return	updated old_pct */
 UNIV_INTERN
 ulint
@@ -213,7 +203,7 @@ buf_LRU_old_ratio_update(
 	uint	old_pct,/*!< in: Reserve this percentage of
 			the buffer pool for "old" blocks. */
 	ibool	adjust);/*!< in: TRUE=adjust the LRU list;
-			FALSE=just assign buf_LRU_old_ratio
+			FALSE=just assign buf_pool->LRU_old_ratio
 			during the initialization of InnoDB */
 /********************************************************************//**
 Update the historical stats that we are collecting for LRU eviction
@@ -235,6 +225,17 @@ ibool
 buf_LRU_file_restore(void);
 /*======================*/

+/******************************************************************//**
+Remove one page from LRU list and put it to free list */
+UNIV_INTERN
+void
+buf_LRU_free_one_page(
+/*==================*/
+	buf_page_t*	bpage)	/*!< in/out: block, must contain a file page and
+				be in a state where it can be freed; there
+				may or may not be a hash index to the page */
+	__attribute__((nonnull));
+
 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 /**********************************************************************//**
 Validates the LRU list.
@@ -254,18 +255,15 @@ buf_LRU_print(void);
 #endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */

 /** @name Heuristics for detecting index scan @{ */
-/** Reserve this much/BUF_LRU_OLD_RATIO_DIV of the buffer pool for
-"old" blocks.  Protected by buf_pool->mutex. */
-extern uint	buf_LRU_old_ratio;
-/** The denominator of buf_LRU_old_ratio. */
+/** The denominator of buf_pool->LRU_old_ratio. */
 #define BUF_LRU_OLD_RATIO_DIV	1024
-/** Maximum value of buf_LRU_old_ratio.
+/** Maximum value of buf_pool->LRU_old_ratio.
 @see buf_LRU_old_adjust_len
-@see buf_LRU_old_ratio_update */
+@see buf_pool->LRU_old_ratio_update */
 #define BUF_LRU_OLD_RATIO_MAX	BUF_LRU_OLD_RATIO_DIV
-/** Minimum value of buf_LRU_old_ratio.
+/** Minimum value of buf_pool->LRU_old_ratio.
 @see buf_LRU_old_adjust_len
-@see buf_LRU_old_ratio_update
+@see buf_pool->LRU_old_ratio_update
 The minimum must exceed
 (BUF_LRU_OLD_TOLERANCE + 5) * BUF_LRU_OLD_RATIO_DIV / BUF_LRU_OLD_MIN_LEN. */
 #define BUF_LRU_OLD_RATIO_MIN	51

--- a/include/buf0rea.h
+++ b/include/buf0rea.h
@@ -76,6 +76,32 @@ buf_read_page(
 	ulint	offset, /*!< in: page number */
 	trx_t*	trx);
 /********************************************************************//**
+Applies a random read-ahead in buf_pool if there are at least a threshold
+value of accessed pages from the random read-ahead area. Does not read any
+page, not even the one at the position (space, offset), if the read-ahead
+mechanism is not activated. NOTE 1: the calling thread may own latches on
+pages: to avoid deadlocks this function must be written such that it cannot
+end up waiting for these latches! NOTE 2: the calling thread must want
+access to the page given: this rule is set to prevent unintended read-aheads
+performed by ibuf routines, a situation which could result in a deadlock if
+the OS does not support asynchronous i/o.
+@return number of page read requests issued; NOTE that if we read ibuf
+pages, it may happen that the page at the given page number does not
+get read even if we return a positive value!
+@return	number of page read requests issued */
+UNIV_INTERN
+ulint
+buf_read_ahead_random(
+/*==================*/
+	ulint	space,		/*!< in: space id */
+	ulint	zip_size,	/*!< in: compressed page size in bytes,
+				or 0 */
+	ulint	offset,		/*!< in: page number of a page which
+				the current thread wants to access */
+	ibool	inside_ibuf,	/*!< in: TRUE if we are inside ibuf
+				routine */
+	trx_t*	trx);
+/********************************************************************//**
 Applies linear read-ahead if in the buf_pool the page is a border page of
 a linear read-ahead area and all the pages in the area have been accessed.
 Does not read any page if the read-ahead mechanism is not activated. Note

--- a/include/buf0types.h
+++ b/include/buf0types.h
 /*****************************************************************************

-Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved

 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -26,6 +26,8 @@ Created 11/17/1995 Heikki Tuuri
 #ifndef buf0types_h
 #define buf0types_h

+#include "page0types.h"
+
 /** Buffer page (uncompressed or compressed) */
 typedef	struct buf_page_struct		buf_page_t;
 /** Buffer block for which an uncompressed page exists */
@@ -60,17 +62,10 @@ enum buf_io_fix {

 /** Parameters of binary buddy system for compressed pages (buf0buddy.h) */
 /* @{ */
-#if UNIV_WORD_SIZE <= 4 /* 32-bit system */
-/** Base-2 logarithm of the smallest buddy block size */
-# define BUF_BUDDY_LOW_SHIFT	6
-#else /* 64-bit system */
-/** Base-2 logarithm of the smallest buddy block size */
-# define BUF_BUDDY_LOW_SHIFT	7
-#endif
+#define BUF_BUDDY_LOW_SHIFT	PAGE_ZIP_MIN_SIZE_SHIFT
+
 #define BUF_BUDDY_LOW		(1 << BUF_BUDDY_LOW_SHIFT)
-					/*!< minimum block size in the binary
-					buddy system; must be at least
-					sizeof(buf_page_t) */
+
 #define BUF_BUDDY_SIZES		(UNIV_PAGE_SIZE_SHIFT - BUF_BUDDY_LOW_SHIFT)
 #define BUF_BUDDY_SIZES_MAX	(UNIV_PAGE_SIZE_SHIFT_MAX - BUF_BUDDY_LOW_SHIFT)
 					/*!< number of buddy sizes */

--- a/include/db0err.h
+++ b/include/db0err.h
--- a/include/dict0boot.h
+++ b/include/dict0boot.h
--- a/include/dict0dict.h
+++ b/include/dict0dict.h
--- a/include/dict0dict.ic
+++ b/include/dict0dict.ic
--- a/include/dict0mem.h
+++ b/include/dict0mem.h
--- a/include/dict0types.h
+++ b/include/dict0types.h
--- a/include/ha_prototypes.h
+++ b/include/ha_prototypes.h
--- a/include/mtr0mtr.h
+++ b/include/mtr0mtr.h
--- a/include/os0file.h
+++ b/include/os0file.h
--- a/include/page0page.h
+++ b/include/page0page.h
--- a/include/row0row.h
+++ b/include/row0row.h
--- a/include/row0row.ic
+++ b/include/row0row.ic
--- a/include/row0upd.ic
+++ b/include/row0upd.ic
--- a/include/srv0srv.h
+++ b/include/srv0srv.h
--- a/include/sync0sync.h
+++ b/include/sync0sync.h
--- a/include/sync0sync.ic
+++ b/include/sync0sync.ic
--- a/include/trx0sys.h
+++ b/include/trx0sys.h
--- a/include/trx0sys.ic
+++ b/include/trx0sys.ic
--- a/include/trx0trx.h
+++ b/include/trx0trx.h
--- a/include/trx0undo.h
+++ b/include/trx0undo.h
--- a/include/ut0mem.h
+++ b/include/ut0mem.h
--- a/include/ut0mem.ic
+++ b/include/ut0mem.ic
--- a/lock/lock0lock.c
+++ b/lock/lock0lock.c
--- a/mtr/mtr0mtr.c
+++ b/mtr/mtr0mtr.c
--- a/os/os0file.c
+++ b/os/os0file.c
--- a/page/page0zip.c
+++ b/page/page0zip.c
--- a/pars/pars0opt.c
+++ b/pars/pars0opt.c
--- a/percona-suite/percona_innodb_expand_fast_index_creation.result
+++ b/percona-suite/percona_innodb_expand_fast_index_creation.result
--- a/percona-suite/percona_innodb_expand_fast_index_creation.test
+++ b/percona-suite/percona_innodb_expand_fast_index_creation.test
--- a/percona-suite/percona_innodb_fake_changes.result
+++ b/percona-suite/percona_innodb_fake_changes.result
--- a/percona-suite/percona_innodb_fake_changes.test
+++ b/percona-suite/percona_innodb_fake_changes.test
--- a/percona-suite/percona_innodb_fake_changes_locks.result
+++ b/percona-suite/percona_innodb_fake_changes_locks.result
--- a/percona-suite/percona_innodb_fake_changes_locks.test
+++ b/percona-suite/percona_innodb_fake_changes_locks.test
--- a/percona-suite/percona_innodb_kill_idle_trx.result
+++ b/percona-suite/percona_innodb_kill_idle_trx.result
--- a/percona-suite/percona_innodb_kill_idle_trx.test
+++ b/percona-suite/percona_innodb_kill_idle_trx.test
--- a/percona-suite/percona_innodb_kill_idle_trx_locks.result
+++ b/percona-suite/percona_innodb_kill_idle_trx_locks.result
--- a/percona-suite/percona_innodb_kill_idle_trx_locks.test
+++ b/percona-suite/percona_innodb_kill_idle_trx_locks.test
--- a/percona-suite/percona_log_connection_error.result
+++ b/percona-suite/percona_log_connection_error.result
--- a/percona-suite/percona_log_connection_error.test
+++ b/percona-suite/percona_log_connection_error.test
--- a/percona-suite/percona_processlist_row_stats.result
+++ b/percona-suite/percona_processlist_row_stats.result
--- a/percona-suite/percona_processlist_row_stats.test
+++ b/percona-suite/percona_processlist_row_stats.test
--- a/percona-suite/percona_query_response_time-replication.result
+++ b/percona-suite/percona_query_response_time-replication.result
--- a/percona-suite/percona_query_response_time-replication.test
+++ b/percona-suite/percona_query_response_time-replication.test
--- a/percona-suite/percona_query_response_time-stored.result
+++ b/percona-suite/percona_query_response_time-stored.result
--- a/percona-suite/percona_query_response_time-stored.test
+++ b/percona-suite/percona_query_response_time-stored.test
--- a/percona-suite/percona_query_response_time.result
+++ b/percona-suite/percona_query_response_time.result
--- a/percona-suite/percona_query_response_time.test
+++ b/percona-suite/percona_query_response_time.test
--- a/percona-suite/percona_server_variables_debug.result
+++ b/percona-suite/percona_server_variables_debug.result
--- a/percona-suite/percona_server_variables_release.result
+++ b/percona-suite/percona_server_variables_release.result
--- a/percona-suite/percona_show_slave_status_nolock.result
+++ b/percona-suite/percona_show_slave_status_nolock.result
--- a/percona-suite/percona_show_slave_status_nolock.test
+++ b/percona-suite/percona_show_slave_status_nolock.test
--- a/percona-suite/percona_status_wait_query_cache_mutex.result
+++ b/percona-suite/percona_status_wait_query_cache_mutex.result
--- a/percona-suite/percona_status_wait_query_cache_mutex.test
+++ b/percona-suite/percona_status_wait_query_cache_mutex.test
--- a/percona-suite/query_response_time-replication.inc
+++ b/percona-suite/query_response_time-replication.inc
--- a/percona-suite/query_response_time-stored.inc
+++ b/percona-suite/query_response_time-stored.inc
--- a/percona-suite/query_response_time.inc
+++ b/percona-suite/query_response_time.inc
--- a/que/que0que.c
+++ b/que/que0que.c
--- a/rem/rem0rec.c
+++ b/rem/rem0rec.c
--- a/row/row0ins.c
+++ b/row/row0ins.c
--- a/row/row0merge.c
+++ b/row/row0merge.c
--- a/row/row0mysql.c
+++ b/row/row0mysql.c
--- a/row/row0purge.c
+++ b/row/row0purge.c
--- a/row/row0row.c
+++ b/row/row0row.c
--- a/row/row0sel.c
+++ b/row/row0sel.c
--- a/row/row0uins.c
+++ b/row/row0uins.c
--- a/row/row0umod.c
+++ b/row/row0umod.c
--- a/row/row0upd.c
+++ b/row/row0upd.c
--- a/srv/srv0srv.c
+++ b/srv/srv0srv.c
--- a/srv/srv0start.c
+++ b/srv/srv0start.c
--- a/sync/sync0sync.c
+++ b/sync/sync0sync.c
--- a/trx/trx0rec.c
+++ b/trx/trx0rec.c
--- a/trx/trx0trx.c
+++ b/trx/trx0trx.c
--- a/trx/trx0undo.c
+++ b/trx/trx0undo.c
--- a/ut/ut0mem.c
+++ b/ut/ut0mem.c
--- a/ut/ut0ut.c
+++ b/ut/ut0ut.c