Revert most of revno 3560.9.1 (Bug#12704861)

This was an attempt to address problems with the Bug#12612184 fix. Even with this follow-up fix, crash recovery can be broken. Let us fix the bug later.

Revert most of revno 3560.9.1 (Bug#12704861)
This was an attempt to address problems with the Bug#12612184 fix. Even with this follow-up fix, crash recovery can be broken. Let us fix the bug later.
91b5e935 · Marko Mäkelä · 5cd2cb0c · 91b5e935 · 91b5e935 · 91b5e935
Commit 91b5e935 authored Oct 26, 2011 by Marko Mäkelä
29 changed files
--- a/mysql-test/suite/innodb_plugin/r/innodb-index.result
+++ b/mysql-test/suite/innodb_plugin/r/innodb-index.result
@@ -1094,6 +1094,20 @@ COMMIT;
 UPDATE bug12547647 SET c = REPEAT('b',16928);
 ERROR 42000: Row size too large. The maximum row size for the used table type, not counting BLOBs, is 8126. You have to change some columns to TEXT or BLOBs
 DROP TABLE bug12547647;
+SET @r=REPEAT('a',500);
+CREATE TABLE t1(a INT,
+v1 VARCHAR(500), v2 VARCHAR(500), v3 VARCHAR(500),
+v4 VARCHAR(500), v5 VARCHAR(500), v6 VARCHAR(500),
+v7 VARCHAR(500), v8 VARCHAR(500), v9 VARCHAR(500),
+v10 VARCHAR(500), v11 VARCHAR(500), v12 VARCHAR(500),
+v13 VARCHAR(500), v14 VARCHAR(500), v15 VARCHAR(500),
+v16 VARCHAR(500), v17 VARCHAR(500), v18 VARCHAR(500)
+) ENGINE=InnoDB ROW_FORMAT=DYNAMIC;
+CREATE INDEX idx1 ON t1(a,v1);
+INSERT INTO t1 VALUES(9,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r);
+UPDATE t1 SET a=1000;
+DELETE FROM t1;
+DROP TABLE t1;
 set global innodb_file_per_table=0;
 set global innodb_file_format=Antelope;
 set global innodb_file_format_check=Antelope;

--- a/mysql-test/suite/innodb_plugin/t/innodb-index.test
+++ b/mysql-test/suite/innodb_plugin/t/innodb-index.test
@@ -527,13 +527,30 @@ CREATE TABLE bug12547647(
 a INT NOT NULL, b BLOB NOT NULL, c TEXT,
 PRIMARY KEY (b(10), a), INDEX (c(10))
 ) ENGINE=InnoDB ROW_FORMAT=DYNAMIC;
 INSERT INTO bug12547647 VALUES (5,repeat('khdfo5AlOq',1900),repeat('g',7731));
 COMMIT;
 # The following used to cause infinite undo log allocation.
 --error ER_TOO_BIG_ROWSIZE
 UPDATE bug12547647 SET c = REPEAT('b',16928);
 DROP TABLE bug12547647;
+# Bug#12637786
+SET @r=REPEAT('a',500);
+CREATE TABLE t1(a INT,
+ v1 VARCHAR(500), v2 VARCHAR(500), v3 VARCHAR(500),
+ v4 VARCHAR(500), v5 VARCHAR(500), v6 VARCHAR(500),
+ v7 VARCHAR(500), v8 VARCHAR(500), v9 VARCHAR(500),
+ v10 VARCHAR(500), v11 VARCHAR(500), v12 VARCHAR(500),
+ v13 VARCHAR(500), v14 VARCHAR(500), v15 VARCHAR(500),
+ v16 VARCHAR(500), v17 VARCHAR(500), v18 VARCHAR(500)
+) ENGINE=InnoDB ROW_FORMAT=DYNAMIC;
+CREATE INDEX idx1 ON t1(a,v1);
+INSERT INTO t1 VALUES(9,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r,@r);
+UPDATE t1 SET a=1000;
+DELETE FROM t1;
+# Let the purge thread clean up this file.
+-- sleep 10
+DROP TABLE t1;
 eval set global innodb_file_per_table=$per_table;
 eval set global innodb_file_format=$format;

--- a/storage/innobase/btr/btr0btr.c
+++ b/storage/innobase/btr/btr0btr.c
@@ -300,30 +300,29 @@ btr_page_alloc_for_ibuf(
 /******************************************************************
 Allocates a new file page to be used in an index tree. NOTE: we assume
 that the caller has made the reservation for free extents! */
-static
-ulint
+page_t*
-btr_page_alloc_low(
+btr_page_alloc(
-/*===============*/
+/*===========*/
-					/* out: allocated page number,
+					/* out: new allocated page, x-latched;
-					FIL_NULL if out of space */
+					NULL if out of space */
 	dict_index_t*	index,		/* in: index */
 	ulint		hint_page_no,	/* in: hint of a good page */
 	byte		file_direction,	/* in: direction where a possible
 					page split is made */
 	ulint		level,		/* in: level where the page is placed
 					in the tree */
-	mtr_t*		mtr,		/* in/out: mini-transaction
+	mtr_t*		mtr)		/* in: mtr */
-					for the allocation */
-	mtr_t*		init_mtr)	/* in/out: mini-transaction
-					in which the page should be
-					initialized (may be the same
-					as mtr), or NULL if it should
-					not be initialized (the page
-					at hint was previously freed
-					in mtr) */
 {
 	fseg_header_t*	seg_header;
 	page_t*		root;
+	page_t*		new_page;
+	ulint		new_page_no;
+	if (index->type & DICT_IBUF) {
+		return(btr_page_alloc_for_ibuf(index, mtr));
+	}
 	root = btr_root_get(index, mtr);
@@ -337,61 +336,19 @@ btr_page_alloc_low(
 	reservation for free extents, and thus we know that a page can
 	be allocated: */
-	return(fseg_alloc_free_page_general(seg_header, hint_page_no,
+	new_page_no = fseg_alloc_free_page_general(seg_header, hint_page_no,
-					    file_direction, TRUE,
+						   file_direction, TRUE, mtr);
-					    mtr, init_mtr));
-}
-/**************************************************************//**
-Allocates a new file page to be used in an index tree. NOTE: we assume
-that the caller has made the reservation for free extents! */
-page_t*
-btr_page_alloc(
-/*===========*/
-					/* out:	new allocated block, x-latched;
-					NULL if out of space */
-	dict_index_t*	index,		/* in: index */
-	ulint		hint_page_no,	/* in: hint of a good page */
-	byte		file_direction,	/* in: direction where a possible
-					page split is made */
-	ulint		level,		/* in: level where the page is placed
-					in the tree */
-	mtr_t*		mtr,		/* in/out: mini-transaction
-					for the allocation */
-	mtr_t*		init_mtr)	/* in/out: mini-transaction
-					for x-latching and initializing
-					the page */
-{
-	page_t*		new_page;
-	ulint		new_page_no;
-	if (index->type & DICT_IBUF) {
-		return(btr_page_alloc_for_ibuf(index, mtr));
-	}
-	new_page_no = btr_page_alloc_low(
-		index, hint_page_no, file_direction, level, mtr, init_mtr);
 	if (new_page_no == FIL_NULL) {
 		return(NULL);
 	}
 	new_page = buf_page_get(dict_index_get_space(index), new_page_no,
-				RW_X_LATCH, init_mtr);
+				RW_X_LATCH, mtr);
 #ifdef UNIV_SYNC_DEBUG
 	buf_page_dbg_add_level(new_page, SYNC_TREE_NODE_NEW);
 #endif /* UNIV_SYNC_DEBUG */
-	if (mtr->freed_clust_leaf) {
-		mtr_memo_release(mtr, new_page, MTR_MEMO_FREE_CLUST_LEAF);
-		ut_ad(!mtr_memo_contains(mtr, buf_block_align(new_page),
-					 MTR_MEMO_FREE_CLUST_LEAF));
-	}
-	ut_ad(btr_freed_leaves_validate(mtr));
 	return(new_page);
 }
@@ -538,138 +495,8 @@ btr_page_free(
 	level = btr_page_get_level(page, mtr);
 	btr_page_free_low(index, page, level, mtr);
-	/* The handling of MTR_MEMO_FREE_CLUST_LEAF assumes this. */
-	ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
-				MTR_MEMO_PAGE_X_FIX));
-	if (level == 0 && (index->type & DICT_CLUSTERED)) {
-		/* We may have to call btr_mark_freed_leaves() to
-		temporarily mark the block nonfree for invoking
-		btr_store_big_rec_extern_fields() after an
-		update. Remember that the block was freed. */
-		mtr->freed_clust_leaf = TRUE;
-		mtr_memo_push(mtr, buf_block_align(page),
-			      MTR_MEMO_FREE_CLUST_LEAF);
-	}
-	ut_ad(btr_freed_leaves_validate(mtr));
 }
-/**************************************************************//**
-Marks all MTR_MEMO_FREE_CLUST_LEAF pages nonfree or free.
-For invoking btr_store_big_rec_extern_fields() after an update,
-we must temporarily mark freed clustered index pages allocated, so
-that off-page columns will not be allocated from them. Between the
-btr_store_big_rec_extern_fields() and mtr_commit() we have to
-mark the pages free again, so that no pages will be leaked. */
-void
-btr_mark_freed_leaves(
-/*==================*/
-	dict_index_t*	index,	/* in/out: clustered index */
-	mtr_t*		mtr,	/* in/out: mini-transaction */
-	ibool		nonfree)/* in: TRUE=mark nonfree, FALSE=mark freed */
-{
-	/* This is loosely based on mtr_memo_release(). */
-	ulint	offset;
-	ut_ad(index->type & DICT_CLUSTERED);
-	ut_ad(mtr->magic_n == MTR_MAGIC_N);
-	ut_ad(mtr->state == MTR_ACTIVE);
-	if (!mtr->freed_clust_leaf) {
-		return;
-	}
-	offset = dyn_array_get_data_size(&mtr->memo);
-	while (offset > 0) {
-		mtr_memo_slot_t*	slot;
-		buf_block_t*		block;
-		offset -= sizeof *slot;
-		slot = dyn_array_get_element(&mtr->memo, offset);
-		if (slot->type != MTR_MEMO_FREE_CLUST_LEAF) {
-			continue;
-		}
-		/* Because btr_page_alloc() does invoke
-		mtr_memo_release on MTR_MEMO_FREE_CLUST_LEAF, all
-		blocks tagged with MTR_MEMO_FREE_CLUST_LEAF in the
-		memo must still be clustered index leaf tree pages. */
-		block = slot->object;
-		ut_a(buf_block_get_space(block)
-		     == dict_index_get_space(index));
-		ut_a(fil_page_get_type(buf_block_get_frame(block))
-		     == FIL_PAGE_INDEX);
-		ut_a(btr_page_get_level(buf_block_get_frame(block), mtr) == 0);
-		if (nonfree) {
-			/* Allocate the same page again. */
-			ulint	page_no;
-			page_no = btr_page_alloc_low(
-				index, buf_block_get_page_no(block),
-				FSP_NO_DIR, 0, mtr, NULL);
-			ut_a(page_no == buf_block_get_page_no(block));
-		} else {
-			/* Assert that the page is allocated and free it. */
-			btr_page_free_low(index, buf_block_get_frame(block),
-					  0, mtr);
-		}
-	}
-	ut_ad(btr_freed_leaves_validate(mtr));
-}
-#ifdef UNIV_DEBUG
-/**************************************************************//**
-Validates all pages marked MTR_MEMO_FREE_CLUST_LEAF.
-See btr_mark_freed_leaves(). */
-ibool
-btr_freed_leaves_validate(
-/*======================*/
-			/* out: TRUE if valid */
-	mtr_t*	mtr)	/* in: mini-transaction */
-{
-	ulint	offset;
-	ut_ad(mtr->magic_n == MTR_MAGIC_N);
-	ut_ad(mtr->state == MTR_ACTIVE);
-	offset = dyn_array_get_data_size(&mtr->memo);
-	while (offset > 0) {
-		mtr_memo_slot_t*	slot;
-		buf_block_t*		block;
-		offset -= sizeof *slot;
-		slot = dyn_array_get_element(&mtr->memo, offset);
-		if (slot->type != MTR_MEMO_FREE_CLUST_LEAF) {
-			continue;
-		}
-		ut_a(mtr->freed_clust_leaf);
-		/* Because btr_page_alloc() does invoke
-		mtr_memo_release on MTR_MEMO_FREE_CLUST_LEAF, all
-		blocks tagged with MTR_MEMO_FREE_CLUST_LEAF in the
-		memo must still be clustered index leaf tree pages. */
-		block = slot->object;
-		ut_a(fil_page_get_type(buf_block_get_frame(block))
-		     == FIL_PAGE_INDEX);
-		ut_a(btr_page_get_level(buf_block_get_frame(block), mtr) == 0);
-	}
-	return(TRUE);
-}
-#endif /* UNIV_DEBUG */
 /******************************************************************
 Sets the child node file address in a node pointer. */
 UNIV_INLINE
@@ -1199,7 +1026,7 @@ btr_root_raise_and_insert(
 	a node pointer to the new page, and then splitting the new page. */
 	new_page = btr_page_alloc(index, 0, FSP_NO_DIR,
-				  btr_page_get_level(root, mtr), mtr, mtr);
+				  btr_page_get_level(root, mtr), mtr);
 	btr_page_create(new_page, index, mtr);
@@ -1820,7 +1647,7 @@ func_start:
 	/* 2. Allocate a new page to the index */
 	new_page = btr_page_alloc(cursor->index, hint_page_no, direction,
-				  btr_page_get_level(page, mtr), mtr, mtr);
+				  btr_page_get_level(page, mtr), mtr);
 	btr_page_create(new_page, cursor->index, mtr);
 	/* 3. Calculate the first record on the upper half-page, and the

--- a/storage/innobase/btr/btr0cur.c
+++ b/storage/innobase/btr/btr0cur.c
@@ -2051,6 +2051,43 @@ return_after_reservations:
 	return(err);
 }
+/*****************************************************************
+Commits and restarts a mini-transaction so that it will retain an
+x-lock on index->lock and the cursor page. */
+void
+btr_cur_mtr_commit_and_start(
+/*=========================*/
+	btr_cur_t*	cursor,	/* in: cursor */
+	mtr_t*		mtr)	/* in/out: mini-transaction */
+{
+	buf_block_t*	block;
+	block = buf_block_align(btr_cur_get_rec(cursor));
+	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index),
+				MTR_MEMO_X_LOCK));
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	/* Keep the locks across the mtr_commit(mtr). */
+	rw_lock_x_lock(dict_index_get_lock(cursor->index));
+	rw_lock_x_lock(&block->lock);
+	mutex_enter(&block->mutex);
+#ifdef UNIV_SYNC_DEBUG
+	buf_block_buf_fix_inc_debug(block, __FILE__, __LINE__);
+#else
+	buf_block_buf_fix_inc(block);
+#endif
+	mutex_exit(&block->mutex);
+	/* Write out the redo log. */
+	mtr_commit(mtr);
+	mtr_start(mtr);
+	/* Reassociate the locks with the mini-transaction.
+	They will be released on mtr_commit(mtr). */
+	mtr_memo_push(mtr, dict_index_get_lock(cursor->index),
+		      MTR_MEMO_X_LOCK);
+	mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX);
+}
 /*==================== B-TREE DELETE MARK AND UNMARK ===============*/
 /********************************************************************
@@ -3449,11 +3486,6 @@ btr_store_big_rec_extern_fields(
 					this function returns */
 	big_rec_t*	big_rec_vec,	/* in: vector containing fields
 					to be stored externally */
-	mtr_t*		alloc_mtr,	/* in/out: in an insert, NULL;
-					in an update, local_mtr for
-					allocating BLOB pages and
-					updating BLOB pointers; alloc_mtr
-					must not have freed any leaf pages */
 	mtr_t*		local_mtr __attribute__((unused))) /* in: mtr
 					containing the latch to rec and to the
 					tree */
@@ -3474,8 +3506,6 @@ btr_store_big_rec_extern_fields(
 	ulint	i;
 	mtr_t	mtr;
-	ut_ad(local_mtr);
-	ut_ad(!alloc_mtr || alloc_mtr == local_mtr);
 	ut_ad(rec_offs_validate(rec, index, offsets));
 	ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index),
 				MTR_MEMO_X_LOCK));
@@ -3485,25 +3515,6 @@ btr_store_big_rec_extern_fields(
 	space_id = buf_frame_get_space_id(rec);
-	if (alloc_mtr) {
-		/* Because alloc_mtr will be committed after
-		mtr, it is possible that the tablespace has been
-		extended when the B-tree record was updated or
-		inserted, or it will be extended while allocating
-		pages for big_rec.
-		TODO: In mtr (not alloc_mtr), write a redo log record
-		about extending the tablespace to its current size,
-		and remember the current size. Whenever the tablespace
-		grows as pages are allocated, write further redo log
-		records to mtr. (Currently tablespace extension is not
-		covered by the redo log. If it were, the record would
-		only be written to alloc_mtr, which is committed after
-		mtr.) */
-	} else {
-		alloc_mtr = &mtr;
-	}
 	/* We have to create a file segment to the tablespace
 	for each field and put the pointer to the field in rec */
@@ -3530,7 +3541,7 @@ btr_store_big_rec_extern_fields(
 			}
 			page = btr_page_alloc(index, hint_page_no,
-					      FSP_NO_DIR, 0, alloc_mtr, &mtr);
+					      FSP_NO_DIR, 0, &mtr);
 			if (page == NULL) {
 				mtr_commit(&mtr);
@@ -3584,42 +3595,37 @@ btr_store_big_rec_extern_fields(
 			extern_len -= store_len;
-			if (alloc_mtr == &mtr) {
 #ifdef UNIV_SYNC_DEBUG
-				rec_page =
+			rec_page =
 #endif /* UNIV_SYNC_DEBUG */
-					buf_page_get(
+			buf_page_get(space_id,
-						space_id,
+				     buf_frame_get_page_no(data),
-						buf_frame_get_page_no(data),
+				     RW_X_LATCH, &mtr);
-						RW_X_LATCH, &mtr);
 #ifdef UNIV_SYNC_DEBUG
-				buf_page_dbg_add_level(
+			buf_page_dbg_add_level(rec_page, SYNC_NO_ORDER_CHECK);
-					rec_page, SYNC_NO_ORDER_CHECK);
 #endif /* UNIV_SYNC_DEBUG */
-			}
 			mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, 0,
-					 MLOG_4BYTES, alloc_mtr);
+					 MLOG_4BYTES, &mtr);
 			mlog_write_ulint(data + local_len + BTR_EXTERN_LEN + 4,
 					 big_rec_vec->fields[i].len
 					 - extern_len,
-					 MLOG_4BYTES, alloc_mtr);
+					 MLOG_4BYTES, &mtr);
 			if (prev_page_no == FIL_NULL) {
 				mlog_write_ulint(data + local_len
 						 + BTR_EXTERN_SPACE_ID,
 						 space_id,
-						 MLOG_4BYTES, alloc_mtr);
+						 MLOG_4BYTES, &mtr);
 				mlog_write_ulint(data + local_len
 						 + BTR_EXTERN_PAGE_NO,
 						 page_no,
-						 MLOG_4BYTES, alloc_mtr);
+						 MLOG_4BYTES, &mtr);
 				mlog_write_ulint(data + local_len
 						 + BTR_EXTERN_OFFSET,
 						 FIL_PAGE_DATA,
-						 MLOG_4BYTES, alloc_mtr);
+						 MLOG_4BYTES, &mtr);
 				/* Set the bit denoting that this field
 				in rec is stored externally */
@@ -3627,7 +3633,7 @@ btr_store_big_rec_extern_fields(
 				rec_set_nth_field_extern_bit(
 					rec, index,
 					big_rec_vec->fields[i].field_no,
-					TRUE, alloc_mtr);
+					TRUE, &mtr);
 			}
 			prev_page_no = page_no;

--- a/storage/innobase/fsp/fsp0fsp.c
+++ b/storage/innobase/fsp/fsp0fsp.c
--- a/storage/innobase/include/btr0btr.h
+++ b/storage/innobase/include/btr0btr.h
@@ -379,11 +379,7 @@ btr_page_alloc(
 					page split is made */
 	ulint		level,		/* in: level where the page is placed
 					in the tree */
-	mtr_t*		mtr,		/* in/out: mini-transaction
+	mtr_t*		mtr);		/* in: mtr */
-					for the allocation */
-	mtr_t*		init_mtr);	/* in/out: mini-transaction
-					for x-latching and initializing
-					the page */
 /******************************************************************
 Frees a file page used in an index tree. NOTE: cannot free field external
 storage pages because the page must contain info on its level. */
@@ -406,31 +402,6 @@ btr_page_free_low(
 	page_t*		page,	/* in: page to be freed, x-latched */
 	ulint		level,	/* in: page level */
 	mtr_t*		mtr);	/* in: mtr */
-/**************************************************************//**
-Marks all MTR_MEMO_FREE_CLUST_LEAF pages nonfree or free.
-For invoking btr_store_big_rec_extern_fields() after an update,
-we must temporarily mark freed clustered index pages allocated, so
-that off-page columns will not be allocated from them. Between the
-btr_store_big_rec_extern_fields() and mtr_commit() we have to
-mark the pages free again, so that no pages will be leaked. */
-void
-btr_mark_freed_leaves(
-/*==================*/
-	dict_index_t*	index,	/* in/out: clustered index */
-	mtr_t*		mtr,	/* in/out: mini-transaction */
-	ibool		nonfree);/* in: TRUE=mark nonfree, FALSE=mark freed */
-#ifdef UNIV_DEBUG
-/**************************************************************//**
-Validates all pages marked MTR_MEMO_FREE_CLUST_LEAF.
-See btr_mark_freed_leaves(). */
-ibool
-btr_freed_leaves_validate(
-/*======================*/
-			/* out: TRUE if valid */
-	mtr_t*	mtr);	/* in: mini-transaction */
-#endif /* UNIV_DEBUG */
 #ifdef UNIV_BTR_PRINT
 /*****************************************************************
 Prints size info of a B-tree. */

--- a/storage/innobase/include/btr0cur.h
+++ b/storage/innobase/include/btr0cur.h
@@ -252,6 +252,15 @@ btr_cur_pessimistic_update(
 				updates */
 	que_thr_t*	thr,	/* in: query thread */
 	mtr_t*		mtr);	/* in: mtr */
+/*****************************************************************
+Commits and restarts a mini-transaction so that it will retain an
+x-lock on index->lock and the cursor page. */
+void
+btr_cur_mtr_commit_and_start(
+/*=========================*/
+	btr_cur_t*	cursor,	/* in: cursor */
+	mtr_t*		mtr);	/* in/out: mini-transaction */
 /***************************************************************
 Marks a clustered index record deleted. Writes an undo log record to
 undo log on this delete marking. Writes in the trx id field the id
@@ -462,11 +471,6 @@ btr_store_big_rec_extern_fields(
 					this function returns */
 	big_rec_t*	big_rec_vec,	/* in: vector containing fields
 					to be stored externally */
-	mtr_t*		alloc_mtr,	/* in/out: in an insert, NULL;
-					in an update, local_mtr for
-					allocating BLOB pages and
-					updating BLOB pointers; alloc_mtr
-					must not have freed any leaf pages */
 	mtr_t*		local_mtr);	/* in: mtr containing the latch to
 					rec and to the tree */
 /***********************************************************************

--- a/storage/innobase/include/fsp0fsp.h
+++ b/storage/innobase/include/fsp0fsp.h
@@ -179,11 +179,7 @@ fseg_alloc_free_page_general(
 				with fsp_reserve_free_extents, then there
 				is no need to do the check for this individual
 				page */
-	mtr_t*		mtr,	/* in/out: mini-transaction */
+	mtr_t*		mtr);	/* in/out: mini-transaction */
-	mtr_t*		init_mtr);/* in/out: mtr or another mini-transaction
-				in which the page should be initialized,
-				or NULL if this is a "fake allocation" of
-				a page that was previously freed in mtr */
 /**************************************************************************
 Reserves free pages from a tablespace. All mini-transactions which may
 use several pages from the tablespace should call this function beforehand

--- a/storage/innobase/include/mtr0mtr.h
+++ b/storage/innobase/include/mtr0mtr.h
@@ -36,8 +36,6 @@ first 3 values must be RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */
 #define MTR_MEMO_MODIFY		54
 #define	MTR_MEMO_S_LOCK		55
 #define	MTR_MEMO_X_LOCK		56
-/* The mini-transaction freed a clustered index leaf page. */
-#define MTR_MEMO_FREE_CLUST_LEAF	57
 /* Log item types: we have made them to be of the type 'byte'
 for the compiler to warn if val and type parameters are switched
@@ -317,12 +315,9 @@ struct mtr_struct{
 	ulint		state;	/* MTR_ACTIVE, MTR_COMMITTING, MTR_COMMITTED */
 	dyn_array_t	memo;	/* memo stack for locks etc. */
 	dyn_array_t	log;	/* mini-transaction log */
-	unsigned	modifications:1;
+	ibool		modifications;
 				/* TRUE if the mtr made modifications to
 				buffer pool pages */
-	unsigned	freed_clust_leaf:1;
-				/* TRUE if MTR_MEMO_FREE_CLUST_LEAF
-				was logged in the mini-transaction */
 	ulint		n_log_recs;
 				/* count of how many page initial log records
 				have been written to the mtr log */

--- a/storage/innobase/include/mtr0mtr.ic
+++ b/storage/innobase/include/mtr0mtr.ic
@@ -26,7 +26,6 @@ mtr_start(
 	mtr->log_mode = MTR_LOG_ALL;
 	mtr->modifications = FALSE;
-	mtr->freed_clust_leaf = FALSE;
 	mtr->n_log_recs = 0;
 #ifdef UNIV_DEBUG
@@ -51,8 +50,7 @@ mtr_memo_push(
 	ut_ad(object);
 	ut_ad(type >= MTR_MEMO_PAGE_S_FIX);
-	ut_ad(type <= MTR_MEMO_FREE_CLUST_LEAF);
+	ut_ad(type <= MTR_MEMO_X_LOCK);
-	ut_ad(type != MTR_MEMO_FREE_CLUST_LEAF || mtr->freed_clust_leaf);
 	ut_ad(mtr);
 	ut_ad(mtr->magic_n == MTR_MAGIC_N);

--- a/storage/innobase/mtr/mtr0mtr.c
+++ b/storage/innobase/mtr/mtr0mtr.c
@@ -53,13 +53,17 @@ mtr_memo_slot_release(
 			buf_page_release((buf_block_t*)object, type, mtr);
 		} else if (type == MTR_MEMO_S_LOCK) {
 			rw_lock_s_unlock((rw_lock_t*)object);
-		} else if (type != MTR_MEMO_X_LOCK) {
+#ifdef UNIV_DEBUG
-			ut_ad(type == MTR_MEMO_MODIFY
+		} else if (type == MTR_MEMO_X_LOCK) {
-			      || type == MTR_MEMO_FREE_CLUST_LEAF);
+			rw_lock_x_unlock((rw_lock_t*)object);
+		} else {
+			ut_ad(type == MTR_MEMO_MODIFY);
 			ut_ad(mtr_memo_contains(mtr, object,
 						MTR_MEMO_PAGE_X_FIX));
+#else
 		} else {
 			rw_lock_x_unlock((rw_lock_t*)object);
+#endif
 		}
 	}

--- a/storage/innobase/row/row0ins.c
+++ b/storage/innobase/row/row0ins.c
@@ -2090,20 +2090,15 @@ row_ins_index_entry_low(
 			if (big_rec) {
 				ut_a(err == DB_SUCCESS);
 				/* Write out the externally stored
-				columns, but allocate the pages and
+				columns while still x-latching
-				write the pointers using the
+				index->lock and block->lock. We have
-				mini-transaction of the record update.
+				to mtr_commit(mtr) first, so that the
-				If any pages were freed in the update,
+				redo log will be written in the
-				temporarily mark them allocated so
+				correct order. Otherwise, we would run
-				that off-page columns will not
+				into trouble on crash recovery if mtr
-				overwrite them. We must do this,
+				freed B-tree pages on which some of
-				because we will write the redo log for
+				the big_rec fields will be written. */
-				the BLOB writes before writing the
+				btr_cur_mtr_commit_and_start(&cursor, &mtr);
-				redo log for the record update. Thus,
-				redo log application at crash recovery
-				will see BLOBs being written to free pages. */
-				btr_mark_freed_leaves(index, &mtr, TRUE);
 				rec = btr_cur_get_rec(&cursor);
 				offsets = rec_get_offsets(rec, index, offsets,
@@ -2111,8 +2106,7 @@ row_ins_index_entry_low(
 							  &heap);
 				err = btr_store_big_rec_extern_fields(
-					index, rec, offsets, big_rec,
+					index, rec, offsets, big_rec, &mtr);
-					&mtr, &mtr);
 				/* If writing big_rec fails (for
 				example, because of DB_OUT_OF_FILE_SPACE),
 				the record will be corrupted. Even if
@@ -2125,9 +2119,6 @@ row_ins_index_entry_low(
 				undo log, and thus the record cannot
 				be rolled back. */
 				ut_a(err == DB_SUCCESS);
-				/* Free the pages again
-				in order to avoid a leak. */
-				btr_mark_freed_leaves(index, &mtr, FALSE);
 				goto stored_big_rec;
 			}
 		} else {
@@ -2175,8 +2166,7 @@ function_exit:
 					  ULINT_UNDEFINED, &heap);
 		err = btr_store_big_rec_extern_fields(index, rec,
-						      offsets, big_rec,
+						      offsets, big_rec, &mtr);
-						      NULL, &mtr);
 stored_big_rec:
 		if (modify) {
 			dtuple_big_rec_free(big_rec);

--- a/storage/innobase/row/row0row.c
+++ b/storage/innobase/row/row0row.c
@@ -212,27 +212,23 @@ row_build(
 	}
 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
-	if (rec_offs_any_null_extern(rec, offsets)) {
+	/* This condition can occur during crash recovery before
-		/* This condition can occur during crash recovery
+	trx_rollback_or_clean_all_without_sess() has completed
-		before trx_rollback_or_clean_all_without_sess() has
+	execution.
-		completed execution.
+	This condition is possible if the server crashed
-		This condition is possible if the server crashed
+	during an insert or update before
-		during an insert or update before
+	btr_store_big_rec_extern_fields() did mtr_commit() all
-		btr_store_big_rec_extern_fields() did mtr_commit() all
+	BLOB pointers to the clustered index record.
-		BLOB pointers to the clustered index record.
+	If the record contains a null BLOB pointer, look up the
-		If the record contains a null BLOB pointer, look up the
+	transaction that holds the implicit lock on this record, and
-		transaction that holds the implicit lock on this record, and
+	assert that it is active. (In this version of InnoDB, we
-		assert that it is active. (In this version of InnoDB, we
+	cannot assert that it was recovered, because there is no
-		cannot assert that it was recovered, because there is no
+	trx->is_recovered field.) */
-		trx->is_recovered field.) */
+	ut_a(!rec_offs_any_null_extern(rec, offsets)
-		ut_a(trx_assert_active(
+	     || trx_assert_active(row_get_rec_trx_id(rec, index, offsets)));
-			     row_get_rec_trx_id(rec, index, offsets)));
-		ut_a(trx_undo_roll_ptr_is_insert(
-			     row_get_rec_roll_ptr(rec, index, offsets)));
-	}
 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
 	if (type != ROW_COPY_POINTERS) {

--- a/storage/innobase/row/row0upd.c
+++ b/storage/innobase/row/row0upd.c
@@ -1591,22 +1591,21 @@ row_upd_clust_rec(
 		*offsets_ = (sizeof offsets_) / sizeof *offsets_;
 		ut_a(err == DB_SUCCESS);
-		/* Write out the externally stored columns, but
+		/* Write out the externally stored columns while still
-		allocate the pages and write the pointers using the
+		x-latching index->lock and block->lock. We have to
-		mini-transaction of the record update. If any pages
+		mtr_commit(mtr) first, so that the redo log will be
-		were freed in the update, temporarily mark them
+		written in the correct order. Otherwise, we would run
-		allocated so that off-page columns will not overwrite
+		into trouble on crash recovery if mtr freed B-tree
-		them. We must do this, because we write the redo log
+		pages on which some of the big_rec fields will be
-		for the BLOB writes before writing the redo log for
+		written. */
-		the record update. */
+		btr_cur_mtr_commit_and_start(btr_cur, mtr);
-		btr_mark_freed_leaves(index, mtr, TRUE);
 		rec = btr_cur_get_rec(btr_cur);
 		err = btr_store_big_rec_extern_fields(
 			index, rec,
 			rec_get_offsets(rec, index, offsets_,
 					ULINT_UNDEFINED, &heap),
-			big_rec, mtr, mtr);
+			big_rec, mtr);
 		if (UNIV_LIKELY_NULL(heap)) {
 			mem_heap_free(heap);
 		}
@@ -1619,8 +1618,6 @@ row_upd_clust_rec(
 		to the undo log, and thus the record cannot be rolled
 		back. */
 		ut_a(err == DB_SUCCESS);
-		/* Free the pages again in order to avoid a leak. */
-		btr_mark_freed_leaves(index, mtr, FALSE);
 	}
 	mtr_commit(mtr);

--- a/storage/innobase/trx/trx0undo.c
+++ b/storage/innobase/trx/trx0undo.c
@@ -864,7 +864,7 @@ trx_undo_add_page(
 	page_no = fseg_alloc_free_page_general(header_page + TRX_UNDO_SEG_HDR
 					       + TRX_UNDO_FSEG_HEADER,
 					       undo->top_page_no + 1, FSP_UP,
-					       TRUE, mtr, mtr);
+					       TRUE, mtr);
 	fil_space_release_free_extents(undo->space, n_reserved);

--- a/storage/innodb_plugin/ChangeLog
+++ b/storage/innodb_plugin/ChangeLog
@@ -50,15 +50,6 @@
 	* include/trx0undo.h, trx/trx0rec.c, trx/trx0undo.c:
 	Fix Bug#12547647 UPDATE LOGGING COULD EXCEED LOG PAGE SIZE
-2011-08-29	The InnoDB Team
-	* btr/btr0btr.c, btr/btr0cur.c, fsp/fsp0fsp.c,
-	include/btr0btr.h, include/btr0cur.h, include/fsp0fsp.h,
-	include/mtr0mtr.h, include/mtr0mtr.ic, mtr/mtr0mtr.c,
-	row/row0ins.c, row/row0row.c, row/row0upd.c, trx/trx0undo.c:
-	Fix Bug#12704861 Corruption after a crash during BLOB update
-	and other regressions from the fix of Bug#12612184
 2011-08-15	The InnoDB Team
 	* btr/btr0btr.c, btr/btr0cur.c, btr/btr0pcur.c, btr/btr0sea.c,

--- a/storage/innodb_plugin/btr/btr0btr.c
+++ b/storage/innodb_plugin/btr/btr0btr.c
@@ -906,29 +906,28 @@ btr_page_alloc_for_ibuf(
 /**************************************************************//**
 Allocates a new file page to be used in an index tree. NOTE: we assume
 that the caller has made the reservation for free extents!
-@return	allocated page number, FIL_NULL if out of space */
+@return	new allocated block, x-latched; NULL if out of space */
-static __attribute__((nonnull(1,5), warn_unused_result))
+UNIV_INTERN
-ulint
+buf_block_t*
-btr_page_alloc_low(
+btr_page_alloc(
-/*===============*/
+/*===========*/
 	dict_index_t*	index,		/*!< in: index */
 	ulint		hint_page_no,	/*!< in: hint of a good page */
 	byte		file_direction,	/*!< in: direction where a possible
 					page split is made */
 	ulint		level,		/*!< in: level where the page is placed
 					in the tree */
-	mtr_t*		mtr,		/*!< in/out: mini-transaction
+	mtr_t*		mtr)		/*!< in: mtr */
-					for the allocation */
-	mtr_t*		init_mtr)	/*!< in/out: mini-transaction
-					in which the page should be
-					initialized (may be the same
-					as mtr), or NULL if it should
-					not be initialized (the page
-					at hint was previously freed
-					in mtr) */
 {
 	fseg_header_t*	seg_header;
 	page_t*		root;
+	buf_block_t*	new_block;
+	ulint		new_page_no;
+	if (dict_index_is_ibuf(index)) {
+		return(btr_page_alloc_for_ibuf(index, mtr));
+	}
 	root = btr_root_get(index, mtr);
@@ -942,42 +941,8 @@ btr_page_alloc_low(
 	reservation for free extents, and thus we know that a page can
 	be allocated: */
-	return(fseg_alloc_free_page_general(
+	new_page_no = fseg_alloc_free_page_general(seg_header, hint_page_no,
-		       seg_header, hint_page_no, file_direction,
+						   file_direction, TRUE, mtr);
-		       TRUE, mtr, init_mtr));
-}
-/**************************************************************//**
-Allocates a new file page to be used in an index tree. NOTE: we assume
-that the caller has made the reservation for free extents!
-@return	new allocated block, x-latched; NULL if out of space */
-UNIV_INTERN
-buf_block_t*
-btr_page_alloc(
-/*===========*/
-	dict_index_t*	index,		/*!< in: index */
-	ulint		hint_page_no,	/*!< in: hint of a good page */
-	byte		file_direction,	/*!< in: direction where a possible
-					page split is made */
-	ulint		level,		/*!< in: level where the page is placed
-					in the tree */
-	mtr_t*		mtr,		/*!< in/out: mini-transaction
-					for the allocation */
-	mtr_t*		init_mtr)	/*!< in/out: mini-transaction
-					for x-latching and initializing
-					the page */
-{
-	buf_block_t*	new_block;
-	ulint		new_page_no;
-	if (dict_index_is_ibuf(index)) {
-		return(btr_page_alloc_for_ibuf(index, mtr));
-	}
-	new_page_no = btr_page_alloc_low(
-		index, hint_page_no, file_direction, level, mtr, init_mtr);
 	if (new_page_no == FIL_NULL) {
 		return(NULL);
@@ -985,16 +950,9 @@ btr_page_alloc(
 	new_block = buf_page_get(dict_index_get_space(index),
 				 dict_table_zip_size(index->table),
-				 new_page_no, RW_X_LATCH, init_mtr);
+				 new_page_no, RW_X_LATCH, mtr);
 	buf_block_dbg_add_level(new_block, SYNC_TREE_NODE_NEW);
-	if (mtr->freed_clust_leaf) {
-		mtr_memo_release(mtr, new_block, MTR_MEMO_FREE_CLUST_LEAF);
-		ut_ad(!mtr_memo_contains(mtr, new_block,
-					 MTR_MEMO_FREE_CLUST_LEAF));
-	}
-	ut_ad(btr_freed_leaves_validate(mtr));
 	return(new_block);
 }
@@ -1129,139 +1087,12 @@ btr_page_free(
 	buf_block_t*	block,	/*!< in: block to be freed, x-latched */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
-	const page_t*	page	= buf_block_get_frame(block);
+	ulint		level;
-	ulint		level	= btr_page_get_level(page, mtr);
-	ut_ad(fil_page_get_type(block->frame) == FIL_PAGE_INDEX);
-	btr_page_free_low(index, block, level, mtr);
-	/* The handling of MTR_MEMO_FREE_CLUST_LEAF assumes this. */
-	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
-	if (level == 0 && dict_index_is_clust(index)) {
-		/* We may have to call btr_mark_freed_leaves() to
-		temporarily mark the block nonfree for invoking
-		btr_store_big_rec_extern_fields_func() after an
-		update. Remember that the block was freed. */
-		mtr->freed_clust_leaf = TRUE;
-		mtr_memo_push(mtr, block, MTR_MEMO_FREE_CLUST_LEAF);
-	}
-	ut_ad(btr_freed_leaves_validate(mtr));
-}
-/**************************************************************//**
-Marks all MTR_MEMO_FREE_CLUST_LEAF pages nonfree or free.
-For invoking btr_store_big_rec_extern_fields() after an update,
-we must temporarily mark freed clustered index pages allocated, so
-that off-page columns will not be allocated from them. Between the
-btr_store_big_rec_extern_fields() and mtr_commit() we have to
-mark the pages free again, so that no pages will be leaked. */
-UNIV_INTERN
-void
-btr_mark_freed_leaves(
-/*==================*/
-	dict_index_t*	index,	/*!< in/out: clustered index */
-	mtr_t*		mtr,	/*!< in/out: mini-transaction */
-	ibool		nonfree)/*!< in: TRUE=mark nonfree, FALSE=mark freed */
-{
-	/* This is loosely based on mtr_memo_release(). */
-	ulint	offset;
-	ut_ad(dict_index_is_clust(index));
-	ut_ad(mtr->magic_n == MTR_MAGIC_N);
-	ut_ad(mtr->state == MTR_ACTIVE);
-	if (!mtr->freed_clust_leaf) {
-		return;
-	}
-	offset = dyn_array_get_data_size(&mtr->memo);
-	while (offset > 0) {
-		mtr_memo_slot_t*	slot;
-		buf_block_t*		block;
-		offset -= sizeof *slot;
-		slot = dyn_array_get_element(&mtr->memo, offset);
-		if (slot->type != MTR_MEMO_FREE_CLUST_LEAF) {
-			continue;
-		}
-		/* Because btr_page_alloc() does invoke
-		mtr_memo_release on MTR_MEMO_FREE_CLUST_LEAF, all
-		blocks tagged with MTR_MEMO_FREE_CLUST_LEAF in the
-		memo must still be clustered index leaf tree pages. */
-		block = slot->object;
-		ut_a(buf_block_get_space(block)
-		     == dict_index_get_space(index));
-		ut_a(fil_page_get_type(buf_block_get_frame(block))
-		     == FIL_PAGE_INDEX);
-		ut_a(page_is_leaf(buf_block_get_frame(block)));
-		if (nonfree) {
-			/* Allocate the same page again. */
-			ulint	page_no;
-			page_no = btr_page_alloc_low(
-				index, buf_block_get_page_no(block),
-				FSP_NO_DIR, 0, mtr, NULL);
-			ut_a(page_no == buf_block_get_page_no(block));
-		} else {
-			/* Assert that the page is allocated and free it. */
-			btr_page_free_low(index, block, 0, mtr);
-		}
-	}
-	ut_ad(btr_freed_leaves_validate(mtr));
-}
-#ifdef UNIV_DEBUG
-/**************************************************************//**
-Validates all pages marked MTR_MEMO_FREE_CLUST_LEAF.
-@see btr_mark_freed_leaves()
-@return TRUE */
-UNIV_INTERN
-ibool
-btr_freed_leaves_validate(
-/*======================*/
-	mtr_t*	mtr)	/*!< in: mini-transaction */
-{
-	ulint	offset;
-	ut_ad(mtr->magic_n == MTR_MAGIC_N);
-	ut_ad(mtr->state == MTR_ACTIVE);
-	offset = dyn_array_get_data_size(&mtr->memo);
-	while (offset > 0) {
-		const mtr_memo_slot_t*	slot;
-		const buf_block_t*	block;
-		offset -= sizeof *slot;
-		slot = dyn_array_get_element(&mtr->memo, offset);
-		if (slot->type != MTR_MEMO_FREE_CLUST_LEAF) {
-			continue;
-		}
-		ut_a(mtr->freed_clust_leaf);
+	level = btr_page_get_level(buf_block_get_frame(block), mtr);
-		/* Because btr_page_alloc() does invoke
-		mtr_memo_release on MTR_MEMO_FREE_CLUST_LEAF, all
-		blocks tagged with MTR_MEMO_FREE_CLUST_LEAF in the
-		memo must still be clustered index leaf tree pages. */
-		block = slot->object;
-		ut_a(fil_page_get_type(buf_block_get_frame(block))
-		     == FIL_PAGE_INDEX);
-		ut_a(page_is_leaf(buf_block_get_frame(block)));
-	}
-	return(TRUE);
+	btr_page_free_low(index, block, level, mtr);
 }
-#endif /* UNIV_DEBUG */
 /**************************************************************//**
 Sets the child node file address in a node pointer. */
@@ -1984,7 +1815,7 @@ btr_root_raise_and_insert(
 	level = btr_page_get_level(root, mtr);
-	new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr, mtr);
+	new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr);
 	new_page = buf_block_get_frame(new_block);
 	new_page_zip = buf_block_get_page_zip(new_block);
 	ut_a(!new_page_zip == !root_page_zip);
@@ -2720,7 +2551,7 @@ func_start:
 	/* 2. Allocate a new page to the index */
 	new_block = btr_page_alloc(cursor->index, hint_page_no, direction,
-				   btr_page_get_level(page, mtr), mtr, mtr);
+				   btr_page_get_level(page, mtr), mtr);
 	new_page = buf_block_get_frame(new_block);
 	new_page_zip = buf_block_get_page_zip(new_block);
 	btr_page_create(new_block, new_page_zip, cursor->index,

--- a/storage/innodb_plugin/btr/btr0cur.c
+++ b/storage/innodb_plugin/btr/btr0cur.c
@@ -2421,6 +2421,39 @@ return_after_reservations:
 	return(err);
 }
+/**************************************************************//**
+Commits and restarts a mini-transaction so that it will retain an
+x-lock on index->lock and the cursor page. */
+UNIV_INTERN
+void
+btr_cur_mtr_commit_and_start(
+/*=========================*/
+	btr_cur_t*	cursor,	/*!< in: cursor */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	buf_block_t*	block;
+	block = btr_cur_get_block(cursor);
+	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index),
+				MTR_MEMO_X_LOCK));
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	/* Keep the locks across the mtr_commit(mtr). */
+	rw_lock_x_lock(dict_index_get_lock(cursor->index));
+	rw_lock_x_lock(&block->lock);
+	mutex_enter(&block->mutex);
+	buf_block_buf_fix_inc(block, __FILE__, __LINE__);
+	mutex_exit(&block->mutex);
+	/* Write out the redo log. */
+	mtr_commit(mtr);
+	mtr_start(mtr);
+	/* Reassociate the locks with the mini-transaction.
+	They will be released on mtr_commit(mtr). */
+	mtr_memo_push(mtr, dict_index_get_lock(cursor->index),
+		      MTR_MEMO_X_LOCK);
+	mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX);
+}
 /*==================== B-TREE DELETE MARK AND UNMARK ===============*/
 /****************************************************************//**
@@ -3863,9 +3896,6 @@ btr_store_big_rec_extern_fields_func(
 					the "external storage" flags in offsets
 					will not correspond to rec when
 					this function returns */
-	const big_rec_t*big_rec_vec,	/*!< in: vector containing fields
-					to be stored externally */
 #ifdef UNIV_DEBUG
 	mtr_t*		local_mtr,	/*!< in: mtr containing the
 					latch to rec and to the tree */
@@ -3874,11 +3904,9 @@ btr_store_big_rec_extern_fields_func(
 	ibool		update_in_place,/*! in: TRUE if the record is updated
 					in place (not delete+insert) */
 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
-	mtr_t*		alloc_mtr)	/*!< in/out: in an insert, NULL;
+	const big_rec_t*big_rec_vec)	/*!< in: vector containing fields
-					in an update, local_mtr for
+					to be stored externally */
-					allocating BLOB pages and
-					updating BLOB pointers; alloc_mtr
-					must not have freed any leaf pages */
 {
 	ulint	rec_page_no;
 	byte*	field_ref;
@@ -3897,9 +3925,6 @@ btr_store_big_rec_extern_fields_func(
 	ut_ad(rec_offs_validate(rec, index, offsets));
 	ut_ad(rec_offs_any_extern(offsets));
-	ut_ad(local_mtr);
-	ut_ad(!alloc_mtr || alloc_mtr == local_mtr);
-	ut_ad(!update_in_place || alloc_mtr);
 	ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index),
 				MTR_MEMO_X_LOCK));
 	ut_ad(mtr_memo_contains(local_mtr, rec_block, MTR_MEMO_PAGE_X_FIX));
@@ -3915,25 +3940,6 @@ btr_store_big_rec_extern_fields_func(
 	rec_page_no = buf_block_get_page_no(rec_block);
 	ut_a(fil_page_get_type(page_align(rec)) == FIL_PAGE_INDEX);
-	if (alloc_mtr) {
-		/* Because alloc_mtr will be committed after
-		mtr, it is possible that the tablespace has been
-		extended when the B-tree record was updated or
-		inserted, or it will be extended while allocating
-		pages for big_rec.
-		TODO: In mtr (not alloc_mtr), write a redo log record
-		about extending the tablespace to its current size,
-		and remember the current size. Whenever the tablespace
-		grows as pages are allocated, write further redo log
-		records to mtr. (Currently tablespace extension is not
-		covered by the redo log. If it were, the record would
-		only be written to alloc_mtr, which is committed after
-		mtr.) */
-	} else {
-		alloc_mtr = &mtr;
-	}
 	if (UNIV_LIKELY_NULL(page_zip)) {
 		int	err;
@@ -4010,7 +4016,7 @@ btr_store_big_rec_extern_fields_func(
 			}
 			block = btr_page_alloc(index, hint_page_no,
-					       FSP_NO_DIR, 0, alloc_mtr, &mtr);
+					       FSP_NO_DIR, 0, &mtr);
 			if (UNIV_UNLIKELY(block == NULL)) {
 				mtr_commit(&mtr);
@@ -4137,15 +4143,11 @@ btr_store_big_rec_extern_fields_func(
 					goto next_zip_page;
 				}
-				if (alloc_mtr == &mtr) {
+				rec_block = buf_page_get(space_id, zip_size,
-					rec_block = buf_page_get(
+							 rec_page_no,
-						space_id, zip_size,
+							 RW_X_LATCH, &mtr);
-						rec_page_no,
+				buf_block_dbg_add_level(rec_block,
-						RW_X_LATCH, &mtr);
+							SYNC_NO_ORDER_CHECK);
-					buf_block_dbg_add_level(
-						rec_block,
-						SYNC_NO_ORDER_CHECK);
-				}
 				if (err == Z_STREAM_END) {
 					mach_write_to_4(field_ref
@@ -4179,8 +4181,7 @@ btr_store_big_rec_extern_fields_func(
 				page_zip_write_blob_ptr(
 					page_zip, rec, index, offsets,
-					big_rec_vec->fields[i].field_no,
+					big_rec_vec->fields[i].field_no, &mtr);
-					alloc_mtr);
 next_zip_page:
 				prev_page_no = page_no;
@@ -4225,23 +4226,19 @@ next_zip_page:
 				extern_len -= store_len;
-				if (alloc_mtr == &mtr) {
+				rec_block = buf_page_get(space_id, zip_size,
-					rec_block = buf_page_get(
+							 rec_page_no,
-						space_id, zip_size,
+							 RW_X_LATCH, &mtr);
-						rec_page_no,
+				buf_block_dbg_add_level(rec_block,
-						RW_X_LATCH, &mtr);
+							SYNC_NO_ORDER_CHECK);
-					buf_block_dbg_add_level(
-						rec_block,
-						SYNC_NO_ORDER_CHECK);
-				}
 				mlog_write_ulint(field_ref + BTR_EXTERN_LEN, 0,
-						 MLOG_4BYTES, alloc_mtr);
+						 MLOG_4BYTES, &mtr);
 				mlog_write_ulint(field_ref
 						 + BTR_EXTERN_LEN + 4,
 						 big_rec_vec->fields[i].len
 						 - extern_len,
-						 MLOG_4BYTES, alloc_mtr);
+						 MLOG_4BYTES, &mtr);
 				if (prev_page_no == FIL_NULL) {
 					btr_blob_dbg_add_blob(
@@ -4251,19 +4248,18 @@ next_zip_page:
 					mlog_write_ulint(field_ref
 							 + BTR_EXTERN_SPACE_ID,
-							 space_id, MLOG_4BYTES,
+							 space_id,
-							 alloc_mtr);
+							 MLOG_4BYTES, &mtr);
 					mlog_write_ulint(field_ref
 							 + BTR_EXTERN_PAGE_NO,
-							 page_no, MLOG_4BYTES,
+							 page_no,
-							 alloc_mtr);
+							 MLOG_4BYTES, &mtr);
 					mlog_write_ulint(field_ref
 							 + BTR_EXTERN_OFFSET,
 							 FIL_PAGE_DATA,
-							 MLOG_4BYTES,
+							 MLOG_4BYTES, &mtr);
-							 alloc_mtr);
 				}
 				prev_page_no = page_no;

--- a/storage/innodb_plugin/fsp/fsp0fsp.c
+++ b/storage/innodb_plugin/fsp/fsp0fsp.c
--- a/storage/innodb_plugin/include/btr0btr.h
+++ b/storage/innodb_plugin/include/btr0btr.h
@@ -557,12 +557,7 @@ btr_page_alloc(
 					page split is made */
 	ulint		level,		/*!< in: level where the page is placed
 					in the tree */
-	mtr_t*		mtr,		/*!< in/out: mini-transaction
+	mtr_t*		mtr);		/*!< in: mtr */
-					for the allocation */
-	mtr_t*		init_mtr)	/*!< in/out: mini-transaction
-					for x-latching and initializing
-					the page */
-	__attribute__((nonnull, warn_unused_result));
 /**************************************************************//**
 Frees a file page used in an index tree. NOTE: cannot free field external
 storage pages because the page must contain info on its level. */
@@ -585,33 +580,6 @@ btr_page_free_low(
 	buf_block_t*	block,	/*!< in: block to be freed, x-latched */
 	ulint		level,	/*!< in: page level */
 	mtr_t*		mtr);	/*!< in: mtr */
-/**************************************************************//**
-Marks all MTR_MEMO_FREE_CLUST_LEAF pages nonfree or free.
-For invoking btr_store_big_rec_extern_fields() after an update,
-we must temporarily mark freed clustered index pages allocated, so
-that off-page columns will not be allocated from them. Between the
-btr_store_big_rec_extern_fields() and mtr_commit() we have to
-mark the pages free again, so that no pages will be leaked. */
-UNIV_INTERN
-void
-btr_mark_freed_leaves(
-/*==================*/
-	dict_index_t*	index,	/*!< in/out: clustered index */
-	mtr_t*		mtr,	/*!< in/out: mini-transaction */
-	ibool		nonfree)/*!< in: TRUE=mark nonfree, FALSE=mark freed */
-	__attribute__((nonnull));
-#ifdef UNIV_DEBUG
-/**************************************************************//**
-Validates all pages marked MTR_MEMO_FREE_CLUST_LEAF.
-@see btr_mark_freed_leaves()
-@return TRUE */
-UNIV_INTERN
-ibool
-btr_freed_leaves_validate(
-/*======================*/
-	mtr_t*	mtr)	/*!< in: mini-transaction */
-	__attribute__((nonnull, warn_unused_result));
-#endif /* UNIV_DEBUG */
 #ifdef UNIV_BTR_PRINT
 /*************************************************************//**
 Prints size info of a B-tree. */

--- a/storage/innodb_plugin/include/btr0cur.h
+++ b/storage/innodb_plugin/include/btr0cur.h
@@ -326,6 +326,16 @@ btr_cur_pessimistic_update(
 	que_thr_t*	thr,	/*!< in: query thread */
 	mtr_t*		mtr);	/*!< in: mtr; must be committed before
 				latching any further pages */
+/*****************************************************************
+Commits and restarts a mini-transaction so that it will retain an
+x-lock on index->lock and the cursor page. */
+UNIV_INTERN
+void
+btr_cur_mtr_commit_and_start(
+/*=========================*/
+	btr_cur_t*	cursor,	/*!< in: cursor */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	__attribute__((nonnull));
 /***********************************************************//**
 Marks a clustered index record deleted. Writes an undo log record to
 undo log on this delete marking. Writes in the trx id field the id
@@ -530,8 +540,6 @@ btr_store_big_rec_extern_fields_func(
 					the "external storage" flags in offsets
 					will not correspond to rec when
 					this function returns */
-	const big_rec_t*big_rec_vec,	/*!< in: vector containing fields
-					to be stored externally */
 #ifdef UNIV_DEBUG
 	mtr_t*		local_mtr,	/*!< in: mtr containing the
 					latch to rec and to the tree */
@@ -540,12 +548,9 @@ btr_store_big_rec_extern_fields_func(
 	ibool		update_in_place,/*! in: TRUE if the record is updated
 					in place (not delete+insert) */
 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
-	mtr_t*		alloc_mtr)	/*!< in/out: in an insert, NULL;
+	const big_rec_t*big_rec_vec)	/*!< in: vector containing fields
-					in an update, local_mtr for
+					to be stored externally */
-					allocating BLOB pages and
+	__attribute__((nonnull));
-					updating BLOB pointers; alloc_mtr
-					must not have freed any leaf pages */
-	__attribute__((nonnull(1,2,3,4,5), warn_unused_result));
 /** Stores the fields in big_rec_vec to the tablespace and puts pointers to
 them in rec.  The extern flags in rec will have to be set beforehand.
@@ -554,22 +559,21 @@ file segment of the index tree.
 @param index	in: clustered index; MUST be X-latched by mtr
 @param b	in/out: block containing rec; MUST be X-latched by mtr
 @param rec	in/out: clustered index record
-@param offs	in: rec_get_offsets(rec, index);
+@param offsets	in: rec_get_offsets(rec, index);
 		the "external storage" flags in offsets will not be adjusted
-@param big	in: vector containing fields to be stored externally
 @param mtr	in: mini-transaction that holds x-latch on index and b
 @param upd	in: TRUE if the record is updated in place (not delete+insert)
-@param rmtr	in/out: in updates, the mini-transaction that holds rec
+@param big	in: vector containing fields to be stored externally
 @return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
 #ifdef UNIV_DEBUG
-# define btr_store_big_rec_extern_fields(index,b,rec,offs,big,mtr,upd,rmtr) \
+# define btr_store_big_rec_extern_fields(index,b,rec,offsets,mtr,upd,big) \
-	btr_store_big_rec_extern_fields_func(index,b,rec,offs,big,mtr,upd,rmtr)
+	btr_store_big_rec_extern_fields_func(index,b,rec,offsets,mtr,upd,big)
 #elif defined UNIV_BLOB_LIGHT_DEBUG
-# define btr_store_big_rec_extern_fields(index,b,rec,offs,big,mtr,upd,rmtr) \
+# define btr_store_big_rec_extern_fields(index,b,rec,offsets,mtr,upd,big) \
-	btr_store_big_rec_extern_fields_func(index,b,rec,offs,big,upd,rmtr)
+	btr_store_big_rec_extern_fields_func(index,b,rec,offsets,upd,big)
 #else
-# define btr_store_big_rec_extern_fields(index,b,rec,offs,big,mtr,upd,rmtr) \
+# define btr_store_big_rec_extern_fields(index,b,rec,offsets,mtr,upd,big) \
-	btr_store_big_rec_extern_fields_func(index,b,rec,offs,big,rmtr)
+	btr_store_big_rec_extern_fields_func(index,b,rec,offsets,big)
 #endif
 /*******************************************************************//**

--- a/storage/innodb_plugin/include/fsp0fsp.h
+++ b/storage/innodb_plugin/include/fsp0fsp.h
@@ -176,18 +176,19 @@ fseg_n_reserved_pages(
 Allocates a single free page from a segment. This function implements
 the intelligent allocation strategy which tries to minimize
 file space fragmentation.
-@param[in/out] seg_header	segment header
+@return	the allocated page offset FIL_NULL if no page could be allocated */
-@param[in] hint			hint of which page would be desirable
+UNIV_INTERN
-@param[in] direction		if the new page is needed because
+ulint
+fseg_alloc_free_page(
+/*=================*/
+	fseg_header_t*	seg_header, /*!< in: segment header */
+	ulint		hint,	/*!< in: hint of which page would be desirable */
+	byte		direction, /*!< in: if the new page is needed because
 				of an index page split, and records are
 				inserted there in order, into which
 				direction they go alphabetically: FSP_DOWN,
-				FSP_UP, FSP_NO_DIR
+				FSP_UP, FSP_NO_DIR */
-@param[in/out] mtr		mini-transaction
+	mtr_t*		mtr);	/*!< in: mtr handle */
-@return	the allocated page offset FIL_NULL if no page could be allocated */
-#define fseg_alloc_free_page(seg_header, hint, direction, mtr)		\
-	fseg_alloc_free_page_general(seg_header, hint, direction,	\
-				     FALSE, mtr, mtr)
 /**********************************************************************//**
 Allocates a single free page from a segment. This function implements
 the intelligent allocation strategy which tries to minimize file space
@@ -209,11 +210,7 @@ fseg_alloc_free_page_general(
 				with fsp_reserve_free_extents, then there
 				is no need to do the check for this individual
 				page */
-	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
-	mtr_t*		init_mtr)/*!< in/out: mtr or another mini-transaction
-				in which the page should be initialized,
-				or NULL if this is a "fake allocation" of
-				a page that was previously freed in mtr */
 	__attribute__((warn_unused_result, nonnull(1,5)));
 /**********************************************************************//**
 Reserves free pages from a tablespace. All mini-transactions which may

--- a/storage/innodb_plugin/include/mtr0mtr.h
+++ b/storage/innodb_plugin/include/mtr0mtr.h
@@ -53,8 +53,6 @@ first 3 values must be RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */
 #define MTR_MEMO_MODIFY		54
 #define	MTR_MEMO_S_LOCK		55
 #define	MTR_MEMO_X_LOCK		56
-/** The mini-transaction freed a clustered index leaf page. */
-#define MTR_MEMO_FREE_CLUST_LEAF	57
 /** @name Log item types
 The log items are declared 'byte' so that the compiler can warn if val
@@ -379,12 +377,9 @@ struct mtr_struct{
 #endif
 	dyn_array_t	memo;	/*!< memo stack for locks etc. */
 	dyn_array_t	log;	/*!< mini-transaction log */
-	unsigned	modifications:1;
+	ibool		modifications;
-				/*!< TRUE if the mini-transaction
+				/* TRUE if the mtr made modifications to
-				modified buffer pool pages */
+				buffer pool pages */
-	unsigned	freed_clust_leaf:1;
-				/*!< TRUE if MTR_MEMO_FREE_CLUST_LEAF
-				was logged in the mini-transaction */
 	ulint		n_log_recs;
 				/* count of how many page initial log records
 				have been written to the mtr log */

--- a/storage/innodb_plugin/include/mtr0mtr.ic
+++ b/storage/innodb_plugin/include/mtr0mtr.ic
@@ -44,7 +44,6 @@ mtr_start(
 	mtr->log_mode = MTR_LOG_ALL;
 	mtr->modifications = FALSE;
-	mtr->freed_clust_leaf = FALSE;
 	mtr->n_log_recs = 0;
 	ut_d(mtr->state = MTR_ACTIVE);
@@ -68,8 +67,7 @@ mtr_memo_push(
 	ut_ad(object);
 	ut_ad(type >= MTR_MEMO_PAGE_S_FIX);
-	ut_ad(type <= MTR_MEMO_FREE_CLUST_LEAF);
+	ut_ad(type <= MTR_MEMO_X_LOCK);
-	ut_ad(type != MTR_MEMO_FREE_CLUST_LEAF || mtr->freed_clust_leaf);
 	ut_ad(mtr);
 	ut_ad(mtr->magic_n == MTR_MAGIC_N);
 	ut_ad(mtr->state == MTR_ACTIVE);

--- a/storage/innodb_plugin/mtr/mtr0mtr.c
+++ b/storage/innodb_plugin/mtr/mtr0mtr.c
@@ -58,11 +58,12 @@ mtr_memo_slot_release(
 			buf_page_release((buf_block_t*)object, type, mtr);
 		} else if (type == MTR_MEMO_S_LOCK) {
 			rw_lock_s_unlock((rw_lock_t*)object);
+#ifdef UNIV_DEBUG
 		} else if (type != MTR_MEMO_X_LOCK) {
-			ut_ad(type == MTR_MEMO_MODIFY
+			ut_ad(type == MTR_MEMO_MODIFY);
-			      || type == MTR_MEMO_FREE_CLUST_LEAF);
 			ut_ad(mtr_memo_contains(mtr, object,
 						MTR_MEMO_PAGE_X_FIX));
+#endif /* UNIV_DEBUG */
 		} else {
 			rw_lock_x_unlock((rw_lock_t*)object);
 		}

--- a/storage/innodb_plugin/row/row0ins.c
+++ b/storage/innodb_plugin/row/row0ins.c
@@ -2097,20 +2097,15 @@ row_ins_index_entry_low(
 			if (big_rec) {
 				ut_a(err == DB_SUCCESS);
 				/* Write out the externally stored
-				columns, but allocate the pages and
+				columns while still x-latching
-				write the pointers using the
+				index->lock and block->lock. We have
-				mini-transaction of the record update.
+				to mtr_commit(mtr) first, so that the
-				If any pages were freed in the update,
+				redo log will be written in the
-				temporarily mark them allocated so
+				correct order. Otherwise, we would run
-				that off-page columns will not
+				into trouble on crash recovery if mtr
-				overwrite them. We must do this,
+				freed B-tree pages on which some of
-				because we will write the redo log for
+				the big_rec fields will be written. */
-				the BLOB writes before writing the
+				btr_cur_mtr_commit_and_start(&cursor, &mtr);
-				redo log for the record update. Thus,
-				redo log application at crash recovery
-				will see BLOBs being written to free pages. */
-				btr_mark_freed_leaves(index, &mtr, TRUE);
 				rec = btr_cur_get_rec(&cursor);
 				offsets = rec_get_offsets(
@@ -2119,8 +2114,7 @@ row_ins_index_entry_low(
 				err = btr_store_big_rec_extern_fields(
 					index, btr_cur_get_block(&cursor),
-					rec, offsets, big_rec, &mtr,
+					rec, offsets, &mtr, FALSE, big_rec);
-					FALSE, &mtr);
 				/* If writing big_rec fails (for
 				example, because of DB_OUT_OF_FILE_SPACE),
 				the record will be corrupted. Even if
@@ -2133,9 +2127,6 @@ row_ins_index_entry_low(
 				undo log, and thus the record cannot
 				be rolled back. */
 				ut_a(err == DB_SUCCESS);
-				/* Free the pages again
-				in order to avoid a leak. */
-				btr_mark_freed_leaves(index, &mtr, FALSE);
 				goto stored_big_rec;
 			}
 		} else {
@@ -2177,7 +2168,7 @@ function_exit:
 		err = btr_store_big_rec_extern_fields(
 			index, btr_cur_get_block(&cursor),
-			rec, offsets, big_rec, &mtr, FALSE, NULL);
+			rec, offsets, &mtr, FALSE, big_rec);
 stored_big_rec:
 		if (modify) {

--- a/storage/innodb_plugin/row/row0row.c
+++ b/storage/innodb_plugin/row/row0row.c
@@ -243,20 +243,19 @@ row_build(
 	}
 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
-	if (rec_offs_any_null_extern(rec, offsets)) {
+	/* This condition can occur during crash recovery before
-		/* This condition can occur during crash recovery
+	trx_rollback_active() has completed execution.
-		before trx_rollback_active() has completed execution.
+	This condition is possible if the server crashed
-		This condition is possible if the server crashed
+	during an insert or update before
-		during an insert or update-by-delete-and-insert before
+	btr_store_big_rec_extern_fields() did mtr_commit() all
-		btr_store_big_rec_extern_fields() did mtr_commit() all
+	BLOB pointers to the clustered index record.
-		BLOB pointers to the freshly inserted clustered index
-		record. */
+	If the record contains a null BLOB pointer, look up the
-		ut_a(trx_assert_recovered(
+	transaction that holds the implicit lock on this record, and
-			     row_get_rec_trx_id(rec, index, offsets)));
+	assert that it was recovered (and will soon be rolled back). */
-		ut_a(trx_undo_roll_ptr_is_insert(
+	ut_a(!rec_offs_any_null_extern(rec, offsets)
-			     row_get_rec_roll_ptr(rec, index, offsets)));
+	     || trx_assert_recovered(row_get_rec_trx_id(rec, index, offsets)));
-	}
 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
 	if (type != ROW_COPY_POINTERS) {

--- a/storage/innodb_plugin/row/row0upd.c
+++ b/storage/innodb_plugin/row/row0upd.c
@@ -1978,22 +1978,21 @@ row_upd_clust_rec(
 		rec_offs_init(offsets_);
 		ut_a(err == DB_SUCCESS);
-		/* Write out the externally stored columns, but
+		/* Write out the externally stored columns while still
-		allocate the pages and write the pointers using the
+		x-latching index->lock and block->lock. We have to
-		mini-transaction of the record update. If any pages
+		mtr_commit(mtr) first, so that the redo log will be
-		were freed in the update, temporarily mark them
+		written in the correct order. Otherwise, we would run
-		allocated so that off-page columns will not overwrite
+		into trouble on crash recovery if mtr freed B-tree
-		them. We must do this, because we write the redo log
+		pages on which some of the big_rec fields will be
-		for the BLOB writes before writing the redo log for
+		written. */
-		the record update. */
+		btr_cur_mtr_commit_and_start(btr_cur, mtr);
-		btr_mark_freed_leaves(index, mtr, TRUE);
 		rec = btr_cur_get_rec(btr_cur);
 		err = btr_store_big_rec_extern_fields(
 			index, btr_cur_get_block(btr_cur), rec,
 			rec_get_offsets(rec, index, offsets_,
 					ULINT_UNDEFINED, &heap),
-			big_rec, mtr, TRUE, mtr);
+			mtr, TRUE, big_rec);
 		/* If writing big_rec fails (for example, because of
 		DB_OUT_OF_FILE_SPACE), the record will be corrupted.
 		Even if we did not update any externally stored
@@ -2003,8 +2002,6 @@ row_upd_clust_rec(
 		to the undo log, and thus the record cannot be rolled
 		back. */
 		ut_a(err == DB_SUCCESS);
-		/* Free the pages again in order to avoid a leak. */
-		btr_mark_freed_leaves(index, mtr, FALSE);
 	}
 	mtr_commit(mtr);

--- a/storage/innodb_plugin/trx/trx0undo.c
+++ b/storage/innodb_plugin/trx/trx0undo.c
@@ -912,7 +912,7 @@ trx_undo_add_page(
 	page_no = fseg_alloc_free_page_general(header_page + TRX_UNDO_SEG_HDR
 					       + TRX_UNDO_FSEG_HEADER,
 					       undo->top_page_no + 1, FSP_UP,
-					       TRUE, mtr, mtr);
+					       TRUE, mtr);
 	fil_space_release_free_extents(undo->space, n_reserved);