Many files:

Merge 3.23.52 innobase/btr/btr0btr.c: Merge 3.23.52 innobase/btr/btr0cur.c: Merge 3.23.52 innobase/btr/btr0sea.c: Merge 3.23.52 innobase/include/btr0btr.h: Merge 3.23.52 innobase/include/btr0cur.h: Merge 3.23.52 innobase/include/btr0sea.h: Merge 3.23.52 innobase/include/buf0buf.h: Merge 3.23.52 innobase/include/buf0rea.h: Merge 3.23.52 innobase/include/data0data.h: Merge 3.23.52 innobase/include/data0data.ic: Merge 3.23.52 innobase/include/log0log.h: Merge 3.23.52 innobase/include/log0log.ic: Merge 3.23.52 innobase/include/os0file.h: Merge 3.23.52 innobase/include/page0page.h: Merge 3.23.52 innobase/include/page0page.ic: Merge 3.23.52 innobase/include/row0mysql.h: Merge 3.23.52 innobase/include/trx0roll.h: Merge 3.23.52 innobase/include/trx0sys.h: Merge 3.23.52 innobase/include/trx0trx.h: Merge 3.23.52 innobase/include/ut0ut.h: Merge 3.23.52 innobase/include/univ.i: Merge 3.23.52 innobase/include/ut0ut.ic: Merge 3.23.52 innobase/buf/buf0buf.c: Merge 3.23.52 innobase/buf/buf0rea.c: Merge 3.23.52 innobase/data/data0data.c: Merge 3.23.52 innobase/dict/dict0crea.c: Merge 3.23.52 innobase/dict/dict0dict.c: Merge 3.23.52 innobase/dict/dict0load.c: Merge 3.23.52 innobase/dict/dict0mem.c: Merge 3.23.52 innobase/fsp/fsp0fsp.c: Merge 3.23.52 innobase/ibuf/ibuf0ibuf.c: Merge 3.23.52 innobase/lock/lock0lock.c: Merge 3.23.52 innobase/log/log0log.c: Merge 3.23.52 innobase/log/log0recv.c: Merge 3.23.52 innobase/mtr/mtr0log.c: Merge 3.23.52 innobase/mtr/mtr0mtr.c: Merge 3.23.52 innobase/os/os0file.c: Merge 3.23.52 innobase/page/page0cur.c: Merge 3.23.52 innobase/page/page0page.c: Merge 3.23.52 innobase/rem/rem0cmp.c: Merge 3.23.52 innobase/row/row0ins.c: Merge 3.23.52 innobase/row/row0mysql.c: Merge 3.23.52 innobase/row/row0purge.c: Merge 3.23.52 innobase/row/row0upd.c: Merge 3.23.52 innobase/srv/srv0srv.c: Merge 3.23.52 innobase/srv/srv0start.c: Merge 3.23.52 innobase/trx/trx0roll.c: Merge 3.23.52 innobase/trx/trx0sys.c: Merge 3.23.52 innobase/trx/trx0trx.c: Merge 3.23.52 innobase/trx/trx0undo.c: Merge 3.23.52 innobase/ut/ut0mem.c: Merge 3.23.52 innobase/ut/ut0ut.c: Merge 3.23.52

Many files:
Merge 3.23.52 innobase/btr/btr0btr.c: Merge 3.23.52 innobase/btr/btr0cur.c: Merge 3.23.52 innobase/btr/btr0sea.c: Merge 3.23.52 innobase/include/btr0btr.h: Merge 3.23.52 innobase/include/btr0cur.h: Merge 3.23.52 innobase/include/btr0sea.h: Merge 3.23.52 innobase/include/buf0buf.h: Merge 3.23.52 innobase/include/buf0rea.h: Merge 3.23.52 innobase/include/data0data.h: Merge 3.23.52 innobase/include/data0data.ic: Merge 3.23.52 innobase/include/log0log.h: Merge 3.23.52 innobase/include/log0log.ic: Merge 3.23.52 innobase/include/os0file.h: Merge 3.23.52 innobase/include/page0page.h: Merge 3.23.52 innobase/include/page0page.ic: Merge 3.23.52 innobase/include/row0mysql.h: Merge 3.23.52 innobase/include/trx0roll.h: Merge 3.23.52 innobase/include/trx0sys.h: Merge 3.23.52 innobase/include/trx0trx.h: Merge 3.23.52 innobase/include/ut0ut.h: Merge 3.23.52 innobase/include/univ.i: Merge 3.23.52 innobase/include/ut0ut.ic: Merge 3.23.52 innobase/buf/buf0buf.c: Merge 3.23.52 innobase/buf/buf0rea.c: Merge 3.23.52 innobase/data/data0data.c: Merge 3.23.52 innobase/dict/dict0crea.c: Merge 3.23.52 innobase/dict/dict0dict.c: Merge 3.23.52 innobase/dict/dict0load.c: Merge 3.23.52 innobase/dict/dict0mem.c: Merge 3.23.52 innobase/fsp/fsp0fsp.c: Merge 3.23.52 innobase/ibuf/ibuf0ibuf.c: Merge 3.23.52 innobase/lock/lock0lock.c: Merge 3.23.52 innobase/log/log0log.c: Merge 3.23.52 innobase/log/log0recv.c: Merge 3.23.52 innobase/mtr/mtr0log.c: Merge 3.23.52 innobase/mtr/mtr0mtr.c: Merge 3.23.52 innobase/os/os0file.c: Merge 3.23.52 innobase/page/page0cur.c: Merge 3.23.52 innobase/page/page0page.c: Merge 3.23.52 innobase/rem/rem0cmp.c: Merge 3.23.52 innobase/row/row0ins.c: Merge 3.23.52 innobase/row/row0mysql.c: Merge 3.23.52 innobase/row/row0purge.c: Merge 3.23.52 innobase/row/row0upd.c: Merge 3.23.52 innobase/srv/srv0srv.c: Merge 3.23.52 innobase/srv/srv0start.c: Merge 3.23.52 innobase/trx/trx0roll.c: Merge 3.23.52 innobase/trx/trx0sys.c: Merge 3.23.52 innobase/trx/trx0trx.c: Merge 3.23.52 innobase/trx/trx0undo.c: Merge 3.23.52 innobase/ut/ut0mem.c: Merge 3.23.52 innobase/ut/ut0ut.c: Merge 3.23.52
1081513a · unknown · b7b988b3 · 1081513a · 1081513a · 1081513a
Commit 1081513a authored Jun 22, 2002 by unknown
52 changed files
--- a/innobase/btr/btr0btr.c
+++ b/innobase/btr/btr0btr.c
@@ -572,6 +572,13 @@ btr_page_get_father_for_rec(

 	if (btr_node_ptr_get_child_page_no(node_ptr) !=
                                                buf_frame_get_page_no(page)) {
+		fprintf(stderr,
+"InnoDB: Dump of the child page:\n");
+		buf_page_print(buf_frame_align(page));
+		fprintf(stderr,
+"InnoDB: Dump of the parent page:\n");
+		buf_page_print(buf_frame_align(node_ptr));
+
      		fprintf(stderr,
 "InnoDB: Corruption of an index tree: table %s, index %s,\n"
 "InnoDB: father ptr page no %lu, child page no %lu\n",
@@ -581,6 +588,12 @@ btr_page_get_father_for_rec(
                    buf_frame_get_page_no(page));
     		page_rec_print(page_rec_get_next(page_get_infimum_rec(page)));
     		page_rec_print(node_ptr);
+
+      		fprintf(stderr,
+"InnoDB: You should dump + drop + reimport the table to fix the\n"
+"InnoDB: corruption. If the crash happens at the database startup, see\n"
+"InnoDB: section 6.1 of http://www.innodb.com/ibman.html about forcing\n"
+"InnoDB: recovery. Then dump + drop + reimport.\n");
 	}

 	ut_a(btr_node_ptr_get_child_page_no(node_ptr) ==
@@ -780,12 +793,14 @@ top_loop:

 /*****************************************************************
 Reorganizes an index page. */
-
+static
 void
 btr_page_reorganize_low(
 /*====================*/
-	ibool	low,	/* in: TRUE if locks should not be updated, i.e.,
-			there cannot exist locks on the page */
+	ibool	recovery,/* in: TRUE if called in recovery: locks should not
+			be updated, i.e., there cannot exist locks on the
+			page, and a hash index should not be dropped: it
+			cannot exist */
 	page_t*	page,	/* in: page to be reorganized */
 	mtr_t*	mtr)	/* in: mtr */
 {
@@ -805,7 +820,9 @@ btr_page_reorganize_low(
 	/* Copy the old page to temporary space */
 	buf_frame_copy(new_page, page);

-	btr_search_drop_page_hash_index(page);
+	if (!recovery) {
+		btr_search_drop_page_hash_index(page);
+	}

 	/* Recreate the page: note that global data on page (possible
 	segment headers, next page-field, etc.) is preserved intact */
@@ -820,7 +837,7 @@ btr_page_reorganize_low(
 	/* Copy max trx id to recreated page */
 	page_set_max_trx_id(page, page_get_max_trx_id(new_page));
 	
-	if (!low) {
+	if (!recovery) {
 		/* Update the record lock bitmaps */
 		lock_move_reorganize_page(page, new_page);
 	}

--- a/innobase/btr/btr0cur.c
+++ b/innobase/btr/btr0cur.c
@@ -36,9 +36,14 @@ Created 10/16/1994 Heikki Tuuri
 #include "ibuf0ibuf.h"
 #include "lock0lock.h"

+/* If the following is set to TRUE, this module prints a lot of
+trace information of individual record operations */
+ibool	btr_cur_print_record_ops = FALSE;
+
 ulint	btr_cur_rnd	= 0;

 ulint	btr_cur_n_non_sea	= 0;
+ulint	btr_cur_n_sea		= 0;

 /* In the optimistic insert, if the insert does not fit, but this much space
 can be released by page reorganize, then it is reorganized */
@@ -187,11 +192,7 @@ btr_cur_search_to_nth_level(
 				tuple must be set so that it cannot get
 				compared to the node ptr page number field! */
 	ulint		mode,	/* in: PAGE_CUR_L, ...;
-				NOTE that if the search is made using a unique
-				prefix of a record, mode should be
-				PAGE_CUR_LE, not PAGE_CUR_GE, as the latter
-				may end up on the previous page relative to the
-				record! Inserts should always be made using
+				Inserts should always be made using
 				PAGE_CUR_LE to search the position! */
 	ulint		latch_mode, /* in: BTR_SEARCH_LEAF, ..., ORed with
 				BTR_INSERT and BTR_ESTIMATE;
@@ -268,7 +269,7 @@ btr_cur_search_to_nth_level(
 #ifdef UNIV_SEARCH_PERF_STAT
 	info->n_searches++;
 #endif	
-	if (btr_search_latch.writer != RW_LOCK_NOT_LOCKED
+	if (btr_search_latch.writer == RW_LOCK_NOT_LOCKED
 		&& latch_mode <= BTR_MODIFY_LEAF && info->last_hash_succ
 		&& !estimate
 	        && btr_search_guess_on_hash(index, info, tuple, mode,
@@ -283,14 +284,14 @@ btr_cur_search_to_nth_level(
 					|| mode != PAGE_CUR_LE);
 		ut_ad(cursor->low_match != ULINT_UNDEFINED
 					|| mode != PAGE_CUR_LE);
+		btr_cur_n_sea++;
+
 	        return;
 	}
 #endif
 #endif
-
-#ifdef UNIV_SEARCH_PERF_STAT
 	btr_cur_n_non_sea++;
-#endif
+
 	/* If the hash search did not succeed, do binary search down the
 	tree */

@@ -796,15 +797,28 @@ btr_cur_optimistic_insert(
 	ulint		data_size;
 	ulint		extra_size;
 	ulint		type;
-	ulint		err;
-	
-	ut_ad(dtuple_check_typed(entry));
+	ulint		err;	

 	*big_rec = NULL;

 	page = btr_cur_get_page(cursor);
 	index = cursor->index;

+	if (!dtuple_check_typed_no_assert(entry)) {
+		fprintf(stderr,
+"InnoDB: Error in a tuple to insert into table %lu index %lu\n",
+					index->table_name, index->name);
+	}
+	
+	if (btr_cur_print_record_ops && thr) {
+		printf(
+	"Trx with id %lu %lu going to insert to table %s index %s\n",
+		ut_dulint_get_high(thr_get_trx(thr)->id),
+		ut_dulint_get_low(thr_get_trx(thr)->id),
+		index->table_name, index->name);
+		dtuple_print(entry);
+	}
+	
 	ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
 							MTR_MEMO_PAGE_X_FIX));
 	max_size = page_get_max_insert_size_after_reorganize(page, 1);
@@ -928,7 +942,7 @@ calculate_sizes_again:
 			buf_frame_get_page_no(page), max_size,
 					rec_size + PAGE_DIR_SLOT_SIZE, type);
 */	
-	if (!(type & (DICT_CLUSTERED | DICT_UNIQUE))) {
+	if (!(type & DICT_CLUSTERED)) {
 		/* We have added a record to page: update its free bits */
 		ibuf_update_free_bits_if_full(cursor->index, page, max_size,
 					rec_size + PAGE_DIR_SLOT_SIZE);
@@ -1258,6 +1272,15 @@ btr_cur_update_sec_rec_in_place(

 	rec = btr_cur_get_rec(cursor);
 	
+	if (btr_cur_print_record_ops && thr) {
+		printf(
+	"Trx with id %lu %lu going to update table %s index %s\n",
+		ut_dulint_get_high(thr_get_trx(thr)->id),
+		ut_dulint_get_low(thr_get_trx(thr)->id),
+		index->table_name, index->name);
+		rec_print(rec);
+	}
+
 	err = lock_sec_rec_modify_check_and_lock(0, rec, index, thr);

 	if (err != DB_SUCCESS) {
@@ -1312,6 +1335,15 @@ btr_cur_update_in_place(
 	index = cursor->index;
 	trx = thr_get_trx(thr);
 	
+	if (btr_cur_print_record_ops && thr) {
+		printf(
+	"Trx with id %lu %lu going to update table %s index %s\n",
+		ut_dulint_get_high(thr_get_trx(thr)->id),
+		ut_dulint_get_low(thr_get_trx(thr)->id),
+		index->table_name, index->name);
+		rec_print(rec);
+	}
+
 	/* Do lock checking and undo logging */
 	err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info,
 							thr, &roll_ptr);
@@ -1398,6 +1430,15 @@ btr_cur_optimistic_update(
 	rec = btr_cur_get_rec(cursor);
 	index = cursor->index;
 	
+	if (btr_cur_print_record_ops && thr) {
+		printf(
+	"Trx with id %lu %lu going to update table %s index %s\n",
+		ut_dulint_get_high(thr_get_trx(thr)->id),
+		ut_dulint_get_low(thr_get_trx(thr)->id),
+		index->table_name, index->name);
+		rec_print(rec);
+	}
+
 	ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
 							MTR_MEMO_PAGE_X_FIX));
 	if (!row_upd_changes_field_size(rec, index, update)) {
@@ -1973,6 +2014,15 @@ btr_cur_del_mark_set_clust_rec(
 	rec = btr_cur_get_rec(cursor);
 	index = cursor->index;
 	
+	if (btr_cur_print_record_ops && thr) {
+		printf(
+	"Trx with id %lu %lu going to del mark table %s index %s\n",
+		ut_dulint_get_high(thr_get_trx(thr)->id),
+		ut_dulint_get_low(thr_get_trx(thr)->id),
+		index->table_name, index->name);
+		rec_print(rec);
+	}
+
 	ut_ad(index->type & DICT_CLUSTERED);
 	ut_ad(rec_get_deleted_flag(rec) == FALSE);

@@ -2102,6 +2152,15 @@ btr_cur_del_mark_set_sec_rec(

 	rec = btr_cur_get_rec(cursor);

+	if (btr_cur_print_record_ops && thr) {
+		printf(
+	"Trx with id %lu %lu going to del mark table %s index %s\n",
+		ut_dulint_get_high(thr_get_trx(thr)->id),
+		ut_dulint_get_low(thr_get_trx(thr)->id),
+		cursor->index->table_name, cursor->index->name);
+		rec_print(rec);
+	}
+
 	err = lock_sec_rec_modify_check_and_lock(flags, rec, cursor->index,
 									thr);
 	if (err != DB_SUCCESS) {

--- a/innobase/btr/btr0sea.c
+++ b/innobase/btr/btr0sea.c
@@ -15,6 +15,7 @@ Created 2/17/1996 Heikki Tuuri
 #include "page0page.h"
 #include "page0cur.h"
 #include "btr0cur.h"
+#include "btr0pcur.h"
 #include "btr0btr.h"

 ulint	btr_search_n_succ	= 0;
@@ -145,6 +146,8 @@ btr_search_info_create(

 	info = mem_heap_alloc(heap, sizeof(btr_search_t));

+	info->magic_n = BTR_SEARCH_MAGIC_N;
+
 	info->last_search = NULL;
 	info->n_direction = 0;
 	info->root_guess = NULL;
@@ -159,6 +162,12 @@ btr_search_info_create(
 	info->n_patt_succ = 0;	
 	info->n_searches = 0;	

+	/* Set some sensible values */
+	info->n_fields = 1;
+	info->n_bytes = 0;
+
+	info->side = BTR_SEARCH_LEFT_SIDE;
+
 	return(info);
 }

@@ -197,7 +206,7 @@ btr_search_info_update_hash(
 	/* Test if the search would have succeeded using the recommended
 	hash prefix */

-	if ((info->n_fields >= n_unique) && (cursor->up_match >= n_unique)) {
+	if (info->n_fields >= n_unique && cursor->up_match >= n_unique) {
 			
 		info->n_hash_potential++;

@@ -207,8 +216,8 @@ btr_search_info_update_hash(
 	cmp = ut_pair_cmp(info->n_fields, info->n_bytes,
 					cursor->low_match, cursor->low_bytes);

-	if (((info->side == BTR_SEARCH_LEFT_SIDE) && (cmp <= 0))
-		|| ((info->side == BTR_SEARCH_RIGHT_SIDE) && (cmp > 0))) {
+	if ((info->side == BTR_SEARCH_LEFT_SIDE && cmp <= 0)
+		|| (info->side == BTR_SEARCH_RIGHT_SIDE && cmp > 0)) {

 		goto set_new_recomm;
 	}
@@ -216,8 +225,8 @@ btr_search_info_update_hash(
 	cmp = ut_pair_cmp(info->n_fields, info->n_bytes,
 					cursor->up_match, cursor->up_bytes);

-	if (((info->side == BTR_SEARCH_LEFT_SIDE) && (cmp > 0))
-		|| ((info->side == BTR_SEARCH_RIGHT_SIDE) && (cmp <= 0))) {
+	if ((info->side == BTR_SEARCH_LEFT_SIDE && cmp > 0)
+		|| (info->side == BTR_SEARCH_RIGHT_SIDE && cmp <= 0)) {

 	    	goto set_new_recomm;
 	}
@@ -233,19 +242,18 @@ set_new_recomm:
 	
 	info->hash_analysis = 0;
 	
-	if ((cursor->up_match >= n_unique)
-					|| (cursor->low_match >= n_unique)) {
-		info->n_fields = n_unique;
-		info->n_bytes = 0;
-
-		info->side = BTR_SEARCH_LEFT_SIDE;
-	}
-
 	cmp = ut_pair_cmp(cursor->up_match, cursor->up_bytes,
 					cursor->low_match, cursor->low_bytes);
 	if (cmp == 0) {
 		info->n_hash_potential = 0;

+		/* For extra safety, we set some sensible values here */
+
+		info->n_fields = 1;
+		info->n_bytes = 0;
+
+		info->side = BTR_SEARCH_LEFT_SIDE;
+
 	} else if (cmp > 0) {
 		info->n_hash_potential = 1;

@@ -305,6 +313,9 @@ btr_search_update_block_hash_info(

 	info->last_hash_succ = FALSE;

+	ut_a(block->magic_n == BUF_BLOCK_MAGIC_N);
+	ut_a(info->magic_n == BTR_SEARCH_MAGIC_N);
+
 	if ((block->n_hash_helps > 0)
 	    && (info->n_hash_potential > 0)
 	    && (block->n_fields == info->n_fields)
@@ -622,6 +633,7 @@ btr_search_guess_on_hash(
 	dulint		tree_id;
 #ifdef notdefined
 	btr_cur_t	cursor2;
+	btr_pcur_t	pcur;
 #endif
 	ut_ad(index && info && tuple && cursor && mtr);
 	ut_ad((latch_mode == BTR_SEARCH_LEAF)
@@ -754,7 +766,26 @@ btr_search_guess_on_hash(

 	btr_cur_search_to_nth_level(index, 0, tuple, mode, latch_mode,
 							&cursor2, 0, mtr);
-	ut_a(btr_cur_get_rec(&cursor2) == btr_cur_get_rec(cursor));
+	if (mode == PAGE_CUR_GE
+		&& btr_cur_get_rec(&cursor2) == page_get_supremum_rec(
+			buf_frame_align(btr_cur_get_rec(&cursor2)))) {
+
+		/* If mode is PAGE_CUR_GE, then the binary search
+		in the index tree may actually take us to the supremum
+		of the previous page */
+					
+		info->last_hash_succ = FALSE;
+
+		btr_pcur_open_on_user_rec(index, tuple, mode, latch_mode,
+				&pcur, mtr);
+		ut_a(btr_pcur_get_rec(&pcur) == btr_cur_get_rec(cursor));
+	} else {
+		ut_a(btr_cur_get_rec(&cursor2) == btr_cur_get_rec(cursor));
+	}
+
+	/* NOTE that it is theoretically possible that the above assertions
+	fail if the page of the cursor gets removed from the buffer pool
+	meanwhile! Thus it might not be a bug. */

 	info->last_hash_succ = TRUE;
 #endif
@@ -835,6 +866,8 @@ btr_search_drop_page_hash_index(
 	n_fields = block->curr_n_fields;
 	n_bytes = block->curr_n_bytes;

+	ut_a(n_fields + n_bytes > 0);
+
 	rw_lock_s_unlock(&btr_search_latch);
 	
 	n_recs = page_get_n_recs(page);
@@ -851,6 +884,14 @@ btr_search_drop_page_hash_index(
 	rec = page_get_infimum_rec(page);
 	rec = page_rec_get_next(rec);

+	if (rec != sup) {
+		ut_a(n_fields <= rec_get_n_fields(rec));
+
+		if (n_bytes > 0) {
+			ut_a(n_fields < rec_get_n_fields(rec));
+		}
+	}
+
 	tree_id = btr_page_get_index_id(page);
 	
 	prev_fold = 0;
@@ -980,6 +1021,8 @@ btr_search_build_page_hash_index(
 		return;
 	}

+	ut_a(n_fields + n_bytes > 0);
+
 	/* Calculate and cache fold values and corresponding records into
 	an array for fast insertion to the hash index */

@@ -995,6 +1038,14 @@ btr_search_build_page_hash_index(
 	rec = page_get_infimum_rec(page);
 	rec = page_rec_get_next(rec);

+	if (rec != sup) {
+		ut_a(n_fields <= rec_get_n_fields(rec));
+
+		if (n_bytes > 0) {
+			ut_a(n_fields < rec_get_n_fields(rec));
+		}
+	}
+
 	/* FIXME: in a mixed tree, all records may not have enough ordering
 	fields: */
 	

--- a/innobase/buf/buf0buf.c
+++ b/innobase/buf/buf0buf.c
@@ -1125,13 +1125,51 @@ buf_page_get_known_nowait(
 	return(TRUE);
 }

+/************************************************************************
+Inits a page to the buffer buf_pool, for use in ibbackup --restore. */
+
+void
+buf_page_init_for_backup_restore(
+/*=============================*/
+	ulint		space,	/* in: space id */
+	ulint		offset,	/* in: offset of the page within space
+				in units of a page */
+	buf_block_t*	block)	/* in: block to init */
+{
+	/* Set the state of the block */
+	block->magic_n		= BUF_BLOCK_MAGIC_N;
+
+	block->state 		= BUF_BLOCK_FILE_PAGE;
+	block->space 		= space;
+	block->offset 		= offset;
+
+	block->lock_hash_val	= 0;
+	block->lock_mutex	= NULL;
+	
+	block->freed_page_clock = 0;
+
+	block->newest_modification = ut_dulint_zero;
+	block->oldest_modification = ut_dulint_zero;
+	
+	block->accessed		= FALSE;
+	block->buf_fix_count 	= 0;
+	block->io_fix		= 0;
+
+	block->n_hash_helps	= 0;
+	block->is_hashed	= FALSE;
+	block->n_fields         = 1;
+	block->n_bytes          = 0;
+	block->side             = BTR_SEARCH_LEFT_SIDE;
+
+	block->file_page_was_freed = FALSE;
+}
+
 /************************************************************************
 Inits a page to the buffer buf_pool. */
 static
 void
 buf_page_init(
 /*==========*/
-				/* out: pointer to the block */
 	ulint		space,	/* in: space id */
 	ulint		offset,	/* in: offset of the page within space
 				in units of a page */
@@ -1141,6 +1179,8 @@ buf_page_init(
 	ut_ad(block->state == BUF_BLOCK_READY_FOR_USE);

 	/* Set the state of the block */
+	block->magic_n		= BUF_BLOCK_MAGIC_N;
+
 	block->state 		= BUF_BLOCK_FILE_PAGE;
 	block->space 		= space;
 	block->offset 		= offset;

--- a/innobase/buf/buf0rea.c
+++ b/innobase/buf/buf0rea.c
@@ -100,6 +100,11 @@ buf_read_page_low(
 	block = buf_page_init_for_read(mode, space, offset);

 	if (block != NULL) {
+		if (buf_debug_prints) {
+			printf("Posting read request for page %lu, sync %lu\n",
+				offset, sync);
+		}
+
 		fil_io(OS_FILE_READ | wake_later,
 			sync, space, offset, 0, UNIV_PAGE_SIZE,
 					(void*)block->frame, (void*)block);
@@ -467,6 +472,12 @@ buf_read_ahead_linear(

 	count = 0;

+	/* Since Windows XP seems to schedule the i/o handler thread
+	very eagerly, and consequently it does not wait for the
+	full read batch to be posted, we use special heuristics here */
+
+	os_aio_simulated_put_read_threads_to_sleep();
+	
 	for (i = low; i < high; i++) {
 		/* It is only sensible to do read-ahead in the non-sync
 		aio mode: hence FALSE as the first parameter */
@@ -556,16 +567,34 @@ buf_read_recv_pages(
 				highest page number the last in the array */
 	ulint	n_stored)	/* in: number of page numbers in the array */
 {
+	ulint	count;
 	ulint	i;

 	for (i = 0; i < n_stored; i++) {

+		count = 0;
+
+		os_aio_print_debug = FALSE;
+
 		while (buf_pool->n_pend_reads >= RECV_POOL_N_FREE_BLOCKS / 2) {

 			os_aio_simulated_wake_handler_threads();
 			os_thread_sleep(500000);
+
+			count++;
+
+			if (count > 100) {
+				fprintf(stderr,
+"InnoDB: Error: InnoDB has waited for 50 seconds for pending\n"
+"InnoDB: reads to the buffer pool to be finished.\n"
+"InnoDB: Number of pending reads %lu\n", buf_pool->n_pend_reads);
+
+				os_aio_print_debug = TRUE;
+			}
 		}

+		os_aio_print_debug = FALSE;
+
 		if ((i + 1 == n_stored) && sync) {
 			buf_read_page_low(TRUE, BUF_READ_ANY_PAGE, space,
 								page_nos[i]);

--- a/innobase/data/data0data.c
+++ b/innobase/data/data0data.c
@@ -64,6 +64,35 @@ dtuple_get_nth_field_noninline(
 	return(dtuple_get_nth_field(tuple, n));
 }

+/*************************************************************************
+Tests if dfield data length and content is equal to the given. */
+
+ibool
+dfield_data_is_binary_equal(
+/*========================*/
+				/* out: TRUE if equal */
+	dfield_t*	field,	/* in: field */
+	ulint		len,	/* in: data length or UNIV_SQL_NULL */
+	byte*		data)	/* in: data */
+{
+	if (len != field->len) {
+
+		return(FALSE);
+	}
+
+	if (len == UNIV_SQL_NULL) {
+
+		return(TRUE);
+	}
+
+	if (0 != ut_memcmp(field->data, data, len)) {
+	    	
+		return(FALSE);
+	}
+
+	return(TRUE);
+}
+
 /****************************************************************
 Returns TRUE if lengths of two dtuples are equal and respective data fields
 in them are equal when compared with collation in char fields (not as binary
@@ -153,6 +182,69 @@ dtuple_set_n_fields(
 	tuple->n_fields_cmp = n_fields;
 }

+/**************************************************************
+Checks that a data field is typed. */
+static
+ibool
+dfield_check_typed_no_assert(
+/*=========================*/
+				/* out: TRUE if ok */
+	dfield_t*	field)	/* in: data field */
+{
+	if (dfield_get_type(field)->mtype > DATA_MYSQL
+	    || dfield_get_type(field)->mtype < DATA_VARCHAR) {
+
+		fprintf(stderr,
+"InnoDB: Error: data field type %lu, len %lu\n",
+			dfield_get_type(field)->mtype, dfield_get_len(field));
+		return(FALSE);
+	}
+
+	return(TRUE);
+}
+
+/**************************************************************
+Checks that a data tuple is typed. */
+
+ibool
+dtuple_check_typed_no_assert(
+/*=========================*/
+				/* out: TRUE if ok */
+	dtuple_t*	tuple)	/* in: tuple */
+{
+	dfield_t*	field;
+	ulint	 	i;
+	char		err_buf[1000];
+	
+	if (dtuple_get_n_fields(tuple) > REC_MAX_N_FIELDS) {
+		fprintf(stderr,
+"InnoDB: Error: index entry has %lu fields\n",
+			dtuple_get_n_fields(tuple));
+
+		dtuple_sprintf(err_buf, 900, tuple);
+		fprintf(stderr,
+"InnoDB: Tuple contents: %s\n", err_buf);	
+
+		return(FALSE);
+	}
+
+	for (i = 0; i < dtuple_get_n_fields(tuple); i++) {
+
+		field = dtuple_get_nth_field(tuple, i);
+
+		if (!dfield_check_typed_no_assert(field)) {
+
+			dtuple_sprintf(err_buf, 900, tuple);
+			fprintf(stderr,
+"InnoDB: Tuple contents: %s\n", err_buf);	
+
+			return(FALSE);
+		}
+	}
+
+	return(TRUE);
+}
+
 /**************************************************************
 Checks that a data field is typed. Asserts an error if not. */

@@ -162,8 +254,15 @@ dfield_check_typed(
 				/* out: TRUE if ok */
 	dfield_t*	field)	/* in: data field */
 {
-	ut_a(dfield_get_type(field)->mtype <= DATA_MYSQL);
-	ut_a(dfield_get_type(field)->mtype >= DATA_VARCHAR);
+	if (dfield_get_type(field)->mtype > DATA_MYSQL
+	    || dfield_get_type(field)->mtype < DATA_VARCHAR) {
+
+		fprintf(stderr,
+"InnoDB: Error: data field type %lu, len %lu\n",
+			dfield_get_type(field)->mtype, dfield_get_len(field));
+
+		ut_a(0);
+	}

 	return(TRUE);
 }
@@ -460,9 +559,21 @@ dtuple_convert_big_rec(
 	ibool		is_externally_stored;
 	ulint		i;
 	ulint		j;
+	char		err_buf[1000];
 	
+	ut_a(dtuple_check_typed_no_assert(entry));
+
 	size = rec_get_converted_size(entry);

+	if (size > 1000000000) {
+		fprintf(stderr,
+"InnoDB: Warning: tuple size very big: %lu\n", size);
+		
+		dtuple_sprintf(err_buf, 900, entry);
+		fprintf(stderr,
+"InnoDB: Tuple contents: %s\n", err_buf);
+	}
+
 	heap = mem_heap_create(size + dtuple_get_n_fields(entry)
 					* sizeof(big_rec_field_t) + 1000);


--- a/innobase/dict/dict0crea.c
+++ b/innobase/dict/dict0crea.c
@@ -153,6 +153,7 @@ dict_create_sys_tables_tuple(
 	if (table->type == DICT_TABLE_CLUSTER_MEMBER) {
 		dfield_set_data(dfield, table->cluster_name,
 				ut_strlen(table->cluster_name));
+		ut_a(0); /* Oracle-style clusters are not supported yet */
 	} else {
 		dfield_set_data(dfield, NULL, UNIV_SQL_NULL);
 	}

--- a/innobase/dict/dict0dict.c
+++ b/innobase/dict/dict0dict.c
@@ -2805,6 +2805,12 @@ dict_update_statistics_low(

 	index = dict_table_get_first_index(table);	

+	if (index == NULL) {
+		/* Table definition is corrupt */
+	
+		return;
+	}
+
 	while (index) {
 		size = btr_get_size(index, BTR_TOTAL_SIZE);

@@ -3196,6 +3202,14 @@ dict_print_info_on_foreign_keys(

 		buf2 += sprintf(buf2, ")");

+		if (foreign->type == DICT_FOREIGN_ON_DELETE_CASCADE) {
+			buf2 += sprintf(buf2, " ON DELETE CASCADE");
+		}
+	
+		if (foreign->type == DICT_FOREIGN_ON_DELETE_SET_NULL) {
+			buf2 += sprintf(buf2, " ON DELETE SET NULL");
+		}
+
 		foreign = UT_LIST_GET_NEXT(foreign_list, foreign);
 	}


--- a/innobase/dict/dict0load.c
+++ b/innobase/dict/dict0load.c
--- a/innobase/dict/dict0mem.c
+++ b/innobase/dict/dict0mem.c
@@ -65,6 +65,9 @@ dict_mem_table_create(
 		
 	table->cached = FALSE;
 	
+	table->mix_id = ut_dulint_zero;
+	table->mix_len = 0;
+	
 	table->cols = mem_heap_alloc(heap, (n_cols + DATA_N_SYS_COLS)
 							* sizeof(dict_col_t));
 	UT_LIST_INIT(table->indexes);

--- a/innobase/fsp/fsp0fsp.c
+++ b/innobase/fsp/fsp0fsp.c
@@ -2608,6 +2608,7 @@ fseg_free_page_low(
 	ulint	not_full_n_used;
 	ulint	state;
 	ulint	i;
+	char	errbuf[200];
 	
 	ut_ad(seg_inode && mtr);
 	ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N) ==
@@ -2621,8 +2622,25 @@ fseg_free_page_low(
 	descr = xdes_get_descriptor(space, page, mtr);

 	ut_a(descr);
-	ut_a(xdes_get_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, mtr)
-								== FALSE);
+	if (xdes_get_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, mtr)
+							!= FALSE) {
+		ut_sprintf_buf(errbuf, descr, 40);
+		fprintf(stderr,
+"InnoDB: Dump of the tablespace extent descriptor: %s\n", errbuf);
+
+		fprintf(stderr,
+"InnoDB: Serious error! InnoDB is trying to free page %lu\n",
+"InnoDB: though it is already marked as free in the tablespace!\n"
+"InnoDB: The tablespace free space info is corrupt.\n"
+"InnoDB: You may need to dump your InnoDB tables and recreate the whole\n"
+"InnoDB: database!\n", page);
+
+		fprintf(stderr,
+"InnoDB: If the InnoDB recovery crashes here, see section 6.1\n"
+"InnoDB: of http://www.innodb.com/ibman.html about forcing recovery.\n");
+		ut_a(0);
+	}
+		
 	state = xdes_get_state(descr, mtr);

 	if (state != XDES_FSEG) {

--- a/innobase/ibuf/ibuf0ibuf.c
+++ b/innobase/ibuf/ibuf0ibuf.c
@@ -685,21 +685,21 @@ ibuf_bitmap_get_map_page(
 /****************************************************************************
 Sets the free bits of the page in the ibuf bitmap. This is done in a separate
 mini-transaction, hence this operation does not restrict further work to only
-ibuf bitmap operations, which would result if the latch to the bitmap pag
+ibuf bitmap operations, which would result if the latch to the bitmap page
 were kept. */
 UNIV_INLINE
 void
 ibuf_set_free_bits_low(
 /*===================*/
 	ulint	type,	/* in: index type */
-	page_t*	page,	/* in: index page; free bit is reset if the index is
-			a non-clustered non-unique, and page level is 0 */
+	page_t*	page,	/* in: index page; free bit is set if the index is
+			non-clustered and page level is 0 */
 	ulint	val,	/* in: value to set: < 4 */
 	mtr_t*	mtr)	/* in: mtr */
 {
 	page_t*	bitmap_page;

-	if (type & (DICT_CLUSTERED | DICT_UNIQUE)) {
+	if (type & DICT_CLUSTERED) {

 		return;
 	}
@@ -733,8 +733,8 @@ void
 ibuf_set_free_bits(
 /*===============*/
 	ulint	type,	/* in: index type */
-	page_t*	page,	/* in: index page; free bit is reset if the index is
-			a non-clustered non-unique, and page level is 0 */
+	page_t*	page,	/* in: index page; free bit is set if the index is
+			non-clustered and page level is 0 */
 	ulint	val,	/* in: value to set: < 4 */
 	ulint	max_val)/* in: ULINT_UNDEFINED or a maximum value which
 			the bits must have before setting; this is for
@@ -743,7 +743,7 @@ ibuf_set_free_bits(
 	mtr_t	mtr;
 	page_t*	bitmap_page;

-	if (type & (DICT_CLUSTERED | DICT_UNIQUE)) {
+	if (type & DICT_CLUSTERED) {

 		return;
 	}
@@ -2024,7 +2024,7 @@ ibuf_insert_low(
 	ulint		n_stored;
 	ulint		bits;
 	
-	ut_a(!(index->type & (DICT_UNIQUE | DICT_CLUSTERED)));
+	ut_a(!(index->type & DICT_CLUSTERED));
 	ut_ad(dtuple_check_typed(entry));

 	do_merge = FALSE;
@@ -2254,10 +2254,7 @@ ibuf_insert(

 	ut_ad(dtuple_check_typed(entry));

-	if (index->type & DICT_CLUSTERED || index->type & DICT_UNIQUE) {
-
-		return(FALSE);
-	}
+	ut_a(!(index->type & DICT_CLUSTERED));
 	
 	if (rec_get_converted_size(entry)
 				>= page_get_free_space_of_empty() / 2) {
@@ -2302,6 +2299,7 @@ ibuf_insert_to_index_page(
 	rec_t*		rec;
 	page_t*		bitmap_page;
 	ulint		old_bits;
+	char		errbuf[1000];

 	ut_ad(ibuf_inside());
 	ut_ad(dtuple_check_typed(entry));
@@ -2324,11 +2322,24 @@ ibuf_insert_to_index_page(

 			/* This time the record must fit */
 			if (!page_cur_tuple_insert(&page_cur, entry, mtr)) {
-				printf(
-			"Ibuf insert fails; page free %lu, dtuple size %lu\n",
+
+				ut_print_timestamp(stderr);
+
+				fprintf(stderr,
+"InnoDB: Error: Insert buffer insert fails; page free %lu, dtuple size %lu\n",
 				page_get_max_insert_size(page, 1),
 				rec_get_converted_size(entry));

+				dtuple_sprintf(errbuf, 900, entry);
+				
+				fprintf(stderr,
+"InnoDB: Cannot insert index record %s\n", errbuf);
+
+				fprintf(stderr,
+"InnoDB: The table where where this index record belongs\n"
+"InnoDB: is now probably corrupt. Please run CHECK TABLE on\n"
+"InnoDB: that table.\n");
+				
 				bitmap_page = ibuf_bitmap_get_map_page(
 						buf_frame_get_space_id(page),
 						buf_frame_get_page_no(page),
@@ -2339,9 +2350,11 @@ ibuf_insert_to_index_page(
 						buf_frame_get_page_no(page),
 						IBUF_BITMAP_FREE, mtr);

-				printf("Bitmap bits %lu\n", old_bits);
-						
-				ut_error;
+				fprintf(stderr, "Bitmap bits %lu\n", old_bits);
+
+				fprintf(stderr,
+"InnoDB: Send a detailed bug report to mysql@lists.mysql.com!\n");
+				
 			}	
 		}
 	}

--- a/innobase/include/btr0btr.h
+++ b/innobase/include/btr0btr.h
@@ -204,16 +204,6 @@ btr_page_reorganize(
 	page_t*	page,	/* in: page to be reorganized */
 	mtr_t*	mtr);	/* in: mtr */
 /*****************************************************************
-Reorganizes an index page. */
-
-void
-btr_page_reorganize_low(
-/*====================*/
-	ibool	low,	/* in: TRUE if locks should not be updated, i.e.,
-			there cannot exist locks on the page */
-	page_t*	page,	/* in: page to be reorganized */
-	mtr_t*	mtr);	/* in: mtr */
-/*****************************************************************
 Decides if the page should be split at the convergence point of
 inserts converging to left. */


--- a/innobase/include/btr0cur.h
+++ b/innobase/include/btr0cur.h
@@ -709,6 +709,7 @@ allowed to free an inherited external field. */
 #define BTR_EXTERN_INHERITED_FLAG	64

 extern ulint	btr_cur_n_non_sea;
+extern ulint	btr_cur_n_sea;

 #ifndef UNIV_NONINL
 #include "btr0cur.ic"

--- a/innobase/include/btr0sea.h
+++ b/innobase/include/btr0sea.h
@@ -176,6 +176,7 @@ btr_search_validate(void);
 /* The search info struct in an index */

 struct btr_search_struct{
+	ulint	magic_n;	/* magic number */
 	/* The following 4 fields are currently not used: */
 	rec_t*	last_search;	/* pointer to the lower limit record of the
 				previous search; NULL if not known */
@@ -220,6 +221,8 @@ struct btr_search_struct{
 	ulint	n_searches;	/* number of searches */
 };

+#define BTR_SEARCH_MAGIC_N	1112765
+
 /* The hash index system */

 typedef struct btr_search_sys_struct	btr_search_sys_t;

--- a/innobase/include/buf0buf.h
+++ b/innobase/include/buf0buf.h
@@ -219,6 +219,16 @@ buf_page_create(
 			a page */
 	mtr_t*	mtr);	/* in: mini-transaction handle */
 /************************************************************************
+Inits a page to the buffer buf_pool, for use in ibbackup --restore. */
+
+void
+buf_page_init_for_backup_restore(
+/*=============================*/
+	ulint		space,	/* in: space id */
+	ulint		offset,	/* in: offset of the page within space
+				in units of a page */
+	buf_block_t*	block);	/* in: block to init */
+/************************************************************************
 Decrements the bufferfix count of a buffer control block and releases
 a latch, if specified. */
 UNIV_INLINE
@@ -605,6 +615,7 @@ struct buf_block_struct{

 	/* 1. General fields */

+	ulint		magic_n;	/* magic number to check */
 	ulint		state;		/* state of the control block:
 					BUF_BLOCK_NOT_USED, ... */
 	byte*		frame;		/* pointer to buffer frame which
@@ -729,6 +740,8 @@ struct buf_block_struct{
                                        frees a page in buffer pool */
 };

+#define BUF_BLOCK_MAGIC_N	41526563
+
 /* The buffer pool structure. NOTE! The definition appears here only for
 other modules of this directory (buf) to see it. Do not use from outside! */


--- a/innobase/include/buf0rea.h
+++ b/innobase/include/buf0rea.h
@@ -89,7 +89,7 @@ buf_read_recv_pages(
 /* The size in pages of the area which the read-ahead algorithms read if
 invoked */

-#define	BUF_READ_AHEAD_AREA	ut_min(32, buf_pool->curr_size / 16)
+#define	BUF_READ_AHEAD_AREA	ut_min(64, ut_2_power_up(buf_pool->curr_size / 32))

 /* Modes used in read-ahead */
 #define BUF_READ_IBUF_PAGES_ONLY	131

--- a/innobase/include/data0data.h
+++ b/innobase/include/data0data.h
@@ -123,7 +123,7 @@ dfield_datas_are_binary_equal(
 	dfield_t*	field2);/* in: field */
 /*************************************************************************
 Tests if dfield data length and content is equal to the given. */
-UNIV_INLINE
+
 ibool
 dfield_data_is_binary_equal(
 /*========================*/
@@ -279,6 +279,14 @@ dtuple_check_typed(
 				/* out: TRUE if ok */
 	dtuple_t*	tuple);	/* in: tuple */
 /**************************************************************
+Checks that a data tuple is typed. */
+
+ibool
+dtuple_check_typed_no_assert(
+/*=========================*/
+				/* out: TRUE if ok */
+	dtuple_t*	tuple);	/* in: tuple */
+/**************************************************************
 Validates the consistency of a tuple which must be complete, i.e,
 all fields must have been set. */


--- a/innobase/include/data0data.ic
+++ b/innobase/include/data0data.ic
@@ -153,30 +153,6 @@ dfield_datas_are_binary_equal(
 	return(TRUE);
 }

-/*************************************************************************
-Tests if dfield data length and content is equal to the given. */
-UNIV_INLINE
-ibool
-dfield_data_is_binary_equal(
-/*========================*/
-				/* out: TRUE if equal */
-	dfield_t*	field,	/* in: field */
-	ulint		len,	/* in: data length or UNIV_SQL_NULL */
-	byte*		data)	/* in: data */
-{
-	if (len != field->len) {
-
-		return(FALSE);
-	}
-
-	if (len != UNIV_SQL_NULL && 0 != ut_memcmp(field->data, data, len)) {
-	    	
-		return(FALSE);
-	}
-
-	return(TRUE);
-}
-
 /*************************************************************************
 Gets info bits in a data tuple. */
 UNIV_INLINE

--- a/innobase/include/log0log.h
+++ b/innobase/include/log0log.h
@@ -157,6 +157,14 @@ log_io_complete(
 /*============*/
 	log_group_t*	group);	/* in: log group */
 /**********************************************************
+Flushes the log files to the disk, using, for example, the Unix fsync.
+This function does the flush even if the user has set
+srv_flush_log_at_trx_commit = FALSE. */
+
+void
+log_flush_to_disk(void);
+/*===================*/
+/**********************************************************
 This function is called, e.g., when a transaction wants to commit. It checks
 that the log has been flushed to disk up to the last log entry written by the
 transaction. If there is a flush running, it waits and checks if the flush
@@ -260,7 +268,9 @@ log_reset_first_header_and_checkpoint(
 /*==================================*/
 	byte*	hdr_buf,/* in: buffer which will be written to the start
 			of the first log file */
-	dulint	lsn);	/* in: lsn of the start of the first log file */
+	dulint	start);	/* in: lsn of the start of the first log file;
+			we pretend that there is a checkpoint at
+			start + LOG_BLOCK_HDR_SIZE */
 /************************************************************************
 Starts an archiving operation. */

@@ -463,6 +473,15 @@ log_block_init(
 	byte*	log_block,	/* in: pointer to the log buffer */
 	dulint	lsn);		/* in: lsn within the log block */
 /****************************************************************
+Initializes a log block in the log buffer in the old, < 3.23.52 format, where
+there was no checksum yet. */
+UNIV_INLINE
+void
+log_block_init_in_old_format(
+/*=========================*/
+	byte*	log_block,	/* in: pointer to the log buffer */
+	dulint	lsn);		/* in: lsn within the log block */
+/****************************************************************
 Converts a lsn to a log block number. */
 UNIV_INLINE
 ulint
@@ -523,7 +542,10 @@ extern log_t*	log_sys;
 					bytes */

 /* Offsets of a log block trailer from the end of the block */
-#define	LOG_BLOCK_TRL_NO	4	/* log block number */
+#define	LOG_BLOCK_TRL_CHECKSUM	4	/* 1 byte checksum of the log block
+					contents */
+#define	LOG_BLOCK_TRL_NO	3	/* 3 lowest bytes of the log block
+					number */
 #define	LOG_BLOCK_TRL_SIZE	4	/* trailer size in bytes */

 /* Offsets for a checkpoint field */
@@ -558,11 +580,22 @@ extern log_t*	log_sys;
 #define LOG_GROUP_ID		0	/* log group number */
 #define LOG_FILE_START_LSN	4	/* lsn of the start of data in this
 					log file */
-#define LOG_FILE_NO		12	/* 4-byte archived log file number */
+#define LOG_FILE_NO		12	/* 4-byte archived log file number;
+					this field is only defined in an
+					archived log file */
+#define LOG_FILE_WAS_CREATED_BY_HOT_BACKUP 16
+					/* a 32-byte field which contains
+					the string 'ibbackup' and the
+					creation time if the log file was
+					created by ibbackup --restore;
+					when mysqld is first time started
+					on the restored database, it can
+					print helpful info for the user */
 #define	LOG_FILE_ARCH_COMPLETED	OS_FILE_LOG_BLOCK_SIZE
 					/* this 4-byte field is TRUE when
 					the writing of an archived log file
-					has been completed */
+					has been completed; this field is
+					only defined in an archived log file */
 #define LOG_FILE_END_LSN	(OS_FILE_LOG_BLOCK_SIZE + 4)
 					/* lsn where the archived log file
 					at least extends: actually the
@@ -572,7 +605,14 @@ extern log_t*	log_sys;
 					is defined only when an archived log
 					file has been completely written */
 #define LOG_CHECKPOINT_1	OS_FILE_LOG_BLOCK_SIZE
+					/* first checkpoint field in the log
+					header; we write alternately to the
+					checkpoint fields when we make new
+					checkpoints; this field is only defined
+					in the first log file of a log group */
 #define LOG_CHECKPOINT_2	(3 * OS_FILE_LOG_BLOCK_SIZE)
+					/* second checkpoint field in the log
+					header */
 #define LOG_FILE_HDR_SIZE	(4 * OS_FILE_LOG_BLOCK_SIZE)

 #define LOG_GROUP_OK		301
@@ -678,7 +718,7 @@ struct log_struct{
 					write i/o has been completed for all
 					log groups */
 	dulint		flush_lsn;	/* end lsn for the current flush */
-	ulint		flush_end_offset;/* the data in buffer ha been flushed
+	ulint		flush_end_offset;/* the data in buffer has been flushed
 					up to this offset when the current
 					flush ends: this field will then
 					be copied to buf_next_to_write */

--- a/innobase/include/log0log.ic
+++ b/innobase/include/log0log.ic
@@ -179,7 +179,7 @@ log_block_get_trl_no(
 				trailer */
 	byte*	log_block)	/* in: log block */
 {
-	return(mach_read_from_4(log_block + OS_FILE_LOG_BLOCK_SIZE
+	return(mach_read_from_3(log_block + OS_FILE_LOG_BLOCK_SIZE
 							- LOG_BLOCK_TRL_NO));
 }

@@ -192,8 +192,8 @@ log_block_set_trl_no(
 	byte*	log_block,	/* in: log block */
 	ulint	n)		/* in: log block number */
 {
-	mach_write_to_4(log_block + OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_NO,
- 									n);
+	mach_write_to_3(log_block + OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_NO,
+ 								n & 0xFFFFFF);
 }

 /****************************************************************
@@ -237,6 +237,29 @@ log_block_init(
 	log_block_set_data_len(log_block, LOG_BLOCK_HDR_SIZE);
 	log_block_set_first_rec_group(log_block, 0);
 }
+
+/****************************************************************
+Initializes a log block in the log buffer in the old format, where there
+was no checksum yet. */
+UNIV_INLINE
+void
+log_block_init_in_old_format(
+/*=========================*/
+	byte*	log_block,	/* in: pointer to the log buffer */
+	dulint	lsn)		/* in: lsn within the log block */
+{
+	ulint	no;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	no = log_block_convert_lsn_to_no(lsn);
+	
+	log_block_set_hdr_no(log_block, no);
+	mach_write_to_4(log_block + OS_FILE_LOG_BLOCK_SIZE
+						- LOG_BLOCK_TRL_NO - 1, no);
+	log_block_set_data_len(log_block, LOG_BLOCK_HDR_SIZE);
+	log_block_set_first_rec_group(log_block, 0);
+}
 	
 /****************************************************************
 Writes to the log the string given. The log must be released with

--- a/innobase/include/os0file.h
+++ b/innobase/include/os0file.h
@@ -16,6 +16,7 @@ Created 10/21/1995 Heikki Tuuri
 os_file_write */
 extern ibool	os_do_not_call_flush_at_each_write;
 extern ibool	os_has_said_disk_full;
+extern ibool	os_aio_print_debug;

 #ifdef __WIN__

@@ -33,6 +34,8 @@ extern ibool	os_has_said_disk_full;
 typedef int	os_file_t;
 #endif

+extern ulint	os_innodb_umask;
+
 /* If this flag is TRUE, then we will use the native aio of the
 OS (provided we compiled Innobase with it in), otherwise we will
 use simulated aio we build below with threads */
@@ -309,6 +312,15 @@ Wakes up simulated aio i/o-handler threads if they have something to do. */
 void
 os_aio_simulated_wake_handler_threads(void);
 /*=======================================*/
+/**************************************************************************
+This function can be called if one wants to post a batch of reads and
+prefers an i/o-handler thread to handle them all at once later. You must
+call os_aio_simulated_wake_handler_threads later to ensure the threads
+are not left sleeping! */
+
+void
+os_aio_simulated_put_read_threads_to_sleep(void);
+/*============================================*/

 #ifdef WIN_ASYNC_IO
 /**************************************************************************

--- a/innobase/include/page0page.h
+++ b/innobase/include/page0page.h
@@ -328,7 +328,7 @@ page_dir_calc_reserved_space(
 	ulint	n_recs);	/* in: number of records */
 /*******************************************************************
 Looks for the directory slot which owns the given record. */
-UNIV_INLINE
+
 ulint
 page_dir_find_owner_slot(
 /*=====================*/

--- a/innobase/include/page0page.ic
+++ b/innobase/include/page0page.ic
@@ -479,6 +479,8 @@ page_rec_get_next(

 	offs = rec_get_next_offs(rec);

+	ut_a(offs < UNIV_PAGE_SIZE);
+
 	if (offs == 0) {
 		
 		return(NULL);
@@ -487,40 +489,6 @@ page_rec_get_next(
 	return(page + offs);
 }

-/*******************************************************************
-Looks for the directory slot which owns the given record. */
-UNIV_INLINE
-ulint
-page_dir_find_owner_slot(
-/*=====================*/
-			/* out: the directory slot number */
-	rec_t*	rec)	/* in: the physical record */
-{
-	ulint			i;
-	page_t*			page;	
-	page_dir_slot_t*	slot;
-
-	ut_ad(page_rec_check(rec));
-
-	while (rec_get_n_owned(rec) == 0) {
-		rec = page_rec_get_next(rec);
-	}
-	
-	page = buf_frame_align(rec);
-
-	i = page_dir_get_n_slots(page) - 1;
-	slot = page_dir_get_nth_slot(page, i); 
-
-	while (page_dir_slot_get_rec(slot) != rec) {
-		ut_a(i > 0);
-
-		i--;
-		slot = page_dir_get_nth_slot(page, i); 
-	}
-
-	return(i);
-}
-
 /****************************************************************
 Sets the pointer to the next record on the page. */ 
 UNIV_INLINE
@@ -534,7 +502,7 @@ page_rec_set_next(
 	page_t*	page;

 	ut_ad(page_rec_check(rec));	
-	ut_ad((next == NULL)
+	ut_a((next == NULL)
 	      || (buf_frame_align(rec) == buf_frame_align(next)));

 	page = buf_frame_align(rec);
@@ -573,7 +541,7 @@ page_rec_get_prev(

 	slot_no = page_dir_find_owner_slot(rec);

-	ut_ad(slot_no != 0);
+	ut_a(slot_no != 0);
 	
 	slot = page_dir_get_nth_slot(page, slot_no - 1);
 	
@@ -584,7 +552,7 @@ page_rec_get_prev(
 		rec2 = page_rec_get_next(rec2);
 	}
 	
-	ut_ad(prev_rec);
+	ut_a(prev_rec);

 	return(prev_rec);
 }

--- a/innobase/include/row0mysql.h
+++ b/innobase/include/row0mysql.h
@@ -230,6 +230,19 @@ row_update_cascade_for_mysql(
 				or set null operation */
 	dict_table_t*	table);	/* in: table where we do the operation */
 /*************************************************************************
+Locks the data dictionary exclusively for performing a table create
+operation. */
+
+void
+row_mysql_lock_data_dictionary(void);
+/*================================*/
+/*************************************************************************
+Unlocks the data dictionary exclusively lock. */
+
+void
+row_mysql_unlock_data_dictionary(void);
+/*==================================*/
+/*************************************************************************
 Does a table creation operation for MySQL. If the name of the created
 table ends to characters INNODB_MONITOR, then this also starts
 printing of monitor output by the master thread. */

--- a/innobase/include/trx0roll.h
+++ b/innobase/include/trx0roll.h
@@ -102,11 +102,13 @@ trx_rollback(
 				calling function can start running
 				a new query thread */
 /***********************************************************************
-Rollback uncommitted transactions which have no user session. */
+Rollback or clean up transactions which have no user session. If the
+transaction already was committed, then we clean up a possible insert
+undo log. If the transaction was not yet committed, then we roll it back. */

 void
-trx_rollback_all_without_sess(void);
-/*===============================*/
+trx_rollback_or_clean_all_without_sess(void);
+/*========================================*/
 /********************************************************************
 Finishes a transaction rollback. */


--- a/innobase/include/trx0sys.h
+++ b/innobase/include/trx0sys.h
@@ -24,6 +24,14 @@ Created 3/26/1996 Heikki Tuuri
 #include "fsp0fsp.h"
 #include "read0types.h"

+/* In a MySQL replication slave, in crash recovery we store the master log
+file name and position here. We have successfully got the updates to InnoDB
+up to this position. If .._pos is -1, it means no crash recovery was needed,
+or there was no master log position info inside InnoDB. */
+
+extern char 		trx_sys_mysql_master_log_name[];
+extern ib_longlong	trx_sys_mysql_master_log_pos;
+
 /* The transaction system */
 extern trx_sys_t*	trx_sys;

@@ -229,13 +237,18 @@ trx_in_trx_list(
 	trx_t*	in_trx);/* in: trx */
 /*********************************************************************
 Updates the offset information about the end of the MySQL binlog entry
-which corresponds to the transaction just being committed. */
+which corresponds to the transaction just being committed. In a MySQL
+replication slave updates the latest master binlog position up to which
+replication has proceeded. */

 void
 trx_sys_update_mysql_binlog_offset(
 /*===============================*/
-	trx_t*	trx,	/* in: transaction being committed */
-	mtr_t*	mtr);	/* in: mtr */
+	char*		file_name,/* in: MySQL log file name */
+	ib_longlong	offset,	/* in: position in that log file */
+	ulint		field,	/* in: offset of the MySQL log info field in
+				the trx sys header */
+	mtr_t*		mtr);	/* in: mtr */
 /*********************************************************************
 Prints to stderr the MySQL binlog offset info in the trx system header if
 the magic number shows it valid. */
@@ -243,15 +256,17 @@ the magic number shows it valid. */
 void
 trx_sys_print_mysql_binlog_offset(void);
 /*===================================*/
+/*********************************************************************
+Prints to stderr the MySQL master log offset info in the trx system header if
+the magic number shows it valid. */
+
+void
+trx_sys_print_mysql_master_log_pos(void);
+/*====================================*/

 /* The automatically created system rollback segment has this id */
 #define TRX_SYS_SYSTEM_RSEG_ID	0

-/* Max number of rollback segments: the number of segment specification slots
-in the transaction system array; rollback segment id must fit in one byte,
-therefore 256 */
-#define	TRX_SYS_N_RSEGS		256
-
 /* Space id and page no where the trx system file copy resides */
 #define	TRX_SYS_SPACE	0	/* the SYSTEM tablespace */
 #define	TRX_SYS_PAGE_NO	FSP_TRX_SYS_PAGE_NO
@@ -277,22 +292,29 @@ therefore 256 */
 					segment specification slots */
 /*-------------------------------------------------------------*/

-#define TRX_SYS_MYSQL_LOG_NAME_LEN	32
+/* Max number of rollback segments: the number of segment specification slots
+in the transaction system array; rollback segment id must fit in one byte,
+therefore 256; each slot is currently 8 bytes in size */
+#define	TRX_SYS_N_RSEGS		256
+
+#define TRX_SYS_MYSQL_LOG_NAME_LEN	512
 #define TRX_SYS_MYSQL_LOG_MAGIC_N	873422344

+/* The offset of the MySQL replication info on the trx system header page;
+this contains the same fields as TRX_SYS_MYSQL_LOG_INFO below */
+#define TRX_SYS_MYSQL_MASTER_LOG_INFO	(UNIV_PAGE_SIZE - 2000)
+
 /* The offset of the MySQL binlog offset info on the trx system header page */
-#define TRX_SYS_MYSQL_LOG_INFO		(UNIV_PAGE_SIZE - 300)
+#define TRX_SYS_MYSQL_LOG_INFO		(UNIV_PAGE_SIZE - 1000)
 #define	TRX_SYS_MYSQL_LOG_MAGIC_N_FLD	0	/* magic number which shows
 						if we have valid data in the
 						MySQL binlog info; the value
 						is ..._MAGIC_N if yes */
-#define TRX_SYS_MYSQL_LOG_NAME		4	/* MySQL log file name */
-#define TRX_SYS_MYSQL_LOG_OFFSET_HIGH	(4 + TRX_SYS_MYSQL_LOG_NAME_LEN)
-						/* high 4 bytes of the offset
+#define TRX_SYS_MYSQL_LOG_OFFSET_HIGH	4	/* high 4 bytes of the offset
 						within that file */
-#define TRX_SYS_MYSQL_LOG_OFFSET_LOW	(8 + TRX_SYS_MYSQL_LOG_NAME_LEN)
-						/* low 4 bytes of the offset
+#define TRX_SYS_MYSQL_LOG_OFFSET_LOW	8	/* low 4 bytes of the offset
 						within that file */
+#define TRX_SYS_MYSQL_LOG_NAME		12	/* MySQL log file name */

 /* The offset of the doublewrite buffer header on the trx system header page */
 #define TRX_SYS_DOUBLEWRITE		(UNIV_PAGE_SIZE - 200)

--- a/innobase/include/trx0trx.h
+++ b/innobase/include/trx0trx.h
@@ -124,6 +124,15 @@ void
 trx_commit_off_kernel(
 /*==================*/
 	trx_t*	trx);	/* in: transaction */
+/********************************************************************
+Cleans up a transaction at database startup. The cleanup is needed if
+the transaction already got to the middle of a commit when the database
+crashed, andf we cannot roll it back. */
+
+void
+trx_cleanup_at_db_startup(
+/*======================*/
+	trx_t*	trx);	/* in: transaction */
 /**************************************************************************
 Does the transaction commit for MySQL. */

@@ -322,13 +331,24 @@ struct trx_struct{
        void*           mysql_thd;      /* MySQL thread handle corresponding
                                        to this trx, or NULL */
 	char*		mysql_log_file_name;
-					/* If MySQL binlog is used, this field
+					/* if MySQL binlog is used, this field
 					contains a pointer to the latest file
 					name; this is NULL if binlog is not
 					used */
-	ib_longlong	mysql_log_offset;/* If MySQL binlog is used, this field
+	ib_longlong	mysql_log_offset;/* if MySQL binlog is used, this field
 					contains the end offset of the binlog
 					entry */
+	char*		mysql_master_log_file_name;
+					/* if the database server is a MySQL
+					replication slave, we have here the
+					master binlog name up to which
+					replication has processed; otherwise
+					this is a pointer to a null character */
+	ib_longlong	mysql_master_log_pos;
+					/* if the database server is a MySQL
+					replication slave, this is the
+					position in the log file up to which
+					replication has processed */
 	os_thread_id_t	mysql_thread_id;/* id of the MySQL thread associated
 					with this transaction object */
 	/*------------------------------*/

--- a/innobase/include/univ.i
+++ b/innobase/include/univ.i
@@ -9,40 +9,26 @@ Created 1/20/1994 Heikki Tuuri
 #ifndef univ_i
 #define univ_i

-#if (defined(_WIN32) || defined(_WIN64)) && !defined(MYSQL_SERVER)
+#if (defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)) && !defined(MYSQL_SERVER)
 #define __WIN__

 #include <windows.h>

-/* When compiling for Itanium IA64, undefine the flag below to prevent use
-of 32-bit assembler */
-
-#ifndef WIN64
+#if !defined(WIN64) && !defined(_WIN64)
 #define UNIV_CAN_USE_X86_ASSEMBLER
 #endif

-/* If you want to check for errors with compiler level -W4,
-comment out the above include of windows.h and let the following defines
-be defined:
-#define HANDLE void*
-#define CRITICAL_SECTION	ulint
-*/
-
 #ifdef _NT_
 #define __NT__
 #endif

 #else
-/* The Unix version */
-
-/* Most C compilers other than gcc do not know 'extern inline' */ 
-#if !defined(__GNUC__) && !defined(__WIN__)
-#define UNIV_MUST_NOT_INLINE
-#endif
+/* The defines used with MySQL */

 /* Include two header files from MySQL to make the Unix flavor used
-in compiling more Posix-compatible. We assume that 'innobase' is a
-subdirectory of 'mysql'. */
+in compiling more Posix-compatible. These headers also define __WIN__
+if we are compiling on Windows. */
+
 #include <global.h>
 #include <my_pthread.h>

@@ -59,6 +45,20 @@ subdirectory of 'mysql'. */
 #include <sched.h>
 #endif

+/* When compiling for Itanium IA64, undefine the flag below to prevent use
+of the 32-bit x86 assembler in mutex operations. */
+
+#if defined(__WIN__) && !defined(WIN64) && !defined(_WIN64)
+#define UNIV_CAN_USE_X86_ASSEMBLER
+#endif
+
+/* We only try to do explicit inlining of functions with gcc and
+Microsoft Visual C++ */
+
+#if !defined(__GNUC__) && !defined(__WIN__)
+#define UNIV_MUST_NOT_INLINE
+#endif
+
 #ifdef HAVE_PREAD
 #define HAVE_PWRITE
 #endif

--- a/innobase/include/ut0ut.h
+++ b/innobase/include/ut0ut.h
@@ -114,7 +114,7 @@ ut_2_exp(
 	ulint	n);	/* in: number */
 /*****************************************************************
 Calculates fast the number rounded up to the nearest power of 2. */
-UNIV_INLINE
+
 ulint
 ut_2_power_up(
 /*==========*/
@@ -155,6 +155,13 @@ ut_print_timestamp(
 /*===============*/
 	FILE*  file); /* in: file where to print */
 /**************************************************************
+Sprintfs a timestamp to a buffer. */
+
+void
+ut_sprintf_timestamp(
+/*=================*/
+	char*	buf); /* in: buffer where to sprintf */
+/**************************************************************
 Returns current year, month, day. */

 void

--- a/innobase/include/ut0ut.ic
+++ b/innobase/include/ut0ut.ic
@@ -172,25 +172,3 @@ ut_2_exp(
 {
 	return(1 << n);
 }
-
-/*****************************************************************
-Calculates fast the number rounded up to the nearest power of 2. */
-UNIV_INLINE
-ulint
-ut_2_power_up(
-/*==========*/
-			/* out: first power of 2 which is >= n */
-	ulint	n)	/* in: number != 0 */
-{
-	ulint	res;
-
-	res = 1;
-
-	ut_ad(n > 0);
-
-	while (res < n) {
-		res = res * 2;
-	}
-
-	return(res);
-}
--- a/innobase/lock/lock0lock.c
+++ b/innobase/lock/lock0lock.c
@@ -1541,6 +1541,15 @@ lock_rec_enqueue_waiting(
 		
 	trx = thr_get_trx(thr);

+	if (trx->dict_operation) {
+		ut_print_timestamp(stderr);
+	
+		fprintf(stderr,
+"  InnoDB: Error: a record lock wait happens in a dictionary operation!\n"
+"InnoDB: Table name %s. Send a bug report to mysql@lists.mysql.com\n",
+index->table_name);
+	}
+	
 	/* Enqueue the lock request that will wait to be granted */
 	lock = lock_rec_create(type_mode | LOCK_WAIT, rec, index, trx);

@@ -2914,7 +2923,7 @@ lock_table_enqueue_waiting(
 	trx_t*	trx;
 	
 	ut_ad(mutex_own(&kernel_mutex));
-
+	
 	/* Test if there already is some other reason to suspend thread:
 	we do not enqueue a lock request if the query thread should be
 	stopped anyway */
@@ -2926,6 +2935,15 @@ lock_table_enqueue_waiting(
 	}

 	trx = thr_get_trx(thr);
+
+	if (trx->dict_operation) {
+		ut_print_timestamp(stderr);
+	
+		fprintf(stderr,
+"  InnoDB: Error: a table lock wait happens in a dictionary operation!\n"
+"InnoDB: Table name %s. Send a bug report to mysql@lists.mysql.com\n",
+table->name);
+	}
 	
 	/* Enqueue the lock request that will wait to be granted */


--- a/innobase/log/log0log.c
+++ b/innobase/log/log0log.c
@@ -162,6 +162,8 @@ log_reserve_and_open(
 	ulint	archived_lsn_age;
 	ulint	count			= 0;
 	ulint	dummy;
+
+	ut_a(len < log->buf_size / 2);
 loop:
 	mutex_enter(&(log->mutex));
 	
@@ -663,6 +665,8 @@ log_init(void)
 	
 	log_sys->buf_next_to_write = 0;

+	log_sys->flush_lsn = ut_dulint_zero;
+
 	log_sys->written_to_some_lsn = log_sys->lsn;
 	log_sys->written_to_all_lsn = log_sys->lsn;
 	
@@ -777,9 +781,15 @@ log_group_init(
 		*(group->file_header_bufs + i) = ut_align(
 			mem_alloc(LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE),
 						OS_FILE_LOG_BLOCK_SIZE);
+
+		memset(*(group->file_header_bufs + i), '\0',
+							LOG_FILE_HDR_SIZE);
+
 		*(group->archive_file_header_bufs + i) = ut_align(
 			mem_alloc(LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE),
 						OS_FILE_LOG_BLOCK_SIZE);
+		memset(*(group->archive_file_header_bufs + i), '\0',
+							LOG_FILE_HDR_SIZE);
 	}
 	
 	group->archive_space_id = archive_space_id;
@@ -791,6 +801,8 @@ log_group_init(
 				mem_alloc(2 * OS_FILE_LOG_BLOCK_SIZE),
 						OS_FILE_LOG_BLOCK_SIZE);
 	
+	memset(group->checkpoint_buf, '\0', OS_FILE_LOG_BLOCK_SIZE);
+
 	UT_LIST_ADD_LAST(log_groups, log_sys->log_groups, group);

 	ut_a(log_calc_max_ages());
@@ -839,7 +851,7 @@ log_group_check_flush_completion(
 {
 	ut_ad(mutex_own(&(log_sys->mutex)));	

-	if (!log_sys->one_flushed && (group->n_pending_writes == 0)) {
+	if (!log_sys->one_flushed && group->n_pending_writes == 0) {

 		if (log_debug_writes) {
 			printf("Log flushed first to group %lu\n", group->id);
@@ -933,16 +945,20 @@ log_io_complete(
 		return;
 	}

+	ut_a(0);	/* We currently use synchronous writing of the
+			logs and cannot end up here! */
+
 	if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC
-	    && srv_unix_file_flush_method != SRV_UNIX_NOSYNC) {
+	    && srv_unix_file_flush_method != SRV_UNIX_NOSYNC
+	    && srv_flush_log_at_trx_commit != 2) {

 	        fil_flush(group->space_id);
 	}

 	mutex_enter(&(log_sys->mutex));

-	ut_ad(group->n_pending_writes > 0);
-	ut_ad(log_sys->n_pending_writes > 0);
+	ut_a(group->n_pending_writes > 0);
+	ut_a(log_sys->n_pending_writes > 0);
 	
 	group->n_pending_writes--;
 	log_sys->n_pending_writes--;
@@ -955,6 +971,57 @@ log_io_complete(
 	mutex_exit(&(log_sys->mutex));
 }

+/**********************************************************
+Flushes the log files to the disk, using, for example, the Unix fsync.
+This function does the flush even if the user has set
+srv_flush_log_at_trx_commit = FALSE. */
+
+void
+log_flush_to_disk(void)
+/*===================*/
+{
+	log_group_t*	group;
+loop:
+	mutex_enter(&(log_sys->mutex));
+
+	if (log_sys->n_pending_writes > 0) {
+		/* A log file write is running */
+		
+		mutex_exit(&(log_sys->mutex));
+
+		/* Wait for the log file write to complete and try again */
+
+		os_event_wait(log_sys->no_flush_event);
+
+		goto loop;
+	}
+
+	group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+	log_sys->n_pending_writes++;
+	group->n_pending_writes++;
+
+	os_event_reset(log_sys->no_flush_event);
+	os_event_reset(log_sys->one_flushed_event);
+
+	mutex_exit(&(log_sys->mutex));
+	
+	fil_flush(group->space_id);
+
+	mutex_enter(&(log_sys->mutex));
+
+	ut_a(group->n_pending_writes == 1);
+	ut_a(log_sys->n_pending_writes == 1);
+	
+	group->n_pending_writes--;
+	log_sys->n_pending_writes--;
+
+	os_event_set(log_sys->no_flush_event);
+	os_event_set(log_sys->one_flushed_event);
+	
+	mutex_exit(&(log_sys->mutex));
+}
+
 /**********************************************************
 Writes a log file header to a log file space. */
 static
@@ -970,7 +1037,6 @@ log_group_file_header_flush(
 {
 	byte*	buf;
 	ulint	dest_offset;
-	ibool	sync;

 	ut_ad(mutex_own(&(log_sys->mutex)));

@@ -981,15 +1047,11 @@ log_group_file_header_flush(
 	mach_write_to_4(buf + LOG_GROUP_ID, group->id);
 	mach_write_to_8(buf + LOG_FILE_START_LSN, start_lsn);

-	dest_offset = nth_file * group->file_size;
+	/* Wipe over possible label of ibbackup --restore */
+	memcpy(buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP, "    ", 4);

-	sync = FALSE;
-
-	if (type == LOG_RECOVER) {
+	dest_offset = nth_file * group->file_size;

-		sync = TRUE;
-	}
-	
 	if (log_debug_writes) {
 		printf(
 		"Writing log file header to group %lu file %lu\n", group->id,
@@ -997,14 +1059,9 @@ log_group_file_header_flush(
 	}

 	if (log_do_write) {
-		if (type == LOG_FLUSH) {
-			log_sys->n_pending_writes++;
-			group->n_pending_writes++;
-		}
-
 		log_sys->n_log_ios++;	
 		
-		fil_io(OS_FILE_WRITE | OS_FILE_LOG, sync, group->space_id,
+		fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE, group->space_id,
 				dest_offset / UNIV_PAGE_SIZE,
 				dest_offset % UNIV_PAGE_SIZE,
 				OS_FILE_LOG_BLOCK_SIZE,
@@ -1012,6 +1069,31 @@ log_group_file_header_flush(
 	}
 }

+/**********************************************************
+Stores a 1-byte checksum to the trailer checksum field of a log block
+before writing it to a log file. This checksum is used in recovery to
+check the consistency of a log block. The checksum is simply the 8 low
+bits of 1 + the sum of the bytes in the log block except the trailer bytes. */
+static
+void
+log_block_store_checksum(
+/*=====================*/
+	byte*	block)	/* in/out: pointer to a log block */
+{
+	ulint	i;
+	ulint	sum;
+
+	sum = 1;
+
+	for (i = 0; i < OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE; i++) {
+		sum += (ulint)(*(block + i));
+	}
+
+	mach_write_to_1(block + OS_FILE_LOG_BLOCK_SIZE
+						- LOG_BLOCK_TRL_CHECKSUM,
+			0xFF & sum);
+}
+	
 /**********************************************************
 Writes a buffer to a log file group. */

@@ -1032,20 +1114,13 @@ log_group_write_buf(
 					header */
 {
 	ulint	write_len;
-	ibool	sync;
 	ibool	write_header;
 	ulint	next_offset;
+	ulint	i;
 	
 	ut_ad(mutex_own(&(log_sys->mutex)));
-	ut_ad(len % OS_FILE_LOG_BLOCK_SIZE == 0);
-	ut_ad(ut_dulint_get_low(start_lsn) % OS_FILE_LOG_BLOCK_SIZE == 0);
-
-	sync = FALSE;
-
-	if (type == LOG_RECOVER) {
-
-		sync = TRUE;
-	}
+	ut_a(len % OS_FILE_LOG_BLOCK_SIZE == 0);
+	ut_a(ut_dulint_get_low(start_lsn) % OS_FILE_LOG_BLOCK_SIZE == 0);

 	if (new_data_offset == 0) {
 		write_header = TRUE;
@@ -1076,7 +1151,6 @@ loop:
 	}
 	
 	if (log_debug_writes) {
-		ulint	i;

 		printf(
 		"Writing log file segment to group %lu offset %lu len %lu\n"
@@ -1100,15 +1174,17 @@ loop:
 		}
 	}

-	if (log_do_write) {
-		if (type == LOG_FLUSH) {
-			log_sys->n_pending_writes++;
-			group->n_pending_writes++;
-		}
+	/* Calculate the checksums for each log block and write them to
+	the trailer fields of the log blocks */
+
+	for (i = 0; i < write_len / OS_FILE_LOG_BLOCK_SIZE; i++) {
+		log_block_store_checksum(buf + i * OS_FILE_LOG_BLOCK_SIZE);
+	}

+	if (log_do_write) {
 		log_sys->n_log_ios++;	

-		fil_io(OS_FILE_WRITE | OS_FILE_LOG, sync, group->space_id,
+		fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE, group->space_id,
 			next_offset / UNIV_PAGE_SIZE,
 			next_offset % UNIV_PAGE_SIZE, write_len, buf, group);
 	}
@@ -1126,15 +1202,15 @@ loop:

 /**********************************************************
 This function is called, e.g., when a transaction wants to commit. It checks
-that the log has been flushed to disk up to the last log entry written by the
-transaction. If there is a flush running, it waits and checks if the flush
-flushed enough. If not, starts a new flush. */
+that the log has been written to the log file up to the last log entry written
+by the transaction. If there is a flush running, it waits and checks if the
+flush flushed enough. If not, starts a new flush. */

 void
 log_flush_up_to(
 /*============*/
 	dulint	lsn,	/* in: log sequence number up to which the log should
-			be flushed, ut_dulint_max if not specified */
+			be written, ut_dulint_max if not specified */
 	ulint	wait)	/* in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP,
 			or LOG_WAIT_ALL_GROUPS */
 {
@@ -1144,6 +1220,7 @@ log_flush_up_to(
 	ulint		area_start;
 	ulint		area_end;
 	ulint		loop_count;
+	ulint		unlock;

 	if (recv_no_ibuf_operations) {
 		/* Recovery is running and no operations on the log files are
@@ -1209,6 +1286,12 @@ loop:
 					ut_dulint_get_low(log_sys->lsn));
 	}

+	log_sys->n_pending_writes++;
+
+	group = UT_LIST_GET_FIRST(log_sys->log_groups);
+	group->n_pending_writes++; 	/* We assume here that we have only
+					one log group! */
+
 	os_event_reset(log_sys->no_flush_event);
 	os_event_reset(log_sys->one_flushed_event);

@@ -1254,6 +1337,36 @@ loop:
 		group = UT_LIST_GET_NEXT(log_groups, group);
 	}

+	mutex_exit(&(log_sys->mutex));
+
+	if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC
+	    && srv_unix_file_flush_method != SRV_UNIX_NOSYNC
+	    && srv_flush_log_at_trx_commit != 2) {
+
+		group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+	        fil_flush(group->space_id);
+	}
+
+	mutex_enter(&(log_sys->mutex));
+
+	group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+	ut_a(group->n_pending_writes == 1);
+	ut_a(log_sys->n_pending_writes == 1);
+	
+	group->n_pending_writes--;
+	log_sys->n_pending_writes--;
+
+	unlock = log_group_check_flush_completion(group);
+	unlock = unlock | log_sys_check_flush_completion();
+	
+	log_flush_do_unlocks(unlock);
+
+	mutex_exit(&(log_sys->mutex));
+
+	return;
+	
 do_waits:
 	mutex_exit(&(log_sys->mutex));

@@ -1539,15 +1652,23 @@ log_reset_first_header_and_checkpoint(
 /*==================================*/
 	byte*	hdr_buf,/* in: buffer which will be written to the start
 			of the first log file */
-	dulint	lsn)	/* in: lsn of the start of the first log file
-			+ LOG_BLOCK_HDR_SIZE */
+	dulint	start)	/* in: lsn of the start of the first log file;
+			we pretend that there is a checkpoint at
+			start + LOG_BLOCK_HDR_SIZE */
 {
 	ulint	fold;
 	byte*	buf;
-
+	dulint	lsn;
+	
 	mach_write_to_4(hdr_buf + LOG_GROUP_ID, 0);
-	mach_write_to_8(hdr_buf + LOG_FILE_START_LSN, lsn);
+	mach_write_to_8(hdr_buf + LOG_FILE_START_LSN, start);
+
+	lsn = ut_dulint_add(start, LOG_BLOCK_HDR_SIZE);

+	/* Write the label of ibbackup --restore */
+	sprintf(hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP, "ibbackup ");
+	ut_sprintf_timestamp(hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP
+						+ strlen("ibbackup "));
 	buf = hdr_buf + LOG_CHECKPOINT_1;
 	
 	mach_write_to_8(buf + LOG_CHECKPOINT_NO, ut_dulint_zero);

--- a/innobase/log/log0recv.c
+++ b/innobase/log/log0recv.c
@@ -568,6 +568,55 @@ recv_read_cp_info_for_backup(
 	return(TRUE);
 }

+/**********************************************************
+Checks the 1-byte checksum to the trailer checksum field of a log block.
+We also accept a log block in the old format where the checksum field
+contained the highest byte of the log block number. */
+static
+ibool
+log_block_checksum_is_ok_or_old_format(
+/*===================================*/
+			/* out: TRUE if ok, or if the log block may be in the
+			format of InnoDB version < 3.23.52 */
+	byte*	block)	/* in: pointer to a log block */
+{
+	ulint	i;
+	ulint	sum;
+
+	sum = 1;
+
+	for (i = 0; i < OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE; i++) {
+		sum += (ulint)(*(block + i));
+	}
+
+/*	printf("Checksum %lu, byte %lu\n", 0xFF & sum,
+		mach_read_from_1(block + OS_FILE_LOG_BLOCK_SIZE
+						- LOG_BLOCK_TRL_CHECKSUM));
+*/
+	if (mach_read_from_1(block + OS_FILE_LOG_BLOCK_SIZE
+						- LOG_BLOCK_TRL_CHECKSUM)
+	    == (0xFF & sum)) {
+
+		return(TRUE);
+	}
+
+	if (((0xFF000000 & log_block_get_hdr_no(block)) >> 24)
+		== mach_read_from_1(block + OS_FILE_LOG_BLOCK_SIZE
+						- LOG_BLOCK_TRL_CHECKSUM)) {
+
+		/* We assume the log block is in the format of
+		InnoDB version < 3.23.52 and the block is ok */
+/*
+		fprintf(stderr,
+"InnoDB: Scanned old format < InnoDB-3.23.52 log block number %lu\n",
+			log_block_get_hdr_no(block));
+*/
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
 /***********************************************************************
 Scans the log segment and n_bytes_scanned is set to the length of valid
 log scanned. */
@@ -598,12 +647,13 @@ recv_scan_log_seg_for_backup(
 	
 		no = log_block_get_hdr_no(log_block);

-		/* fprintf(stderr, "Log block header no %lu\n", no); */
+/*		fprintf(stderr, "Log block header no %lu\n", no); */

-		if (no != log_block_get_trl_no(log_block)
-		    || no != log_block_convert_lsn_to_no(*scanned_lsn)) {
-
-/*			printf(
+		if ((no & 0xFFFFFF) != log_block_get_trl_no(log_block)
+		    || no != log_block_convert_lsn_to_no(*scanned_lsn)
+		    || !log_block_checksum_is_ok_or_old_format(log_block)) {
+/*
+			printf(
 "Log block n:o %lu, trailer n:o %lu, scanned lsn n:o %lu\n",
 			no, log_block_get_trl_no(log_block),
 			log_block_convert_lsn_to_no(*scanned_lsn));
@@ -611,8 +661,8 @@ recv_scan_log_seg_for_backup(
 			/* Garbage or an incompletely written log block */

 			log_block += OS_FILE_LOG_BLOCK_SIZE;
-
-/*			printf(
+/*
+			printf(
 "Next log block n:o %lu, trailer n:o %lu\n",
 			log_block_get_hdr_no(log_block),
 			log_block_get_trl_no(log_block));
@@ -629,11 +679,11 @@ recv_scan_log_seg_for_backup(

 			/* Garbage from a log buffer flush which was made
 			before the most recent database recovery */
-
+/*
 			printf("Scanned cp n:o %lu, block cp n:o %lu\n",
 				*scanned_checkpoint_no,
 				log_block_get_checkpoint_no(log_block));
-
+*/
 			break;
 		}

@@ -1011,7 +1061,7 @@ recv_recover_page(
 			page_lsn = page_newest_lsn;
 		}
 	} else {
-		/* In recovery from a backup we do not use the buffer
+		/* In recovery from a backup we do not really use the buffer
 		pool */

 		page_newest_lsn = ut_dulint_zero;
@@ -1361,6 +1411,14 @@ recv_apply_log_recs_for_backup(
 			  nth_page_in_file >> (32 - UNIV_PAGE_SIZE_SHIFT), 
 				UNIV_PAGE_SIZE);

+			/* We simulate a page read made by the buffer pool,
+			to make sure recovery works ok. We must init the
+			block corresponding to buf_pool->frame_zero
+			(== page) */
+
+			buf_page_init_for_backup_restore(0, i,
+						buf_block_align(page));
+
 			recv_recover_page(TRUE, FALSE, page, 0, i);

 			buf_flush_init_for_writing(page,
@@ -2037,8 +2095,33 @@ recv_scan_log_recs(

 		/* fprintf(stderr, "Log block header no %lu\n", no); */

-		if (no != log_block_get_trl_no(log_block)
-		    || no != log_block_convert_lsn_to_no(scanned_lsn)) {
+		if ((no & 0xFFFFFF) != log_block_get_trl_no(log_block)
+		    || no != log_block_convert_lsn_to_no(scanned_lsn)
+		    || !log_block_checksum_is_ok_or_old_format(log_block)) {
+
+			if ((no & 0xFFFFFF) == log_block_get_trl_no(log_block)
+		    	    && no == log_block_convert_lsn_to_no(scanned_lsn)
+			    && !log_block_checksum_is_ok_or_old_format(
+								log_block)) {
+				fprintf(stderr,
+"InnoDB: Log block no %lu at lsn %lu %lu has\n"
+"InnoDB: ok header and trailer, but checksum field contains %lu\n",
+				no, ut_dulint_get_high(scanned_lsn),
+				ut_dulint_get_low(scanned_lsn),
+				mach_read_from_1(log_block
+						+ OS_FILE_LOG_BLOCK_SIZE
+						- LOG_BLOCK_TRL_CHECKSUM));
+			}
+
+			if ((no & 0xFFFFFF)
+					!= log_block_get_trl_no(log_block)) {
+				fprintf(stderr,
+"InnoDB: Log block with header no %lu at lsn %lu %lu has\n"
+"InnoDB: trailer no %lu\n",
+				no, ut_dulint_get_high(scanned_lsn),
+				ut_dulint_get_low(scanned_lsn),
+				log_block_get_trl_no(log_block));
+			}

 			/* Garbage or an incompletely written log block */

@@ -2241,6 +2324,7 @@ recv_recovery_from_checkpoint_start(
 	dulint		archived_lsn;
 	ulint		capacity;
 	byte*		buf;
+	byte		log_hdr_buf[LOG_FILE_HDR_SIZE];
 	ulint		err;

 	ut_ad((type != LOG_CHECKPOINT)
@@ -2288,6 +2372,33 @@ recv_recovery_from_checkpoint_start(
 	checkpoint_no = mach_read_from_8(buf + LOG_CHECKPOINT_NO);
 	archived_lsn = mach_read_from_8(buf + LOG_CHECKPOINT_ARCHIVED_LSN);

+	/* Read the first log file header to print a note if this is
+	a recovery from a restored InnoDB Hot Backup */
+	
+	fil_io(OS_FILE_READ | OS_FILE_LOG, TRUE, max_cp_group->space_id,
+				0, 0, LOG_FILE_HDR_SIZE,
+				log_hdr_buf, max_cp_group);
+
+	if (0 == ut_memcmp(log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP,
+				"ibbackup", ut_strlen("ibbackup"))) {
+		/* This log file was created by ibbackup --restore: print
+		a note to the user about it */
+
+		fprintf(stderr,
+	"InnoDB: The log file was created by ibbackup --restore at\n"
+	"InnoDB: %s\n", log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP);
+		
+		/* Wipe over the label now */
+
+		ut_memcpy(log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP,
+								"    ", 4);
+		/* Write to the log file to wipe over the label */
+		fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE,
+				max_cp_group->space_id,
+				0, 0, OS_FILE_LOG_BLOCK_SIZE,
+				log_hdr_buf, max_cp_group);
+	}
+				
 	group = UT_LIST_GET_FIRST(log_sys->log_groups);

 	while (group) {
@@ -2471,7 +2582,7 @@ recv_recovery_from_checkpoint_finish(void)
 	/* Rollback the uncommitted transactions which have no user session */

 	if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO) {
-		trx_rollback_all_without_sess();
+		trx_rollback_or_clean_all_without_sess();
 	}

 	/* Apply the hashed log records to the respective file pages */
@@ -2487,6 +2598,7 @@ recv_recovery_from_checkpoint_finish(void)
 	}

 	if (recv_needed_recovery) {
+		trx_sys_print_mysql_master_log_pos();
 		trx_sys_print_mysql_binlog_offset();
 	}

@@ -2614,10 +2726,9 @@ recv_reset_log_files_for_backup(

 	/* We pretend there is a checkpoint at lsn + LOG_BLOCK_HDR_SIZE */
 	
-	log_reset_first_header_and_checkpoint(buf,
-				ut_dulint_add(lsn, LOG_BLOCK_HDR_SIZE));
+	log_reset_first_header_and_checkpoint(buf, lsn);
 	
-	log_block_init(buf + LOG_FILE_HDR_SIZE, lsn);
+	log_block_init_in_old_format(buf + LOG_FILE_HDR_SIZE, lsn);
 	log_block_set_first_rec_group(buf + LOG_FILE_HDR_SIZE,
 							LOG_BLOCK_HDR_SIZE);
 	sprintf(name, "%sib_logfile%lu", log_dir, 0);
@@ -2754,7 +2865,7 @@ ask_again:
 		if (ut_dulint_cmp(recv_sys->parse_start_lsn, start_lsn) < 0) {
 			fprintf(stderr, 
 	"InnoDB: Archive log file %s starts from too big a lsn\n",
-									name);	    
+								name);	    
 			return(TRUE);
 		}
 	
@@ -2765,7 +2876,7 @@ ask_again:

 		fprintf(stderr,
 	"InnoDB: Archive log file %s starts from a wrong lsn\n",
-									name);
+								name);
 		return(TRUE);
 	}


--- a/innobase/mtr/mtr0log.c
+++ b/innobase/mtr/mtr0log.c
@@ -290,7 +290,7 @@ mlog_write_string(
 		ut_a(0);
 	}
 	ut_ad(ptr && mtr);
-	ut_ad(len < UNIV_PAGE_SIZE);
+	ut_a(len < UNIV_PAGE_SIZE);

 	ut_memcpy(ptr, str, len);

@@ -338,9 +338,13 @@ mlog_parse_string(
 	offset = mach_read_from_2(ptr);
 	ptr += 2;

+	ut_a(offset < UNIV_PAGE_SIZE);
+
 	len = mach_read_from_2(ptr);
 	ptr += 2;

+	ut_a(len + offset < UNIV_PAGE_SIZE);
+
 	if (end_ptr < ptr + len) {

 		return(NULL);

--- a/innobase/mtr/mtr0mtr.c
+++ b/innobase/mtr/mtr0mtr.c
@@ -315,7 +315,7 @@ mtr_log_reserve_and_write(
 	}

 	data_size = dyn_array_get_data_size(mlog);
-	
+
 	/* Open the database log for log_write_low */
 	mtr->start_lsn = log_reserve_and_open(data_size); 


--- a/innobase/os/os0file.c
+++ b/innobase/os/os0file.c
@@ -22,6 +22,16 @@ Created 10/21/1995 Heikki Tuuri

 #endif

+/* This specifies the file permissions InnoDB uses when it craetes files in
+Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
+my_umask */
+
+#ifndef __WIN__
+ulint	os_innodb_umask		= S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
+#else
+ulint	os_innodb_umask		= 0;
+#endif
+
 /* If the following is set to TRUE, we do not call os_file_flush in every
 os_file_write. We can set this TRUE if the doublewrite buffer is used. */
 ibool	os_do_not_call_flush_at_each_write	= FALSE;
@@ -32,7 +42,7 @@ OS does not provide an atomic pread or pwrite, or similar */
 os_mutex_t	os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES];

 /* In simulated aio, merge at most this many consecutive i/os */
-#define OS_AIO_MERGE_N_CONSECUTIVE	32
+#define OS_AIO_MERGE_N_CONSECUTIVE	64

 /* If this flag is TRUE, then we will use the native aio of the
 OS (provided we compiled Innobase with it in), otherwise we will
@@ -40,6 +50,8 @@ use simulated aio we build below with threads */

 ibool	os_aio_use_native_aio	= FALSE;

+ibool	os_aio_print_debug	= FALSE;
+
 /* The aio array slot structure */
 typedef struct os_aio_slot_struct	os_aio_slot_t;

@@ -115,7 +127,12 @@ os_aio_array_t*	os_aio_sync_array	= NULL;

 ulint	os_aio_n_segments	= ULINT_UNDEFINED;

+/* If the following is TRUE, read i/o handler threads try to
+wait until a batch of new read requests have been posted */
+ibool	os_aio_recommend_sleep_for_read_threads	= FALSE;
+
 ulint	os_n_file_reads		= 0;
+ulint	os_bytes_read_since_printout = 0;
 ulint	os_n_file_writes	= 0;
 ulint	os_n_fsyncs		= 0;
 ulint	os_n_file_reads_old	= 0;
@@ -412,8 +429,8 @@ try_again:
 	}

 	if (create_mode == OS_FILE_CREATE) {
-	        file = open(name, create_flag, S_IRUSR | S_IWUSR | S_IRGRP
-			                     | S_IWGRP | S_IROTH | S_IWOTH);
+	        file = open(name, create_flag, S_IRUSR | S_IWUSR
+						| S_IRGRP | S_IWGRP);
        } else {
                file = open(name, create_flag);
        }
@@ -548,8 +565,7 @@ try_again:
 	}
 #endif
 	if (create_mode == OS_FILE_CREATE) {
-	        file = open(name, create_flag, S_IRUSR | S_IWUSR | S_IRGRP
-			                     | S_IWGRP | S_IROTH | S_IWOTH);
+	        file = open(name, create_flag, os_innodb_umask);
        } else {
                file = open(name, create_flag);
        }
@@ -735,6 +751,8 @@ os_file_flush(

 	ut_a(file);

+	os_n_fsyncs++;
+
 	ret = FlushFileBuffers(file);

 	if (ret) {
@@ -957,6 +975,7 @@ os_file_read(
 	ut_a((offset & 0xFFFFFFFF) == offset);

 	os_n_file_reads++;
+	os_bytes_read_since_printout += n;

 try_again:	
 	ut_ad(file);
@@ -1626,13 +1645,40 @@ os_aio_simulated_wake_handler_threads(void)
 		/* We do not use simulated aio: do nothing */

 		return;
-	}
+	}	
+
+	os_aio_recommend_sleep_for_read_threads	= FALSE;

 	for (i = 0; i < os_aio_n_segments; i++) {
 		os_aio_simulated_wake_handler_thread(i);
 	}
 }

+/**************************************************************************
+This function can be called if one wants to post a batch of reads and
+prefers an i/o-handler thread to handle them all at once later. You must
+call os_aio_simulated_wake_handler_threads later to ensure the threads
+are not left sleeping! */
+
+void
+os_aio_simulated_put_read_threads_to_sleep(void)
+/*============================================*/
+{
+	os_aio_array_t*	array;
+	ulint		g;
+
+	os_aio_recommend_sleep_for_read_threads	= TRUE;
+
+	for (g = 0; g < os_aio_n_segments; g++) {
+		os_aio_get_array_and_local_segment(&array, g);
+
+		if (array == os_aio_read_array) {
+		
+			os_event_reset(os_aio_segment_wait_events[g]);
+		}
+	}
+}
+
 /***********************************************************************
 Requests an asynchronous i/o operation. */

@@ -2042,15 +2088,10 @@ os_aio_simulated_handle(
 	ibool		ret;
 	ulint		n;
 	ulint		i;
-
+	
 	segment = os_aio_get_array_and_local_segment(&array, global_segment);
 	
 restart:
-	/* Give other threads chance to add several i/os to the array
-	at once */
-	
-	os_thread_yield();
-
 	/* NOTE! We only access constant fields in os_aio_array. Therefore
 	we do not have to acquire the protecting mutex yet */

@@ -2061,6 +2102,15 @@ restart:

 	/* Look through n slots after the segment * n'th slot */

+	if (array == os_aio_read_array
+	    && os_aio_recommend_sleep_for_read_threads) {
+
+		/* Give other threads chance to add several i/os to the array
+		at once. */
+
+		goto recommended_sleep;
+	}
+	
 	os_mutex_enter(array->mutex);

 	/* Check if there is a slot for which the i/o has already been
@@ -2071,6 +2121,11 @@ restart:

 		if (slot->reserved && slot->io_already_done) {

+			if (os_aio_print_debug) {
+				fprintf(stderr,
+"InnoDB: i/o for slot %lu already done, returning\n", i);
+			}
+
 			ret = TRUE;
 			
 			goto slot_io_done;
@@ -2177,6 +2232,13 @@ consecutive_loop:

 	srv_io_thread_op_info[global_segment] = (char*) "doing file i/o";

+	if (os_aio_print_debug) {
+		fprintf(stderr,
+"InnoDB: doing i/o of type %lu at offset %lu %lu, length %lu\n",
+			slot->type, slot->offset_high, slot->offset,
+			total_len);
+	}
+
 	/* Do the i/o with ordinary, synchronous i/o functions: */
 	if (slot->type == OS_FILE_WRITE) {
 		ret = os_file_write(slot->name, slot->file, combined_buf,
@@ -2244,10 +2306,18 @@ wait_for_io:

 	os_mutex_exit(array->mutex);

-	srv_io_thread_op_info[global_segment] = (char*) "waiting for i/o request";
+recommended_sleep:
+	srv_io_thread_op_info[global_segment] =
+				(char*)"waiting for i/o request";

 	os_event_wait(os_aio_segment_wait_events[global_segment]);

+	if (os_aio_print_debug) {
+		fprintf(stderr,
+"InnoDB: i/o handler thread for i/o segment %lu wakes up\n",
+			global_segment);
+	}
+	
 	goto restart;
 }

@@ -2316,6 +2386,7 @@ os_aio_print(void)
 	ulint		n_reserved;
 	time_t		current_time;
 	double		time_elapsed;
+	double		avg_bytes_read;
 	ulint		i;

 	for (i = 0; i < srv_n_file_io_threads; i++) {
@@ -2392,9 +2463,19 @@ loop:
 	       fil_n_pending_log_flushes, fil_n_pending_tablespace_flushes);
 	printf("%lu OS file reads, %lu OS file writes, %lu OS fsyncs\n",
 		os_n_file_reads, os_n_file_writes, os_n_fsyncs);
-	printf("%.2f reads/s, %.2f writes/s, %.2f fsyncs/s\n",
+
+	if (os_n_file_reads == os_n_file_reads_old) {
+		avg_bytes_read = 0.0;
+	} else {
+		avg_bytes_read = os_bytes_read_since_printout /
+				(os_n_file_reads - os_n_file_reads_old);
+	}
+
+	printf(
+"%.2f reads/s, %lu avg bytes/read, %.2f writes/s, %.2f fsyncs/s\n",
 		(os_n_file_reads - os_n_file_reads_old)
 		/ time_elapsed,
+		(ulint)avg_bytes_read,
 		(os_n_file_writes - os_n_file_writes_old)
 		/ time_elapsed,
 		(os_n_fsyncs - os_n_fsyncs_old)
@@ -2403,6 +2484,7 @@ loop:
 	os_n_file_reads_old = os_n_file_reads;
 	os_n_file_writes_old = os_n_file_writes;
 	os_n_fsyncs_old = os_n_fsyncs;
+	os_bytes_read_since_printout = 0;
 	
 	os_last_printout = current_time;
 }

--- a/innobase/page/page0cur.c
+++ b/innobase/page/page0cur.c
@@ -403,6 +403,8 @@ page_cur_insert_rec_write_log(
 	byte*	log_ptr;
 	ulint	i;

+	ut_a(rec_size < UNIV_PAGE_SIZE);
+
 	log_ptr = mlog_open(mtr, 30 + MLOG_BUF_MARGIN);

 	if (log_ptr == NULL) {
@@ -491,6 +493,8 @@ page_cur_insert_rec_write_log(

 	mlog_close(mtr, log_ptr);

+	ut_a(rec_size - i < UNIV_PAGE_SIZE);
+
 	if (rec_size - i >= MLOG_BUF_MARGIN) {
 		mlog_catenate_string(mtr, ins_ptr, rec_size - i);
 	}
@@ -602,6 +606,9 @@ page_cur_parse_insert_rec(

 	/* Build the inserted record to buf */
 	
+	ut_a(mismatch_index < UNIV_PAGE_SIZE);
+	ut_a(end_seg_len < UNIV_PAGE_SIZE);
+
 	ut_memcpy(buf, rec_get_start(cursor_rec), mismatch_index);
 	ut_memcpy(buf + mismatch_index, ptr, end_seg_len);

@@ -937,6 +944,8 @@ page_copy_rec_list_end_to_created_page(

 	log_data_len = dyn_array_get_data_size(&(mtr->log)) - log_data_len;

+	ut_a(log_data_len < 100 * UNIV_PAGE_SIZE);
+
 	mach_write_to_4(log_ptr, log_data_len);
 	
 	rec_set_next_offs(insert_rec, PAGE_SUPREMUM);

--- a/innobase/page/page0page.c
+++ b/innobase/page/page0page.c
@@ -17,6 +17,7 @@ Created 2/2/1994 Heikki Tuuri
 #include "lock0lock.h"
 #include "fut0lst.h"
 #include "btr0sea.h"
+#include "buf0buf.h"

 /* A cached template page used in page_create */
 page_t*	page_template	= NULL;
@@ -63,6 +64,65 @@ Assuming a page size of 8 kB, a typical index page of a secondary
 index contains 300 index entries, and the size of the page directory
 is 50 x 4 bytes = 200 bytes. */

+/*******************************************************************
+Looks for the directory slot which owns the given record. */
+
+ulint
+page_dir_find_owner_slot(
+/*=====================*/
+			/* out: the directory slot number */
+	rec_t*	rec)	/* in: the physical record */
+{
+	ulint			i;
+	ulint			steps		= 0;
+	page_t*			page;	
+	page_dir_slot_t*	slot;
+	rec_t*			original_rec	= rec;
+	char			err_buf[1000];
+	
+	ut_ad(page_rec_check(rec));
+
+	while (rec_get_n_owned(rec) == 0) {
+		steps++;
+		rec = page_rec_get_next(rec);
+	}
+	
+	page = buf_frame_align(rec);
+
+	i = page_dir_get_n_slots(page) - 1;
+	slot = page_dir_get_nth_slot(page, i); 
+
+	while (page_dir_slot_get_rec(slot) != rec) {
+
+ 		if (i == 0) {
+			fprintf(stderr,
+		"InnoDB: Probable data corruption on page %lu\n",
+			buf_frame_get_page_no(page));
+
+			rec_sprintf(err_buf, 900, original_rec);
+
+	  		fprintf(stderr,
+		"InnoDB: Original record %s\n"
+		"InnoDB: on that page. Steps %lu.\n", err_buf, steps);
+
+			rec_sprintf(err_buf, 900, rec);
+
+	  		fprintf(stderr,
+		"InnoDB: Cannot find the dir slot for record %s\n"
+		"InnoDB: on that page!\n", err_buf);
+
+			buf_page_print(page);
+
+	  		ut_a(0);
+	  	}
+
+		i--;
+		slot = page_dir_get_nth_slot(page, i); 
+	}
+
+	return(i);
+}
+
 /******************************************************************
 Used to check the consistency of a directory slot. */
 static

--- a/innobase/rem/rem0cmp.c
+++ b/innobase/rem/rem0cmp.c
@@ -104,7 +104,9 @@ cmp_types_are_equal(
        if ((type1->mtype == DATA_VARCHAR && type2->mtype == DATA_CHAR)
          || (type1->mtype == DATA_CHAR && type2->mtype == DATA_VARCHAR)
          || (type1->mtype == DATA_FIXBINARY && type2->mtype == DATA_BINARY)
-          || (type1->mtype == DATA_BINARY && type2->mtype == DATA_FIXBINARY)) {
+          || (type1->mtype == DATA_BINARY && type2->mtype == DATA_FIXBINARY)
+          || (type1->mtype == DATA_MYSQL && type2->mtype == DATA_VARMYSQL)
+          || (type1->mtype == DATA_VARMYSQL && type2->mtype == DATA_MYSQL)) {

                return(TRUE);
        }
@@ -124,14 +126,9 @@ cmp_types_are_equal(
 		return(FALSE);
 	}

-	if (type1->mtype == DATA_MYSQL
-	   || type1->mtype == DATA_VARMYSQL) {
+        if (type1->mtype == DATA_INT && type1->len != type2->len) {
 	
-		if ((type1->prtype & ~DATA_NOT_NULL)
-			!= (type2->prtype & ~DATA_NOT_NULL)) {
-
-			return(FALSE);
-		}
+		return(FALSE);
 	}

 	return(TRUE);

--- a/innobase/row/row0ins.c
+++ b/innobase/row/row0ins.c
@@ -609,7 +609,7 @@ the caller must have a shared latch on dict_foreign_key_check_lock. */
 ulint
 row_ins_check_foreign_constraint(
 /*=============================*/
-				/* out: DB_SUCCESS, DB_LOCK_WAIT,
+				/* out: DB_SUCCESS,
 				DB_NO_REFERENCED_ROW,
 				or DB_ROW_IS_REFERENCED */
 	ibool		check_ref,/* in: TRUE if we want to check that
@@ -635,6 +635,7 @@ row_ins_check_foreign_constraint(
 	ulint		i;
 	mtr_t		mtr;

+run_again:
 	ut_ad(rw_lock_own(&dict_foreign_key_check_lock, RW_LOCK_SHARED));

 	if (thr_get_trx(thr)->check_foreigns == FALSE) {
@@ -682,7 +683,7 @@ row_ins_check_foreign_constraint(

 		if (err != DB_SUCCESS) {

-			return(err);
+			goto do_possible_lock_wait;
 		}
 	}

@@ -727,6 +728,11 @@ row_ins_check_foreign_constraint(
 			if (!rec_get_deleted_flag(rec)) {
 				/* Found a matching record */

+/*				printf(
+"FOREIGN: Found matching record from %s %s\n",
+		check_index->table_name, check_index->name);
+				rec_print(rec);
+*/
 				if (check_ref) {			
 					err = DB_SUCCESS;

@@ -779,6 +785,17 @@ next_rec:
 	/* Restore old value */
 	dtuple_set_n_fields_cmp(entry, n_fields_cmp);

+do_possible_lock_wait:
+	if (err == DB_LOCK_WAIT) {
+		thr_get_trx(thr)->error_state = err;
+
+		que_thr_stop_for_mysql(thr);
+	
+		row_mysql_handle_errors(&err, thr_get_trx(thr), thr, NULL);
+
+		goto run_again;
+	}
+
 	return(err);
 }

@@ -792,8 +809,7 @@ static
 ulint
 row_ins_check_foreign_constraints(
 /*==============================*/
-				/* out: DB_SUCCESS, DB_LOCK_WAIT, or error
-				code */
+				/* out: DB_SUCCESS or error code */
 	dict_table_t*	table,	/* in: table */
 	dict_index_t*	index,	/* in: index */
 	dtuple_t*	entry,	/* in: index entry for index */

--- a/innobase/row/row0mysql.c
+++ b/innobase/row/row0mysql.c
@@ -934,6 +934,7 @@ row_update_for_mysql(
 	ut_ad(!prebuilt->sql_stat_start);

 	que_thr_move_to_run_state_for_mysql(thr, trx);
+
 run_again:
 	thr->run_node = node;
 	thr->prev_node = node;
@@ -998,7 +999,6 @@ row_update_cascade_for_mysql(
 	trx_t*		trx;

 	trx = thr_get_trx(thr);
-
 run_again:
 	thr->run_node = node;
 	thr->prev_node = node;
@@ -1130,6 +1130,35 @@ row_mysql_recover_tmp_table(
 	return(row_rename_table_for_mysql(old_name, table->name, trx));
 }

+/*************************************************************************
+Locks the data dictionary exclusively for performing a table create
+operation. */
+
+void
+row_mysql_lock_data_dictionary(void)
+/*================================*/
+{
+	/* Serialize data dictionary operations with dictionary mutex:
+	no deadlocks or lock waits can occur then in these operations */
+
+	rw_lock_x_lock(&(dict_foreign_key_check_lock));
+	mutex_enter(&(dict_sys->mutex));
+}
+
+/*************************************************************************
+Unlocks the data dictionary exclusively lock. */
+
+void
+row_mysql_unlock_data_dictionary(void)
+/*==================================*/
+{
+	/* Serialize data dictionary operations with dictionary mutex:
+	no deadlocks can occur then in these operations */
+
+	mutex_exit(&(dict_sys->mutex));
+	rw_lock_x_unlock(&(dict_foreign_key_check_lock));
+}
+
 /*************************************************************************
 Does a table creation operation for MySQL. If the name of the created
 table ends to characters INNODB_MONITOR, then this also starts
@@ -1150,6 +1179,7 @@ row_create_table_for_mysql(
 	ulint		err;

 	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+	ut_ad(mutex_own(&(dict_sys->mutex)));
 	
 	if (srv_created_new_raw || srv_force_recovery) {
 		fprintf(stderr,
@@ -1263,19 +1293,13 @@ row_create_table_for_mysql(
 		 "to use this feature you must compile InnoDB with\n"
 		 "UNIV_MEM_DEBUG defined in univ.i and the server must be\n"
 		 "quiet because allocation from a mem heap is not protected\n"
-		       "by any semaphore.\n");
+		 "by any semaphore.\n");

 		ut_a(mem_validate());
 		      
 		printf("Memory validated\n");
 	}

-	/* Serialize data dictionary operations with dictionary mutex:
-	no deadlocks can occur then in these operations */
-
-	rw_lock_x_lock(&(dict_foreign_key_check_lock));
-	mutex_enter(&(dict_sys->mutex));
-
 	heap = mem_heap_create(512);

 	trx->dict_operation = TRUE;
@@ -1325,9 +1349,6 @@ row_create_table_for_mysql(
 		trx->error_state = DB_SUCCESS;
 	}

-	mutex_exit(&(dict_sys->mutex));
-	rw_lock_x_unlock(&(dict_foreign_key_check_lock));
-
 	que_graph_free((que_t*) que_node_get_parent(thr));

 	trx->op_info = "";
@@ -1354,6 +1375,7 @@ row_create_index_for_mysql(
 	ulint		keywordlen;
 	ulint		err;
 	
+	ut_ad(mutex_own(&(dict_sys->mutex)));
 	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
 	
 	trx->op_info = "creating index";
@@ -1372,12 +1394,6 @@ row_create_index_for_mysql(
 		return(DB_SUCCESS);
 	}

-	/* Serialize data dictionary operations with dictionary mutex:
-	no deadlocks can occur then in these operations */
-
-	rw_lock_x_lock(&(dict_foreign_key_check_lock));
-	mutex_enter(&(dict_sys->mutex));
-
 	heap = mem_heap_create(512);

 	trx->dict_operation = TRUE;
@@ -1405,9 +1421,6 @@ row_create_index_for_mysql(
 		trx->error_state = DB_SUCCESS;
 	}

-	mutex_exit(&(dict_sys->mutex));
-	rw_lock_x_unlock(&(dict_foreign_key_check_lock));
-
 	que_graph_free((que_t*) que_node_get_parent(thr));
 	
 	trx->op_info = "";
@@ -1441,6 +1454,7 @@ row_table_add_foreign_constraints(
 	ulint	keywordlen;
 	ulint	err;

+	ut_ad(mutex_own(&(dict_sys->mutex)));
 	ut_a(sql_string);
 	
 	trx->op_info = "adding foreign keys";
@@ -1459,12 +1473,6 @@ row_table_add_foreign_constraints(
 		return(DB_SUCCESS);
 	}

-	/* Serialize data dictionary operations with dictionary mutex:
-	no deadlocks can occur then in these operations */
-
-	rw_lock_x_lock(&(dict_foreign_key_check_lock));
-	mutex_enter(&(dict_sys->mutex));
-
 	trx->dict_operation = TRUE;

 	err = dict_create_foreign_constraints(trx, sql_string, name);
@@ -1486,9 +1494,6 @@ row_table_add_foreign_constraints(
 		trx->error_state = DB_SUCCESS;
 	}

-	mutex_exit(&(dict_sys->mutex));
-	rw_lock_x_unlock(&(dict_foreign_key_check_lock));
-
 	return((int) err);
 }

@@ -1917,6 +1922,13 @@ row_drop_table_for_mysql(
 		ut_a(0);
 	} else {
 		dict_table_remove_from_cache(table);
+
+		if (dict_load_table(name) != NULL) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+"  InnoDB: Error: dropping of table %s failed!\n", name);
+
+		}
 	}
 funct_exit:	
 	rw_lock_s_unlock(&(purge_sys->purge_is_running));

--- a/innobase/row/row0purge.c
+++ b/innobase/row/row0purge.c
@@ -511,6 +511,14 @@ row_purge_parse_undo_rec(

 	clust_index = dict_table_get_first_index(node->table);

+	if (clust_index == NULL) {
+		/* The table was corrupt in the data dictionary */
+
+		rw_lock_x_unlock(&(purge_sys->purge_is_running));
+
+		return(FALSE);
+	}
+
 	ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref),
 								node->heap);


--- a/innobase/row/row0upd.c
+++ b/innobase/row/row0upd.c
@@ -129,8 +129,7 @@ static
 ulint
 row_upd_check_references_constraints(
 /*=================================*/
-				/* out: DB_SUCCESS, DB_LOCK_WAIT, or an error
-				code */
+				/* out: DB_SUCCESS or an error code */
 	btr_pcur_t*	pcur,	/* in: cursor positioned on a record; NOTE: the
 				cursor position is lost in this function! */
 	dict_table_t*	table,	/* in: table in question */
@@ -626,7 +625,7 @@ row_upd_index_parse(

 /*******************************************************************
 Returns TRUE if ext_vec contains i. */
-UNIV_INLINE
+static
 ibool
 upd_ext_vec_contains(
 /*=================*/
@@ -738,6 +737,7 @@ row_upd_build_difference_binary(
 	ulint		n_diff;
 	ulint		roll_ptr_pos;
 	ulint		trx_id_pos;
+	ibool		extern_bit;
 	ulint		i;

 	/* This function is used only for a clustered index */
@@ -763,9 +763,10 @@ row_upd_build_difference_binary(

 			goto skip_compare;
 		}
+
+		extern_bit = rec_get_nth_field_extern_bit(rec, i);
 		
-		if (rec_get_nth_field_extern_bit(rec, i)
-		    != upd_ext_vec_contains(ext_vec, n_ext_vec, i)
+		if (extern_bit != upd_ext_vec_contains(ext_vec, n_ext_vec, i)
 		    || !dfield_data_is_binary_equal(dfield, len, data)) {

 			upd_field = upd_get_nth_field(update, n_diff);
@@ -1362,7 +1363,7 @@ ulint
 row_upd_del_mark_clust_rec(
 /*=======================*/
 				/* out: DB_SUCCESS if operation successfully
-				completed, else error code or DB_LOCK_WAIT */
+				completed, else error code */
 	upd_node_t*	node,	/* in: row update node */
 	dict_index_t*	index,	/* in: clustered index */
 	que_thr_t*	thr,	/* in: query thread */
@@ -1381,8 +1382,6 @@ row_upd_del_mark_clust_rec(
 	pcur = node->pcur;
 	btr_cur = btr_pcur_get_btr_cur(pcur);

-	ut_ad(FALSE == rec_get_deleted_flag(btr_pcur_get_rec(pcur)));
-
 	/* Store row because we have to build also the secondary index
 	entries */
 	
@@ -1391,11 +1390,11 @@ row_upd_del_mark_clust_rec(
 	/* Mark the clustered index record deleted; we do not have to check
 	locks, because we assume that we have an x-lock on the record */

-	err = btr_cur_del_mark_set_clust_rec(BTR_NO_LOCKING_FLAG, btr_cur,
-							TRUE, thr, mtr);
+	err = btr_cur_del_mark_set_clust_rec(BTR_NO_LOCKING_FLAG,
+						btr_cur, TRUE, thr, mtr);
 	if (err == DB_SUCCESS && check_ref) {
-		/* NOTE that the following call loses
-		the position of pcur ! */
+		/* NOTE that the following call loses the position of pcur ! */
+
 		err = row_upd_check_references_constraints(pcur, index->table,
 							index, thr, mtr);
 		if (err != DB_SUCCESS) {

--- a/innobase/srv/srv0srv.c
+++ b/innobase/srv/srv0srv.c
@@ -639,7 +639,7 @@ srv_release_threads(
 	
 		slot = srv_table_get_nth_slot(i);

-		if ((slot->type == type) && slot->suspended) {
+		if (slot->in_use && slot->type == type && slot->suspended) {
 			
 			slot->suspended = FALSE;

@@ -1631,6 +1631,7 @@ srv_init(void)
 	for (i = 0; i < OS_THREAD_MAX_N; i++) {
 		slot = srv_mysql_table + i;
 		slot->in_use = FALSE;
+		slot->type = 0;
 		slot->event = os_event_create(NULL);
 		ut_a(slot->event);
 	}
@@ -1890,8 +1891,6 @@ srv_conc_exit_innodb(
 	trx_t*	trx)	/* in: transaction object associated with the
 			thread */
 {
-	srv_conc_slot_t*	slot	= NULL;
-
 	if (srv_thread_concurrency >= 500) {
 	
 		return;
@@ -2200,10 +2199,12 @@ loop:
 		       "FILE I/O\n"
 		       "--------\n");
 		os_aio_print();
-		printf("-------------\n"
-		       "INSERT BUFFER\n"
-		       "-------------\n");
+		printf("-------------------------------------\n"
+		       "INSERT BUFFER AND ADAPTIVE HASH INDEX\n"
+		       "-------------------------------------\n");
 		ibuf_print();
+		printf("Successful hash searches %lu, non-hash searches %lu\n",
+			btr_cur_n_sea, btr_cur_n_non_sea);
 		printf("---\n"
 		       "LOG\n"
 		       "---\n");
@@ -2498,18 +2499,19 @@ loop:
 	for (i = 0; i < 10; i++) {
 		n_ios_old = log_sys->n_log_ios + buf_pool->n_pages_read
 						+ buf_pool->n_pages_written;
-		srv_main_thread_op_info = "sleeping";
+		srv_main_thread_op_info = (char*)"sleeping";
 		os_thread_sleep(1000000);

 		/* ALTER TABLE in MySQL requires on Unix that the table handler
 		can drop tables lazily after there no longer are SELECT
 		queries to them. */

-		srv_main_thread_op_info = "doing background drop tables";
+		srv_main_thread_op_info =
+					(char*)"doing background drop tables";

 		row_drop_tables_for_mysql_in_background();

-		srv_main_thread_op_info = "";
+		srv_main_thread_op_info = (char*)"";

 		if (srv_force_recovery >= SRV_FORCE_NO_BACKGROUND) {

@@ -2520,8 +2522,9 @@ loop:
 		is issued or the we have specified in my.cnf no flush
 		at transaction commit */

-		srv_main_thread_op_info = "flushing log";
+		srv_main_thread_op_info = (char*)"flushing log";
 		log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP);
+		log_flush_to_disk();

 		/* If there were less than 10 i/os during the
 		one second sleep, we assume that there is free
@@ -2533,11 +2536,14 @@ loop:
 		n_ios = log_sys->n_log_ios + buf_pool->n_pages_read
 						+ buf_pool->n_pages_written;
 		if (n_pend_ios < 3 && (n_ios - n_ios_old < 10)) {
-			srv_main_thread_op_info = "doing insert buffer merge";
+			srv_main_thread_op_info =
+					(char*)"doing insert buffer merge";
 			ibuf_contract_for_n_pages(TRUE, 5);

-			srv_main_thread_op_info = "flushing log";
+			srv_main_thread_op_info =
+						(char*)"flushing log";
 			log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP);
+			log_flush_to_disk();
 		}
 		
 		if (srv_fast_shutdown && srv_shutdown_state > 0) {
@@ -2578,16 +2584,18 @@ loop:

 		srv_main_thread_op_info = "flushing log";
 		log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP);
+		log_flush_to_disk();
 	}

 	/* We run a batch of insert buffer merge every 10 seconds,
 	even if the server were active */

-	srv_main_thread_op_info = "doing insert buffer merge";
+	srv_main_thread_op_info = (char*)"doing insert buffer merge";
 	ibuf_contract_for_n_pages(TRUE, 5);

-	srv_main_thread_op_info = "flushing log";
+	srv_main_thread_op_info = (char*)"flushing log";
 	log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP);
+	log_flush_to_disk();

 	/* We run a full purge every 10 seconds, even if the server
 	were active */
@@ -2603,7 +2611,7 @@ loop:
 			goto background_loop;
 		}

-		srv_main_thread_op_info = "purging";
+		srv_main_thread_op_info = (char*)"purging";
 		n_pages_purged = trx_purge();

 		current_time = time(NULL);
@@ -2612,6 +2620,7 @@ loop:
 			srv_main_thread_op_info = "flushing log";

 		        log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP);
+			log_flush_to_disk();
 			last_flush_time = current_time;
 		}
 	}
@@ -2620,25 +2629,25 @@ background_loop:
 	/* In this loop we run background operations when the server
 	is quiet and we also come here about once in 10 seconds */

-	srv_main_thread_op_info = "doing background drop tables";
+	srv_main_thread_op_info = (char*)"doing background drop tables";

 	n_tables_to_drop = row_drop_tables_for_mysql_in_background();

-	srv_main_thread_op_info = "";
+	srv_main_thread_op_info = (char*)"";
 	
-	srv_main_thread_op_info = "flushing buffer pool pages";
+	srv_main_thread_op_info = (char*)"flushing buffer pool pages";

 	/* Flush a few oldest pages to make the checkpoint younger */

 	n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 10, ut_dulint_max);

-	srv_main_thread_op_info = "making checkpoint";
+	srv_main_thread_op_info = (char*)"making checkpoint";

 	/* Make a new checkpoint about once in 10 seconds */

 	log_checkpoint(TRUE, FALSE);

-	srv_main_thread_op_info = "reserving kernel mutex";
+	srv_main_thread_op_info = (char*)"reserving kernel mutex";

 	mutex_enter(&kernel_mutex);
 	if (srv_activity_count != old_activity_count) {
@@ -2651,11 +2660,11 @@ background_loop:
 	/* The server has been quiet for a while: start running background
 	operations */
 		
-	srv_main_thread_op_info = "purging";
+	srv_main_thread_op_info = (char*)"purging";

 	n_pages_purged = trx_purge();

-	srv_main_thread_op_info = "reserving kernel mutex";
+	srv_main_thread_op_info = (char*)"reserving kernel mutex";

 	mutex_enter(&kernel_mutex);
 	if (srv_activity_count != old_activity_count) {
@@ -2664,10 +2673,10 @@ background_loop:
 	}
 	mutex_exit(&kernel_mutex);

-	srv_main_thread_op_info = "doing insert buffer merge";
+	srv_main_thread_op_info = (char*)"doing insert buffer merge";
 	n_bytes_merged = ibuf_contract_for_n_pages(TRUE, 20);

-	srv_main_thread_op_info = "reserving kernel mutex";
+	srv_main_thread_op_info = (char*)"reserving kernel mutex";

 	mutex_enter(&kernel_mutex);
 	if (srv_activity_count != old_activity_count) {
@@ -2676,10 +2685,10 @@ background_loop:
 	}
 	mutex_exit(&kernel_mutex);
 	
-	srv_main_thread_op_info = "flushing buffer pool pages";
+	srv_main_thread_op_info = (char*)"flushing buffer pool pages";
 	n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max);

-	srv_main_thread_op_info = "reserving kernel mutex";
+	srv_main_thread_op_info = (char*)"reserving kernel mutex";

 	mutex_enter(&kernel_mutex);
 	if (srv_activity_count != old_activity_count) {
@@ -2691,11 +2700,11 @@ background_loop:
 	srv_main_thread_op_info = "waiting for buffer pool flush to end";
 	buf_flush_wait_batch_end(BUF_FLUSH_LIST);

-	srv_main_thread_op_info = "making checkpoint";
+	srv_main_thread_op_info = (char*)"making checkpoint";

 	log_checkpoint(TRUE, FALSE);

-	srv_main_thread_op_info = "reserving kernel mutex";
+	srv_main_thread_op_info = (char*)"reserving kernel mutex";

 	mutex_enter(&kernel_mutex);
 	if (srv_activity_count != old_activity_count) {
@@ -2704,7 +2713,8 @@ background_loop:
 	}
 	mutex_exit(&kernel_mutex);

-	srv_main_thread_op_info = "archiving log (if log archive is on)";
+	srv_main_thread_op_info =
+				(char*)"archiving log (if log archive is on)";
 	
 	log_archive_do(FALSE, &n_bytes_archived);

@@ -2730,7 +2740,7 @@ background_loop:
 	master thread to wait for more server activity */
 	
 suspend_thread:
-	srv_main_thread_op_info = "suspending";
+	srv_main_thread_op_info = (char*)"suspending";

 	mutex_enter(&kernel_mutex);

@@ -2744,7 +2754,7 @@ suspend_thread:

 	mutex_exit(&kernel_mutex);

-	srv_main_thread_op_info = "waiting for server activity";
+	srv_main_thread_op_info = (char*)"waiting for server activity";

 	os_event_wait(event);


--- a/innobase/srv/srv0start.c
+++ b/innobase/srv/srv0start.c
@@ -932,6 +932,26 @@ innobase_start_or_create_for_mysql(void)
 	ulint	k;
 	mtr_t   mtr;

+#ifdef UNIV_DEBUG
+	fprintf(stderr,
+"InnoDB: !!!!!!!!!!!!!! UNIV_DEBUG switched on !!!!!!!!!!!!!!!\n"); 
+#endif
+
+#ifdef UNIV_SYNC_DEBUG
+	fprintf(stderr,
+"InnoDB: !!!!!!!!!!!!!! UNIV_SYNC_DEBUG switched on !!!!!!!!!!!!!!!\n"); 
+#endif
+
+#ifdef UNIV_SEARCH_DEBUG
+	fprintf(stderr,
+"InnoDB: !!!!!!!!!!!!!! UNIV_SEARCH_DEBUG switched on !!!!!!!!!!!!!!!\n"); 
+#endif
+
+#ifdef UNIV_MEM_DEBUG
+	fprintf(stderr,
+"InnoDB: !!!!!!!!!!!!!! UNIV_MEM_DEBUG switched on !!!!!!!!!!!!!!!\n"); 
+#endif
+
 	log_do_write = TRUE;
 /*	yydebug = TRUE; */

@@ -999,7 +1019,7 @@ innobase_start_or_create_for_mysql(void)
 	os_aio_use_native_aio = FALSE;
 	
 	if (!os_aio_use_native_aio) {
-		os_aio_init(4 * SRV_N_PENDING_IOS_PER_THREAD
+		os_aio_init(8 * SRV_N_PENDING_IOS_PER_THREAD
 						* srv_n_file_io_threads,
 					srv_n_file_io_threads,
 					SRV_MAX_N_PENDING_SYNC_IOS);

--- a/innobase/trx/trx0roll.c
+++ b/innobase/trx/trx0roll.c
@@ -160,11 +160,13 @@ trx_rollback_last_sql_stat_for_mysql(
 }

 /***********************************************************************
-Rollback uncommitted transactions which have no user session. */
+Rollback or clean up transactions which have no user session. If the
+transaction already was committed, then we clean up a possible insert
+undo log. If the transaction was not yet committed, then we roll it back. */

 void
-trx_rollback_all_without_sess(void)
-/*===============================*/
+trx_rollback_or_clean_all_without_sess(void)
+/*========================================*/
 {
 	mem_heap_t*	heap;
 	que_fork_t*	fork;
@@ -217,6 +219,19 @@ loop:

 	trx->sess = trx_dummy_sess;
 	
+	if (trx->conc_state == TRX_COMMITTED_IN_MEMORY) {
+
+		fprintf(stderr, "InnoDB: Cleaning up trx with id %lu %lu\n",
+					ut_dulint_get_high(trx->id),
+					ut_dulint_get_low(trx->id));
+
+		trx_cleanup_at_db_startup(trx);
+					
+		mem_heap_free(heap);
+
+		goto loop;
+	}
+
 	fork = que_fork_create(NULL, NULL, QUE_FORK_RECOVERY, heap);
 	fork->trx = trx;

@@ -264,9 +279,17 @@ loop:
 		/* If the transaction was for a dictionary operation, we
 		drop the relevant table, if it still exists */

+		fprintf(stderr,
+"InnoDB: Dropping table with id %lu %lu in recovery if it exists\n",
+			ut_dulint_get_high(trx->table_id),
+			ut_dulint_get_low(trx->table_id));
+
 		table = dict_table_get_on_id_low(trx->table_id, trx);

 		if (table) {		
+			fprintf(stderr,
+"InnoDB: Table found: dropping table %s in recovery\n", table->name);
+
 			err = row_drop_table_for_mysql(table->name, trx,
 								TRUE);
 			ut_a(err == (int) DB_SUCCESS);

--- a/innobase/trx/trx0sys.c
+++ b/innobase/trx/trx0sys.c
@@ -26,6 +26,14 @@ Created 3/26/1996 Heikki Tuuri
 trx_sys_t*		trx_sys 	= NULL;
 trx_doublewrite_t*	trx_doublewrite = NULL;

+/* In a MySQL replication slave, in crash recovery we store the master log
+file name and position here. We have successfully got the updates to InnoDB
+up to this position. If .._pos is -1, it means no crash recovery was needed,
+or there was no master log position info inside InnoDB. */
+
+char 		trx_sys_mysql_master_log_name[TRX_SYS_MYSQL_LOG_NAME_LEN];
+ib_longlong	trx_sys_mysql_master_log_pos	= -1;
+
 /********************************************************************
 Determines if a page number is located inside the doublewrite buffer. */

@@ -427,75 +435,62 @@ trx_sys_flush_max_trx_id(void)

 /*********************************************************************
 Updates the offset information about the end of the MySQL binlog entry
-which corresponds to the transaction just being committed. */
+which corresponds to the transaction just being committed. In a MySQL
+replication slave updates the latest master binlog position up to which
+replication has proceeded. */

 void
 trx_sys_update_mysql_binlog_offset(
 /*===============================*/
-	trx_t*	trx,	/* in: transaction being committed */
-	mtr_t*	mtr)	/* in: mtr */
+	char*		file_name,/* in: MySQL log file name */
+	ib_longlong	offset,	/* in: position in that log file */
+	ulint		field,	/* in: offset of the MySQL log info field in
+				the trx sys header */
+	mtr_t*		mtr)	/* in: mtr */
 {
 	trx_sysf_t*	sys_header;
-	char		namebuf[TRX_SYS_MYSQL_LOG_NAME_LEN];
-	
-	ut_ad(trx->mysql_log_file_name);

-	memset(namebuf, ' ', TRX_SYS_MYSQL_LOG_NAME_LEN - 1);
-	namebuf[TRX_SYS_MYSQL_LOG_NAME_LEN - 1] = '\0';
+	if (ut_strlen(file_name) >= TRX_SYS_MYSQL_LOG_NAME_LEN) {

-	/* Copy the whole MySQL log file name to the buffer, or only the
-	last characters, if it does not fit */
+		/* We cannot fit the name to the 512 bytes we have reserved */

-	if (ut_strlen(trx->mysql_log_file_name)
-			> TRX_SYS_MYSQL_LOG_NAME_LEN - 1) {
-		ut_memcpy(namebuf, trx->mysql_log_file_name
-			+ ut_strlen(trx->mysql_log_file_name)
-			- (TRX_SYS_MYSQL_LOG_NAME_LEN - 1),
-			TRX_SYS_MYSQL_LOG_NAME_LEN - 1);
-	} else {
-		ut_memcpy(namebuf, trx->mysql_log_file_name,
-				1 + ut_strlen(trx->mysql_log_file_name));
+		return;
 	}

-	namebuf[TRX_SYS_MYSQL_LOG_NAME_LEN - 1] = '\0';
-
 	sys_header = trx_sysf_get(mtr);

-	if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO
+	if (mach_read_from_4(sys_header + field
 					+ TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
 	   != TRX_SYS_MYSQL_LOG_MAGIC_N) {

-	   	mlog_write_ulint(sys_header + TRX_SYS_MYSQL_LOG_INFO
+	   	mlog_write_ulint(sys_header + field
 					+ TRX_SYS_MYSQL_LOG_MAGIC_N_FLD,
 				TRX_SYS_MYSQL_LOG_MAGIC_N,
 				MLOG_4BYTES, mtr);
 	}

-	if (0 != ut_memcmp(sys_header + TRX_SYS_MYSQL_LOG_INFO
-					+ TRX_SYS_MYSQL_LOG_NAME,
-			 namebuf, TRX_SYS_MYSQL_LOG_NAME_LEN)) {
+	if (0 != ut_memcmp(sys_header + field + TRX_SYS_MYSQL_LOG_NAME,
+			file_name, 1 + ut_strlen(file_name))) {

-		mlog_write_string(sys_header + TRX_SYS_MYSQL_LOG_INFO
+		mlog_write_string(sys_header + field
 					+ TRX_SYS_MYSQL_LOG_NAME,
-				namebuf, TRX_SYS_MYSQL_LOG_NAME_LEN, mtr);
+			file_name, 1 + ut_strlen(file_name), mtr);
 	}

-	if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO
+	if (mach_read_from_4(sys_header + field
 					+ TRX_SYS_MYSQL_LOG_OFFSET_HIGH) > 0
-	   || (trx->mysql_log_offset >> 32) > 0) {
+	   || (offset >> 32) > 0) {
 				
-		mlog_write_ulint(sys_header + TRX_SYS_MYSQL_LOG_INFO
+		mlog_write_ulint(sys_header + field
 					+ TRX_SYS_MYSQL_LOG_OFFSET_HIGH,
-				(ulint)(trx->mysql_log_offset >> 32),
+				(ulint)(offset >> 32),
 				MLOG_4BYTES, mtr);
 	}

-	mlog_write_ulint(sys_header + TRX_SYS_MYSQL_LOG_INFO
+	mlog_write_ulint(sys_header + field
 					+ TRX_SYS_MYSQL_LOG_OFFSET_LOW,
-				(ulint)(trx->mysql_log_offset & 0xFFFFFFFF),
+				(ulint)(offset & 0xFFFFFFFF),
 				MLOG_4BYTES, mtr);				
-
-	trx->mysql_log_file_name = NULL;
 }

 /*********************************************************************
@@ -533,6 +528,58 @@ trx_sys_print_mysql_binlog_offset(void)
 	mtr_commit(&mtr);
 }

+/*********************************************************************
+Prints to stderr the MySQL master log offset info in the trx system header if
+the magic number shows it valid. */
+
+void
+trx_sys_print_mysql_master_log_pos(void)
+/*====================================*/
+{
+	trx_sysf_t*	sys_header;
+	mtr_t		mtr;
+	
+	mtr_start(&mtr);
+
+	sys_header = trx_sysf_get(&mtr);
+
+	if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+					+ TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
+	   != TRX_SYS_MYSQL_LOG_MAGIC_N) {
+
+		mtr_commit(&mtr);
+
+		return;
+	}
+
+	fprintf(stderr,
+"InnoDB: In a MySQL replication slave the last master binlog file\n"
+"InnoDB: position %lu %lu, file name %s\n",
+		mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+					+ TRX_SYS_MYSQL_LOG_OFFSET_HIGH),
+		mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+					+ TRX_SYS_MYSQL_LOG_OFFSET_LOW),
+		sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+						+ TRX_SYS_MYSQL_LOG_NAME);
+	/* Copy the master log position info to global variables we can
+	use in ha_innobase.cc to initialize glob_mi to right values */
+
+	ut_memcpy(trx_sys_mysql_master_log_name,
+		sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+						+ TRX_SYS_MYSQL_LOG_NAME,
+			TRX_SYS_MYSQL_LOG_NAME_LEN);
+
+	trx_sys_mysql_master_log_pos = 
+		(((ib_longlong)mach_read_from_4(
+			sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+					+ TRX_SYS_MYSQL_LOG_OFFSET_HIGH))
+		<< 32)
+		+ (ib_longlong)
+		mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+					+ TRX_SYS_MYSQL_LOG_OFFSET_LOW);
+	mtr_commit(&mtr);
+}
+
 /********************************************************************
 Looks for a free slot for a rollback segment in the trx system file copy. */

@@ -660,7 +707,7 @@ trx_sys_init_at_db_start(void)

 	if (UT_LIST_GET_LEN(trx_sys->trx_list) > 0) {
 		fprintf(stderr,
-	"InnoDB: %lu uncommitted transaction(s) which must be rolled back\n",
+	"InnoDB: %lu transaction(s) which must be rolled back or cleaned up\n",
 				UT_LIST_GET_LEN(trx_sys->trx_list));

 		fprintf(stderr, "InnoDB: Trx id counter is %lu %lu\n", 

--- a/innobase/trx/trx0trx.c
+++ b/innobase/trx/trx0trx.c
@@ -83,6 +83,8 @@ trx_create(

 	trx->mysql_log_file_name = NULL;
 	trx->mysql_log_offset = 0;
+	trx->mysql_master_log_file_name = "";
+	trx->mysql_master_log_pos = 0;
 	
 	trx->ignore_duplicates_in_insert = FALSE;

@@ -363,16 +365,31 @@ trx_lists_init_at_db_start(void)

 			trx = trx_create(NULL); 

+			trx->id = undo->trx_id;
+
+			trx->insert_undo = undo;
+			trx->rseg = rseg;
+
 			if (undo->state != TRX_UNDO_ACTIVE) {

 				trx->conc_state = TRX_COMMITTED_IN_MEMORY;
+
+				/* We give a dummy value for the trx no;
+				this should have no relevance since purge
+				is not interested in committed transaction
+				numbers, unless they are in the history
+				list, in which case it looks the number
+				from the disk based undo log structure */
+
+				trx->no = trx->id;
 			} else {
 				trx->conc_state = TRX_ACTIVE;
-			}

-			trx->id = undo->trx_id;
-			trx->insert_undo = undo;
-			trx->rseg = rseg;
+				/* A running transaction always has the number
+				field inited to ut_dulint_max */
+
+				trx->no = ut_dulint_max;
+			}

 			if (undo->dict_operation) {
 				trx->dict_operation = undo->dict_operation;
@@ -397,14 +414,25 @@ trx_lists_init_at_db_start(void)
 			if (NULL == trx) {
 				trx = trx_create(NULL); 

+				trx->id = undo->trx_id;
+
 				if (undo->state != TRX_UNDO_ACTIVE) {
 					trx->conc_state =
 						TRX_COMMITTED_IN_MEMORY;
+					/* We give a dummy value for the trx
+					number */
+
+					trx->no = trx->id;
 				} else {
 					trx->conc_state = TRX_ACTIVE;
+
+					/* A running transaction always has
+					the number field inited to
+					ut_dulint_max */
+
+					trx->no = ut_dulint_max;
 				}

-				trx->id = undo->trx_id;
 				trx->rseg = rseg;
 				trx_list_insert_ordered(trx);

@@ -583,7 +611,7 @@ trx_commit_off_kernel(
 		if (undo) {
 			mutex_enter(&kernel_mutex);
 #ifdef notdefined
-			/* ########## There is a bug here: purge and rollback
+			/* !!!!!!!!! There is a bug here: purge and rollback
 			need the whole stack of old record versions even if no
 			consistent read would need them!! This is because they
 			decide on the basis of the old versions when we can
@@ -627,12 +655,25 @@ trx_commit_off_kernel(
 		mutex_exit(&(rseg->mutex));

 		/* Update the latest MySQL binlog name and offset info
-		in trx sys header if MySQL binlogging is on */
+		in trx sys header if MySQL binlogging is on or the database
+		server is a MySQL replication slave */

 		if (trx->mysql_log_file_name) {
-			trx_sys_update_mysql_binlog_offset(trx, &mtr);
+			trx_sys_update_mysql_binlog_offset(
+					trx->mysql_log_file_name,
+					trx->mysql_log_offset,
+					TRX_SYS_MYSQL_LOG_INFO, &mtr);
+			trx->mysql_log_file_name = NULL;
 		}
-		
+
+		if (trx->mysql_master_log_file_name[0] != '\0') {
+			/* This database server is a MySQL replication slave */ 
+			trx_sys_update_mysql_binlog_offset(
+				trx->mysql_master_log_file_name,
+				trx->mysql_master_log_pos,
+				TRX_SYS_MYSQL_MASTER_LOG_INFO, &mtr);
+		}
+				
 		/* If we did not take the shortcut, the following call
 		commits the mini-transaction, making the whole transaction
 		committed in the file-based world at this log sequence number;
@@ -707,12 +748,12 @@ trx_commit_off_kernel(

 		/*-------------------------------------*/

-		/* Most MySQL users run with srv_flush.. set to FALSE: */
+		/* Most MySQL users run with srv_flush_.. set to FALSE: */

 		if (srv_flush_log_at_trx_commit) {
 		
 			log_flush_up_to(lsn, LOG_WAIT_ONE_GROUP);
- 		}
+		}

 		/*-------------------------------------*/
 	
@@ -730,6 +771,29 @@ trx_commit_off_kernel(
 	UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx);
 }

+/********************************************************************
+Cleans up a transaction at database startup. The cleanup is needed if
+the transaction already got to the middle of a commit when the database
+crashed, andf we cannot roll it back. */
+
+void
+trx_cleanup_at_db_startup(
+/*======================*/
+	trx_t*	trx)	/* in: transaction */
+{
+	if (trx->insert_undo != NULL) {
+
+		trx_undo_insert_cleanup(trx);
+	}
+
+	trx->conc_state = TRX_NOT_STARTED;
+	trx->rseg = NULL;
+	trx->undo_no = ut_dulint_zero;
+	trx->last_sql_stat_start.least_undo_no = ut_dulint_zero;
+
+	UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx);
+}
+
 /************************************************************************
 Assigns a read view for a consistent read query. All the consistent reads
 within the same transaction will get the same read view, which is created

--- a/innobase/trx/trx0undo.c
+++ b/innobase/trx/trx0undo.c
@@ -1147,7 +1147,7 @@ trx_undo_mem_create_at_db_start(
 	/* If the log segment is being freed, the page list is inconsistent! */
 	if (state == TRX_UNDO_TO_FREE) {

-		return(undo);
+		goto add_to_list;
 	}

 	last_addr = flst_get_last(seg_header + TRX_UNDO_PAGE_LIST, mtr);
@@ -1166,7 +1166,7 @@ trx_undo_mem_create_at_db_start(
 		undo->top_offset = rec - last_page;
 		undo->top_undo_no = trx_undo_rec_get_undo_no(rec);
 	}
-	
+add_to_list:	
 	if (type == TRX_UNDO_INSERT) {
 		if (state != TRX_UNDO_CACHED) {
 			UT_LIST_ADD_LAST(undo_list, rseg->insert_undo_list,

--- a/innobase/ut/ut0mem.c
+++ b/innobase/ut/ut0mem.c
@@ -38,6 +38,8 @@ os_fast_mutex_t ut_list_mutex;  /* this protects the list */

 ibool  ut_mem_block_list_inited = FALSE;

+ulint*	ut_mem_null_ptr	= NULL;
+
 /**************************************************************************
 Initializes the mem block list at database startup. */
 static
@@ -83,12 +85,16 @@ ut_malloc_low(
 		"InnoDB: Check if you should increase the swap file or\n"
 		"InnoDB: ulimits of your operating system.\n"
 		"InnoDB: On FreeBSD check you have compiled the OS with\n"
-		"InnoDB: a big enough maximum process size.\n",
+		"InnoDB: a big enough maximum process size.\n"
+		"InnoDB: We now intentionally generate a seg fault so that\n"
+		"InnoDB: on Linux we get a stack trace.\n",
 		                  n, ut_total_allocated_memory, errno);

 	        os_fast_mutex_unlock(&ut_list_mutex);

-		exit(1);
+		/* Make an intentional seg fault so that we get a stack
+		trace */
+		printf("%lu\n", *ut_mem_null_ptr);	
 	}		

 	if (set_to_zero) {

--- a/innobase/ut/ut0ut.c
+++ b/innobase/ut/ut0ut.c
@@ -110,6 +110,49 @@ ut_print_timestamp(
 #endif
 }

+/**************************************************************
+Sprintfs a timestamp to a buffer. */
+
+void
+ut_sprintf_timestamp(
+/*=================*/
+	char*	buf) /* in: buffer where to sprintf */
+{
+#ifdef __WIN__
+  	SYSTEMTIME cal_tm;
+
+  	GetLocalTime(&cal_tm);
+
+  	sprintf(buf, "%02d%02d%02d %2d:%02d:%02d",
+	  (int)cal_tm.wYear % 100,
+	  (int)cal_tm.wMonth,
+	  (int)cal_tm.wDay,
+	  (int)cal_tm.wHour,
+	  (int)cal_tm.wMinute,
+	  (int)cal_tm.wSecond);
+#else
+	struct tm  cal_tm;
+  	struct tm* cal_tm_ptr;
+  	time_t     tm;
+
+  	time(&tm);
+
+#ifdef HAVE_LOCALTIME_R
+  	localtime_r(&tm, &cal_tm);
+  	cal_tm_ptr = &cal_tm;
+#else
+  	cal_tm_ptr = localtime(&tm);
+#endif
+  	sprintf(buf, "%02d%02d%02d %2d:%02d:%02d",
+	  cal_tm_ptr->tm_year % 100,
+	  cal_tm_ptr->tm_mon + 1,
+	  cal_tm_ptr->tm_mday,
+	  cal_tm_ptr->tm_hour,
+	  cal_tm_ptr->tm_min,
+	  cal_tm_ptr->tm_sec);
+#endif
+}
+
 /**************************************************************
 Returns current year, month, day. */

@@ -258,3 +301,26 @@ ut_ulint_sort(ulint* arr, ulint* aux_arr, ulint low, ulint high)
 	UT_SORT_FUNCTION_BODY(ut_ulint_sort, arr, aux_arr, low, high,
 								ut_ulint_cmp);
 }
+
+/*****************************************************************
+Calculates fast the number rounded up to the nearest power of 2. */
+
+ulint
+ut_2_power_up(
+/*==========*/
+			/* out: first power of 2 which is >= n */
+	ulint	n)	/* in: number != 0 */
+{
+	ulint	res;
+
+	res = 1;
+
+	ut_ad(n > 0);
+
+	while (res < n) {
+		res = res * 2;
+	}
+
+	return(res);
+}
+