[PATCH] reiserfs: logging rework

From: Chris Mason <mason@suse.com> reiserfs logging rework, making things much faster for small transactions. metadata buffers are dirtied when they are safe to write, so normal kernel mechanisms can contribute to log cleaning.

[PATCH] reiserfs: logging rework
From: Chris Mason <mason@suse.com> reiserfs logging rework, making things much faster for small transactions. metadata buffers are dirtied when they are safe to write, so normal kernel mechanisms can contribute to log cleaning.
7c563ced · Andrew Morton · Linus Torvalds · 8f576882 · 7c563ced · 7c563ced
Commit 7c563ced authored Apr 11, 2004 by Andrew Morton Committed by Linus Torvalds Apr 11, 2004
11 changed files
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -30,32 +30,11 @@ struct tree_balance * cur_tb = NULL; /* detects whether more than one
                                        is interrupting do_balance */
 #endif
-/*
- * AKPM: The __mark_buffer_dirty() call here will not
- * put the buffer on the dirty buffer LRU because we've just
- * set BH_Dirty.  That's a thinko in reiserfs.
- *
- * I'm reluctant to "fix" this bug because that would change
- * behaviour.  Using mark_buffer_dirty() here would make the
- * buffer eligible for VM and periodic writeback, which may
- * violate ordering constraints.  I'll just leave the code
- * as-is by removing the __mark_buffer_dirty call altogether.
- *
- * Chris says this code has "probably never been run" anyway.
- * It is due to go away.
- */
 inline void do_balance_mark_leaf_dirty (struct tree_balance * tb, 
 					struct buffer_head * bh, int flag)
 {
-    if (reiserfs_dont_log(tb->tb_sb)) {
+    journal_mark_dirty(tb->transaction_handle,
-	if (!test_set_buffer_dirty(bh)) {
+                       tb->transaction_handle->t_super, bh) ;
-//	    __mark_buffer_dirty(bh) ;
-	    tb->need_balance_dirty = 1;
-	}
-    } else {
-	journal_mark_dirty(tb->transaction_handle, tb->transaction_handle->t_super, bh) ;
-    }
 }
 #define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty

--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -2106,9 +2106,9 @@ static void tb_buffer_sanity_check (struct super_block * p_s_sb,
 {;}
 #endif
-static void clear_all_dirty_bits(struct super_block *s, 
+static int clear_all_dirty_bits(struct super_block *s,
                                 struct buffer_head *bh) {
-  reiserfs_prepare_for_journal(s, bh, 0) ;
+  return reiserfs_prepare_for_journal(s, bh, 0) ;
 }
 static int wait_tb_buffers_until_unlocked (struct tree_balance * p_s_tb)
@@ -2137,11 +2137,11 @@ static int wait_tb_buffers_until_unlocked (struct tree_balance * p_s_tb)
 					    p_s_tb->tb_path->path_length - i);
 		}
 #endif
-		clear_all_dirty_bits(p_s_tb->tb_sb, 
+		if (!clear_all_dirty_bits(p_s_tb->tb_sb,
-				     PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i)) ;
+				     PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i)))
+		{
-		if ( buffer_locked (PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i)) )
 		    locked = PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i);
+		}
 	    }
 	}
@@ -2151,22 +2151,19 @@ static int wait_tb_buffers_until_unlocked (struct tree_balance * p_s_tb)
 		if ( p_s_tb->L[i] ) {
 		    tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->L[i], "L", i);
-		    clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->L[i]) ;
+		    if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->L[i]))
-		    if ( buffer_locked (p_s_tb->L[i]) )
 			locked = p_s_tb->L[i];
 		}
 		if ( !locked && p_s_tb->FL[i] ) {
 		    tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->FL[i], "FL", i);
-		    clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FL[i]) ;
+		    if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FL[i]))
-		    if ( buffer_locked (p_s_tb->FL[i]) )
 			locked = p_s_tb->FL[i];
 		}
 		if ( !locked && p_s_tb->CFL[i] ) {
 		    tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->CFL[i], "CFL", i);
-		    clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFL[i]) ;
+		    if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFL[i]))
-		    if ( buffer_locked (p_s_tb->CFL[i]) )
 			locked = p_s_tb->CFL[i];
 		}
@@ -2176,23 +2173,20 @@ static int wait_tb_buffers_until_unlocked (struct tree_balance * p_s_tb)
 		if ( p_s_tb->R[i] ) {
 		    tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->R[i], "R", i);
-		    clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->R[i]) ;
+		    if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->R[i]))
-		    if ( buffer_locked (p_s_tb->R[i]) )
 			locked = p_s_tb->R[i];
 		}
 		if ( !locked && p_s_tb->FR[i] ) {
 		    tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->FR[i], "FR", i);
-		    clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FR[i]) ;
+		    if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FR[i]))
-		    if ( buffer_locked (p_s_tb->FR[i]) )
 			locked = p_s_tb->FR[i];
 		}
 		if ( !locked && p_s_tb->CFR[i] ) {
 		    tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->CFR[i], "CFR", i);
-		    clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFR[i]) ;
+		    if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFR[i]))
-		    if ( buffer_locked (p_s_tb->CFR[i]) )
 			locked = p_s_tb->CFR[i];
 		}
 	    }
@@ -2207,10 +2201,8 @@ static int wait_tb_buffers_until_unlocked (struct tree_balance * p_s_tb)
 	*/
 	for ( i = 0; !locked && i < MAX_FEB_SIZE; i++ ) { 
 	    if ( p_s_tb->FEB[i] ) {
-		clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FEB[i]) ;
+		if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FEB[i]))
-		if (buffer_locked(p_s_tb->FEB[i])) {
 		    locked = p_s_tb->FEB[i] ;
-		}
 	    }
 	}

--- a/fs/reiserfs/ibalance.c
+++ b/fs/reiserfs/ibalance.c
@@ -633,7 +633,6 @@ static void balance_internal_when_delete (struct tree_balance * tb,
 		/* use check_internal if new root is an internal node */
 		check_internal (new_root);
 	    /*&&&&&&&&&&&&&&&&&&&&&&*/
-	    tb->tb_sb->s_dirt = 1;
 	    /* do what is needed for buffer thrown from tree */
 	    reiserfs_invalidate_buffer(tb, tbSh);
@@ -951,7 +950,6 @@ int balance_internal (struct tree_balance * tb,			/* tree_balance structure 		*/
        PUT_SB_ROOT_BLOCK( tb->tb_sb, tbSh->b_blocknr );
        PUT_SB_TREE_HEIGHT( tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) + 1 );
 	do_balance_mark_sb_dirty (tb, REISERFS_SB(tb->tb_sb)->s_sbh, 1);
-	tb->tb_sb->s_dirt = 1;
    }
    if ( tb->blknum[h] == 2 ) {

--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -964,7 +964,7 @@ static void init_inode (struct inode * inode, struct path * path)
    REISERFS_I(inode)->i_prealloc_block = 0;
    REISERFS_I(inode)->i_prealloc_count = 0;
    REISERFS_I(inode)->i_trans_id = 0;
-    REISERFS_I(inode)->i_trans_index = 0;
+    REISERFS_I(inode)->i_jl = NULL;
    if (stat_data_v1 (ih)) {
 	struct stat_data_v1 * sd = (struct stat_data_v1 *)B_I_PITEM (bh, ih);
@@ -1621,7 +1621,7 @@ int reiserfs_new_inode (struct reiserfs_transaction_handle *th,
    REISERFS_I(inode)->i_prealloc_block = 0;
    REISERFS_I(inode)->i_prealloc_count = 0;
    REISERFS_I(inode)->i_trans_id = 0;
-    REISERFS_I(inode)->i_trans_index = 0;
+    REISERFS_I(inode)->i_jl = 0;
    REISERFS_I(inode)->i_attrs =
 	REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
    sd_attrs_to_i_attrs( REISERFS_I(inode) -> i_attrs, inode );

--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -32,13 +32,6 @@
 **                      around too long.
 **		     -- Note, if you call this as an immediate flush from 
 **		        from within kupdate, it will ignore the immediate flag
-**
-** The commit thread -- a writer process for async commits.  It allows a 
-**                      a process to request a log flush on a task queue.
-**                      the commit will happen once the commit thread wakes up.
-**                      The benefit here is the writer (with whatever
-**                      related locks it has) doesn't have to wait for the
-**                      log blocks to hit disk if it doesn't want to.
 */
 #include <linux/config.h>
@@ -60,6 +53,14 @@
 #include <linux/suspend.h>
 #include <linux/buffer_head.h>
 #include <linux/workqueue.h>
+#include <linux/writeback.h>
+/* gets a struct reiserfs_journal_list * from a list head */
+#define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
+                               j_list))
+#define JOURNAL_WORK_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
+                               j_working_list))
 /* the number of mounted filesystems.  This is used to decide when to
 ** start and kill the commit workqueue
@@ -78,6 +79,12 @@ static struct workqueue_struct *commit_wq;
 #define BLOCK_FREED_HOLDER 3    /* this block was freed during this transaction, and can't be written */
 #define BLOCK_NEEDS_FLUSH 4	/* used in flush_journal_list */
+#define BLOCK_DIRTIED 5
+/* journal list state bits */
+#define LIST_TOUCHED 1
+#define LIST_DIRTY   2
 /* flags for do_journal_end */
 #define FLUSH_ALL   1		/* flush commit and real blocks */
@@ -86,6 +93,9 @@ static struct workqueue_struct *commit_wq;
 /* state bits for the journal */
 #define WRITERS_BLOCKED 1      /* set when new writers not allowed */
+#define WRITERS_QUEUED 2       /* set when log is full due to too many
+				* writers
+				*/
 static int do_journal_end(struct reiserfs_transaction_handle *,struct super_block *,unsigned long nblocks,int flags) ;
 static int flush_journal_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) ;
@@ -94,6 +104,9 @@ static int can_dirty(struct reiserfs_journal_cnode *cn) ;
 static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks);
 static int release_journal_dev( struct super_block *super,
 				struct reiserfs_journal *journal );
+static int dirty_one_transaction(struct super_block *s,
+                                 struct reiserfs_journal_list *jl);
+static void flush_async_commits(void *p);
 static void init_journal_hash(struct super_block *p_s_sb) {
  memset(SB_JOURNAL(p_s_sb)->j_hash_table, 0, JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)) ;
@@ -105,8 +118,10 @@ static void init_journal_hash(struct super_block *p_s_sb) {
 ** more details.
 */
 static int reiserfs_clean_and_file_buffer(struct buffer_head *bh) {
-  if (bh)
+  if (bh) {
    clear_buffer_dirty(bh);
+    clear_bit(BH_JTest, &bh->b_state);
+  }
  return 0 ;
 }
@@ -367,6 +382,7 @@ static void free_cnode(struct super_block *p_s_sb, struct reiserfs_journal_cnode
 static int clear_prepared_bits(struct buffer_head *bh) {
  clear_bit(BH_JPrepared, &bh->b_state) ;
+  clear_bit(BH_JRestore_dirty, &bh->b_state) ;
  return 0 ;
 }
@@ -471,11 +487,6 @@ int reiserfs_in_journal(struct super_block *p_s_sb,
  *next_zero_bit = 0 ; /* always start this at zero. */
-  /* we aren't logging all blocks are safe for reuse */
-  if (reiserfs_dont_log(p_s_sb)) {
-    return 0 ;
-  }
  PROC_INFO_INC( p_s_sb, journal.in_journal );
  /* If we aren't doing a search_all, this is a metablock, and it will be logged before use.
  ** if we crash before the transaction that freed it commits,  this transaction won't
@@ -503,6 +514,7 @@ int reiserfs_in_journal(struct super_block *p_s_sb,
  /* is it in the current transaction.  This should never happen */
  if ((cn = get_journal_hash_dev(p_s_sb, SB_JOURNAL(p_s_sb)->j_hash_table, bl))) {
+    BUG();
    return 1; 
  }
@@ -527,18 +539,30 @@ inline void insert_journal_hash(struct reiserfs_journal_cnode **table, struct re
 /* lock the current transaction */
 inline static void lock_journal(struct super_block *p_s_sb) {
-  PROC_INFO_INC( p_s_sb, journal.lock_journal );
+    PROC_INFO_INC( p_s_sb, journal.lock_journal );
-  while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_wlock)) > 0) {
+    down(&SB_JOURNAL(p_s_sb)->j_lock);
-    PROC_INFO_INC( p_s_sb, journal.lock_journal_wait );
-    sleep_on(&(SB_JOURNAL(p_s_sb)->j_wait)) ;
-  }
-  atomic_set(&(SB_JOURNAL(p_s_sb)->j_wlock), 1) ;
 }
 /* unlock the current transaction */
 inline static void unlock_journal(struct super_block *p_s_sb) {
-  atomic_dec(&(SB_JOURNAL(p_s_sb)->j_wlock)) ;
+    up(&SB_JOURNAL(p_s_sb)->j_lock);
-  wake_up(&(SB_JOURNAL(p_s_sb)->j_wait)) ;
+}
+static inline void get_journal_list(struct reiserfs_journal_list *jl)
+{
+    jl->j_refcount++;
+}
+static inline void put_journal_list(struct super_block *s,
+                                   struct reiserfs_journal_list *jl)
+{
+    if (jl->j_refcount < 1) {
+        printk("trans id %lu, refcount at %d\n", jl->j_trans_id,
+	                                         jl->j_refcount);
+        BUG();
+    }
+    if (--jl->j_refcount == 0)
+        reiserfs_kfree(jl, sizeof(struct reiserfs_journal_list), s);
 }
 /*
@@ -556,6 +580,83 @@ static void cleanup_freed_for_journal_list(struct super_block *p_s_sb, struct re
  jl->j_list_bitmap = NULL ;
 }
+static int journal_list_still_alive(struct super_block *s,
+                                    unsigned long trans_id)
+{
+    struct list_head *entry = &SB_JOURNAL(s)->j_journal_list;
+    struct reiserfs_journal_list *jl;
+    if (!list_empty(entry)) {
+        jl = JOURNAL_LIST_ENTRY(entry->next);
+	if (jl->j_trans_id <= trans_id) {
+	    return 1;
+	}
+    }
+    return 0;
+}
+static int flush_older_commits(struct super_block *s, struct reiserfs_journal_list *jl) {
+    struct reiserfs_journal_list *other_jl;
+    struct reiserfs_journal_list *first_jl;
+    struct list_head *entry;
+    unsigned long trans_id = jl->j_trans_id;
+    unsigned long other_trans_id;
+    unsigned long first_trans_id;
+find_first:
+    /*
+     * first we walk backwards to find the oldest uncommitted transation
+     */
+    first_jl = jl;
+    entry = jl->j_list.prev;
+    while(1) {
+	other_jl = JOURNAL_LIST_ENTRY(entry);
+	if (entry == &SB_JOURNAL(s)->j_journal_list ||
+	    atomic_read(&other_jl->j_older_commits_done))
+	    break;
+        first_jl = other_jl;
+	entry = other_jl->j_list.prev;
+    }
+    /* if we didn't find any older uncommitted transactions, return now */
+    if (first_jl == jl) {
+        return 0;
+    }
+    first_trans_id = first_jl->j_trans_id;
+    entry = &first_jl->j_list;
+    while(1) {
+	other_jl = JOURNAL_LIST_ENTRY(entry);
+	other_trans_id = other_jl->j_trans_id;
+	if (other_trans_id < trans_id) {
+	    if (atomic_read(&other_jl->j_commit_left) != 0) {
+		flush_commit_list(s, other_jl, 0);
+		/* list we were called with is gone, return */
+		if (!journal_list_still_alive(s, trans_id))
+		    return 1;
+		/* the one we just flushed is gone, this means all
+		 * older lists are also gone, so first_jl is no longer
+		 * valid either.  Go back to the beginning.
+		 */
+		if (!journal_list_still_alive(s, other_trans_id)) {
+		    goto find_first;
+		}
+	    }
+	    entry = entry->next;
+	    if (entry == &SB_JOURNAL(s)->j_journal_list)
+		return 0;
+	} else {
+	    return 0;
+	}
+    }
+    return 0;
+}
 /*
 ** if this journal list still has commit blocks unflushed, send them to disk.
 **
@@ -564,13 +665,10 @@ static void cleanup_freed_for_journal_list(struct super_block *p_s_sb, struct re
 **
 */
 static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) {
-  int i, count ;
+  int i;
-  int index = 0 ;
  int bn ;
-  int retry_count = 0 ;
-  int orig_commit_left = 0 ;
  struct buffer_head *tbh = NULL ;
-  struct reiserfs_journal_list *other_jl ;
+  unsigned long trans_id = jl->j_trans_id;
  reiserfs_check_lock_depth("flush_commit_list") ;
@@ -581,133 +679,100 @@ static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list
  /* before we can put our commit blocks on disk, we have to make sure everyone older than
  ** us is on disk too
  */
-  if (jl->j_len <= 0) {
+  if (jl->j_len <= 0)
-    return 0 ;
+    BUG();
-  }
+  if (trans_id == SB_JOURNAL(s)->j_trans_id)
+    BUG();
+  get_journal_list(jl);
  if (flushall) {
-    /* we _must_ make sure the transactions are committed in order.  Start with the
+    if (flush_older_commits(s, jl) == 1) {
-    ** index after this one, wrap all the way around 
+      /* list disappeared during flush_older_commits.  return */
-    */
+      goto put_jl;
-    index = (jl - SB_JOURNAL_LIST(s)) + 1 ;
-    for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) {
-      other_jl = SB_JOURNAL_LIST(s) + ( (index + i) % JOURNAL_LIST_COUNT) ;
-      if (other_jl && other_jl != jl && other_jl->j_len > 0 && other_jl->j_trans_id > 0 && 
-          other_jl->j_trans_id <= jl->j_trans_id && (atomic_read(&(jl->j_older_commits_done)) == 0)) {
-        flush_commit_list(s, other_jl, 0) ;
-      }
    }
  }
-  count = 0 ;
-  /* don't flush the commit list for the current transactoin */
-  if (jl == ((SB_JOURNAL_LIST(s) + SB_JOURNAL_LIST_INDEX(s)))) {
-    return 0 ;
-  }
  /* make sure nobody is trying to flush this one at the same time */
-  if (atomic_read(&(jl->j_commit_flushing))) {
+  down(&jl->j_commit_lock);
-    sleep_on(&(jl->j_commit_wait)) ;
+  if (!journal_list_still_alive(s, trans_id)) {
-    if (flushall) {
+    up(&jl->j_commit_lock);
-      atomic_set(&(jl->j_older_commits_done), 1) ;
+    goto put_jl;
-    }
-    return 0 ;
  }
+  if (jl->j_trans_id == 0)
+    BUG();
  /* this commit is done, exit */
  if (atomic_read(&(jl->j_commit_left)) <= 0) {
    if (flushall) {
      atomic_set(&(jl->j_older_commits_done), 1) ;
    }
-    return 0 ;
+    up(&jl->j_commit_lock);
+    goto put_jl;
  }
-  /* keeps others from flushing while we are flushing */
-  atomic_set(&(jl->j_commit_flushing), 1) ; 
+  /*
-  if (jl->j_len > SB_JOURNAL_TRANS_MAX(s)) {
+   * for the description block and all the log blocks, submit any buffers
-    reiserfs_panic(s, "journal-512: flush_commit_list: length is %lu, list number %d\n", jl->j_len, jl - SB_JOURNAL_LIST(s)) ;
+   * that haven't already reached the disk
-    return 0 ;
+   */
+  for (i = 0 ; i < (jl->j_len + 1) ; i++) {
+    bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start+i) %
+         SB_ONDISK_JOURNAL_SIZE(s);
+    tbh = journal_find_get_block(s, bn) ;
+    wait_on_buffer(tbh) ;
+    ll_rw_block(WRITE, 1, &tbh) ;
+    put_bh(tbh) ;
  }
-  orig_commit_left = atomic_read(&(jl->j_commit_left)) ; 
+  /* wait on everything written so far before writing the commit */
+  for (i = 0 ;  i < (jl->j_len + 1) ; i++) {
-  /* start by checking all the commit blocks in this transaction.  
+    bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
-  ** Add anyone not on disk into tbh.  Stop checking once commit_left <= 1, because that means we
+	 (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s) ;
-  ** only have the commit block left 
-  */
-retry:
-  count = 0 ;
-  for (i = 0 ; atomic_read(&(jl->j_commit_left)) > 1 && i < (jl->j_len + 1) ; i++) {  /* everything but commit_bh */
-    bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start+i) %  SB_ONDISK_JOURNAL_SIZE(s);
    tbh = journal_find_get_block(s, bn) ;
-/* kill this sanity check */
+    wait_on_buffer(tbh) ;
-if (count > (orig_commit_left + 2)) {
+    if (buffer_dirty(tbh))
-reiserfs_panic(s, "journal-539: flush_commit_list: BAD count(%d) > orig_commit_left(%d)!\n", count, orig_commit_left) ;
+      BUG();
-}
+    if (!buffer_uptodate(tbh)) {
-    if (tbh) {
+      reiserfs_panic(s, "journal-601, buffer write failed\n") ;
-      if (buffer_locked(tbh)) { /* wait on it, redo it just to make sure */
-	wait_on_buffer(tbh) ;
-	if (!buffer_uptodate(tbh)) {
-	  reiserfs_panic(s, "journal-584, buffer write failed\n") ;
-	}
-      } 
-      if (buffer_dirty(tbh)) {
-	printk("journal-569: flush_commit_list, block already dirty!\n") ;
-      } else {				
-	mark_buffer_dirty(tbh) ;
-      }
-      ll_rw_block(WRITE, 1, &tbh) ;
-      count++ ;
-      put_bh(tbh) ; /* once for our get_hash */
-    } 
-  }
-  /* wait on everyone in tbh before writing commit block*/
-  if (count > 0) {
-    for (i = 0 ; atomic_read(&(jl->j_commit_left)) > 1 && 
-                 i < (jl->j_len + 1) ; i++) {  /* everything but commit_bh */
-      bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s) ;
-      tbh = journal_find_get_block(s, bn) ;
-      wait_on_buffer(tbh) ;
-      if (!buffer_uptodate(tbh)) {
-	reiserfs_panic(s, "journal-601, buffer write failed\n") ;
-      }
-      put_bh(tbh) ; /* once for our get_hash */
-      bforget(tbh) ;    /* once due to original getblk in do_journal_end */
-      atomic_dec(&(jl->j_commit_left)) ;
    }
+    put_bh(tbh) ; /* once for journal_find_get_block */
+    put_bh(tbh) ;    /* once due to original getblk in do_journal_end */
+    atomic_dec(&(jl->j_commit_left)) ;
  }
-  if (atomic_read(&(jl->j_commit_left)) != 1) { /* just the commit_bh left, flush it without calling getblk for everyone */
+  if (atomic_read(&(jl->j_commit_left)) != 1)
-    if (retry_count < 2) {
+    BUG();
-      printk("journal-582: flush_commit_list, not all log blocks on disk yet, trying again\n") ;
-      retry_count++ ;
-      goto retry;
-    }
-    reiserfs_panic(s, "journal-563: flush_commit_list: BAD, j_commit_left is %u, should be 1\n", 
-		   atomic_read(&(jl->j_commit_left)));
-  }
+  if (buffer_dirty(jl->j_commit_bh))
+    BUG();
  mark_buffer_dirty(jl->j_commit_bh) ;
  sync_dirty_buffer(jl->j_commit_bh) ;
  if (!buffer_uptodate(jl->j_commit_bh)) {
    reiserfs_panic(s, "journal-615: buffer write failed\n") ;
  }
-  atomic_dec(&(jl->j_commit_left)) ;
  bforget(jl->j_commit_bh) ;
+  if (SB_JOURNAL(s)->j_last_commit_id != 0 &&
+     (jl->j_trans_id - SB_JOURNAL(s)->j_last_commit_id) != 1) {
+      reiserfs_warning("clm-2200: last commit %lu, current %lu\n",
+                       SB_JOURNAL(s)->j_last_commit_id,
+		       jl->j_trans_id);
+  }
+  SB_JOURNAL(s)->j_last_commit_id = jl->j_trans_id;
  /* now, every commit block is on the disk.  It is safe to allow blocks freed during this transaction to be reallocated */
  cleanup_freed_for_journal_list(s, jl) ;
+  /* mark the metadata dirty */
+  dirty_one_transaction(s, jl);
+  atomic_dec(&(jl->j_commit_left)) ;
  if (flushall) {
    atomic_set(&(jl->j_older_commits_done), 1) ;
  }
-  atomic_set(&(jl->j_commit_flushing), 0) ;
+  up(&jl->j_commit_lock);
-  wake_up(&(jl->j_commit_wait)) ;
+put_jl:
+  put_journal_list(s, jl);
-  s->s_dirt = 1 ;
  return 0 ;
 }
@@ -804,22 +869,27 @@ static int update_journal_header_block(struct super_block *p_s_sb,
 ** flush any and all journal lists older than you are 
 ** can only be called from flush_journal_list
 */
-static int flush_older_journal_lists(struct super_block *p_s_sb, struct reiserfs_journal_list *jl, unsigned long trans_id) {
+static int flush_older_journal_lists(struct super_block *p_s_sb,
-  int i, index ;
+                                     struct reiserfs_journal_list *jl)
-  struct reiserfs_journal_list *other_jl ;
+{
+    struct list_head *entry;
-  index = jl - SB_JOURNAL_LIST(p_s_sb) ;
+    struct reiserfs_journal_list *other_jl ;
-  for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) {
+    unsigned long trans_id = jl->j_trans_id;
-    other_jl = SB_JOURNAL_LIST(p_s_sb) + ((index + i) % JOURNAL_LIST_COUNT) ;
-    if (other_jl && other_jl->j_len > 0 && 
+    /* we know we are the only ones flushing things, no extra race
-        other_jl->j_trans_id > 0 && 
+     * protection is required.
-	other_jl->j_trans_id < trans_id && 
+     */
-        other_jl != jl) {
+restart:
-      /* do not flush all */
+    entry = SB_JOURNAL(p_s_sb)->j_journal_list.next;
-      flush_journal_list(p_s_sb, other_jl, 0) ; 
+    other_jl = JOURNAL_LIST_ENTRY(entry);
+    if (other_jl->j_trans_id < trans_id) {
+	/* do not flush all */
+	flush_journal_list(p_s_sb, other_jl, 0) ;
+	/* other_jl is now deleted from the list */
+	goto restart;
    }
-  }
+    return 0 ;
-  return 0 ;
 }
 static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate) {
@@ -836,15 +906,27 @@ static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate) {
    unlock_buffer(bh) ;
    put_bh(bh) ;
 }
 static void submit_logged_buffer(struct buffer_head *bh) {
-    lock_buffer(bh) ;
    get_bh(bh) ;
    bh->b_end_io = reiserfs_end_buffer_io_sync ;
    mark_buffer_notjournal_new(bh) ;
    clear_buffer_dirty(bh) ;
+    if (!test_and_clear_bit(BH_JTest, &bh->b_state))
+        BUG();
+    if (!buffer_uptodate(bh))
+        BUG();
    submit_bh(WRITE, bh) ;
 }
+static void del_from_work_list(struct super_block *s,
+                               struct reiserfs_journal_list *jl) {
+    if (!list_empty(&jl->j_working_list)) {
+	list_del_init(&jl->j_working_list);
+	SB_JOURNAL(s)->j_num_work_lists--;
+    }
+}
 /* flush a journal list, both commit and real blocks
 **
 ** always set flushall to 1, unless you are calling from inside
@@ -865,29 +947,26 @@ static int flush_journal_list(struct super_block *s,
  unsigned long j_len_saved = jl->j_len ;
  if (j_len_saved <= 0) {
-    return 0 ;
+    BUG();
  }
  if (atomic_read(&SB_JOURNAL(s)->j_wcount) != 0) {
    reiserfs_warning("clm-2048: flush_journal_list called with wcount %d\n",
                      atomic_read(&SB_JOURNAL(s)->j_wcount)) ;
  }
-  /* if someone is getting the commit list, we must wait for them */
+  if (jl->j_trans_id == 0)
-  while (atomic_read(&(jl->j_commit_flushing))) { 
+    BUG();
-    sleep_on(&(jl->j_commit_wait)) ;
-  }
-  /* if someone is flushing this list, we must wait for them */
-  while (atomic_read(&(jl->j_flushing))) {
-    sleep_on(&(jl->j_flush_wait)) ;
-  }
-  /* this list is now ours, we can change anything we want */
+  /* if flushall == 0, the lock is already held */
-  atomic_set(&(jl->j_flushing), 1) ;
+  if (flushall) {
+      down(&SB_JOURNAL(s)->j_flush_sem);
+  } else if (!down_trylock(&SB_JOURNAL(s)->j_flush_sem)) {
+      BUG();
+  }
  count = 0 ;
  if (j_len_saved > SB_JOURNAL_TRANS_MAX(s)) {
-    reiserfs_panic(s, "journal-715: flush_journal_list, length is %lu, list number %d\n", j_len_saved, jl - SB_JOURNAL_LIST(s)) ;
+    reiserfs_panic(s, "journal-715: flush_journal_list, length is %lu, trans id %lu\n", j_len_saved, jl->j_trans_id);
-    atomic_dec(&(jl->j_flushing)) ;
    return 0 ;
  }
@@ -902,6 +981,9 @@ static int flush_journal_list(struct super_block *s,
  */
  flush_commit_list(s, jl, 1) ;
+  if (!(jl->j_state & LIST_DIRTY))
+      BUG();
  /* are we done now? */
  if (atomic_read(&(jl->j_nonzerolen)) <= 0 && 
      atomic_read(&(jl->j_commit_left)) <= 0) {
@@ -937,13 +1019,13 @@ static int flush_journal_list(struct super_block *s,
      get_bh(saved_bh) ;
      if (buffer_journal_dirty(saved_bh)) {
+	if (!can_dirty(cn))
+	  BUG();
        was_jwait = 1 ;
-	mark_buffer_notjournal_dirty(saved_bh) ;
-        /* undo the inc from journal_mark_dirty */
-	put_bh(saved_bh) ;
-      }
-      if (can_dirty(cn)) {
        was_dirty = 1 ;
+      } else if (can_dirty(cn)) {
+        /* everything with !pjl && jwait should be writable */
+	BUG();
      }
    }
@@ -951,7 +1033,8 @@ static int flush_journal_list(struct super_block *s,
    ** sure they are commited, and don't try writing it to disk
    */
    if (pjl) {
-      flush_commit_list(s, pjl, 1) ;
+      if (atomic_read(&pjl->j_commit_left))
+        flush_commit_list(s, pjl, 1) ;
      goto free_cnode ;
    }
@@ -970,22 +1053,17 @@ static int flush_journal_list(struct super_block *s,
 printk("journal-813: BAD! buffer %llu %cdirty %cjwait, not in a newer tranasction\n", (unsigned long long)saved_bh->b_blocknr,
        was_dirty ? ' ' : '!', was_jwait ? ' ' : '!') ;
    }
-    /* kupdate_one_transaction waits on the buffers it is writing, so we
-    ** should never see locked buffers here
-    */
-    if (buffer_locked(saved_bh)) {
-      printk("clm-2083: locked buffer %llu in flush_journal_list\n", 
-              (unsigned long long)saved_bh->b_blocknr) ;
-      wait_on_buffer(saved_bh) ;
-      if (!buffer_uptodate(saved_bh)) {
-        reiserfs_panic(s, "journal-923: buffer write failed\n") ;
-      }
-    } 
    if (was_dirty) { 
      /* we inc again because saved_bh gets decremented at free_cnode */
      get_bh(saved_bh) ;
      set_bit(BLOCK_NEEDS_FLUSH, &cn->state) ;
-      submit_logged_buffer(saved_bh) ;
+      lock_buffer(saved_bh);
+      if (cn->blocknr != saved_bh->b_blocknr)
+        BUG();
+      if (buffer_dirty(saved_bh))
+        submit_logged_buffer(saved_bh) ;
+      else
+        unlock_buffer(saved_bh);
      count++ ;
    } else {
      printk("clm-2082: Unable to flush buffer %llu in flush_journal_list\n",
@@ -1016,6 +1094,14 @@ printk("journal-813: BAD! buffer %llu %cdirty %cjwait, not in a newer tranasctio
 	if (!buffer_uptodate(cn->bh)) {
 	  reiserfs_panic(s, "journal-949: buffer write failed\n") ;
 	}
+	/* note, we must clear the JDirty_wait bit after the up to date
+	** check, otherwise we race against our flushpage routine
+	*/
+	if (!test_and_clear_bit(BH_JDirty_wait, &cn->bh->b_state))
+	    BUG();
+        /* undo the inc from journal_mark_dirty */
+	put_bh(cn->bh) ;
        brelse(cn->bh) ;
      }
      cn = cn->next ;
@@ -1029,7 +1115,7 @@ printk("journal-813: BAD! buffer %llu %cdirty %cjwait, not in a newer tranasctio
  ** replayed after a crash
  */
  if (flushall) {
-    flush_older_journal_lists(s, jl, jl->j_trans_id) ;
+    flush_older_journal_lists(s, jl);
  } 
  /* before we can remove everything from the hash tables for this 
@@ -1044,181 +1130,246 @@ printk("journal-813: BAD! buffer %llu %cdirty %cjwait, not in a newer tranasctio
    update_journal_header_block(s, (jl->j_start + jl->j_len + 2) % SB_ONDISK_JOURNAL_SIZE(s), jl->j_trans_id) ;
  }
  remove_all_from_journal_list(s, jl, 0) ;
+  list_del(&jl->j_list);
+  SB_JOURNAL(s)->j_num_lists--;
+  del_from_work_list(s, jl);
+  if (SB_JOURNAL(s)->j_last_flush_id != 0 &&
+     (jl->j_trans_id - SB_JOURNAL(s)->j_last_flush_id) != 1) {
+      reiserfs_warning("clm-2201: last flush %lu, current %lu\n",
+                       SB_JOURNAL(s)->j_last_flush_id,
+		       jl->j_trans_id);
+  }
+  SB_JOURNAL(s)->j_last_flush_id = jl->j_trans_id;
+  /* not strictly required since we are freeing the list, but it should
+   * help find code using dead lists later on
+   */
  jl->j_len = 0 ;
  atomic_set(&(jl->j_nonzerolen), 0) ;
  jl->j_start = 0 ;
  jl->j_realblock = NULL ;
  jl->j_commit_bh = NULL ;
  jl->j_trans_id = 0 ;
-  atomic_dec(&(jl->j_flushing)) ;
+  jl->j_state = 0;
-  wake_up(&(jl->j_flush_wait)) ;
+  put_journal_list(s, jl);
+  if (flushall)
+    up(&SB_JOURNAL(s)->j_flush_sem);
  return 0 ;
 } 
+#define CHUNK_SIZE 32
+struct buffer_chunk {
+    struct buffer_head *bh[CHUNK_SIZE];
+    int nr;
+};
-static int kupdate_one_transaction(struct super_block *s,
+static void write_chunk(struct buffer_chunk *chunk) {
-                                    struct reiserfs_journal_list *jl) 
+    int i;
+    for (i = 0; i < chunk->nr ; i++) {
+	submit_logged_buffer(chunk->bh[i]) ;
+    }
+    chunk->nr = 0;
+}
+static void add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh) {
+    if (chunk->nr >= CHUNK_SIZE)
+        BUG();
+    chunk->bh[chunk->nr++] = bh;
+    if (chunk->nr >= CHUNK_SIZE)
+        write_chunk(chunk);
+}
+static int write_one_transaction(struct super_block *s,
+                                 struct reiserfs_journal_list *jl,
+				 struct buffer_chunk *chunk)
 {
-    struct reiserfs_journal_list *pjl ; /* previous list for this cn */
+    struct reiserfs_journal_cnode *cn;
-    struct reiserfs_journal_cnode *cn, *walk_cn ;
-    b_blocknr_t blocknr ;
-    int run = 0 ;
-    int orig_trans_id = jl->j_trans_id ;
-    struct buffer_head *saved_bh ; 
    int ret = 0 ;
-    /* if someone is getting the commit list, we must wait for them */
+    jl->j_state |= LIST_TOUCHED;
-    while (atomic_read(&(jl->j_commit_flushing))) {
+    del_from_work_list(s, jl);
-        sleep_on(&(jl->j_commit_wait)) ;
+    if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) {
-    }
+        return 0;
-    /* if someone is flushing this list, we must wait for them */
-    while (atomic_read(&(jl->j_flushing))) {
-        sleep_on(&(jl->j_flush_wait)) ;
    }
-    /* was it flushed while we slept? */
-    if (jl->j_len <= 0 || jl->j_trans_id != orig_trans_id) {
-        return 0 ;
-    }
-    /* this list is now ours, we can change anything we want */
-    atomic_set(&(jl->j_flushing), 1) ;
-loop_start:
    cn = jl->j_realblock ;
    while(cn) {
-        saved_bh = NULL ;
        /* if the blocknr == 0, this has been cleared from the hash,
        ** skip it
        */
        if (cn->blocknr == 0) {
            goto next ;
        }
+        if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) {
+	    struct buffer_head *tmp_bh;
+	    /* we can race against journal_mark_freed when we try
+	     * to lock_buffer(cn->bh), so we have to inc the buffer
+	     * count, and recheck things after locking
+	     */
+	    tmp_bh = cn->bh;
+	    get_bh(tmp_bh);
+	    lock_buffer(tmp_bh);
+	    if (cn->bh && can_dirty(cn) && buffer_dirty(tmp_bh)) {
+		if (!buffer_journal_dirty(tmp_bh) ||
+		    reiserfs_buffer_prepared(tmp_bh))
+		    BUG();
+		add_to_chunk(chunk, tmp_bh);
+		ret++;
+	    } else {
+		/* note, cn->bh might be null now */
+		unlock_buffer(tmp_bh);
+	    }
+	    put_bh(tmp_bh);
+        }
+next:
+        cn = cn->next ;
+	cond_resched();
+    }
+    return ret ;
+}
+/* used by flush_commit_list */
+static int dirty_one_transaction(struct super_block *s,
+                                 struct reiserfs_journal_list *jl)
+{
+    struct reiserfs_journal_cnode *cn;
+    struct reiserfs_journal_list *pjl;
+    int ret = 0 ;
+    jl->j_state |= LIST_DIRTY;
+    cn = jl->j_realblock ;
+    while(cn) {
        /* look for a more recent transaction that logged this
        ** buffer.  Only the most recent transaction with a buffer in
        ** it is allowed to send that buffer to disk
        */
-        pjl = find_newer_jl_for_cn(cn) ;
+	pjl = find_newer_jl_for_cn(cn) ;
-        if (run == 0 && !pjl && cn->bh && buffer_journal_dirty(cn->bh) &&
+        if (!pjl && cn->blocknr && cn->bh && buffer_journal_dirty(cn->bh))
-            can_dirty(cn)) 
+	{
-        {
+	    if (!can_dirty(cn))
-            if (!test_bit(BH_JPrepared, &cn->bh->b_state)) {
+	        BUG();
-                set_bit(BLOCK_NEEDS_FLUSH, &cn->state) ;
+	    /* if the buffer is prepared, it will either be logged
-		submit_logged_buffer(cn->bh) ;
+	     * or restored.  If restored, we need to make sure
-            } else {
+	     * it actually gets marked dirty
-                /* someone else is using this buffer.  We can't 
+	     */
-                ** send it to disk right now because they might
+	    mark_buffer_notjournal_new(cn->bh) ;
-                ** be changing/logging it.
+	    if (test_bit(BH_JPrepared, &cn->bh->b_state)) {
-                */
+	        set_bit(BH_JRestore_dirty, &cn->bh->b_state);
-                ret = 1 ;
+	    } else {
-            }
+	        set_bit(BH_JTest, &cn->bh->b_state);
-        } else if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) {
+	        mark_buffer_dirty(cn->bh);
-            clear_bit(BLOCK_NEEDS_FLUSH, &cn->state) ;
+	    }
-            if (!pjl && cn->bh) {
-                wait_on_buffer(cn->bh) ;
-            }
-            /* check again, someone could have logged while we scheduled */
-            pjl = find_newer_jl_for_cn(cn) ;
-            /* before the JDirty_wait bit is set, the 
-            ** buffer is added to the hash list.  So, if we are
-            ** run in the middle of a do_journal_end, we will notice
-            ** if this buffer was logged and added from the latest
-            ** transaction.  In this case, we don't want to decrement
-            ** b_count
-            */
-            if (!pjl && cn->bh && buffer_journal_dirty(cn->bh)) {
-                blocknr = cn->blocknr ;
-                walk_cn = cn ;
-                saved_bh= cn->bh ;
-                /* update all older transactions to show this block
-                ** was flushed
-                */
-                mark_buffer_notjournal_dirty(cn->bh) ;
-                while(walk_cn) {
-                    if (walk_cn->bh && walk_cn->blocknr == blocknr && 
-                         walk_cn->sb == cn->sb) {
-                        if (walk_cn->jlist) {
-                            atomic_dec(&(walk_cn->jlist->j_nonzerolen)) ;
-                        }
-                        walk_cn->bh = NULL ;
-                    }
-                    walk_cn = walk_cn->hnext ;
-                }
-                if (atomic_read(&saved_bh->b_count) < 1) {
-                    reiserfs_warning("clm-2081: bad count on %lu\n", 
-                                      saved_bh->b_blocknr) ;
-                }
-                brelse(saved_bh) ;
-            }
-        }
-        /*
-        ** if the more recent transaction is committed to the log,
-        ** this buffer can be considered flushed.  Decrement our
-        ** counters to reflect one less buffer that needs writing.
-        **
-        ** note, this relies on all of the above code being
-        ** schedule free once pjl comes back non-null.
-        */
-        if (pjl && cn->bh && atomic_read(&pjl->j_commit_left) == 0) {
-            atomic_dec(&cn->jlist->j_nonzerolen) ;
-            cn->bh = NULL ;
        } 
-next:
        cn = cn->next ;
    }
-    /* the first run through the loop sends all the dirty buffers to
+    return ret ;
-    ** ll_rw_block.
+}
-    ** the second run through the loop does all the accounting
-    */
+static int kupdate_transactions(struct super_block *s,
-    if (run++ == 0) {
+                                   struct reiserfs_journal_list *jl,
-        goto loop_start ;
+				   struct reiserfs_journal_list **next_jl,
+				   unsigned long *next_trans_id,
+				   int num_blocks,
+				   int num_trans) {
+    int ret = 0;
+    int written = 0 ;
+    int transactions_flushed = 0;
+    unsigned long orig_trans_id = jl->j_trans_id;
+    struct buffer_chunk chunk;
+    struct list_head *entry;
+    chunk.nr = 0;
+    down(&SB_JOURNAL(s)->j_flush_sem);
+    if (!journal_list_still_alive(s, orig_trans_id)) {
+	goto done;
+    }
+    /* we've got j_flush_sem held, nobody is going to delete any
+     * of these lists out from underneath us
+     */
+    while((num_trans && transactions_flushed < num_trans) ||
+          (!num_trans && written < num_blocks)) {
+	if (jl->j_len == 0 || (jl->j_state & LIST_TOUCHED) ||
+	    atomic_read(&jl->j_commit_left))
+	{
+	    del_from_work_list(s, jl);
+	    break;
+	}
+	ret = write_one_transaction(s, jl, &chunk);
+	if (ret < 0)
+	    goto done;
+	transactions_flushed++;
+	written += ret;
+	entry = jl->j_list.next;
+	/* did we wrap? */
+	if (entry == &SB_JOURNAL(s)->j_journal_list) {
+	    break;
+        }
+	jl = JOURNAL_LIST_ENTRY(entry);
+	/* don't bother with older transactions */
+	if (jl->j_trans_id <= orig_trans_id)
+	    break;
+    }
+    if (chunk.nr) {
+        write_chunk(&chunk);
    }
-    atomic_set(&(jl->j_flushing), 0) ;
+done:
-    wake_up(&(jl->j_flush_wait)) ;
+    up(&SB_JOURNAL(s)->j_flush_sem);
-    return ret ;
+    return ret;
 }
-/* since we never give dirty buffers to bdflush/kupdate, we have to
-** flush them ourselves.  This runs through the journal lists, finds
+/* for o_sync and fsync heavy applications, they tend to use
-** old metadata in need of flushing and sends it to disk.
+** all the journa list slots with tiny transactions.  These
-** this does not end transactions, commit anything, or free
+** trigger lots and lots of calls to update the header block, which
-** cnodes.
+** adds seeks and slows things down.
 **
-** returns the highest transaction id that was flushed last time
+** This function tries to clear out a large chunk of the journal lists
+** at once, which makes everything faster since only the newest journal
+** list updates the header block
 */
-static unsigned long reiserfs_journal_kupdate(struct super_block *s) {
+static int flush_used_journal_lists(struct super_block *s,
-    struct reiserfs_journal_list *jl ;
+                                    struct reiserfs_journal_list *jl) {
-    int i ;
+    unsigned long len = 0;
-    int start ;
+    unsigned long cur_len;
-    time_t age ;
+    int ret;
-    int ret = 0 ;
+    int i;
+    struct reiserfs_journal_list *tjl;
-    start = SB_JOURNAL_LIST_INDEX(s) ;
+    struct reiserfs_journal_list *flush_jl;
+    unsigned long trans_id;
-    /* safety check to prevent flush attempts during a mount */
-    if (start < 0) {
+    flush_jl = tjl = jl;
-        return 0 ;
-    }
+    /* flush for 256 transactions or 256 blocks, whichever comes first */
-    i = (start + 1) % JOURNAL_LIST_COUNT ;
+    for(i = 0 ; i < 256 && len < 256 ; i++) {
-    while(i != start) {
+	if (atomic_read(&tjl->j_commit_left) ||
-        jl = SB_JOURNAL_LIST(s) + i  ;
+	    tjl->j_trans_id < jl->j_trans_id) {
-        age = get_seconds() - jl->j_timestamp ;
+	    break;
-        if (jl->j_len > 0 && // age >= (JOURNAL_MAX_COMMIT_AGE * 2) && 
+	}
-            atomic_read(&(jl->j_nonzerolen)) > 0 &&
+	cur_len = atomic_read(&tjl->j_nonzerolen);
-            atomic_read(&(jl->j_commit_left)) == 0) {
+	if (cur_len > 0) {
+	    tjl->j_state &= ~LIST_TOUCHED;
-            if (jl->j_trans_id == SB_JOURNAL(s)->j_trans_id) {
+	}
-                break ;
+	len += cur_len;
-            }
+	flush_jl = tjl;
-            /* if ret was already 1, we want to preserve that */
+	if (tjl->j_list.next == &SB_JOURNAL(s)->j_journal_list)
-            ret |= kupdate_one_transaction(s, jl) ;
+	    break;
-        } 
+	tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next);
-        if (atomic_read(&(jl->j_nonzerolen)) > 0) {
+    }
-            ret |= 1 ;
+    /* try to find a group of blocks we can flush across all the
-        }
+    ** transactions, but only bother if we've actually spanned
-        i = (i + 1) % JOURNAL_LIST_COUNT ;
+    ** across multiple lists
+    */
+    if (flush_jl != jl) {
+        ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i);
    }
-    return ret ;
+    flush_journal_list(s, flush_jl, 1);
+    return 0;
 }
 /*
@@ -1262,6 +1413,10 @@ void remove_journal_hash(struct super_block *sb,
 }
 static void free_journal_ram(struct super_block *p_s_sb) {
+  reiserfs_kfree(SB_JOURNAL(p_s_sb)->j_current_jl,
+                 sizeof(struct reiserfs_journal_list), p_s_sb);
+  SB_JOURNAL(p_s_sb)->j_num_lists--;
  vfree(SB_JOURNAL(p_s_sb)->j_cnode_free_orig) ;
  free_list_bitmaps(p_s_sb, SB_JOURNAL(p_s_sb)->j_list_bitmap) ;
  free_bitmap_nodes(p_s_sb) ; /* must be after free_list_bitmaps */
@@ -1392,7 +1547,7 @@ static int journal_transaction_is_valid(struct super_block *p_s_sb, struct buffe
    }
    brelse(c_bh) ;
    reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1006: found valid "
-                   "transaction start offset %lu, len %d id %d\n", 
+                   "transaction start offset %llu, len %d id %d\n",
 		   d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), 
 		   get_desc_trans_len(desc), get_desc_trans_id(desc)) ;
    return 1 ;
@@ -1432,7 +1587,7 @@ static int journal_read_transaction(struct super_block *p_s_sb, unsigned long cu
  desc = (struct reiserfs_journal_desc *)d_bh->b_data ;
  trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) ;
  reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1037: "
-                 "journal_read_transaction, offset %lu, len %d mount_id %d\n", 
+                 "journal_read_transaction, offset %llu, len %d mount_id %d\n",
 		 d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), 
 		 get_desc_trans_len(desc), get_desc_mount_id(desc)) ;
  if (get_desc_trans_id(desc) < oldest_trans_id) {
@@ -1460,7 +1615,7 @@ static int journal_read_transaction(struct super_block *p_s_sb, unsigned long cu
  commit = (struct reiserfs_journal_commit *)c_bh->b_data ;
  if (journal_compare_desc_commit(p_s_sb, desc, commit)) {
    reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal_read_transaction, "
-                   "commit offset %ld had bad time %d or length %d\n", 
+                   "commit offset %llu had bad time %d or length %d\n",
 		   c_bh->b_blocknr -  SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), 
 		   get_commit_trans_id(commit), get_commit_trans_len(commit));
    brelse(c_bh) ;
@@ -1628,7 +1783,7 @@ static int journal_read(struct super_block *p_s_sb) {
  printk("reiserfs: checking transaction log (%s) for (%s)\n",
 	 bdevname(SB_JOURNAL(p_s_sb)->j_dev_bd, b),
 	 reiserfs_bdevname(p_s_sb));
-  start = get_seconds() ;
+  start = get_seconds();
  /* step 1, read in the journal header block.  Check the transaction it says 
  ** is the first unflushed, and if that transaction is not valid, 
@@ -1688,7 +1843,7 @@ static int journal_read(struct super_block *p_s_sb) {
 	oldest_start = d_bh->b_blocknr ;
 	newest_mount_id = get_desc_mount_id(desc) ;
 	reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1179: Setting "
-	               "oldest_start to offset %lu, trans_id %lu\n", 
+	               "oldest_start to offset %llu, trans_id %lu\n",
 		       oldest_start - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), 
 		       oldest_trans_id) ;
      } else if (oldest_trans_id > get_desc_trans_id(desc)) { 
@@ -1716,7 +1871,7 @@ static int journal_read(struct super_block *p_s_sb) {
  cur_dblock = oldest_start ;
  if (oldest_trans_id)  {
    reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1206: Starting replay "
-                   "from offset %lu, trans_id %lu\n", 
+                   "from offset %llu, trans_id %lu\n",
 		   cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), 
 		   oldest_trans_id) ;
@@ -1770,70 +1925,26 @@ static int journal_read(struct super_block *p_s_sb) {
  return 0 ;
 }
+static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
-struct reiserfs_journal_commit_task {
+{
-  struct super_block *p_s_sb ;
+    struct reiserfs_journal_list *jl;
-  int jindex ;
+retry:
-  int wake_on_finish ; /* if this is one, we wake the task_done queue, if it
+    jl = reiserfs_kmalloc(sizeof(struct reiserfs_journal_list), GFP_NOFS, s);
-                       ** is zero, we free the whole struct on finish
+    if (!jl) {
-		       */
+	yield();
-  struct reiserfs_journal_commit_task *self ;
+	goto retry;
-  struct work_struct work;
+    }
-} ;
+    memset(jl, 0, sizeof(*jl));
+    INIT_LIST_HEAD(&jl->j_list);
-static void reiserfs_journal_commit_task_func(void *__ct) {
+    INIT_LIST_HEAD(&jl->j_working_list);
-  struct reiserfs_journal_commit_task *ct = __ct;
+    sema_init(&jl->j_commit_lock, 1);
-  struct reiserfs_journal_list *jl ;
+    SB_JOURNAL(s)->j_num_lists++;
+    get_journal_list(jl);
-  reiserfs_write_lock(ct->p_s_sb);
+    return jl;
-  jl = SB_JOURNAL_LIST(ct->p_s_sb) + ct->jindex ;
-  flush_commit_list(ct->p_s_sb, SB_JOURNAL_LIST(ct->p_s_sb) + ct->jindex, 1) ; 
-  if (jl->j_len > 0 && atomic_read(&(jl->j_nonzerolen)) > 0 &&
-      atomic_read(&(jl->j_commit_left)) == 0) {
-    kupdate_one_transaction(ct->p_s_sb, jl) ;
-  }
-  reiserfs_kfree(ct->self, sizeof(struct reiserfs_journal_commit_task), ct->p_s_sb) ;
-  reiserfs_write_unlock(ct->p_s_sb);
-}
-static void setup_commit_task_arg(struct reiserfs_journal_commit_task *ct,
-                                  struct super_block *p_s_sb, 
-				  int jindex) {
-  if (!ct) {
-    reiserfs_panic(NULL, "journal-1360: setup_commit_task_arg called with NULL struct\n") ;
-  }
-  ct->p_s_sb = p_s_sb ;
-  ct->jindex = jindex ;
-  INIT_WORK(&ct->work, reiserfs_journal_commit_task_func, ct);
-  ct->self = ct ;
-}
-static void commit_flush_async(struct super_block *p_s_sb, int jindex) {
-  struct reiserfs_journal_commit_task *ct ;
-  /* using GFP_NOFS, GFP_KERNEL could try to flush inodes, which will try
-  ** to start/join a transaction, which will deadlock
-  */
-  ct = reiserfs_kmalloc(sizeof(struct reiserfs_journal_commit_task), GFP_NOFS, p_s_sb) ;
-  if (ct) {
-    setup_commit_task_arg(ct, p_s_sb, jindex) ;
-    queue_work(commit_wq, &ct->work) ;
-  } else {
-#ifdef CONFIG_REISERFS_CHECK
-    reiserfs_warning("journal-1540: kmalloc failed, doing sync commit\n") ;
-#endif
-    flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1) ;
-  }
 }
 static void journal_list_init(struct super_block *p_s_sb) {
-  int i ;
+    SB_JOURNAL(p_s_sb)->j_current_jl = alloc_journal_list(p_s_sb);
-  for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) {
-    init_waitqueue_head(&(SB_JOURNAL_LIST(p_s_sb)[i].j_commit_wait)) ;
-    init_waitqueue_head(&(SB_JOURNAL_LIST(p_s_sb)[i].j_flush_wait)) ;
-  }
 }
 static int release_journal_dev( struct super_block *super,
@@ -1924,6 +2035,7 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo
    struct reiserfs_super_block * rs;
    struct reiserfs_journal_header *jh;
    struct reiserfs_journal *journal;
+    struct reiserfs_journal_list *jl;
    char b[BDEVNAME_SIZE];
    journal = SB_JOURNAL(p_s_sb) = vmalloc(sizeof (struct reiserfs_journal)) ;
@@ -1934,6 +2046,8 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo
    memset(journal, 0, sizeof(struct reiserfs_journal)) ;
    INIT_LIST_HEAD(&SB_JOURNAL(p_s_sb)->j_bitmap_nodes) ;
    INIT_LIST_HEAD (&SB_JOURNAL(p_s_sb)->j_prealloc_list);
+    INIT_LIST_HEAD(&SB_JOURNAL(p_s_sb)->j_working_list);
+    INIT_LIST_HEAD(&SB_JOURNAL(p_s_sb)->j_journal_list);
    reiserfs_allocate_list_bitmaps(p_s_sb, SB_JOURNAL(p_s_sb)->j_list_bitmap, 
 				   SB_BMAP_NR(p_s_sb)) ;
    allocate_bitmap_nodes(p_s_sb) ;
@@ -2041,10 +2155,6 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo
  brelse (bhjh);
  SB_JOURNAL(p_s_sb)->j_list_bitmap_index = 0 ;
-  SB_JOURNAL_LIST_INDEX(p_s_sb) = -10000 ; /* make sure flush_old_commits does not try to flush a list while replay is on */
-  /* clear out the journal list array */
-  memset(SB_JOURNAL_LIST(p_s_sb), 0, sizeof(struct reiserfs_journal_list) * JOURNAL_LIST_COUNT) ; 
  journal_list_init(p_s_sb) ;
  memset(SB_JOURNAL(p_s_sb)->j_list_hash_table, 0, JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)) ;
@@ -2061,13 +2171,13 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo
  SB_JOURNAL(p_s_sb)->j_last = NULL ;	  
  SB_JOURNAL(p_s_sb)->j_first = NULL ;     
  init_waitqueue_head(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
-  init_waitqueue_head(&(SB_JOURNAL(p_s_sb)->j_wait)) ; 
+  sema_init(&SB_JOURNAL(p_s_sb)->j_lock, 1);
+  sema_init(&SB_JOURNAL(p_s_sb)->j_flush_sem, 1);
  SB_JOURNAL(p_s_sb)->j_trans_id = 10 ;  
  SB_JOURNAL(p_s_sb)->j_mount_id = 10 ; 
  SB_JOURNAL(p_s_sb)->j_state = 0 ;
  atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 0) ;
-  atomic_set(&(SB_JOURNAL(p_s_sb)->j_wlock), 0) ;
  SB_JOURNAL(p_s_sb)->j_cnode_free_list = allocate_cnodes(num_cnodes) ;
  SB_JOURNAL(p_s_sb)->j_cnode_free_orig = SB_JOURNAL(p_s_sb)->j_cnode_free_list ;
  SB_JOURNAL(p_s_sb)->j_cnode_free = SB_JOURNAL(p_s_sb)->j_cnode_free_list ? num_cnodes : 0 ;
@@ -2075,8 +2185,9 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo
  SB_JOURNAL(p_s_sb)->j_must_wait = 0 ;
  init_journal_hash(p_s_sb) ;
-  SB_JOURNAL_LIST(p_s_sb)[0].j_list_bitmap = get_list_bitmap(p_s_sb, SB_JOURNAL_LIST(p_s_sb)) ;
+  jl = SB_JOURNAL(p_s_sb)->j_current_jl;
-  if (!(SB_JOURNAL_LIST(p_s_sb)[0].j_list_bitmap)) {
+  jl->j_list_bitmap = get_list_bitmap(p_s_sb, jl);
+  if (!jl->j_list_bitmap) {
    reiserfs_warning("journal-2005, get_list_bitmap failed for journal list 0\n") ;
    goto free_and_return;
  }
@@ -2084,16 +2195,12 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo
    reiserfs_warning("Replay Failure, unable to mount\n") ;
    goto free_and_return;
  }
-  SB_JOURNAL_LIST_INDEX(p_s_sb) = 0 ; /* once the read is done, we can set this
-                                         where it belongs */
-  if (reiserfs_dont_log (p_s_sb))
-    return 0;
  reiserfs_mounted_fs_count++ ;
  if (reiserfs_mounted_fs_count <= 1)
    commit_wq = create_workqueue("reiserfs");
+  INIT_WORK(&journal->j_work, flush_async_commits, p_s_sb);
  return 0 ;
 free_and_return:
  free_journal_ram(p_s_sb);
@@ -2107,8 +2214,6 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo
 */
 int journal_transaction_should_end(struct reiserfs_transaction_handle *th, int new_alloc) {
  time_t now = get_seconds() ;
-  if (reiserfs_dont_log(th->t_super)) 
-    return 0 ;
  /* cannot restart while nested */
  if (th->t_refcount > 1)
    return 0 ;
@@ -2148,6 +2253,35 @@ void reiserfs_wait_on_write_block(struct super_block *s) {
               !test_bit(WRITERS_BLOCKED, &SB_JOURNAL(s)->j_state)) ;
 }
+static void queue_log_writer(struct super_block *s) {
+    set_bit(WRITERS_QUEUED, &SB_JOURNAL(s)->j_state);
+    sleep_on(&SB_JOURNAL(s)->j_join_wait);
+}
+static void wake_queued_writers(struct super_block *s) {
+    if (test_and_clear_bit(WRITERS_QUEUED, &SB_JOURNAL(s)->j_state))
+        wake_up(&SB_JOURNAL(s)->j_join_wait);
+}
+static void let_transaction_grow(struct super_block *sb,
+                                 unsigned long trans_id)
+{
+    unsigned long bcount = SB_JOURNAL(sb)->j_bcount;
+    while(1) {
+	yield();
+        while ((atomic_read(&SB_JOURNAL(sb)->j_wcount) > 0 ||
+	        atomic_read(&SB_JOURNAL(sb)->j_jlock)) &&
+	       SB_JOURNAL(sb)->j_trans_id == trans_id) {
+	    queue_log_writer(sb);
+	}
+	if (SB_JOURNAL(sb)->j_trans_id != trans_id)
+	    break;
+	if (bcount == SB_JOURNAL(sb)->j_bcount)
+	    break;
+	bcount = SB_JOURNAL(sb)->j_bcount;
+    }
+}
 /* join == true if you must join an existing transaction.
 ** join == false if you can deal with waiting for others to finish
 **
@@ -2157,15 +2291,14 @@ void reiserfs_wait_on_write_block(struct super_block *s) {
 static int do_journal_begin_r(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb,unsigned long nblocks,int join) {
  time_t now = get_seconds() ;
  int old_trans_id  ;
+  struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+  struct reiserfs_transaction_handle myth;
+  int sched_count = 0;
  reiserfs_check_lock_depth("journal_begin") ;
  RFALSE( p_s_sb->s_flags & MS_RDONLY, 
 	  "clm-2078: calling journal_begin on readonly FS") ;
-  if (reiserfs_dont_log(p_s_sb)) {
-    th->t_super = p_s_sb ; /* others will check this for the don't log flag */
-    return 0 ;
-  }
  PROC_INFO_INC( p_s_sb, journal.journal_being );
  /* set here for journal_join */
  th->t_refcount = 1;
@@ -2173,66 +2306,76 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, struct sup
 relock:
  lock_journal(p_s_sb) ;
+  journal->j_bcount++;
-  if (test_bit(WRITERS_BLOCKED, &SB_JOURNAL(p_s_sb)->j_state)) {
+  if (test_bit(WRITERS_BLOCKED, &journal->j_state)) {
    unlock_journal(p_s_sb) ;
    reiserfs_wait_on_write_block(p_s_sb) ;
    PROC_INFO_INC( p_s_sb, journal.journal_relock_writers );
    goto relock ;
  }
+  now = get_seconds();
  /* if there is no room in the journal OR
  ** if this transaction is too old, and we weren't called joinable, wait for it to finish before beginning 
  ** we don't sleep if there aren't other writers
  */
-  if (  (!join && SB_JOURNAL(p_s_sb)->j_must_wait > 0) ||
+  if ( (!join && journal->j_must_wait > 0) ||
-     ( !join && (SB_JOURNAL(p_s_sb)->j_len_alloc + nblocks + 2) >= SB_JOURNAL_MAX_BATCH(p_s_sb)) || 
+     ( !join && (journal->j_len_alloc + nblocks + 2) >= SB_JOURNAL_MAX_BATCH(p_s_sb)) ||
-     (!join && atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) > 0 && SB_JOURNAL(p_s_sb)->j_trans_start_time > 0 && 
+     (!join && atomic_read(&journal->j_wcount) > 0 && journal->j_trans_start_time > 0 &&
-      (now - SB_JOURNAL(p_s_sb)->j_trans_start_time) > SB_JOURNAL_MAX_TRANS_AGE(p_s_sb)) ||
+      (now - journal->j_trans_start_time) > SB_JOURNAL_MAX_TRANS_AGE(p_s_sb)) ||
-     (!join && atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) ) ||
+     (!join && atomic_read(&journal->j_jlock)) ||
-     (!join && SB_JOURNAL(p_s_sb)->j_cnode_free < (SB_JOURNAL_TRANS_MAX(p_s_sb) * 3))) {
+     (!join && journal->j_cnode_free < (SB_JOURNAL_TRANS_MAX(p_s_sb) * 3))) {
+    old_trans_id = journal->j_trans_id;
    unlock_journal(p_s_sb) ; /* allow others to finish this transaction */
-    /* if writer count is 0, we can just force this transaction to end, and start
+    if (!join && (journal->j_len_alloc + nblocks + 2) >=
-    ** a new one afterwards.
+        SB_JOURNAL_MAX_BATCH(p_s_sb) &&
-    */
+	((journal->j_len + nblocks + 2) * 100) < (journal->j_len_alloc * 75))
-    if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0) {
+    {
-      struct reiserfs_transaction_handle myth ;
+	if (atomic_read(&journal->j_wcount) > 10) {
-      journal_join(&myth, p_s_sb, 1) ;
+	    sched_count++;
-      reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
+	    queue_log_writer(p_s_sb);
-      journal_mark_dirty(&myth, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
+	    goto relock;
-      do_journal_end(&myth, p_s_sb,1,COMMIT_NOW) ;
+	}
+    }
+    /* don't mess with joining the transaction if all we have to do is
+     * wait for someone else to do a commit
+     */
+    if (atomic_read(&journal->j_jlock)) {
+	while (journal->j_trans_id == old_trans_id &&
+	       atomic_read(&journal->j_jlock)) {
+	    queue_log_writer(p_s_sb);
+        }
+	goto relock;
+    }
+    journal_join(&myth, p_s_sb, 1) ;
+    /* someone might have ended the transaction while we joined */
+    if (old_trans_id != SB_JOURNAL(p_s_sb)->j_trans_id) {
+        do_journal_end(&myth, p_s_sb, 1, 0) ;
    } else {
-      /* but if the writer count isn't zero, we have to wait for the current writers to finish.
+        do_journal_end(&myth, p_s_sb, 1, COMMIT_NOW) ;
-      ** They won't batch on transaction end once we set j_jlock
-      */
-      atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 1) ;
-      old_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ;
-      while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) &&
-            SB_JOURNAL(p_s_sb)->j_trans_id == old_trans_id) {
-	sleep_on(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
-      }
    }
    PROC_INFO_INC( p_s_sb, journal.journal_relock_wcount );
    goto relock ;
  }
+  /* we are the first writer, set trans_id */
-  if (SB_JOURNAL(p_s_sb)->j_trans_start_time == 0) { /* we are the first writer, set trans_id */
+  if (journal->j_trans_start_time == 0) {
-    SB_JOURNAL(p_s_sb)->j_trans_start_time = now ;
+    journal->j_trans_start_time = get_seconds();
  }
-  atomic_inc(&(SB_JOURNAL(p_s_sb)->j_wcount)) ;
+  atomic_inc(&(journal->j_wcount)) ;
-  SB_JOURNAL(p_s_sb)->j_len_alloc += nblocks ;
+  journal->j_len_alloc += nblocks ;
  th->t_blocks_logged = 0 ;
  th->t_blocks_allocated = nblocks ;
-  th->t_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ;
+  th->t_trans_id = journal->j_trans_id ;
  unlock_journal(p_s_sb) ;
-  p_s_sb->s_dirt = 1; 
  return 0 ;
 }
 static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
  struct reiserfs_transaction_handle *cur_th = current->journal_info;
@@ -2277,11 +2420,6 @@ int journal_begin(struct reiserfs_transaction_handle *th, struct super_block  *
    return ret ;
 }
-/* not used at all */
-int journal_prepare(struct super_block  * p_s_sb, struct buffer_head *bh) {
-  return 0 ;
-}
 /*
 ** puts bh into the current transaction.  If it was already there, reorders removes the
 ** old pointers from the hash, and puts new ones in (to make sure replay happen in the right order).
@@ -2297,18 +2435,14 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th, struct super_bloc
  int prepared = 0 ;
  PROC_INFO_INC( p_s_sb, journal.mark_dirty );
-  if (reiserfs_dont_log(th->t_super)) {
-    mark_buffer_dirty(bh) ;
-    return 0 ;
-  }
  if (th->t_trans_id != SB_JOURNAL(p_s_sb)->j_trans_id) {
    reiserfs_panic(th->t_super, "journal-1577: handle trans id %ld != current trans id %ld\n", 
                   th->t_trans_id, SB_JOURNAL(p_s_sb)->j_trans_id);
  }
-  p_s_sb->s_dirt = 1 ;
+  p_s_sb->s_dirt = 1;
  prepared = test_and_clear_bit(BH_JPrepared, &bh->b_state) ;
+  clear_bit(BH_JRestore_dirty, &bh->b_state);
  /* already in this transaction, we are done */
  if (buffer_journaled(bh)) {
    PROC_INFO_INC( p_s_sb, journal.mark_dirty_already );
@@ -2319,13 +2453,12 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th, struct super_bloc
  ** a dirty or journal_dirty or locked buffer to be logged, as some changes
  ** could get to disk too early.  NOT GOOD.
  */
-  if (!prepared || buffer_locked(bh)) {
+  if (!prepared || buffer_locked(bh) || buffer_dirty(bh)) {
    printk("journal-1777: buffer %llu bad state %cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT\n", (unsigned long long)bh->b_blocknr, prepared ? ' ' : '!', 
                            buffer_locked(bh) ? ' ' : '!',
 			    buffer_dirty(bh) ? ' ' : '!',
 			    buffer_journal_dirty(bh) ? ' ' : '!') ;
  }
-  count_already_incd = clear_prepared_bits(bh) ;
  if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0) {
    printk("journal-1409: journal_mark_dirty returning because j_wcount was %d\n", atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount))) ;
@@ -2344,14 +2477,6 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th, struct super_bloc
    mark_buffer_notjournal_dirty(bh) ;
  }
-  if (buffer_dirty(bh)) {
-    clear_buffer_dirty(bh) ;
-  }
-  if (buffer_journaled(bh)) { /* must double check after getting lock */
-    goto done ;
-  }
  if (SB_JOURNAL(p_s_sb)->j_len > SB_JOURNAL(p_s_sb)->j_len_alloc) {
    SB_JOURNAL(p_s_sb)->j_len_alloc = SB_JOURNAL(p_s_sb)->j_len + JOURNAL_PER_BALANCE_CNT ;
  }
@@ -2391,24 +2516,6 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th, struct super_bloc
    SB_JOURNAL(p_s_sb)->j_first = cn ;
    SB_JOURNAL(p_s_sb)->j_last = cn ;
  }
-done:
-  return 0 ;
-}
-/*
-** if buffer already in current transaction, do a journal_mark_dirty
-** otherwise, just mark it dirty and move on.  Used for writes to meta blocks
-** that don't need journaling
-*/
-int journal_mark_dirty_nolog(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, struct buffer_head *bh) {
-  if (reiserfs_dont_log(th->t_super) || buffer_journaled(bh) || 
-      buffer_journal_dirty(bh)) {
-    return journal_mark_dirty(th, p_s_sb, bh) ;
-  }
-  if (get_journal_hash_dev(p_s_sb, SB_JOURNAL(p_s_sb)->j_list_hash_table, bh->b_blocknr)) {
-    return journal_mark_dirty(th, p_s_sb, bh) ;
-  }
-  mark_buffer_dirty(bh) ;
  return 0 ;
 }
@@ -2474,7 +2581,6 @@ static int remove_from_transaction(struct super_block *p_s_sb, b_blocknr_t block
    if (atomic_read(&(bh->b_count)) < 0) {
      printk("journal-1752: remove from trans, b_count < 0\n") ;
    }
-    if (!buffer_locked(bh)) reiserfs_clean_and_file_buffer(bh) ; 
    ret = 1 ;
  }
  SB_JOURNAL(p_s_sb)->j_len-- ;
@@ -2500,7 +2606,7 @@ static int can_dirty(struct reiserfs_journal_cnode *cn) {
  int can_dirty = 1 ;
  /* first test hprev.  These are all newer than cn, so any node here
-  ** with the name block number and dev means this node can't be sent
+  ** with the same block number and dev means this node can't be sent
  ** to disk right now.
  */
  while(cur && can_dirty) {
@@ -2551,72 +2657,56 @@ int journal_end_sync(struct reiserfs_transaction_handle *th, struct super_block
 ** change flush_commit_lists to have a repeat parameter too.
 **
 */
-void flush_async_commits(struct super_block *p_s_sb) {
+static void flush_async_commits(void *p) {
-  int i ;
+  struct super_block *p_s_sb = p;
+  struct reiserfs_journal_list *jl;
+  struct list_head *entry;
-  for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) {
+  lock_kernel();
-    if (i != SB_JOURNAL_LIST_INDEX(p_s_sb)) {
+  if (!list_empty(&SB_JOURNAL(p_s_sb)->j_journal_list)) {
-      flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + i, 1) ; 
+      /* last entry is the youngest, commit it and you get everything */
-    }
+      entry = SB_JOURNAL(p_s_sb)->j_journal_list.prev;
+      jl = JOURNAL_LIST_ENTRY(entry);
+      flush_commit_list(p_s_sb, jl, 1);
  }
+  unlock_kernel();
 }
 /*
 ** flushes any old transactions to disk
 ** ends the current transaction if it is too old
-**
-** also calls flush_journal_list with old_only == 1, which allows me to reclaim
-** memory and such from the journal lists whose real blocks are all on disk.
-**
-** called by sync_dev_journal from buffer.c
 */
-int flush_old_commits(struct super_block *p_s_sb, int immediate) {
+int reiserfs_flush_old_commits(struct super_block *p_s_sb) {
-  int i ;
+    time_t now ;
-  int count = 0;
+    struct reiserfs_transaction_handle th ;
-  int start ; 
-  time_t now ; 
+    now = get_seconds();
-  struct reiserfs_transaction_handle th ; 
+    /* safety check so we don't flush while we are replaying the log during
+     * mount
-  start =  SB_JOURNAL_LIST_INDEX(p_s_sb) ;
+     */
-  now = get_seconds() ;
+    if (list_empty(&SB_JOURNAL(p_s_sb)->j_journal_list)) {
+	return 0  ;
-  /* safety check so we don't flush while we are replaying the log during mount */
+    }
-  if (SB_JOURNAL_LIST_INDEX(p_s_sb) < 0) {
-    return 0  ;
+    /* check the current transaction.  If there are no writers, and it is
-  }
+     * too old, finish it, and force the commit blocks to disk
-  /* starting with oldest, loop until we get to the start */
+     */
-  i = (SB_JOURNAL_LIST_INDEX(p_s_sb) + 1) % JOURNAL_LIST_COUNT ;
+    if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0 &&
-  while(i != start) {
+        SB_JOURNAL(p_s_sb)->j_trans_start_time > 0 &&
-    if (SB_JOURNAL_LIST(p_s_sb)[i].j_len > 0 && ((now - SB_JOURNAL_LIST(p_s_sb)[i].j_timestamp) > SB_JOURNAL_MAX_COMMIT_AGE(p_s_sb) ||
+        SB_JOURNAL(p_s_sb)->j_len > 0 &&
-       immediate)) {
+        (now - SB_JOURNAL(p_s_sb)->j_trans_start_time) >
-      /* we have to check again to be sure the current transaction did not change */
+	SB_JOURNAL_MAX_TRANS_AGE(p_s_sb))
-      if (i != SB_JOURNAL_LIST_INDEX(p_s_sb))  {
+    {
-	flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + i, 1) ;
+	journal_join(&th, p_s_sb, 1) ;
-      }
+	reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
-    }
+	journal_mark_dirty(&th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
-    i = (i + 1) % JOURNAL_LIST_COUNT ;
-    count++ ;
+	/* we're only being called from kreiserfsd, it makes no sense to do
-  }
+	** an async commit so that kreiserfsd can do it later
-  /* now, check the current transaction.  If there are no writers, and it is too old, finish it, and
+	*/
-  ** force the commit blocks to disk
+	do_journal_end(&th, p_s_sb,1, COMMIT_NOW | WAIT) ;
-  */
+    }
-  if (!immediate && atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0 &&  
+    return p_s_sb->s_dirt;
-     SB_JOURNAL(p_s_sb)->j_trans_start_time > 0 && 
-     SB_JOURNAL(p_s_sb)->j_len > 0 && 
-     (now - SB_JOURNAL(p_s_sb)->j_trans_start_time) > SB_JOURNAL_MAX_TRANS_AGE(p_s_sb)) {
-    journal_join(&th, p_s_sb, 1) ;
-    reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
-    journal_mark_dirty(&th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
-    do_journal_end(&th, p_s_sb,1, COMMIT_NOW) ;
-  } else if (immediate) { /* belongs above, but I wanted this to be very explicit as a special case.  If they say to 
-                             flush, we must be sure old transactions hit the disk too. */
-    journal_join(&th, p_s_sb, 1) ;
-    reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
-    journal_mark_dirty(&th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
-    do_journal_end(&th, p_s_sb,1, COMMIT_NOW | WAIT) ;
-  }
-   reiserfs_journal_kupdate(p_s_sb) ;
-   return 0 ;
 }
 /*
@@ -2637,6 +2727,7 @@ static int check_journal_end(struct reiserfs_transaction_handle *th, struct supe
  int flush = flags & FLUSH_ALL ;
  int commit_now = flags & COMMIT_NOW ;
  int wait_on_commit = flags & WAIT ;
+  struct reiserfs_journal_list *jl;
  if (th->t_trans_id != SB_JOURNAL(p_s_sb)->j_trans_id) {
    reiserfs_panic(th->t_super, "journal-1577: handle trans id %ld != current trans id %ld\n", 
@@ -2653,13 +2744,7 @@ static int check_journal_end(struct reiserfs_transaction_handle *th, struct supe
  ** care of in this trans
  */
  if (SB_JOURNAL(p_s_sb)->j_len == 0) {
-    int wcount = atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) ;
+    BUG();
-    unlock_journal(p_s_sb) ;
-    if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock))  > 0 && wcount <= 0) {
-      atomic_dec(&(SB_JOURNAL(p_s_sb)->j_jlock)) ;
-      wake_up(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
-    }
-    return 0 ;
  }
  /* if wcount > 0, and we are called to with flush or commit_now,
  ** we wait on j_join_wait.  We will wake up when the last writer has
@@ -2669,24 +2754,37 @@ static int check_journal_end(struct reiserfs_transaction_handle *th, struct supe
  */
  if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) > 0) {
    if (flush || commit_now) {
-      int orig_jindex = SB_JOURNAL_LIST_INDEX(p_s_sb) ;
+      unsigned trans_id ;
+      jl = SB_JOURNAL(p_s_sb)->j_current_jl;
+      trans_id = jl->j_trans_id;
      atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 1) ;
      if (flush) {
        SB_JOURNAL(p_s_sb)->j_next_full_flush = 1 ;
      }
      unlock_journal(p_s_sb) ;
      /* sleep while the current transaction is still j_jlocked */
-      while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) && 
+      while(SB_JOURNAL(p_s_sb)->j_trans_id == trans_id) {
-            SB_JOURNAL(p_s_sb)->j_trans_id == th->t_trans_id) {
+	if (atomic_read(&SB_JOURNAL(p_s_sb)->j_jlock)) {
-	sleep_on(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
+	    queue_log_writer(p_s_sb);
-      }
+        } else {
-      if (commit_now) {
+	    lock_journal(p_s_sb);
-	if (wait_on_commit) {
+	    if (SB_JOURNAL(p_s_sb)->j_trans_id == trans_id) {
-	  flush_commit_list(p_s_sb,  SB_JOURNAL_LIST(p_s_sb) + orig_jindex, 1) ;
+	        atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 1) ;
-	} else {
+	    }
-	  commit_flush_async(p_s_sb, orig_jindex) ; 
+	    unlock_journal(p_s_sb);
 	}
      }
+      if (SB_JOURNAL(p_s_sb)->j_trans_id == trans_id) {
+          BUG();
+      }
+      if (commit_now && journal_list_still_alive(p_s_sb, trans_id) &&
+          wait_on_commit)
+      {
+	  flush_commit_list(p_s_sb, jl, 1) ;
+      }
      return 0 ;
    } 
    unlock_journal(p_s_sb) ;
@@ -2694,7 +2792,7 @@ static int check_journal_end(struct reiserfs_transaction_handle *th, struct supe
  }
  /* deal with old transactions where we are the last writers */
-  now = get_seconds() ;
+  now = get_seconds();
  if ((now - SB_JOURNAL(p_s_sb)->j_trans_start_time) > SB_JOURNAL_MAX_TRANS_AGE(p_s_sb)) {
    commit_now = 1 ;
    SB_JOURNAL(p_s_sb)->j_next_async_flush = 1 ;
@@ -2734,25 +2832,21 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th, struct super_bloc
  struct buffer_head *bh = NULL ;
  struct reiserfs_list_bitmap *jb = NULL ;
  int cleaned = 0 ;
-  if (reiserfs_dont_log(th->t_super)) {
+  cn = get_journal_hash_dev(p_s_sb, SB_JOURNAL(p_s_sb)->j_hash_table, blocknr);
-    bh = sb_find_get_block(p_s_sb, blocknr) ;
+  if (cn && cn->bh) {
-    if (bh && buffer_dirty (bh)) {
+      bh = cn->bh ;
-      printk ("journal_mark_freed(dont_log): dirty buffer on hash list: %lx %d\n", bh->b_state, blocknr);
+      get_bh(bh) ;
-      BUG ();
-    }
-    brelse (bh);
-    return 0 ;
  }
-  bh = sb_find_get_block(p_s_sb, blocknr) ;
  /* if it is journal new, we just remove it from this transaction */
  if (bh && buffer_journal_new(bh)) {
    mark_buffer_notjournal_new(bh) ;
    clear_prepared_bits(bh) ;
+    reiserfs_clean_and_file_buffer(bh) ;
    cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned) ;
  } else {
    /* set the bit for this block in the journal bitmap for this transaction */
-    jb = SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_list_bitmap ;
+    jb = SB_JOURNAL(p_s_sb)->j_current_jl->j_list_bitmap;
    if (!jb) {
      reiserfs_panic(p_s_sb, "journal-1702: journal_mark_freed, journal_list_bitmap is NULL\n") ;
    }
@@ -2762,6 +2856,7 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th, struct super_bloc
    if (bh) {
      clear_prepared_bits(bh) ;
+      reiserfs_clean_and_file_buffer(bh) ;
    }
    cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned) ;
@@ -2793,7 +2888,6 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th, struct super_bloc
  }
  if (bh) {
-    reiserfs_clean_and_file_buffer(bh) ;
    put_bh(bh) ; /* get_hash grabs the buffer */
    if (atomic_read(&(bh->b_count)) < 0) {
      printk("journal-2165: bh->b_count < 0\n") ;
@@ -2803,50 +2897,84 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th, struct super_bloc
 }
 void reiserfs_update_inode_transaction(struct inode *inode) {
+  REISERFS_I(inode)->i_jl = SB_JOURNAL(inode->i_sb)->j_current_jl;
-  REISERFS_I(inode)->i_trans_index = SB_JOURNAL_LIST_INDEX(inode->i_sb);
  REISERFS_I(inode)->i_trans_id = SB_JOURNAL(inode->i_sb)->j_trans_id ;
 }
-static int reiserfs_inode_in_this_transaction(struct inode *inode) {
+static void __commit_trans_jl(struct inode *inode, unsigned long id,
-  if (REISERFS_I(inode)->i_trans_id == SB_JOURNAL(inode->i_sb)->j_trans_id || 
+                                 struct reiserfs_journal_list *jl)
-      REISERFS_I(inode)->i_trans_id == 0) {
+{
-    return 1; 
+    struct reiserfs_transaction_handle th ;
-  } 
+    struct super_block *sb = inode->i_sb ;
-  return 0 ;
+    /* is it from the current transaction, or from an unknown transaction? */
+    if (id == SB_JOURNAL(sb)->j_trans_id) {
+	jl = SB_JOURNAL(sb)->j_current_jl;
+	/* try to let other writers come in and grow this transaction */
+	let_transaction_grow(sb, id);
+	if (SB_JOURNAL(sb)->j_trans_id != id) {
+	    goto flush_commit_only;
+	}
+	journal_begin(&th, sb, 1) ;
+	/* someone might have ended this transaction while we joined */
+	if (SB_JOURNAL(sb)->j_trans_id != id) {
+	    reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), 1) ;
+	    journal_mark_dirty(&th, sb, SB_BUFFER_WITH_SB(sb)) ;
+	    journal_end(&th, sb, 1) ;
+	    goto flush_commit_only;
+	}
+	journal_end_sync(&th, sb, 1) ;
+    } else {
+	/* this gets tricky, we have to make sure the journal list in
+	 * the inode still exists.  We know the list is still around
+	 * if we've got a larger transaction id than the oldest list
+	 */
+flush_commit_only:
+	if (journal_list_still_alive(inode->i_sb, id)) {
+	    flush_commit_list(sb, jl, 1) ;
+	}
+    }
+    /* otherwise the list is gone, and long since committed */
 }
 void reiserfs_commit_for_inode(struct inode *inode) {
-  struct reiserfs_journal_list *jl ;
+    unsigned long id = REISERFS_I(inode)->i_trans_id;
-  struct reiserfs_transaction_handle th ;
+    struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl;
-  struct super_block *sb = inode->i_sb ;
+    /* for the whole inode, assume unset id means it was
-  jl = SB_JOURNAL_LIST(sb) + REISERFS_I(inode)->i_trans_index ;
+     * changed in the current transaction.  More conservative
+     */
-  /* is it from the current transaction, or from an unknown transaction? */
+    if (!id || !jl) {
-  if (reiserfs_inode_in_this_transaction(inode)) {
+	reiserfs_update_inode_transaction(inode) ;
-    journal_join(&th, sb, 1) ;
+	id = REISERFS_I(inode)->i_trans_id;
-    reiserfs_update_inode_transaction(inode) ;
+	/* jl will be updated in __commit_trans_jl */
-    journal_end_sync(&th, sb, 1) ;
+    }
-  } else if (jl->j_trans_id == REISERFS_I(inode)->i_trans_id) {
-    flush_commit_list(sb, jl, 1) ;
+    __commit_trans_jl(inode, id, jl);
-  }
-  /* if the transaction id does not match, this list is long since flushed
-  ** and we don't have to do anything here
-  */
 }
 void reiserfs_restore_prepared_buffer(struct super_block *p_s_sb, 
                                      struct buffer_head *bh) {
-  PROC_INFO_INC( p_s_sb, journal.restore_prepared );
+    PROC_INFO_INC( p_s_sb, journal.restore_prepared );
-  if (reiserfs_dont_log (p_s_sb))
+    if (!bh) {
-    return;
+	return ;
+    }
-  if (!bh) {
+    if (test_and_clear_bit(BH_JRestore_dirty, &bh->b_state) &&
-    return ;
+	buffer_journal_dirty(bh)) {
-  }
+	struct reiserfs_journal_cnode *cn;
-  clear_bit(BH_JPrepared, &bh->b_state) ;
+	cn = get_journal_hash_dev(p_s_sb,
+	                          SB_JOURNAL(p_s_sb)->j_list_hash_table,
+				  bh->b_blocknr);
+	if (cn && can_dirty(cn)) {
+	    set_bit(BH_JTest, &bh->b_state);
+	    mark_buffer_dirty(bh);
+        }
+    }
+    clear_bit(BH_JPrepared, &bh->b_state) ;
 }
 extern struct tree_balance *cur_tb ;
@@ -2857,29 +2985,39 @@ extern struct tree_balance *cur_tb ;
 ** wait on it.
 ** 
 */
-void reiserfs_prepare_for_journal(struct super_block *p_s_sb, 
+int reiserfs_prepare_for_journal(struct super_block *p_s_sb,
                                  struct buffer_head *bh, int wait) {
-  int retry_count = 0 ;
  PROC_INFO_INC( p_s_sb, journal.prepare );
-  if (reiserfs_dont_log (p_s_sb))
-    return;
-  while(!test_bit(BH_JPrepared, &bh->b_state) ||
+    if (test_set_buffer_locked(bh)) {
-        (wait && buffer_locked(bh))) {
+	if (!wait)
-    if (buffer_journaled(bh)) {
+	    return 0;
-      set_bit(BH_JPrepared, &bh->b_state) ;
+	lock_buffer(bh);
-      return ;
    }
-    set_bit(BH_JPrepared, &bh->b_state) ;
+    set_bit(BH_JPrepared, &bh->b_state);
-    if (wait) {
+    if (test_clear_buffer_dirty(bh) && buffer_journal_dirty(bh))  {
-      RFALSE( buffer_locked(bh) && cur_tb != NULL,
+	clear_bit(BH_JTest, &bh->b_state);
-	      "waiting while do_balance was running\n") ;
+	set_bit(BH_JRestore_dirty, &bh->b_state);
-      wait_on_buffer(bh) ;
+    }
+    unlock_buffer(bh);
+    return 1;
+}
+static void flush_old_journal_lists(struct super_block *s) {
+    struct reiserfs_journal_list *jl;
+    struct list_head *entry;
+    time_t now = get_seconds();
+    while(!list_empty(&SB_JOURNAL(s)->j_journal_list)) {
+        entry = SB_JOURNAL(s)->j_journal_list.next;
+	jl = JOURNAL_LIST_ENTRY(entry);
+	/* this check should always be run, to send old lists to disk */
+	if (jl->j_timestamp < (now - (JOURNAL_MAX_TRANS_AGE * 4))) {
+	    flush_used_journal_lists(s, jl);
+	} else {
+	    break;
+	}
    }
-    PROC_INFO_INC( p_s_sb, journal.prepare_retry );
-    retry_count++ ;
-  }
 }
 /* 
@@ -2898,23 +3036,24 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
  struct buffer_head *c_bh ; /* commit bh */
  struct buffer_head *d_bh ; /* desc bh */
  int cur_write_start = 0 ; /* start index of current log write */
-  int cur_blocks_left = 0 ; /* number of journal blocks left to write */
  int old_start ;
  int i ;
-  int jindex ;
-  int orig_jindex ;
  int flush = flags & FLUSH_ALL ;
-  int commit_now = flags & COMMIT_NOW ;
  int wait_on_commit = flags & WAIT ;
-  struct reiserfs_super_block *rs ; 
+  struct reiserfs_journal_list *jl, *temp_jl;
-  int trans_half ;
+  struct list_head *entry, *safe;
+  unsigned long jindex;
+  unsigned long commit_trans_id;
+  int trans_half;
  if (th->t_refcount > 1)
    BUG() ;
  current->journal_info = th->t_handle_save;
-  if (reiserfs_dont_log(th->t_super)) {
+  reiserfs_check_lock_depth("journal end");
-    return 0 ;
+  if (SB_JOURNAL(p_s_sb)->j_len == 0) {
+      reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
+      journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
  }
  lock_journal(p_s_sb) ;
@@ -2923,24 +3062,24 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
    flush = 1 ;
  }
  if (SB_JOURNAL(p_s_sb)->j_next_async_flush) {
-    flags |= COMMIT_NOW ;
+    flags |= COMMIT_NOW | WAIT;
-    commit_now = 1 ;
+    wait_on_commit = 1;
  }
  /* check_journal_end locks the journal, and unlocks if it does not return 1 
  ** it tells us if we should continue with the journal_end, or just return
  */
  if (!check_journal_end(th, p_s_sb, nblocks, flags)) {
-    return 0 ;
+    p_s_sb->s_dirt = 1;
+    wake_queued_writers(p_s_sb);
+    goto out ;
  }
  /* check_journal_end might set these, check again */
  if (SB_JOURNAL(p_s_sb)->j_next_full_flush) {
    flush = 1 ;
  }
-  if (SB_JOURNAL(p_s_sb)->j_next_async_flush) {
-    commit_now = 1 ;
-  }
  /*
  ** j must wait means we have to flush the log blocks, and the real blocks for
  ** this transaction
@@ -2957,10 +3096,9 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
  current->journal_info = th->t_handle_save ;
 #endif
-  rs = SB_DISK_SUPER_BLOCK(p_s_sb) ;
  /* setup description block */
  d_bh = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + SB_JOURNAL(p_s_sb)->j_start) ; 
-  set_buffer_uptodate(d_bh) ;
+  set_buffer_uptodate(d_bh);
  desc = (struct reiserfs_journal_desc *)(d_bh)->b_data ;
  memset(d_bh->b_data, 0, d_bh->b_size) ;
  memcpy(get_journal_desc_magic (d_bh), JOURNAL_DESC_MAGIC, 8) ;
@@ -2975,28 +3113,33 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
  set_buffer_uptodate(c_bh) ;
  /* init this journal list */
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_older_commits_done), 0) ;
+  jl = SB_JOURNAL(p_s_sb)->j_current_jl;
-  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ;
-  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_timestamp = SB_JOURNAL(p_s_sb)->j_trans_start_time ;
-  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_bh = c_bh ;
-  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_start = SB_JOURNAL(p_s_sb)->j_start ;
-  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_len = SB_JOURNAL(p_s_sb)->j_len ;  
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_nonzerolen), SB_JOURNAL(p_s_sb)->j_len) ;
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_left), SB_JOURNAL(p_s_sb)->j_len + 2);
-  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_realblock = NULL ;
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_flushing), 1) ;
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_flushing), 1) ;
-  /* which is faster, locking/unlocking at the start and end of the for
-  ** or locking once per iteration around the insert_journal_hash?
-  ** eitherway, we are write locking insert_journal_hash.  The ENTIRE FOR
-  ** LOOP MUST not cause schedule to occur.
-  */
-  /* for each real block, add it to the journal list hash,
+  /* we lock the commit before doing anything because
+   * we want to make sure nobody tries to run flush_commit_list until
+   * the new transaction is fully setup, and we've already flushed the
+   * ordered bh list
+   */
+  down(&jl->j_commit_lock);
+  /* save the transaction id in case we need to commit it later */
+  commit_trans_id = jl->j_trans_id;
+  atomic_set(&jl->j_older_commits_done, 0) ;
+  jl->j_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ;
+  jl->j_timestamp = SB_JOURNAL(p_s_sb)->j_trans_start_time ;
+  jl->j_commit_bh = c_bh ;
+  jl->j_start = SB_JOURNAL(p_s_sb)->j_start ;
+  jl->j_len = SB_JOURNAL(p_s_sb)->j_len ;
+  atomic_set(&jl->j_nonzerolen, SB_JOURNAL(p_s_sb)->j_len) ;
+  atomic_set(&jl->j_commit_left, SB_JOURNAL(p_s_sb)->j_len + 2);
+  jl->j_realblock = NULL ;
+  /* The ENTIRE FOR LOOP MUST not cause schedule to occur.
+  **  for each real block, add it to the journal list hash,
  ** copy into real block index array in the commit or desc block
  */
-  trans_half = journal_trans_half(p_s_sb->s_blocksize) ;
+  trans_half = journal_trans_half(p_s_sb->s_blocksize);
  for (i = 0, cn = SB_JOURNAL(p_s_sb)->j_first ; cn ; cn = cn->next, i++) {
    if (test_bit(BH_JDirty, &cn->bh->b_state) ) {
      jl_cn = get_cnode(p_s_sb) ;
@@ -3004,7 +3147,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
        reiserfs_panic(p_s_sb, "journal-1676, get_cnode returned NULL\n") ;
      }
      if (i == 0) {
-        SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_realblock = jl_cn ;
+        jl->j_realblock = jl_cn ;
      }
      jl_cn->prev = last_cn ;
      jl_cn->next = NULL ;
@@ -3020,9 +3163,9 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
      }
      jl_cn->blocknr = cn->bh->b_blocknr ; 
      jl_cn->state = 0 ;
-      jl_cn->sb = p_s_sb ;
+      jl_cn->sb = p_s_sb;
      jl_cn->bh = cn->bh ;
-      jl_cn->jlist = SB_JOURNAL_LIST(p_s_sb) + SB_JOURNAL_LIST_INDEX(p_s_sb) ;
+      jl_cn->jlist = jl;
      insert_journal_hash(SB_JOURNAL(p_s_sb)->j_list_hash_table, jl_cn) ; 
      if (i < trans_half) {
 	desc->j_realblock[i] = cpu_to_le32(cn->bh->b_blocknr) ;
@@ -3033,7 +3176,6 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
      i-- ;
    }
  }
  set_desc_trans_len(desc, SB_JOURNAL(p_s_sb)->j_len) ;
  set_desc_mount_id(desc, SB_JOURNAL(p_s_sb)->j_mount_id) ;
  set_desc_trans_id(desc, SB_JOURNAL(p_s_sb)->j_trans_id) ;
@@ -3041,53 +3183,35 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
  /* special check in case all buffers in the journal were marked for not logging */
  if (SB_JOURNAL(p_s_sb)->j_len == 0) {
-    brelse(d_bh) ;
+    BUG();
-    brelse(c_bh) ;
-    unlock_journal(p_s_sb) ;
-    printk("journal-2020: do_journal_end: BAD desc->j_len is ZERO\n") ;
-    atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 0) ;
-    wake_up(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
-    return 0 ;
  }
+  /* we're about to dirty all the log blocks, mark the description block
+   * dirty now too.  Don't mark the commit block dirty until all the
+   * others are on disk
+   */
+  mark_buffer_dirty(d_bh);
  /* first data block is j_start + 1, so add one to cur_write_start wherever you use it */
  cur_write_start = SB_JOURNAL(p_s_sb)->j_start ;
-  cur_blocks_left = SB_JOURNAL(p_s_sb)->j_len  ;
  cn = SB_JOURNAL(p_s_sb)->j_first ;
  jindex = 1 ; /* start at one so we don't get the desc again */
-  while(cur_blocks_left > 0) {
+  while(cn) {
+    clear_bit(BH_JNew, &(cn->bh->b_state)) ;
    /* copy all the real blocks into log area.  dirty log blocks */
    if (test_bit(BH_JDirty, &cn->bh->b_state)) {
      struct buffer_head *tmp_bh ;
      tmp_bh =  journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + 
 		       ((cur_write_start + jindex) % SB_ONDISK_JOURNAL_SIZE(p_s_sb))) ;
-      set_buffer_uptodate(tmp_bh) ;
+      set_buffer_uptodate(tmp_bh);
      memcpy(tmp_bh->b_data, cn->bh->b_data, cn->bh->b_size) ;  
+      mark_buffer_dirty(tmp_bh);
      jindex++ ;
-    } else {
-      /* JDirty cleared sometime during transaction.  don't log this one */
-      printk("journal-2048: do_journal_end: BAD, buffer in journal hash, but not JDirty!\n") ;
-    }
-    cn = cn->next ;
-    cur_blocks_left-- ;
-  }
-  /* we are done  with both the c_bh and d_bh, but
-  ** c_bh must be written after all other commit blocks,
-  ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1.
-  */
-  /* now loop through and mark all buffers from this transaction as JDirty_wait
-  ** clear the JDirty bit, clear BH_JNew too.  
-  ** if they weren't JDirty, they weren't logged, just relse them and move on
-  */
-  cn = SB_JOURNAL(p_s_sb)->j_first ; 
-  while(cn) {
-    clear_bit(BH_JNew, &(cn->bh->b_state)) ;
-    if (test_bit(BH_JDirty, &(cn->bh->b_state))) {
      set_bit(BH_JDirty_wait, &(cn->bh->b_state)) ; 
      clear_bit(BH_JDirty, &(cn->bh->b_state)) ;
    } else {
+      /* JDirty cleared sometime during transaction.  don't log this one */
+      reiserfs_warning("journal-2048: do_journal_end: BAD, buffer in journal hash, but not JDirty!\n") ;
      brelse(cn->bh) ;
    }
    next = cn->next ;
@@ -3095,30 +3219,17 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
    cn = next ;
  }
-  /* unlock the journal list for committing and flushing */
+  /* we are done  with both the c_bh and d_bh, but
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_flushing), 0) ;
+  ** c_bh must be written after all other commit blocks,
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_flushing), 0) ;
+  ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1.
+  */
-  orig_jindex = SB_JOURNAL_LIST_INDEX(p_s_sb) ;
-  jindex = (SB_JOURNAL_LIST_INDEX(p_s_sb) + 1) % JOURNAL_LIST_COUNT ; 
-  SB_JOURNAL_LIST_INDEX(p_s_sb) = jindex ;
-  /* write any buffers that must hit disk before this commit is done */
+  SB_JOURNAL(p_s_sb)->j_current_jl = alloc_journal_list(p_s_sb);
-  fsync_buffers_list(&(SB_JOURNAL(p_s_sb)->j_dirty_buffers_lock),
-		     &(SB_JOURNAL(p_s_sb)->j_dirty_buffers)) ;
-  /* honor the flush and async wishes from the caller */
+  /* now it is safe to insert this transaction on the main list */
-  if (flush) {
+  list_add_tail(&jl->j_list, &SB_JOURNAL(p_s_sb)->j_journal_list);
+  list_add_tail(&jl->j_working_list, &SB_JOURNAL(p_s_sb)->j_working_list);
-    flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + orig_jindex, 1) ;
+  SB_JOURNAL(p_s_sb)->j_num_work_lists++;
-    flush_journal_list(p_s_sb,  SB_JOURNAL_LIST(p_s_sb) + orig_jindex , 1) ;  
-  } else if (commit_now) {
-    if (wait_on_commit) {
-      flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + orig_jindex, 1) ;
-    } else {
-      commit_flush_async(p_s_sb, orig_jindex) ; 
-    }
-  }
  /* reset journal values for the next transaction */
  old_start = SB_JOURNAL(p_s_sb)->j_start ;
@@ -3130,57 +3241,96 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
  SB_JOURNAL(p_s_sb)->j_len = 0 ;
  SB_JOURNAL(p_s_sb)->j_trans_start_time = 0 ;
  SB_JOURNAL(p_s_sb)->j_trans_id++ ;
+  SB_JOURNAL(p_s_sb)->j_current_jl->j_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id;
  SB_JOURNAL(p_s_sb)->j_must_wait = 0 ;
  SB_JOURNAL(p_s_sb)->j_len_alloc = 0 ;
  SB_JOURNAL(p_s_sb)->j_next_full_flush = 0 ;
  SB_JOURNAL(p_s_sb)->j_next_async_flush = 0 ;
  init_journal_hash(p_s_sb) ; 
+  /* tail conversion targets have to hit the disk before we end the
+   * transaction.  Otherwise a later transaction might repack the tail
+   * before this transaction commits, leaving the data block unflushed and
+   * clean, if we crash before the later transaction commits, the data block
+   * is lost.
+   */
+  fsync_buffers_list(&(SB_JOURNAL(p_s_sb)->j_dirty_buffers_lock),
+		     &(SB_JOURNAL(p_s_sb)->j_dirty_buffers)) ;
+  up(&jl->j_commit_lock);
+  /* honor the flush wishes from the caller, simple commits can
+  ** be done outside the journal lock, they are done below
+  */
+  if (flush) {
+    flush_commit_list(p_s_sb, jl, 1) ;
+    flush_journal_list(p_s_sb, jl, 1) ;
+  }
  /* if the next transaction has any chance of wrapping, flush 
  ** transactions that might get overwritten.  If any journal lists are very 
  ** old flush them as well.  
  */
-  for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) {
+first_jl:
-    jindex = i ;
+  list_for_each_safe(entry, safe, &SB_JOURNAL(p_s_sb)->j_journal_list) {
-    if (SB_JOURNAL_LIST(p_s_sb)[jindex].j_len > 0 && SB_JOURNAL(p_s_sb)->j_start <= SB_JOURNAL_LIST(p_s_sb)[jindex].j_start) {
+    temp_jl = JOURNAL_LIST_ENTRY(entry);
-      if ((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) >= SB_JOURNAL_LIST(p_s_sb)[jindex].j_start) {
+    if (SB_JOURNAL(p_s_sb)->j_start <= temp_jl->j_start) {
-	flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1) ; 
+      if ((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) >=
+          temp_jl->j_start)
+      {
+	flush_used_journal_lists(p_s_sb, temp_jl);
+	goto first_jl;
+      } else if ((SB_JOURNAL(p_s_sb)->j_start +
+                  SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) <
+		  SB_ONDISK_JOURNAL_SIZE(p_s_sb))
+      {
+          /* if we don't cross into the next transaction and we don't
+	   * wrap, there is no way we can overlap any later transactions
+	   * break now
+	   */
+	  break;
      }
-    } else if (SB_JOURNAL_LIST(p_s_sb)[jindex].j_len > 0 && 
+    } else if ((SB_JOURNAL(p_s_sb)->j_start +
-              (SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) > SB_ONDISK_JOURNAL_SIZE(p_s_sb)) {
+                SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) >
-      if (((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb)) >= 
+		SB_ONDISK_JOURNAL_SIZE(p_s_sb))
-            SB_JOURNAL_LIST(p_s_sb)[jindex].j_start) {
+    {
-	flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1 ) ; 
+      if (((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) %
+            SB_ONDISK_JOURNAL_SIZE(p_s_sb)) >= temp_jl->j_start)
+      {
+	flush_used_journal_lists(p_s_sb, temp_jl);
+	goto first_jl;
+      } else {
+	  /* we don't overlap anything from out start to the end of the
+	   * log, and our wrapped portion doesn't overlap anything at
+	   * the start of the log.  We can break
+	   */
+	  break;
      }
-    } 
-    /* this check should always be run, to send old lists to disk */
-    if (SB_JOURNAL_LIST(p_s_sb)[jindex].j_len > 0 && 
-              SB_JOURNAL_LIST(p_s_sb)[jindex].j_timestamp < 
-	      (get_seconds() - (SB_JOURNAL_MAX_TRANS_AGE(p_s_sb) * 4))) {
-	flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1 ) ; 
    }
  }
+  flush_old_journal_lists(p_s_sb);
-  /* if the next journal_list is still in use, flush it */
+  SB_JOURNAL(p_s_sb)->j_current_jl->j_list_bitmap = get_list_bitmap(p_s_sb, SB_JOURNAL(p_s_sb)->j_current_jl) ;
-  if (SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_len != 0) {
-    flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + SB_JOURNAL_LIST_INDEX(p_s_sb), 1) ; 
-  }
-  /* we don't want anyone flushing the new transaction's list */
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_flushing), 1) ;
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_flushing), 1) ;
-  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_list_bitmap = get_list_bitmap(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + 
-											 SB_JOURNAL_LIST_INDEX(p_s_sb)) ;
-  if (!(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_list_bitmap)) {
+  if (!(SB_JOURNAL(p_s_sb)->j_current_jl->j_list_bitmap)) {
    reiserfs_panic(p_s_sb, "journal-1996: do_journal_end, could not get a list bitmap\n") ;
  }
-  unlock_journal(p_s_sb) ;
  atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 0) ;
+  unlock_journal(p_s_sb) ;
  /* wake up any body waiting to join. */
+  clear_bit(WRITERS_QUEUED, &SB_JOURNAL(p_s_sb)->j_state);
  wake_up(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
+  if (!flush) {
+      if (wait_on_commit) {
+	  if (journal_list_still_alive(p_s_sb, commit_trans_id))
+	      flush_commit_list(p_s_sb, jl, 1) ;
+      } else {
+          queue_work(commit_wq, &SB_JOURNAL(p_s_sb)->j_work);
+      }
+  }
+out:
+  reiserfs_check_lock_depth("journal end2");
  return 0 ;
 }
--- a/fs/reiserfs/objectid.c
+++ b/fs/reiserfs/objectid.c
@@ -86,7 +86,6 @@ __u32 reiserfs_get_unused_objectid (struct reiserfs_transaction_handle *th)
    }
    journal_mark_dirty(th, s, SB_BUFFER_WITH_SB (s));
-    s->s_dirt = 1;
    return unused_objectid;
 }
@@ -105,8 +104,6 @@ void reiserfs_release_objectid (struct reiserfs_transaction_handle *th,
    reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ;
    journal_mark_dirty(th, s, SB_BUFFER_WITH_SB (s)); 
-    s->s_dirt = 1;
    /* start at the beginning of the objectid map (i = 0) and go to
       the end of it (i = disk_sb->s_oid_cursize).  Linear search is

--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -87,7 +87,7 @@ static int show_super(struct seq_file *m, struct super_block *sb)
 	struct reiserfs_sb_info *r = REISERFS_SB(sb);
 	seq_printf(m,	"state: \t%s\n"
-			"mount options: \t%s%s%s%s%s%s%s%s%s%s%s%s\n"
+			"mount options: \t%s%s%s%s%s%s%s%s%s%s%s\n"
 			"gen. counter: \t%i\n"
 			"s_kmallocs: \t%i\n"
 			"s_disk_reads: \t%i\n"
@@ -131,7 +131,6 @@ static int show_super(struct seq_file *m, struct super_block *sb)
 			reiserfs_test4( sb ) ? "TEST4 " : "",
 			have_large_tails( sb ) ? "TAILS " : have_small_tails(sb)?"SMALL_TAILS ":"NO_TAILS ",
 			replay_only( sb ) ? "REPLAY_ONLY " : "",
-			reiserfs_dont_log( sb ) ? "DONT_LOG " : "LOG ",
 			convert_reiserfs( sb ) ? "CONV " : "",
 			atomic_read( &r -> s_generation_counter ),
@@ -370,7 +369,6 @@ static int show_journal(struct seq_file *m, struct super_block *sb)
 			"j_first_unflushed_offset: \t%lu\n"
 			"j_last_flush_trans_id: \t%lu\n"
 			"j_trans_start_time: \t%li\n"
-			"j_journal_list_index: \t%i\n"
 			"j_list_bitmap_index: \t%i\n"
 			"j_must_wait: \t%i\n"
 			"j_next_full_flush: \t%i\n"
@@ -416,7 +414,6 @@ static int show_journal(struct seq_file *m, struct super_block *sb)
 			JF( j_first_unflushed_offset ),
 			JF( j_last_flush_trans_id ),
 			JF( j_trans_start_time ),
-			JF( j_journal_list_index ),
 			JF( j_list_bitmap_index ),
 			JF( j_must_wait ),
 			JF( j_next_full_flush ),

--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -59,22 +59,26 @@ static int is_any_reiserfs_magic_string (struct reiserfs_super_block * rs)
 static int reiserfs_remount (struct super_block * s, int * flags, char * data);
 static int reiserfs_statfs (struct super_block * s, struct kstatfs * buf);
-static void reiserfs_write_super (struct super_block * s)
+static void reiserfs_sync_fs (struct super_block * s)
 {
+    if (!(s->s_flags & MS_RDONLY)) {
+        struct reiserfs_transaction_handle th;
+	reiserfs_write_lock(s);
+	journal_begin(&th, s, 1);
+	journal_end_sync(&th, s, 1);
+	reiserfs_flush_old_commits(s);
+	s->s_dirt = 0;
+	reiserfs_write_unlock(s);
+    }
+}
-  int dirty = 0 ;
+static void reiserfs_write_super(struct super_block *s)
-  reiserfs_write_lock(s);
+{
-  if (!(s->s_flags & MS_RDONLY)) {
+    reiserfs_sync_fs(s);
-    dirty = flush_old_commits(s, 1) ;
-  }
-  s->s_dirt = dirty;
-  reiserfs_write_unlock(s);
 }
 static void reiserfs_write_super_lockfs (struct super_block * s)
 {
-  int dirty = 0 ;
  struct reiserfs_transaction_handle th ;
  reiserfs_write_lock(s);
  if (!(s->s_flags & MS_RDONLY)) {
@@ -84,7 +88,7 @@ static void reiserfs_write_super_lockfs (struct super_block * s)
    reiserfs_block_writes(&th) ;
    journal_end(&th, s, 1) ;
  }
-  s->s_dirt = dirty;
+  s->s_dirt = 0;
  reiserfs_write_unlock(s);
 }
@@ -805,7 +809,6 @@ static int reiserfs_remount (struct super_block * s, int * mount_flags, char * a
    reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ;
    set_sb_umount_state( rs, REISERFS_SB(s)->s_mount_state );
    journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s));
-    s->s_dirt = 0;
  } else {
    /* remount read-write */
    if (!(s->s_flags & MS_RDONLY))
@@ -822,12 +825,12 @@ static int reiserfs_remount (struct super_block * s, int * mount_flags, char * a
    set_sb_umount_state( rs, REISERFS_ERROR_FS );
    /* mark_buffer_dirty (SB_BUFFER_WITH_SB (s), 1); */
    journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s));
-    s->s_dirt = 0;
    REISERFS_SB(s)->s_mount_state = REISERFS_VALID_FS ;
  }
  /* this will force a full flush of all journal lists */
  SB_JOURNAL(s)->j_must_wait = 1 ;
  journal_end(&th, s, 10) ;
+  s->s_dirt = 0;
  if (!( *mount_flags & MS_RDONLY ) )
    finish_unfinished( s );
@@ -1392,8 +1395,6 @@ static int reiserfs_fill_super (struct super_block * s, void * data, int silent)
 	/* look for files which were to be removed in previous session */
 	finish_unfinished (s);
-	s->s_dirt = 0;
    } else {
 	if ( old_format_only(s) && !silent) {
 	    reiserfs_warning("reiserfs: using 3.5.x disk format\n") ;

--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -1702,23 +1702,39 @@ struct reiserfs_journal_header {
 	 (((block)<<(JBH_HASH_SHIFT - 6)) ^ ((block) >> 13) ^ ((block) << (JBH_HASH_SHIFT - 12))))
 #define journal_hash(t,sb,block) ((t)[_jhashfn((sb),(block)) & JBH_HASH_MASK])
-/* finds n'th buffer with 0 being the start of this commit.  Needs to go away, j_ap_blocks has changed
-** since I created this.  One chunk of code in journal.c needs changing before deleting it
-*/
-#define JOURNAL_BUFFER(j,n) ((j)->j_ap_blocks[((j)->j_start + (n)) % JOURNAL_BLOCK_COUNT])
 // We need these to make journal.c code more readable
 #define journal_find_get_block(s, block) __find_get_block(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
 #define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
 #define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
+/*
+** transaction handle which is passed around for all journal calls
+*/
+struct reiserfs_transaction_handle {
+  struct super_block *t_super ; /* super for this FS when journal_begin was
+				   called. saves calls to reiserfs_get_super
+				   also used by nested transactions to make
+				   sure they are nesting on the right FS
+				   _must_ be first in the handle
+				*/
+  int t_refcount;
+  int t_blocks_logged ;         /* number of blocks this writer has logged */
+  int t_blocks_allocated ;      /* number of blocks this writer allocated */
+  unsigned long t_trans_id ;    /* sanity check, equals the current trans id */
+  void *t_handle_save ;		/* save existing current->journal_info */
+  int displace_new_blocks:1;	/* if new block allocation occurres, that block
+				   should be displaced from others */
+} ;
+int journal_mark_dirty(struct reiserfs_transaction_handle *, struct super_block *, struct buffer_head *bh) ;
+int reiserfs_flush_old_commits(struct super_block *);
 void reiserfs_commit_for_inode(struct inode *) ;
 void reiserfs_update_inode_transaction(struct inode *) ;
 void reiserfs_wait_on_write_block(struct super_block *s) ;
 void reiserfs_block_writes(struct reiserfs_transaction_handle *th) ;
 void reiserfs_allow_writes(struct super_block *s) ;
 void reiserfs_check_lock_depth(char *caller) ;
-void reiserfs_prepare_for_journal(struct super_block *, struct buffer_head *bh, int wait) ;
+int reiserfs_prepare_for_journal(struct super_block *, struct buffer_head *bh, int wait) ;
 void reiserfs_restore_prepared_buffer(struct super_block *, struct buffer_head *bh) ;
 int journal_init(struct super_block *, const char * j_dev_name, int old_format, unsigned int) ;
 int journal_release(struct reiserfs_transaction_handle*, struct super_block *) ;
@@ -1730,7 +1746,6 @@ int journal_mark_freed(struct reiserfs_transaction_handle *, struct super_block
 int journal_transaction_should_end(struct reiserfs_transaction_handle *, int) ;
 int reiserfs_in_journal(struct super_block *p_s_sb, int bmap_nr, int bit_nr, int searchall, b_blocknr_t *next) ;
 int journal_begin(struct reiserfs_transaction_handle *, struct super_block *p_s_sb, unsigned long) ;
-void flush_async_commits(struct super_block *p_s_sb) ;
 int buffer_journaled(const struct buffer_head *bh) ;
 int mark_buffer_journal_new(struct buffer_head *bh) ;

--- a/include/linux/reiserfs_fs_i.h
+++ b/include/linux/reiserfs_fs_i.h
@@ -3,6 +3,8 @@
 #include <linux/list.h>
+struct reiserfs_journal_list;
 /** bitmasks for i_flags field in reiserfs-specific part of inode */
 typedef enum {
    /** this says what format of key do all items (but stat data) of
@@ -48,7 +50,7 @@ struct reiserfs_inode_info {
    ** needs to be committed in order for this inode to be properly
    ** flushed */
    unsigned long i_trans_id ;
-    unsigned long i_trans_index ;
+    struct reiserfs_journal_list *i_jl;
    struct inode vfs_inode;
 };

--- a/include/linux/reiserfs_fs_sb.h
+++ b/include/linux/reiserfs_fs_sb.h
@@ -106,7 +106,6 @@ typedef enum {
 #define JOURNAL_MAX_CNODE   1500 /* max cnodes to allocate. */
 #define JOURNAL_HASH_SIZE 8192   
 #define JOURNAL_NUM_BITMAPS 5 /* number of copies of the bitmaps to have floating.  Must be >= 2 */
-#define JOURNAL_LIST_COUNT 64
 /* these are bh_state bit flag offset numbers, for use in the buffer head */
@@ -121,6 +120,7 @@ typedef enum {
 */
 #define BH_JPrepared 20		/* block has been prepared for the log */
 #define BH_JRestore_dirty 22    /* restore the dirty bit later */
+#define BH_JTest 23             /* debugging use only */
 /* One of these for every block in every transaction
 ** Each one is in two hash tables.  First, a hash of the current transaction, and after journal_end, a
@@ -153,26 +153,6 @@ struct reiserfs_list_bitmap {
  struct reiserfs_bitmap_node **bitmaps ;
 } ;
-/*
-** transaction handle which is passed around for all journal calls
-*/
-struct reiserfs_transaction_handle {
-  struct super_block *t_super ; /* super for this FS when journal_begin was
-				   called. saves calls to reiserfs_get_super
-				   also used by nested transactions to make
-				   sure they are nesting on the right FS
-				   _must_ be first in the handle
-				*/
-  int t_refcount;
-  int t_blocks_logged ;         /* number of blocks this writer has logged */
-  int t_blocks_allocated ;      /* number of blocks this writer allocated */
-  unsigned long t_trans_id ;    /* sanity check, equals the current trans id */
-  void *t_handle_save ;		/* save existing current->journal_info */
-  int displace_new_blocks:1;	/* if new block allocation occurres, that block
-				   should be displaced from others */
-} ;
 /*
 ** one of these for each transaction.  The most important part here is the j_realblock.
 ** this list of cnodes is used to hash all the blocks in all the commits, to mark all the
@@ -181,23 +161,25 @@ struct reiserfs_transaction_handle {
 ** to be overwritten */
 struct reiserfs_journal_list {
  unsigned long j_start ;
+  unsigned long j_state;
  unsigned long j_len ;
  atomic_t j_nonzerolen ;
  atomic_t j_commit_left ;
-  atomic_t j_flushing ;
-  atomic_t j_commit_flushing ;
  atomic_t j_older_commits_done ;      /* all commits older than this on disk*/
+  struct semaphore j_commit_lock;
  unsigned long j_trans_id ;
  time_t j_timestamp ;
  struct reiserfs_list_bitmap *j_list_bitmap ;
  struct buffer_head *j_commit_bh ; /* commit buffer head */
  struct reiserfs_journal_cnode *j_realblock  ;
  struct reiserfs_journal_cnode *j_freedlist ; /* list of buffers that were freed during this trans.  free each of these on flush */
-  wait_queue_head_t j_commit_wait ; /* wait for all the commit blocks to be flushed */
+  /* time ordered list of all active transactions */
-  wait_queue_head_t j_flush_wait ; /* wait for all the real blocks to be flushed */
+  struct list_head j_list;
-} ;
-struct reiserfs_page_list  ; /* defined in reiserfs_fs.h */
+  /* time ordered list of all transactions we haven't tried to flush yet */
+  struct list_head j_working_list;
+  int j_refcount;
+} ;
 struct reiserfs_journal {
  struct buffer_head ** j_ap_blocks ; /* journal blocks on disk */
@@ -220,16 +202,11 @@ struct reiserfs_journal {
  unsigned long j_last_flush_trans_id ;    /* last fully flushed journal timestamp */
  struct buffer_head *j_header_bh ;   
-  /* j_flush_pages must be flushed before the current transaction can
-  ** commit
-  */
-  struct reiserfs_page_list *j_flush_pages ;
  time_t j_trans_start_time ;         /* time this transaction started */
-  wait_queue_head_t j_wait ;         /* wait  journal_end to finish I/O */
+  struct semaphore j_lock;
-  atomic_t j_wlock ;                       /* lock for j_wait */
+  struct semaphore j_flush_sem;
  wait_queue_head_t j_join_wait ;    /* wait for current transaction to finish before starting new one */
  atomic_t j_jlock ;                       /* lock for j_join_wait */
-  int j_journal_list_index ;	      /* journal list number of the current trans */
  int j_list_bitmap_index ;	      /* number of next list bitmap to use */
  int j_must_wait ;		       /* no more journal begins allowed. MUST sleep on j_join_wait */
  int j_next_full_flush ;             /* next journal_end will flush all journal list */
@@ -246,19 +223,37 @@ struct reiserfs_journal {
  struct reiserfs_journal_cnode *j_cnode_free_list ;
  struct reiserfs_journal_cnode *j_cnode_free_orig ; /* orig pointer returned from vmalloc */
+  struct reiserfs_journal_list *j_current_jl;
  int j_free_bitmap_nodes ;
  int j_used_bitmap_nodes ;
+  int j_num_lists;      /* total number of active transactions */
+  int j_num_work_lists; /* number that need attention from kreiserfsd */
+  /* debugging to make sure things are flushed in order */
+  int j_last_flush_id;
+  /* debugging to make sure things are committed in order */
+  int j_last_commit_id;
  struct list_head j_bitmap_nodes ;
  struct list_head j_dirty_buffers ;
  spinlock_t j_dirty_buffers_lock ; /* protects j_dirty_buffers */
+  /* list of all active transactions */
+  struct list_head j_journal_list;
+  /* lists that haven't been touched by writeback attempts */
+  struct list_head j_working_list;
  struct reiserfs_list_bitmap j_list_bitmap[JOURNAL_NUM_BITMAPS] ;	/* array of bitmaps to record the deleted blocks */
-  struct reiserfs_journal_list j_journal_list[JOURNAL_LIST_COUNT] ;	    /* array of all the journal lists */
  struct reiserfs_journal_cnode *j_hash_table[JOURNAL_HASH_SIZE] ; 	    /* hash table for real buffer heads in current trans */ 
  struct reiserfs_journal_cnode *j_list_hash_table[JOURNAL_HASH_SIZE] ; /* hash table for all the real buffer heads in all 
  										the transactions */
  struct list_head j_prealloc_list;     /* list of inodes which have preallocated blocks */
  unsigned long j_max_trans_size ;
  unsigned long j_max_batch_size ;
+  struct work_struct j_work;
 };
 #define JOURNAL_DESC_MAGIC "ReIsErLB" /* ick.  magic string to find desc blocks in the journal */
@@ -417,7 +412,6 @@ struct reiserfs_sb_info
 #define REISERFS_LARGETAIL 0  /* large tails will be created in a session */
 #define REISERFS_SMALLTAIL 17  /* small (for files less than block size) tails will be created in a session */
 #define REPLAYONLY 3 /* replay journal and return 0. Use by fsck */
-#define REISERFS_NOLOG 4      /* -o nolog: turn journalling off */
 #define REISERFS_CONVERT 5    /* -o conv: causes conversion of old
                                 format super block to the new
                                 format. If not specified - old
@@ -473,8 +467,6 @@ struct reiserfs_sb_info
 void reiserfs_file_buffer (struct buffer_head * bh, int list);
 extern struct file_system_type reiserfs_fs_type;
-int journal_mark_dirty(struct reiserfs_transaction_handle *, struct super_block *, struct buffer_head *bh) ;
-int flush_old_commits(struct super_block *s, int) ;
 int reiserfs_resize(struct super_block *, unsigned long) ;
 #define CARRY_ON                0
@@ -484,8 +476,6 @@ int reiserfs_resize(struct super_block *, unsigned long) ;
 #define SB_BUFFER_WITH_SB(s) (REISERFS_SB(s)->s_sbh)
 #define SB_JOURNAL(s) (REISERFS_SB(s)->s_journal)
 #define SB_JOURNAL_1st_RESERVED_BLOCK(s) (SB_JOURNAL(s)->j_1st_reserved_block)
-#define SB_JOURNAL_LIST(s) (SB_JOURNAL(s)->j_journal_list)
-#define SB_JOURNAL_LIST_INDEX(s) (SB_JOURNAL(s)->j_journal_list_index) 
 #define SB_JOURNAL_LEN_FREE(s) (SB_JOURNAL(s)->j_journal_len_free) 
 #define SB_AP_BITMAP(s) (REISERFS_SB(s)->s_ap_bitmap)