Merge bk://linus.bkbits.net/linux-2.5

into hostme.bitkeeper.com:/ua/repos/j/jfs/linux-2.5

Merge bk://linus.bkbits.net/linux-2.5
into hostme.bitkeeper.com:/ua/repos/j/jfs/linux-2.5
ddc729d4 · Dave Kleikamp · 18277e88 · 9bda462a · ddc729d4 · ddc729d4
Commit ddc729d4 authored Sep 19, 2002 by Dave Kleikamp
7 changed files
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -19,6 +19,7 @@
 #include <linux/fs.h>
 #include "jfs_incore.h"
+#include "jfs_dmap.h"
 #include "jfs_txnmgr.h"
 #include "jfs_xattr.h"
 #include "jfs_debug.h"
@@ -94,6 +95,47 @@ static void jfs_truncate(struct inode *ip)
 	IWRITE_UNLOCK(ip);
 }
+static int jfs_open(struct inode *inode, struct file *file)
+{
+	int rc;
+	if ((rc = generic_file_open(inode, file)))
+		return rc;
+	/*
+	 * We attempt to allow only one "active" file open per aggregate
+	 * group.  Otherwise, appending to files in parallel can cause
+	 * fragmentation within the files.
+	 *
+	 * If the file is empty, it was probably just created and going
+	 * to be written to.  If it has a size, we'll hold off until the
+	 * file is actually grown.
+	 */
+	if (S_ISREG(inode->i_mode) && file->f_mode & FMODE_WRITE &&
+	    (inode->i_size == 0)) {
+		struct jfs_inode_info *ji = JFS_IP(inode);
+		if (ji->active_ag == -1) {
+			ji->active_ag = ji->agno;
+			atomic_inc(
+			    &JFS_SBI(inode->i_sb)->bmap->db_active[ji->agno]);
+		}
+	}
+	return 0;
+}
+static int jfs_release(struct inode *inode, struct file *file)
+{
+	struct jfs_inode_info *ji = JFS_IP(inode);
+	if (ji->active_ag != -1) {
+		struct bmap *bmap = JFS_SBI(inode->i_sb)->bmap;
+		atomic_dec(&bmap->db_active[ji->active_ag]);
+		ji->active_ag = -1;
+	}
+	return 0;
+}
 struct inode_operations jfs_file_inode_operations = {
 	.truncate	= jfs_truncate,
 	.setxattr	= jfs_setxattr,
@@ -103,7 +145,7 @@ struct inode_operations jfs_file_inode_operations = {
 };
 struct file_operations jfs_file_operations = {
-	.open		= generic_file_open,
+	.open		= jfs_open,
 	.llseek		= generic_file_llseek,
 	.write		= generic_file_write,
 	.read		= generic_file_read,
@@ -112,4 +154,5 @@ struct file_operations jfs_file_operations = {
 	.writev		= generic_file_writev,
 	.sendfile	= generic_file_sendfile,
 	.fsync		= jfs_fsync,
+	.release	= jfs_release,
 };
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -241,6 +241,7 @@ int dbMount(struct inode *ipbmap)
 	bmp->db_ipbmap = ipbmap;
 	JFS_SBI(ipbmap->i_sb)->bmap = bmp;
+	memset(bmp->db_active, 0, sizeof(bmp->db_active));
 	DBINITMAP(bmp->db_mapsize, ipbmap, &bmp->db_DBmap);
 	/*
@@ -271,6 +272,7 @@ int dbMount(struct inode *ipbmap)
 int dbUnmount(struct inode *ipbmap, int mounterror)
 {
 	struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
+	int i;
 	if (!(mounterror || isReadOnly(ipbmap)))
 		dbSync(ipbmap);
@@ -280,6 +282,14 @@ int dbUnmount(struct inode *ipbmap, int mounterror)
 	 */
 	truncate_inode_pages(ipbmap->i_mapping, 0);
+	/*
+	 * Sanity Check
+	 */
+	for (i = 0; i < bmp->db_numag; i++)
+		if (atomic_read(&bmp->db_active[i]))
+			printk(KERN_ERR "dbUnmount: db_active[%d] = %d\n",
+			       i, atomic_read(&bmp->db_active[i]));
 	/* free the memory for the in-memory bmap. */
 	kfree(bmp);
@@ -598,102 +608,77 @@ dbUpdatePMap(struct inode *ipbmap,
 *
 * FUNCTION:    find the preferred allocation group for new allocations.
 *
- *		we try to keep the trailing (rightmost) allocation groups
+ *		Within the allocation groups, we maintain a preferred
- *		free for large allocations.  we try to do this by targeting
- *		new inode allocations towards the leftmost or 'active'
- *		allocation groups while keeping the rightmost or 'inactive'
- *		allocation groups free. once the active allocation groups
- *		have dropped to a certain percentage of free space, we add
- *		the leftmost inactive allocation group to the active set.
- *
- *		within the active allocation groups, we maintain a preferred
 *		allocation group which consists of a group with at least
- *		average free space over the active set. it is the preferred
+ *		average free space.  It is the preferred group that we target
- *		group that we target new inode allocation towards.  the 
+ *		new inode allocation towards.  The tie-in between inode
- *		tie-in between inode allocation and block allocation occurs
+ *		allocation and block allocation occurs as we allocate the
- *		as we allocate the first (data) block of an inode and specify
+ *		first (data) block of an inode and specify the inode (block)
- *		the inode (block) as the allocation hint for this block.
+ *		as the allocation hint for this block.
+ *
+ *		We try to avoid having more than one open file growing in
+ *		an allocation group, as this will lead to fragmentation.
+ *		This differs from the old OS/2 method of trying to keep
+ *		empty ags around for large allocations.
 *
 * PARAMETERS:
 *      ipbmap	-  pointer to in-core inode for the block map.
 *
 * RETURN VALUES:
 *      the preferred allocation group number.
- *
- * note: only called by dbAlloc();
 */
 int dbNextAG(struct inode *ipbmap)
 {
-	s64 avgfree, inactfree, actfree, rem;
+	s64 avgfree;
-	int actags, inactags, l2agsize;
+	int agpref;
+	s64 hwm = 0;
+	int i;
+	int next_best = -1;
 	struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
 	BMAP_LOCK(bmp);
-	/* determine the number of active allocation groups (i.e.
+	/* determine the average number of free blocks within the ags. */
-	 * the number of allocation groups up to and including
+	avgfree = (u32)bmp->db_nfree / bmp->db_numag;
-	 * the rightmost allocation group with blocks allocated
-	 * in it.
-	 */
-	actags = bmp->db_maxag + 1;
-	assert(actags <= bmp->db_numag);
-	/* get the number of inactive allocation groups (i.e. the
+	/*
-	 * number of allocation group following the rightmost group
+	 * if the current preferred ag does not have an active allocator
-	 * with allocation in it.
+	 * and has at least average freespace, return it
 	 */
-	inactags = bmp->db_numag - actags;
+	agpref = bmp->db_agpref;
+	if ((atomic_read(&bmp->db_active[agpref]) == 0) &&
+	    (bmp->db_agfree[agpref] >= avgfree))
+		goto found;
-	/* determine how many blocks are in the inactive allocation
+	/* From the last preferred ag, find the next one with at least
-	 * groups. in doing this, we must account for the fact that
+	 * average free space.
-	 * the rightmost group might be a partial group (i.e. file
-	 * system size is not a multiple of the group size).
 	 */
-	l2agsize = bmp->db_agl2size;
+	for (i = 0 ; i < bmp->db_numag; i++, agpref++) {
-	rem = bmp->db_mapsize & (bmp->db_agsize - 1);
+		if (agpref == bmp->db_numag)
-	inactfree = (inactags
+			agpref = 0;
-		     && rem) ? ((inactags - 1) << l2agsize) +
-	    rem : inactags << l2agsize;
-	/* now determine how many free blocks are in the active
+		if (atomic_read(&bmp->db_active[agpref]))
-	 * allocation groups plus the average number of free blocks
+			/* open file is currently growing in this ag */
-	 * within the active ags.
+			continue;
-	 */
+		if (bmp->db_agfree[agpref] >= avgfree)
-	actfree = bmp->db_nfree - inactfree;
+			goto found;
-	avgfree = (u32) actfree / (u32) actags;
+		else if (bmp->db_agfree[agpref] > hwm) {
+			hwm = bmp->db_agfree[agpref];
-	/* check if not all of the allocation groups are active.
+			next_best = agpref;
-	 */
-	if (actags < bmp->db_numag) {
-		/* not all of the allocation groups are active.  determine
-		 * if we should extend the active set by 1 (i.e. add the
-		 * group following the current active set).  we do so if
-		 * the number of free blocks within the active set is less
-		 * than the allocation group set and average free within
-		 * the active set is less than 60%.  we activate a new group
-		 * by setting the allocation group preference to the new
-		 * group.
-		 */
-		if (actfree < bmp->db_agsize &&
-		    ((avgfree * 100) >> l2agsize) < 60)
-			bmp->db_agpref = actags;
-	} else {
-		/* all allocation groups are in the active set.  check if
-		 * the preferred allocation group has average free space.
-		 * if not, re-establish the preferred group as the leftmost
-		 * group with average free space.
-		 */
-		if (bmp->db_agfree[bmp->db_agpref] < avgfree) {
-			for (bmp->db_agpref = 0; bmp->db_agpref < actags;
-			     bmp->db_agpref++) {
-				if (bmp->db_agfree[bmp->db_agpref] <=
-				    avgfree)
-					break;
-			}
-			assert(bmp->db_agpref < bmp->db_numag);
 		}
 	}
+	/*
+	 * If no inactive ag was found with average freespace, use the
+	 * next best
+	 */
+	if (next_best != -1)
+		agpref = next_best;
+	/* else agpref should be back to its original value */
+found:
+	bmp->db_agpref = agpref;
 	BMAP_UNLOCK(bmp);
 	/* return the preferred group.
@@ -701,7 +686,6 @@ int dbNextAG(struct inode *ipbmap)
 	return (bmp->db_agpref);
 }
 /*
 * NAME:	dbAlloc()
 *
@@ -750,6 +734,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
 	struct dmap *dp;
 	int l2nb;
 	s64 mapSize;
+	int writers;
 	/* assert that nblocks is valid */
 	assert(nblocks > 0);
@@ -774,11 +759,10 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
 	/* the hint should be within the map */
 	assert(hint < mapSize);
-	/* if no hint was specified or the number of blocks to be
+	/* if the number of blocks to be allocated is greater than the
-	 * allocated is greater than the allocation group size, try
+	 * allocation group size, try to allocate anywhere.
-	 * to allocate anywhere.
 	 */
-	if (hint == 0 || l2nb > bmp->db_agl2size) {
+	if (l2nb > bmp->db_agl2size) {
 		IWRITE_LOCK(ipbmap);
 		rc = dbAllocAny(bmp, nblocks, l2nb, results);
@@ -790,39 +774,34 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
 		goto write_unlock;
 	}
+	/*
+	 * If no hint, let dbNextAG recommend an allocation group
+	 */
+	if (hint == 0)
+		goto pref_ag;
 	/* we would like to allocate close to the hint.  adjust the
 	 * hint to the block following the hint since the allocators
 	 * will start looking for free space starting at this point.
-	 * if the hint was the last block of the file system, try to
-	 * allocate in the same allocation group as the hint.
 	 */
 	blkno = hint + 1;
-	if (blkno >= bmp->db_mapsize) {
-		blkno--;
+	if (blkno >= bmp->db_mapsize)
-		goto tryag;
+		goto pref_ag;
-	}
+	agno = blkno >> bmp->db_agl2size;
 	/* check if blkno crosses over into a new allocation group.
 	 * if so, check if we should allow allocations within this
-	 * allocation group.  we try to keep the trailing (rightmost)
+	 * allocation group.
-	 * allocation groups of the file system free for large
-	 * allocations and may want to prevent this allocation from
-	 * spilling over into this space.
-	 */
-	if ((blkno & (bmp->db_agsize - 1)) == 0) {
-		/* check if the AG is beyond the rightmost AG with
-		 * allocations in it.  if so, call dbNextAG() to
-		 * determine if the allocation should be allowed
-		 * to proceed within this AG or should be targeted
-		 * to another AG.
 	 */
-		agno = blkno >> bmp->db_agl2size;
+	if ((blkno & (bmp->db_agsize - 1)) == 0)
-		if (agno > bmp->db_maxag) {
+		/* check if the AG is currenly being written to.
-			agno = dbNextAG(ipbmap);
+		 * if so, call dbNextAG() to find a non-busy
-			blkno = (s64) agno << bmp->db_agl2size;
+		 * AG with sufficient free space.
-			goto tryag;
+		 */
-		}
+		if (atomic_read(&bmp->db_active[agno]))
-	}
+			goto pref_ag;
 	/* check if the allocation request size can be satisfied from a
 	 * single dmap.  if so, try to allocate from the dmap containing
@@ -844,9 +823,8 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
 		/* first, try to satisfy the allocation request with the
 		 * blocks beginning at the hint.
 		 */
-		if ((rc =
+		if ((rc = dbAllocNext(bmp, dp, blkno, (int) nblocks))
-		     dbAllocNext(bmp, dp, blkno,
+		    != ENOSPC) {
-				 (int) nblocks)) != ENOSPC) {
 			if (rc == 0) {
 				*results = blkno;
 				DBALLOC(bmp->db_DBmap, bmp->db_mapsize,
@@ -858,12 +836,23 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
 			goto read_unlock;
 		}
+		writers = atomic_read(&bmp->db_active[agno]);
+		if ((writers > 1) ||
+		    ((writers == 1) && (JFS_IP(ip)->active_ag != agno))) {
+			/*
+			 * Someone else is writing in this allocation
+			 * group.  To avoid fragmenting, try another ag
+			 */
+			release_metapage(mp);
+			IREAD_UNLOCK(ipbmap);
+			goto pref_ag;
+		}
 		/* next, try to satisfy the allocation request with blocks
 		 * near the hint.
 		 */
 		if ((rc =
-		     dbAllocNear(bmp, dp, blkno, (int) nblocks, l2nb,
+		     dbAllocNear(bmp, dp, blkno, (int) nblocks, l2nb, results))
-				 results))
 		    != ENOSPC) {
 			if (rc == 0) {
 				DBALLOC(bmp->db_DBmap, bmp->db_mapsize,
@@ -876,10 +865,9 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
 		}
 		/* try to satisfy the allocation request with blocks within
-		 * the same allocation group as the hint.
+		 * the same dmap as the hint.
 		 */
-		if ((rc =
+		if ((rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results))
-		     dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results))
 		    != ENOSPC) {
 			if (rc == 0) {
 				DBALLOC(bmp->db_DBmap, bmp->db_mapsize,
@@ -895,14 +883,30 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
 		IREAD_UNLOCK(ipbmap);
 	}
-      tryag:
+	/* try to satisfy the allocation request with blocks within
+	 * the same allocation group as the hint.
+	 */
+	IWRITE_LOCK(ipbmap);
+	if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results))
+	    != ENOSPC) {
+		if (rc == 0)
+			DBALLOC(bmp->db_DBmap, bmp->db_mapsize,
+				*results, nblocks);
+		goto write_unlock;
+	}
+	IWRITE_UNLOCK(ipbmap);
+      pref_ag:
+	/*
+	 * Let dbNextAG recommend a preferred allocation group
+	 */
+	agno = dbNextAG(ipbmap);
 	IWRITE_LOCK(ipbmap);
-	/* determine the allocation group number of the hint and try to
+	/* Try to allocate within this allocation group.  if that fails, try to
-	 * allocate within this allocation group.  if that fails, try to
 	 * allocate anywhere in the map.
 	 */
-	agno = blkno >> bmp->db_agl2size;
 	if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) == ENOSPC)
 		rc = dbAllocAny(bmp, nblocks, l2nb, results);
 	if (rc == 0) {
@@ -2314,11 +2318,9 @@ static void dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
 	 * if so, establish the new maximum allocation group number by
 	 * searching left for the first allocation group with allocation.
 	 */
-	if ((bmp->db_agfree[agno] == bmp->db_agsize
+	if ((bmp->db_agfree[agno] == bmp->db_agsize && agno == bmp->db_maxag) ||
-	     && agno == bmp->db_maxag) || (agno == bmp->db_numag - 1
+	    (agno == bmp->db_numag - 1 &&
-					   && bmp->db_agfree[agno] ==
+	     bmp->db_agfree[agno] == (bmp-> db_mapsize & (BPERDMAP - 1)))) {
-					   (bmp-> db_mapsize &
-					    (BPERDMAP - 1)))) {
 		while (bmp->db_maxag > 0) {
 			bmp->db_maxag -= 1;
 			if (bmp->db_agfree[bmp->db_maxag] !=

--- a/fs/jfs/jfs_dmap.h
+++ b/fs/jfs/jfs_dmap.h
@@ -227,6 +227,7 @@ struct bmap {
 	struct dbmap db_bmap;	/* on-disk aggregate map descriptor */
 	struct inode *db_ipbmap;	/* ptr to aggregate map incore inode */
 	struct semaphore db_bmaplock;	/* aggregate map lock */
+	atomic_t db_active[MAXAG];	/* count of active, open files in AG */
 	u32 *db_DBmap;
 };

--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -514,9 +514,12 @@ int extFill(struct inode *ip, xad_t * xp)
 static int
 extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
 {
+	struct jfs_inode_info *ji = JFS_IP(ip);
+	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
 	s64 nb, nblks, daddr, max;
-	int rc, nbperpage = JFS_SBI(ip->i_sb)->nbperpage;
+	int rc, nbperpage = sbi->nbperpage;
-	struct bmap *mp = JFS_SBI(ip->i_sb)->bmap;
+	struct bmap *bmp = sbi->bmap;
+	int ag;
 	/* get the number of blocks to initially attempt to allocate.
 	 * we'll first try the number of blocks requested unless this
@@ -524,7 +527,7 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
 	 * blocks in the map. in that case, we'll start off with the 
 	 * maximum free.
 	 */
-	max = (s64) 1 << mp->db_maxfreebud;
+	max = (s64) 1 << bmp->db_maxfreebud;
 	if (*nblocks >= max && *nblocks > nbperpage)
 		nb = nblks = (max > nbperpage) ? max : nbperpage;
 	else
@@ -549,6 +552,18 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
 	*nblocks = nb;
 	*blkno = daddr;
+	if (S_ISREG(ip->i_mode) && (ji->fileset == FILESYSTEM_I)) {
+		ag = BLKTOAG(daddr, sbi);
+		if (ji->active_ag == -1) {
+			atomic_inc(&bmp->db_active[ag]);
+			ji->active_ag = ag;
+		} else if (ji->active_ag != ag) {
+			atomic_dec(&bmp->db_active[ji->active_ag]);
+			atomic_inc(&bmp->db_active[ag]);
+			ji->active_ag = ag;
+		}
+	}
 	return (0);
 }

--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -429,6 +429,7 @@ int diRead(struct inode *ip)
 	/* set the ag for the inode */
 	JFS_IP(ip)->agno = BLKTOAG(agstart, sbi);
+	JFS_IP(ip)->active_ag = -1;
 	return (rc);
 }
@@ -1358,6 +1359,7 @@ diInitInode(struct inode *ip, int iagno, int ino, int extno, struct iag * iagp)
 	DBG_DIALLOC(JFS_IP(ipimap)->i_imap, ip->i_ino);
 	jfs_ip->ixpxd = iagp->inoext[extno];
 	jfs_ip->agno = BLKTOAG(le64_to_cpu(iagp->agstart), sbi);
+	jfs_ip->active_ag = -1;
 }
@@ -1413,6 +1415,21 @@ int diAlloc(struct inode *pip, boolean_t dir, struct inode *ip)
 	 * moving backward on the disk.)  compute the hint within the
 	 * file system and the iag.
 	 */
+	/* get the ag number of this iag */
+	agno = JFS_IP(pip)->agno;
+	if (atomic_read(&JFS_SBI(pip->i_sb)->bmap->db_active[agno])) {
+		/*
+		 * There is an open file actively growing.  We want to
+		 * allocate new inodes from a different ag to avoid
+		 * fragmentation problems.
+		 */
+		agno = dbNextAG(JFS_SBI(pip->i_sb)->ipbmap);
+		AG_LOCK(imap, agno);
+		goto tryag;
+	}
 	inum = pip->i_ino + 1;
 	ino = inum & (INOSPERIAG - 1);
@@ -1420,9 +1437,6 @@ int diAlloc(struct inode *pip, boolean_t dir, struct inode *ip)
 	if (ino == 0)
 		inum = pip->i_ino;
-	/* get the ag number of this iag */
-	agno = JFS_IP(pip)->agno;
 	/* lock the AG inode map information */
 	AG_LOCK(imap, agno);

--- a/fs/jfs/jfs_incore.h
+++ b/fs/jfs/jfs_incore.h
@@ -49,7 +49,7 @@ struct jfs_inode_info {
 	long	cflag;		/* commit flags		*/
 	u16	bxflag;		/* xflag of pseudo buffer?	*/
 	unchar	agno;		/* ag number			*/
-	unchar	pad;		/* pad			*/
+	signed char active_ag;	/* ag currently allocating from	*/
 	lid_t	blid;		/* lid of pseudo buffer?	*/
 	lid_t	atlhead;	/* anonymous tlock list head	*/
 	lid_t	atltail;	/* anonymous tlock list tail	*/

--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -406,6 +406,7 @@ static void init_once(void *foo, kmem_cache_t * cachep, unsigned long flags)
 		init_rwsem(&jfs_ip->rdwrlock);
 		init_MUTEX(&jfs_ip->commit_sem);
 		jfs_ip->atlhead = 0;
+		jfs_ip->active_ag = -1;
 		inode_init_once(&jfs_ip->vfs_inode);
 	}
 }