JFS: Avoid parallel allocations within the same allocation group

When large files are writting in parallel, allocating the space for these files within the same allocation group can cause severe fragmentation of the files. By keeping track of open, growing files within an allocation group, we can force other new allocations into a different allocation group to avoid this.

JFS: Avoid parallel allocations within the same allocation group
When large files are writting in parallel, allocating the space for these files within the same allocation group can cause severe fragmentation of the files. By keeping track of open, growing files within an allocation group, we can force other new allocations into a different allocation group to avoid this.
2f86142b · Dave Kleikamp · d1cd8c07 · 2f86142b · 2f86142b · 2f86142b
Commit 2f86142b authored Sep 18, 2002 by Dave Kleikamp
7 changed files
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -19,6 +19,7 @@
 #include <linux/fs.h>
 #include "jfs_incore.h"
+#include "jfs_dmap.h"
 #include "jfs_txnmgr.h"
 #include "jfs_xattr.h"
 #include "jfs_debug.h"
@@ -94,6 +95,42 @@ static void jfs_truncate(struct inode *ip)
 	IWRITE_UNLOCK(ip);
 }
+static int jfs_open(struct inode *inode, struct file *file)
+{
+	int rc;
+	if ((rc = generic_file_open(inode, file)))
+		return rc;
+	/*
+	 * We attempt to allow only one "active" file open per aggregate
+	 * group.  Otherwise, appending to files in parallel can cause
+	 * fragmentation within the files.
+	 */
+	if (S_ISREG(inode->i_mode) && file->f_mode & FMODE_WRITE) {
+		struct jfs_inode_info *ji = JFS_IP(inode);
+		if (ji->active_ag == -1) {
+			ji->active_ag = ji->agno;
+			atomic_inc(
+			    &JFS_SBI(inode->i_sb)->bmap->db_active[ji->agno]);
+		}
+	}
+	return 0;
+}
+static int jfs_release(struct inode *inode, struct file *file)
+{
+	struct jfs_inode_info *ji = JFS_IP(inode);
+	if (ji->active_ag != -1) {
+		struct bmap *bmap = JFS_SBI(inode->i_sb)->bmap;
+		atomic_dec(&bmap->db_active[ji->active_ag]);
+		ji->active_ag = -1;
+	}
+	return 0;
+}
 struct inode_operations jfs_file_inode_operations = {
 	.truncate	= jfs_truncate,
 	.setxattr	= jfs_setxattr,
@@ -103,7 +140,7 @@ struct inode_operations jfs_file_inode_operations = {
 };
 struct file_operations jfs_file_operations = {
-	.open		= generic_file_open,
+	.open		= jfs_open,
 	.llseek		= generic_file_llseek,
 	.write		= generic_file_write,
 	.read		= generic_file_read,
@@ -112,4 +149,5 @@ struct file_operations jfs_file_operations = {
 	.writev		= generic_file_writev,
 	.sendfile	= generic_file_sendfile,
 	.fsync		= jfs_fsync,
+	.release	= jfs_release,
 };
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -241,6 +241,7 @@ int dbMount(struct inode *ipbmap)
 	bmp->db_ipbmap = ipbmap;
 	JFS_SBI(ipbmap->i_sb)->bmap = bmp;
+	memset(bmp->db_active, 0, sizeof(bmp->db_active));
 	DBINITMAP(bmp->db_mapsize, ipbmap, &bmp->db_DBmap);
 	/*
@@ -271,6 +272,7 @@ int dbMount(struct inode *ipbmap)
 int dbUnmount(struct inode *ipbmap, int mounterror)
 {
 	struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
+	int i;
 	if (!(mounterror || isReadOnly(ipbmap)))
 		dbSync(ipbmap);
@@ -280,6 +282,14 @@ int dbUnmount(struct inode *ipbmap, int mounterror)
 	 */
 	truncate_inode_pages(ipbmap->i_mapping, 0);
+	/*
+	 * Sanity Check
+	 */
+	for (i = 0; i < bmp->db_numag; i++)
+		if (atomic_read(&bmp->db_active[i]))
+			printk(KERN_ERR "dbUnmount: db_active[%d] = %d\n",
+			       i, atomic_read(&bmp->db_active[i]));
 	/* free the memory for the in-memory bmap. */
 	kfree(bmp);
@@ -598,102 +608,77 @@ dbUpdatePMap(struct inode *ipbmap,
 *
 * FUNCTION:    find the preferred allocation group for new allocations.
 *
- *		we try to keep the trailing (rightmost) allocation groups
+ *		Within the allocation groups, we maintain a preferred
- *		free for large allocations.  we try to do this by targeting
- *		new inode allocations towards the leftmost or 'active'
- *		allocation groups while keeping the rightmost or 'inactive'
- *		allocation groups free. once the active allocation groups
- *		have dropped to a certain percentage of free space, we add
- *		the leftmost inactive allocation group to the active set.
- *
- *		within the active allocation groups, we maintain a preferred
 *		allocation group which consists of a group with at least
- *		average free space over the active set. it is the preferred
+ *		average free space.  It is the preferred group that we target
- *		group that we target new inode allocation towards.  the 
+ *		new inode allocation towards.  The tie-in between inode
- *		tie-in between inode allocation and block allocation occurs
+ *		allocation and block allocation occurs as we allocate the
- *		as we allocate the first (data) block of an inode and specify
+ *		first (data) block of an inode and specify the inode (block)
- *		the inode (block) as the allocation hint for this block.
+ *		as the allocation hint for this block.
+ *
+ *		We try to avoid having more than one open file growing in
+ *		an allocation group, as this will lead to fragmentation.
+ *		This differs from the old OS/2 method of trying to keep
+ *		empty ags around for large allocations.
 *
 * PARAMETERS:
 *      ipbmap	-  pointer to in-core inode for the block map.
 *
 * RETURN VALUES:
 *      the preferred allocation group number.
- *
- * note: only called by dbAlloc();
 */
 int dbNextAG(struct inode *ipbmap)
 {
-	s64 avgfree, inactfree, actfree, rem;
+	s64 avgfree;
-	int actags, inactags, l2agsize;
+	int agpref;
+	s64 hwm = 0;
+	int i;
+	int next_best = -1;
 	struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
 	BMAP_LOCK(bmp);
-	/* determine the number of active allocation groups (i.e.
+	/* determine the average number of free blocks within the ags. */
-	 * the number of allocation groups up to and including
+	avgfree = (u32)bmp->db_nfree / bmp->db_numag;
-	 * the rightmost allocation group with blocks allocated
-	 * in it.
-	 */
-	actags = bmp->db_maxag + 1;
-	assert(actags <= bmp->db_numag);
-	/* get the number of inactive allocation groups (i.e. the
-	 * number of allocation group following the rightmost group
-	 * with allocation in it.
-	 */
-	inactags = bmp->db_numag - actags;
-	/* determine how many blocks are in the inactive allocation
+	/*
-	 * groups. in doing this, we must account for the fact that
+	 * if the current preferred ag does not have an active allocator
-	 * the rightmost group might be a partial group (i.e. file
+	 * and has at least average freespace, return it
-	 * system size is not a multiple of the group size).
 	 */
-	l2agsize = bmp->db_agl2size;
+	agpref = bmp->db_agpref;
-	rem = bmp->db_mapsize & (bmp->db_agsize - 1);
+	if ((atomic_read(&bmp->db_active[agpref]) == 0) &&
-	inactfree = (inactags
+	    (bmp->db_agfree[agpref] >= avgfree))
-		     && rem) ? ((inactags - 1) << l2agsize) +
+		goto found;
-	    rem : inactags << l2agsize;
-	/* now determine how many free blocks are in the active
+	/* From the last preferred ag, find the next one with at least
-	 * allocation groups plus the average number of free blocks
+	 * average free space.
-	 * within the active ags.
 	 */
-	actfree = bmp->db_nfree - inactfree;
+	for (i = 0 ; i < bmp->db_numag; i++, agpref++) {
-	avgfree = (u32) actfree / (u32) actags;
+		if (agpref == bmp->db_numag)
+			agpref = 0;
-	/* check if not all of the allocation groups are active.
+		if (atomic_read(&bmp->db_active[agpref]))
-	 */
+			/* open file is currently growing in this ag */
-	if (actags < bmp->db_numag) {
+			continue;
-		/* not all of the allocation groups are active.  determine
+		if (bmp->db_agfree[agpref] >= avgfree)
-		 * if we should extend the active set by 1 (i.e. add the
+			goto found;
-		 * group following the current active set).  we do so if
+		else if (bmp->db_agfree[agpref] > hwm) {
-		 * the number of free blocks within the active set is less
+			hwm = bmp->db_agfree[agpref];
-		 * than the allocation group set and average free within
+			next_best = agpref;
-		 * the active set is less than 60%.  we activate a new group
-		 * by setting the allocation group preference to the new
-		 * group.
-		 */
-		if (actfree < bmp->db_agsize &&
-		    ((avgfree * 100) >> l2agsize) < 60)
-			bmp->db_agpref = actags;
-	} else {
-		/* all allocation groups are in the active set.  check if
-		 * the preferred allocation group has average free space.
-		 * if not, re-establish the preferred group as the leftmost
-		 * group with average free space.
-		 */
-		if (bmp->db_agfree[bmp->db_agpref] < avgfree) {
-			for (bmp->db_agpref = 0; bmp->db_agpref < actags;
-			     bmp->db_agpref++) {
-				if (bmp->db_agfree[bmp->db_agpref] <=
-				    avgfree)
-					break;
-			}
-			assert(bmp->db_agpref < bmp->db_numag);
 		}
 	}
+	/*
+	 * If no inactive ag was found with average freespace, use the
+	 * next best
+	 */
+	if (next_best != -1)
+		agpref = next_best;
+	/* else agpref should be back to its original value */
+found:
+	bmp->db_agpref = agpref;
 	BMAP_UNLOCK(bmp);
 	/* return the preferred group.
@@ -701,7 +686,6 @@ int dbNextAG(struct inode *ipbmap)
 	return (bmp->db_agpref);
 }
 /*
 * NAME:	dbAlloc()
 *
@@ -750,6 +734,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
 	struct dmap *dp;
 	int l2nb;
 	s64 mapSize;
+	int writers;
 	/* assert that nblocks is valid */
 	assert(nblocks > 0);
@@ -774,11 +759,10 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
 	/* the hint should be within the map */
 	assert(hint < mapSize);
-	/* if no hint was specified or the number of blocks to be
+	/* if the number of blocks to be allocated is greater than the
-	 * allocated is greater than the allocation group size, try
+	 * allocation group size, try to allocate anywhere.
-	 * to allocate anywhere.
 	 */
-	if (hint == 0 || l2nb > bmp->db_agl2size) {
+	if (l2nb > bmp->db_agl2size) {
 		IWRITE_LOCK(ipbmap);
 		rc = dbAllocAny(bmp, nblocks, l2nb, results);
@@ -790,39 +774,34 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
 		goto write_unlock;
 	}
+	/*
+	 * If no hint, let dbNextAG recommend an allocation group
+	 */
+	if (hint == 0)
+		goto pref_ag;
 	/* we would like to allocate close to the hint.  adjust the
 	 * hint to the block following the hint since the allocators
 	 * will start looking for free space starting at this point.
-	 * if the hint was the last block of the file system, try to
-	 * allocate in the same allocation group as the hint.
 	 */
 	blkno = hint + 1;
-	if (blkno >= bmp->db_mapsize) {
-		blkno--;
+	if (blkno >= bmp->db_mapsize)
-		goto tryag;
+		goto pref_ag;
-	}
+	agno = blkno >> bmp->db_agl2size;
 	/* check if blkno crosses over into a new allocation group.
 	 * if so, check if we should allow allocations within this
-	 * allocation group.  we try to keep the trailing (rightmost)
+	 * allocation group.
-	 * allocation groups of the file system free for large
-	 * allocations and may want to prevent this allocation from
-	 * spilling over into this space.
 	 */
-	if ((blkno & (bmp->db_agsize - 1)) == 0) {
+	if ((blkno & (bmp->db_agsize - 1)) == 0)
-		/* check if the AG is beyond the rightmost AG with
+		/* check if the AG is currenly being written to.
-		 * allocations in it.  if so, call dbNextAG() to
+		 * if so, call dbNextAG() to find a non-busy
-		 * determine if the allocation should be allowed
+		 * AG with sufficient free space.
-		 * to proceed within this AG or should be targeted
-		 * to another AG.
 		 */
-		agno = blkno >> bmp->db_agl2size;
+		if (atomic_read(&bmp->db_active[agno]))
-		if (agno > bmp->db_maxag) {
+			goto pref_ag;
-			agno = dbNextAG(ipbmap);
-			blkno = (s64) agno << bmp->db_agl2size;
-			goto tryag;
-		}
-	}
 	/* check if the allocation request size can be satisfied from a
 	 * single dmap.  if so, try to allocate from the dmap containing
@@ -844,9 +823,8 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
 		/* first, try to satisfy the allocation request with the
 		 * blocks beginning at the hint.
 		 */
-		if ((rc =
+		if ((rc = dbAllocNext(bmp, dp, blkno, (int) nblocks))
-		     dbAllocNext(bmp, dp, blkno,
+		    != ENOSPC) {
-				 (int) nblocks)) != ENOSPC) {
 			if (rc == 0) {
 				*results = blkno;
 				DBALLOC(bmp->db_DBmap, bmp->db_mapsize,
@@ -858,12 +836,23 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
 			goto read_unlock;
 		}
+		writers = atomic_read(&bmp->db_active[agno]);
+		if ((writers > 1) ||
+		    ((writers == 1) && (JFS_IP(ip)->active_ag != agno))) {
+			/*
+			 * Someone else is writing in this allocation
+			 * group.  To avoid fragmenting, try another ag
+			 */
+			release_metapage(mp);
+			IREAD_UNLOCK(ipbmap);
+			goto pref_ag;
+		}
 		/* next, try to satisfy the allocation request with blocks
 		 * near the hint.
 		 */
 		if ((rc =
-		     dbAllocNear(bmp, dp, blkno, (int) nblocks, l2nb,
+		     dbAllocNear(bmp, dp, blkno, (int) nblocks, l2nb, results))
-				 results))
 		    != ENOSPC) {
 			if (rc == 0) {
 				DBALLOC(bmp->db_DBmap, bmp->db_mapsize,
@@ -876,10 +865,9 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
 		}
 		/* try to satisfy the allocation request with blocks within
-		 * the same allocation group as the hint.
+		 * the same dmap as the hint.
 		 */
-		if ((rc =
+		if ((rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results))
-		     dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results))
 		    != ENOSPC) {
 			if (rc == 0) {
 				DBALLOC(bmp->db_DBmap, bmp->db_mapsize,
@@ -895,14 +883,30 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
 		IREAD_UNLOCK(ipbmap);
 	}
-      tryag:
+	/* try to satisfy the allocation request with blocks within
+	 * the same allocation group as the hint.
+	 */
+	IWRITE_LOCK(ipbmap);
+	if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results))
+	    != ENOSPC) {
+		if (rc == 0)
+			DBALLOC(bmp->db_DBmap, bmp->db_mapsize,
+				*results, nblocks);
+		goto write_unlock;
+	}
+	IWRITE_UNLOCK(ipbmap);
+      pref_ag:
+	/*
+	 * Let dbNextAG recommend a preferred allocation group
+	 */
+	agno = dbNextAG(ipbmap);
 	IWRITE_LOCK(ipbmap);
-	/* determine the allocation group number of the hint and try to
+	/* Try to allocate within this allocation group.  if that fails, try to
-	 * allocate within this allocation group.  if that fails, try to
 	 * allocate anywhere in the map.
 	 */
-	agno = blkno >> bmp->db_agl2size;
 	if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) == ENOSPC)
 		rc = dbAllocAny(bmp, nblocks, l2nb, results);
 	if (rc == 0) {
@@ -2314,11 +2318,9 @@ static void dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
 	 * if so, establish the new maximum allocation group number by
 	 * searching left for the first allocation group with allocation.
 	 */
-	if ((bmp->db_agfree[agno] == bmp->db_agsize
+	if ((bmp->db_agfree[agno] == bmp->db_agsize && agno == bmp->db_maxag) ||
-	     && agno == bmp->db_maxag) || (agno == bmp->db_numag - 1
+	    (agno == bmp->db_numag - 1 &&
-					   && bmp->db_agfree[agno] ==
+	     bmp->db_agfree[agno] == (bmp-> db_mapsize & (BPERDMAP - 1)))) {
-					   (bmp-> db_mapsize &
-					    (BPERDMAP - 1)))) {
 		while (bmp->db_maxag > 0) {
 			bmp->db_maxag -= 1;
 			if (bmp->db_agfree[bmp->db_maxag] !=

--- a/fs/jfs/jfs_dmap.h
+++ b/fs/jfs/jfs_dmap.h
@@ -227,6 +227,7 @@ struct bmap {
 	struct dbmap db_bmap;	/* on-disk aggregate map descriptor */
 	struct inode *db_ipbmap;	/* ptr to aggregate map incore inode */
 	struct semaphore db_bmaplock;	/* aggregate map lock */
+	atomic_t db_active[MAXAG];	/* count of active, open files in AG */
 	u32 *db_DBmap;
 };

--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -514,9 +514,12 @@ int extFill(struct inode *ip, xad_t * xp)
 static int
 extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
 {
+	struct jfs_inode_info *ji = JFS_IP(ip);
+	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
 	s64 nb, nblks, daddr, max;
-	int rc, nbperpage = JFS_SBI(ip->i_sb)->nbperpage;
+	int rc, nbperpage = sbi->nbperpage;
-	struct bmap *mp = JFS_SBI(ip->i_sb)->bmap;
+	struct bmap *bmp = sbi->bmap;
+	int ag;
 	/* get the number of blocks to initially attempt to allocate.
 	 * we'll first try the number of blocks requested unless this
@@ -524,7 +527,7 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
 	 * blocks in the map. in that case, we'll start off with the 
 	 * maximum free.
 	 */
-	max = (s64) 1 << mp->db_maxfreebud;
+	max = (s64) 1 << bmp->db_maxfreebud;
 	if (*nblocks >= max && *nblocks > nbperpage)
 		nb = nblks = (max > nbperpage) ? max : nbperpage;
 	else
@@ -549,6 +552,18 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
 	*nblocks = nb;
 	*blkno = daddr;
+	if (S_ISREG(ip->i_mode) && (ji->fileset == FILESYSTEM_I)) {
+		ag = BLKTOAG(daddr, sbi);
+		if (ji->active_ag == -1) {
+			atomic_inc(&bmp->db_active[ag]);
+			ji->active_ag = ag;
+		} else if (ji->active_ag != ag) {
+			atomic_dec(&bmp->db_active[ji->active_ag]);
+			atomic_inc(&bmp->db_active[ag]);
+			ji->active_ag = ag;
+		}
+	}
 	return (0);
 }

--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -429,6 +429,7 @@ int diRead(struct inode *ip)
 	/* set the ag for the inode */
 	JFS_IP(ip)->agno = BLKTOAG(agstart, sbi);
+	JFS_IP(ip)->active_ag = -1;
 	return (rc);
 }
@@ -1358,6 +1359,7 @@ diInitInode(struct inode *ip, int iagno, int ino, int extno, struct iag * iagp)
 	DBG_DIALLOC(JFS_IP(ipimap)->i_imap, ip->i_ino);
 	jfs_ip->ixpxd = iagp->inoext[extno];
 	jfs_ip->agno = BLKTOAG(le64_to_cpu(iagp->agstart), sbi);
+	jfs_ip->active_ag = -1;
 }
@@ -1413,6 +1415,21 @@ int diAlloc(struct inode *pip, boolean_t dir, struct inode *ip)
 	 * moving backward on the disk.)  compute the hint within the
 	 * file system and the iag.
 	 */
+	/* get the ag number of this iag */
+	agno = JFS_IP(pip)->agno;
+	if (atomic_read(&JFS_SBI(pip->i_sb)->bmap->db_active[agno])) {
+		/*
+		 * There is an open file actively growing.  We want to
+		 * allocate new inodes from a different ag to avoid
+		 * fragmentation problems.
+		 */
+		agno = dbNextAG(JFS_SBI(pip->i_sb)->ipbmap);
+		AG_LOCK(imap, agno);
+		goto tryag;
+	}
 	inum = pip->i_ino + 1;
 	ino = inum & (INOSPERIAG - 1);
@@ -1420,9 +1437,6 @@ int diAlloc(struct inode *pip, boolean_t dir, struct inode *ip)
 	if (ino == 0)
 		inum = pip->i_ino;
-	/* get the ag number of this iag */
-	agno = JFS_IP(pip)->agno;
 	/* lock the AG inode map information */
 	AG_LOCK(imap, agno);

--- a/fs/jfs/jfs_incore.h
+++ b/fs/jfs/jfs_incore.h
@@ -49,7 +49,7 @@ struct jfs_inode_info {
 	long	cflag;		/* commit flags		*/
 	u16	bxflag;		/* xflag of pseudo buffer?	*/
 	unchar	agno;		/* ag number			*/
-	unchar	pad;		/* pad			*/
+	signed char active_ag;	/* ag currently allocating from	*/
 	lid_t	blid;		/* lid of pseudo buffer?	*/
 	lid_t	atlhead;	/* anonymous tlock list head	*/
 	lid_t	atltail;	/* anonymous tlock list tail	*/

--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -406,6 +406,7 @@ static void init_once(void *foo, kmem_cache_t * cachep, unsigned long flags)
 		init_rwsem(&jfs_ip->rdwrlock);
 		init_MUTEX(&jfs_ip->commit_sem);
 		jfs_ip->atlhead = 0;
+		jfs_ip->active_ag = -1;
 		inode_init_once(&jfs_ip->vfs_inode);
 	}
 }