Commit 5717ea4d authored by Dave Chinner's avatar Dave Chinner Committed by Darrick J. Wong

xfs: rework xfs_iflush_cluster() dirty inode iteration

Now that we have all the dirty inodes attached to the cluster
buffer, we don't actually have to do radix tree lookups to find
them. Sure, the radix tree is efficient, but walking a linked list
of just the dirty inodes attached to the buffer is much better.

We are also no longer dependent on having a locked inode passed into
the function to determine where to start the lookup. This means we
can drop it from the function call and treat all inodes the same.

We also make xfs_iflush_cluster skip inodes marked with
XFS_IRECLAIM. This we avoid races with inodes that reclaim is
actively referencing or are being re-initialised by inode lookup. If
they are actually dirty, they'll get written by a future cluster
flush....

We also add a shutdown check after obtaining the flush lock so that
we catch inodes that are dirty in memory and may have inconsistent
state due to the shutdown in progress. We abort these inodes
directly and so they remove themselves directly from the buffer list
and the AIL rather than having to wait for the buffer to be failed
and callbacks run to be processed correctly.
Signed-off-by: default avatarDave Chinner <dchinner@redhat.com>
Reviewed-by: default avatarDarrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: default avatarDarrick J. Wong <darrick.wong@oracle.com>
parent e6187b34
......@@ -3603,141 +3603,120 @@ xfs_iflush(
* locked. The function will walk across all the inodes on the cluster buffer it
* can find and lock without blocking, and flush them to the cluster buffer.
*
* On success, the caller must write out the buffer returned in *bp and
* release it. On failure, the filesystem will be shut down, the buffer will
* have been unlocked and released, and EFSCORRUPTED will be returned.
* On successful flushing of at least one inode, the caller must write out the
* buffer and release it. If no inodes are flushed, -EAGAIN will be returned and
* the caller needs to release the buffer. On failure, the filesystem will be
* shut down, the buffer will have been unlocked and released, and EFSCORRUPTED
* will be returned.
*/
int
xfs_iflush_cluster(
struct xfs_inode *ip,
struct xfs_buf *bp)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_perag *pag;
unsigned long first_index, mask;
int cilist_size;
struct xfs_inode **cilist;
struct xfs_inode *cip;
struct xfs_ino_geometry *igeo = M_IGEO(mp);
int error = 0;
int nr_found;
struct xfs_mount *mp = bp->b_mount;
struct xfs_log_item *lip, *n;
struct xfs_inode *ip;
struct xfs_inode_log_item *iip;
int clcount = 0;
int i;
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
cilist_size = igeo->inodes_per_cluster * sizeof(struct xfs_inode *);
cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS);
if (!cilist)
goto out_put;
mask = ~(igeo->inodes_per_cluster - 1);
first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
rcu_read_lock();
/* really need a gang lookup range call here */
nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist,
first_index, igeo->inodes_per_cluster);
if (nr_found == 0)
goto out_free;
for (i = 0; i < nr_found; i++) {
cip = cilist[i];
int error = 0;
/*
* because this is an RCU protected lookup, we could find a
* recently freed or even reallocated inode during the lookup.
* We need to check under the i_flags_lock for a valid inode
* here. Skip it if it is not valid or the wrong inode.
* We must use the safe variant here as on shutdown xfs_iflush_abort()
* can remove itself from the list.
*/
spin_lock(&cip->i_flags_lock);
if (!cip->i_ino ||
__xfs_iflags_test(cip, XFS_ISTALE)) {
spin_unlock(&cip->i_flags_lock);
continue;
}
list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
iip = (struct xfs_inode_log_item *)lip;
ip = iip->ili_inode;
/*
* Once we fall off the end of the cluster, no point checking
* any more inodes in the list because they will also all be
* outside the cluster.
* Quick and dirty check to avoid locks if possible.
*/
if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) {
spin_unlock(&cip->i_flags_lock);
break;
}
spin_unlock(&cip->i_flags_lock);
if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK))
continue;
if (xfs_ipincount(ip))
continue;
/*
* Do an un-protected check to see if the inode is dirty and
* is a candidate for flushing. These checks will be repeated
* later after the appropriate locks are acquired.
* The inode is still attached to the buffer, which means it is
* dirty but reclaim might try to grab it. Check carefully for
* that, and grab the ilock while still holding the i_flags_lock
* to guarantee reclaim will not be able to reclaim this inode
* once we drop the i_flags_lock.
*/
if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0)
spin_lock(&ip->i_flags_lock);
ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE));
if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) {
spin_unlock(&ip->i_flags_lock);
continue;
}
/*
* Try to get locks. If any are unavailable or it is pinned,
* then this inode cannot be flushed and is skipped.
* ILOCK will pin the inode against reclaim and prevent
* concurrent transactions modifying the inode while we are
* flushing the inode.
*/
if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED))
continue;
if (!xfs_iflock_nowait(cip)) {
xfs_iunlock(cip, XFS_ILOCK_SHARED);
continue;
}
if (xfs_ipincount(cip)) {
xfs_ifunlock(cip);
xfs_iunlock(cip, XFS_ILOCK_SHARED);
if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
spin_unlock(&ip->i_flags_lock);
continue;
}
spin_unlock(&ip->i_flags_lock);
/*
* Check the inode number again, just to be certain we are not
* racing with freeing in xfs_reclaim_inode(). See the comments
* in that function for more information as to why the initial
* check is not sufficient.
* Skip inodes that are already flush locked as they have
* already been written to the buffer.
*/
if (!cip->i_ino) {
xfs_ifunlock(cip);
xfs_iunlock(cip, XFS_ILOCK_SHARED);
if (!xfs_iflock_nowait(ip)) {
xfs_iunlock(ip, XFS_ILOCK_SHARED);
continue;
}
/*
* arriving here means that this inode can be flushed. First
* re-check that it's dirty before flushing.
* Abort flushing this inode if we are shut down because the
* inode may not currently be in the AIL. This can occur when
* log I/O failure unpins the inode without inserting into the
* AIL, leaving a dirty/unpinned inode attached to the buffer
* that otherwise looks like it should be flushed.
*/
if (!xfs_inode_clean(cip)) {
error = xfs_iflush(cip, bp);
if (error) {
xfs_iunlock(cip, XFS_ILOCK_SHARED);
goto out_free;
}
clcount++;
} else {
xfs_ifunlock(cip);
if (XFS_FORCED_SHUTDOWN(mp)) {
xfs_iunpin_wait(ip);
/* xfs_iflush_abort() drops the flush lock */
xfs_iflush_abort(ip);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
error = -EIO;
continue;
}
xfs_iunlock(cip, XFS_ILOCK_SHARED);
/* don't block waiting on a log force to unpin dirty inodes */
if (xfs_ipincount(ip)) {
xfs_ifunlock(ip);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
continue;
}
if (clcount) {
XFS_STATS_INC(mp, xs_icluster_flushcnt);
XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
if (!xfs_inode_clean(ip))
error = xfs_iflush(ip, bp);
else
xfs_ifunlock(ip);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
if (error)
break;
clcount++;
}
out_free:
rcu_read_unlock();
kmem_free(cilist);
out_put:
xfs_perag_put(pag);
if (error) {
bp->b_flags |= XBF_ASYNC;
xfs_buf_ioend_fail(bp);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
}
return error;
}
if (!clcount)
return -EAGAIN;
XFS_STATS_INC(mp, xs_icluster_flushcnt);
XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
return 0;
}
/* Release an inode. */
......
......@@ -426,7 +426,7 @@ int xfs_log_force_inode(struct xfs_inode *ip);
void xfs_iunpin_wait(xfs_inode_t *);
#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount))
int xfs_iflush_cluster(struct xfs_inode *, struct xfs_buf *);
int xfs_iflush_cluster(struct xfs_buf *);
void xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode,
struct xfs_inode *ip1, uint ip1_mode);
......
......@@ -507,12 +507,18 @@ xfs_inode_item_push(
* reference for IO until we queue the buffer for delwri submission.
*/
xfs_buf_hold(bp);
error = xfs_iflush_cluster(ip, bp);
error = xfs_iflush_cluster(bp);
if (!error) {
if (!xfs_buf_delwri_queue(bp, buffer_list))
rval = XFS_ITEM_FLUSHING;
xfs_buf_relse(bp);
} else {
/*
* Release the buffer if we were unable to flush anything. On
* any other error, the buffer has already been released.
*/
if (error == -EAGAIN)
xfs_buf_relse(bp);
rval = XFS_ITEM_LOCKED;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment