Commit 8c3c0743 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'xfs-5.7-merge-12' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

Pull more xfs updates from Darrick Wong:
 "As promised last week, this batch changes how xfs interacts with
  memory reclaim; how the log batches and throttles log items; how hard
  writes near ENOSPC will try to squeeze more space out of the
  filesystem; and hopefully fix the last of the umount hangs after a
  catastrophic failure.

  Summary:

   - Validate the realtime geometry in the superblock when mounting

   - Refactor a bunch of tricky flag handling in the log code

   - Flush the CIL more judiciously so that we don't wait until there
     are millions of log items consuming a lot of memory.

   - Throttle transaction commits to prevent the xfs frontend from
     flooding the CIL with too many log items.

   - Account metadata buffers correctly for memory reclaim.

   - Mark slabs properly for memory reclaim. These should help reclaim
     run more effectively when XFS is using a lot of memory.

   - Don't write a garbage log record at unmount time if we're trying to
     trigger summary counter recalculation at next mount.

   - Don't block the AIL on locked dquot/inode buffers; instead trigger
     its backoff mechanism to give the lock holder a chance to finish
     up.

   - Ratelimit writeback flushing when buffered writes encounter ENOSPC.

   - Other minor cleanups.

   - Make reflink a synchronous operation when the fs is mounted with
     wsync or sync, which means that now we force the log to disk to
     record the changes"

* tag 'xfs-5.7-merge-12' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: (26 commits)
  xfs: reflink should force the log out if mounted with wsync
  xfs: factor out a new xfs_log_force_inode helper
  xfs: fix inode number overflow in ifree cluster helper
  xfs: remove redundant variable assignment in xfs_symlink()
  xfs: ratelimit inode flush on buffered write ENOSPC
  xfs: return locked status of inode buffer on xfsaild push
  xfs: trylock underlying buffer on dquot flush
  xfs: remove unnecessary ternary from xfs_create
  xfs: don't write a corrupt unmount record to force summary counter recalc
  xfs: factor inode lookup from xfs_ifree_cluster
  xfs: tail updates only need to occur when LSN changes
  xfs: factor common AIL item deletion code
  xfs: correctly acount for reclaimable slabs
  xfs: Improve metadata buffer reclaim accountability
  xfs: don't allow log IO to be throttled
  xfs: Throttle commits on delayed background CIL push
  xfs: Lower CIL flush limit for large logs
  xfs: remove some stale comments from the log code
  xfs: refactor unmount record writing
  xfs: merge xlog_commit_record with xlog_write_done
  ...
parents d3e5e977 5833112d
......@@ -328,6 +328,38 @@ xfs_validate_sb_common(
return -EFSCORRUPTED;
}
/* Validate the realtime geometry; stolen from xfs_repair */
if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE ||
sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) {
xfs_notice(mp,
"realtime extent sanity check failed");
return -EFSCORRUPTED;
}
if (sbp->sb_rblocks == 0) {
if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 ||
sbp->sb_rextslog != 0 || sbp->sb_frextents != 0) {
xfs_notice(mp,
"realtime zeroed geometry check failed");
return -EFSCORRUPTED;
}
} else {
uint64_t rexts;
uint64_t rbmblocks;
rexts = div_u64(sbp->sb_rblocks, sbp->sb_rextsize);
rbmblocks = howmany_64(sbp->sb_rextents,
NBBY * sbp->sb_blocksize);
if (sbp->sb_rextents != rexts ||
sbp->sb_rextslog != xfs_highbit32(sbp->sb_rextents) ||
sbp->sb_rbmblocks != rbmblocks) {
xfs_notice(mp,
"realtime geometry sanity check failed");
return -EFSCORRUPTED;
}
}
if (sbp->sb_unit) {
if (!xfs_sb_version_hasdalign(sbp) ||
sbp->sb_unit > sbp->sb_width ||
......
......@@ -327,6 +327,9 @@ xfs_buf_free(
__free_page(page);
}
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab +=
bp->b_page_count;
} else if (bp->b_flags & _XBF_KMEM)
kmem_free(bp->b_addr);
_xfs_buf_free_pages(bp);
......@@ -2114,9 +2117,11 @@ xfs_buf_delwri_pushbuf(
int __init
xfs_buf_init(void)
{
xfs_buf_zone = kmem_cache_create("xfs_buf",
sizeof(struct xfs_buf), 0,
SLAB_HWCACHE_ALIGN, NULL);
xfs_buf_zone = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0,
SLAB_HWCACHE_ALIGN |
SLAB_RECLAIM_ACCOUNT |
SLAB_MEM_SPREAD,
NULL);
if (!xfs_buf_zone)
goto out;
......
......@@ -1105,8 +1105,8 @@ xfs_qm_dqflush(
* Get the buffer containing the on-disk dquot
*/
error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
mp->m_quotainfo->qi_dqchunklen, 0, &bp,
&xfs_dquot_buf_ops);
mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK,
&bp, &xfs_dquot_buf_ops);
if (error)
goto out_unlock;
......@@ -1177,7 +1177,7 @@ xfs_qm_dqflush(
out_unlock:
xfs_dqfunlock(dqp);
return -EIO;
return error;
}
/*
......
......@@ -189,7 +189,8 @@ xfs_qm_dquot_logitem_push(
if (!xfs_buf_delwri_queue(bp, buffer_list))
rval = XFS_ITEM_FLUSHING;
xfs_buf_relse(bp);
}
} else if (error == -EAGAIN)
rval = XFS_ITEM_LOCKED;
spin_lock(&lip->li_ailp->ail_lock);
out_unlock:
......
......@@ -15,7 +15,6 @@
#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_icache.h"
#include "xfs_log.h"
#include "xfs_pnfs.h"
/*
......@@ -221,18 +220,7 @@ STATIC int
xfs_fs_nfs_commit_metadata(
struct inode *inode)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
xfs_lsn_t lsn = 0;
xfs_ilock(ip, XFS_ILOCK_SHARED);
if (xfs_ipincount(ip))
lsn = ip->i_itemp->ili_last_lsn;
xfs_iunlock(ip, XFS_ILOCK_SHARED);
if (!lsn)
return 0;
return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
return xfs_log_force_inode(XFS_I(inode));
}
const struct export_operations xfs_export_operations = {
......
......@@ -80,19 +80,9 @@ xfs_dir_fsync(
int datasync)
{
struct xfs_inode *ip = XFS_I(file->f_mapping->host);
struct xfs_mount *mp = ip->i_mount;
xfs_lsn_t lsn = 0;
trace_xfs_dir_fsync(ip);
xfs_ilock(ip, XFS_ILOCK_SHARED);
if (xfs_ipincount(ip))
lsn = ip->i_itemp->ili_last_lsn;
xfs_iunlock(ip, XFS_ILOCK_SHARED);
if (!lsn)
return 0;
return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
return xfs_log_force_inode(ip);
}
STATIC int
......@@ -1069,7 +1059,11 @@ xfs_file_remap_range(
ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
remap_flags);
if (ret)
goto out_unlock;
if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_log_force_inode(dest);
out_unlock:
xfs_reflink_remap_unlock(file_in, file_out);
if (ret)
......
......@@ -1200,8 +1200,7 @@ xfs_create(
unlock_dp_on_error = false;
error = xfs_dir_createname(tp, dp, name, ip->i_ino,
resblks ?
resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
resblks - XFS_IALLOC_SPACE_RES(mp));
if (error) {
ASSERT(error != -ENOSPC);
goto out_trans_cancel;
......@@ -2503,6 +2502,88 @@ xfs_iunlink_remove(
return error;
}
/*
* Look up the inode number specified and mark it stale if it is found. If it is
* dirty, return the inode so it can be attached to the cluster buffer so it can
* be processed appropriately when the cluster free transaction completes.
*/
static struct xfs_inode *
xfs_ifree_get_one_inode(
struct xfs_perag *pag,
struct xfs_inode *free_ip,
xfs_ino_t inum)
{
struct xfs_mount *mp = pag->pag_mount;
struct xfs_inode *ip;
retry:
rcu_read_lock();
ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum));
/* Inode not in memory, nothing to do */
if (!ip)
goto out_rcu_unlock;
/*
* because this is an RCU protected lookup, we could find a recently
* freed or even reallocated inode during the lookup. We need to check
* under the i_flags_lock for a valid inode here. Skip it if it is not
* valid, the wrong inode or stale.
*/
spin_lock(&ip->i_flags_lock);
if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) {
spin_unlock(&ip->i_flags_lock);
goto out_rcu_unlock;
}
spin_unlock(&ip->i_flags_lock);
/*
* Don't try to lock/unlock the current inode, but we _cannot_ skip the
* other inodes that we did not find in the list attached to the buffer
* and are not already marked stale. If we can't lock it, back off and
* retry.
*/
if (ip != free_ip) {
if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
rcu_read_unlock();
delay(1);
goto retry;
}
/*
* Check the inode number again in case we're racing with
* freeing in xfs_reclaim_inode(). See the comments in that
* function for more information as to why the initial check is
* not sufficient.
*/
if (ip->i_ino != inum) {
xfs_iunlock(ip, XFS_ILOCK_EXCL);
goto out_rcu_unlock;
}
}
rcu_read_unlock();
xfs_iflock(ip);
xfs_iflags_set(ip, XFS_ISTALE);
/*
* We don't need to attach clean inodes or those only with unlogged
* changes (which we throw away, anyway).
*/
if (!ip->i_itemp || xfs_inode_clean(ip)) {
ASSERT(ip != free_ip);
xfs_ifunlock(ip);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
goto out_no_inode;
}
return ip;
out_rcu_unlock:
rcu_read_unlock();
out_no_inode:
return NULL;
}
/*
* A big issue when freeing the inode cluster is that we _cannot_ skip any
* inodes that are in memory - they all must be marked stale and attached to
......@@ -2603,77 +2684,11 @@ xfs_ifree_cluster(
* even trying to lock them.
*/
for (i = 0; i < igeo->inodes_per_cluster; i++) {
retry:
rcu_read_lock();
ip = radix_tree_lookup(&pag->pag_ici_root,
XFS_INO_TO_AGINO(mp, (inum + i)));
/* Inode not in memory, nothing to do */
if (!ip) {
rcu_read_unlock();
ip = xfs_ifree_get_one_inode(pag, free_ip, inum + i);
if (!ip)
continue;
}
/*
* because this is an RCU protected lookup, we could
* find a recently freed or even reallocated inode
* during the lookup. We need to check under the
* i_flags_lock for a valid inode here. Skip it if it
* is not valid, the wrong inode or stale.
*/
spin_lock(&ip->i_flags_lock);
if (ip->i_ino != inum + i ||
__xfs_iflags_test(ip, XFS_ISTALE)) {
spin_unlock(&ip->i_flags_lock);
rcu_read_unlock();
continue;
}
spin_unlock(&ip->i_flags_lock);
/*
* Don't try to lock/unlock the current inode, but we
* _cannot_ skip the other inodes that we did not find
* in the list attached to the buffer and are not
* already marked stale. If we can't lock it, back off
* and retry.
*/
if (ip != free_ip) {
if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
rcu_read_unlock();
delay(1);
goto retry;
}
/*
* Check the inode number again in case we're
* racing with freeing in xfs_reclaim_inode().
* See the comments in that function for more
* information as to why the initial check is
* not sufficient.
*/
if (ip->i_ino != inum + i) {
xfs_iunlock(ip, XFS_ILOCK_EXCL);
rcu_read_unlock();
continue;
}
}
rcu_read_unlock();
xfs_iflock(ip);
xfs_iflags_set(ip, XFS_ISTALE);
/*
* we don't need to attach clean inodes or those only
* with unlogged changes (which we throw away, anyway).
*/
iip = ip->i_itemp;
if (!iip || xfs_inode_clean(ip)) {
ASSERT(ip != free_ip);
xfs_ifunlock(ip);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
continue;
}
iip->ili_last_fields = iip->ili_fields;
iip->ili_fields = 0;
iip->ili_fsync_fields = 0;
......@@ -3930,3 +3945,22 @@ xfs_irele(
trace_xfs_irele(ip, _RET_IP_);
iput(VFS_I(ip));
}
/*
* Ensure all commited transactions touching the inode are written to the log.
*/
int
xfs_log_force_inode(
struct xfs_inode *ip)
{
xfs_lsn_t lsn = 0;
xfs_ilock(ip, XFS_ILOCK_SHARED);
if (xfs_ipincount(ip))
lsn = ip->i_itemp->ili_last_lsn;
xfs_iunlock(ip, XFS_ILOCK_SHARED);
if (!lsn)
return 0;
return xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, NULL);
}
......@@ -426,6 +426,7 @@ int xfs_itruncate_extents_flags(struct xfs_trans **,
struct xfs_inode *, int, xfs_fsize_t, int);
void xfs_iext_realloc(xfs_inode_t *, int, int);
int xfs_log_force_inode(struct xfs_inode *ip);
void xfs_iunpin_wait(xfs_inode_t *);
#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount))
......
......@@ -552,7 +552,8 @@ xfs_inode_item_push(
if (!xfs_buf_delwri_queue(bp, buffer_list))
rval = XFS_ITEM_FLUSHING;
xfs_buf_relse(bp);
}
} else if (error == -EAGAIN)
rval = XFS_ITEM_LOCKED;
spin_lock(&lip->li_ailp->ail_lock);
out_unlock:
......@@ -730,29 +731,27 @@ xfs_iflush_done(
* holding the lock before removing the inode from the AIL.
*/
if (need_ail) {
bool mlip_changed = false;
xfs_lsn_t tail_lsn = 0;
/* this is an opencoded batch version of xfs_trans_ail_delete */
spin_lock(&ailp->ail_lock);
list_for_each_entry(blip, &tmp, li_bio_list) {
if (INODE_ITEM(blip)->ili_logged &&
blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn)
mlip_changed |= xfs_ail_delete_one(ailp, blip);
else {
blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) {
/*
* xfs_ail_update_finish() only cares about the
* lsn of the first tail item removed, any
* others will be at the same or higher lsn so
* we just ignore them.
*/
xfs_lsn_t lsn = xfs_ail_delete_one(ailp, blip);
if (!tail_lsn && lsn)
tail_lsn = lsn;
} else {
xfs_clear_li_failed(blip);
}
}
if (mlip_changed) {
if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount))
xlog_assign_tail_lsn_locked(ailp->ail_mount);
if (list_empty(&ailp->ail_head))
wake_up_all(&ailp->ail_empty);
}
spin_unlock(&ailp->ail_lock);
if (mlip_changed)
xfs_log_space_wake(ailp->ail_mount);
xfs_ail_update_finish(ailp, tail_lsn);
}
/*
......
......@@ -24,13 +24,6 @@
kmem_zone_t *xfs_log_ticket_zone;
/* Local miscellaneous function prototypes */
STATIC int
xlog_commit_record(
struct xlog *log,
struct xlog_ticket *ticket,
struct xlog_in_core **iclog,
xfs_lsn_t *commitlsnp);
STATIC struct xlog *
xlog_alloc_log(
struct xfs_mount *mp,
......@@ -66,14 +59,6 @@ xlog_grant_push_ail(
struct xlog *log,
int need_bytes);
STATIC void
xlog_regrant_reserve_log_space(
struct xlog *log,
struct xlog_ticket *ticket);
STATIC void
xlog_ungrant_log_space(
struct xlog *log,
struct xlog_ticket *ticket);
STATIC void
xlog_sync(
struct xlog *log,
struct xlog_in_core *iclog);
......@@ -478,73 +463,6 @@ xfs_log_reserve(
return error;
}
/*
* NOTES:
*
* 1. currblock field gets updated at startup and after in-core logs
* marked as with WANT_SYNC.
*/
/*
* This routine is called when a user of a log manager ticket is done with
* the reservation. If the ticket was ever used, then a commit record for
* the associated transaction is written out as a log operation header with
* no data. The flag XLOG_TIC_INITED is set when the first write occurs with
* a given ticket. If the ticket was one with a permanent reservation, then
* a few operations are done differently. Permanent reservation tickets by
* default don't release the reservation. They just commit the current
* transaction with the belief that the reservation is still needed. A flag
* must be passed in before permanent reservations are actually released.
* When these type of tickets are not released, they need to be set into
* the inited state again. By doing this, a start record will be written
* out when the next write occurs.
*/
xfs_lsn_t
xfs_log_done(
struct xfs_mount *mp,
struct xlog_ticket *ticket,
struct xlog_in_core **iclog,
bool regrant)
{
struct xlog *log = mp->m_log;
xfs_lsn_t lsn = 0;
if (XLOG_FORCED_SHUTDOWN(log) ||
/*
* If nothing was ever written, don't write out commit record.
* If we get an error, just continue and give back the log ticket.
*/
(((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
(xlog_commit_record(log, ticket, iclog, &lsn)))) {
lsn = (xfs_lsn_t) -1;
regrant = false;
}
if (!regrant) {
trace_xfs_log_done_nonperm(log, ticket);
/*
* Release ticket if not permanent reservation or a specific
* request has been made to release a permanent reservation.
*/
xlog_ungrant_log_space(log, ticket);
} else {
trace_xfs_log_done_perm(log, ticket);
xlog_regrant_reserve_log_space(log, ticket);
/* If this ticket was a permanent reservation and we aren't
* trying to release it, reset the inited flags; so next time
* we write, a start record will be written out.
*/
ticket->t_flags |= XLOG_TIC_INITED;
}
xfs_log_ticket_put(ticket);
return lsn;
}
static bool
__xlog_state_release_iclog(
struct xlog *log,
......@@ -869,32 +787,44 @@ xlog_wait_on_iclog(
}
/*
* Final log writes as part of unmount.
*
* Mark the filesystem clean as unmount happens. Note that during relocation
* this routine needs to be executed as part of source-bag while the
* deallocation must not be done until source-end.
* Write out an unmount record using the ticket provided. We have to account for
* the data space used in the unmount ticket as this write is not done from a
* transaction context that has already done the accounting for us.
*/
/* Actually write the unmount record to disk. */
static void
xfs_log_write_unmount_record(
struct xfs_mount *mp)
static int
xlog_write_unmount_record(
struct xlog *log,
struct xlog_ticket *ticket,
xfs_lsn_t *lsn,
uint flags)
{
/* the data section must be 32 bit size aligned */
struct xfs_unmount_log_format magic = {
struct xfs_unmount_log_format ulf = {
.magic = XLOG_UNMOUNT_TYPE,
};
struct xfs_log_iovec reg = {
.i_addr = &magic,
.i_len = sizeof(magic),
.i_addr = &ulf,
.i_len = sizeof(ulf),
.i_type = XLOG_REG_TYPE_UNMOUNT,
};
struct xfs_log_vec vec = {
.lv_niovecs = 1,
.lv_iovecp = &reg,
};
struct xlog *log = mp->m_log;
/* account for space used by record data */
ticket->t_curr_res -= sizeof(ulf);
return xlog_write(log, &vec, ticket, lsn, NULL, flags, false);
}
/*
* Mark the filesystem clean by writing an unmount record to the head of the
* log.
*/
static void
xlog_unmount_write(
struct xlog *log)
{
struct xfs_mount *mp = log->l_mp;
struct xlog_in_core *iclog;
struct xlog_ticket *tic = NULL;
xfs_lsn_t lsn;
......@@ -905,23 +835,7 @@ xfs_log_write_unmount_record(
if (error)
goto out_err;
/*
* If we think the summary counters are bad, clear the unmount header
* flag in the unmount record so that the summary counters will be
* recalculated during log recovery at next mount. Refer to
* xlog_check_unmount_rec for more details.
*/
if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp,
XFS_ERRTAG_FORCE_SUMMARY_RECALC)) {
xfs_alert(mp, "%s: will fix summary counters at next mount",
__func__);
flags &= ~XLOG_UNMOUNT_TRANS;
}
/* remove inited flag, and account for space used */
tic->t_flags = 0;
tic->t_curr_res -= sizeof(magic);
error = xlog_write(log, &vec, tic, &lsn, NULL, flags);
error = xlog_write_unmount_record(log, tic, &lsn, flags);
/*
* At this point, we're umounting anyway, so there's no point in
* transitioning log state to IOERROR. Just continue...
......@@ -943,8 +857,7 @@ xfs_log_write_unmount_record(
if (tic) {
trace_xfs_log_umount_write(log, tic);
xlog_ungrant_log_space(log, tic);
xfs_log_ticket_put(tic);
xfs_log_ticket_ungrant(log, tic);
}
}
......@@ -987,8 +900,22 @@ xfs_log_unmount_write(
if (XLOG_FORCED_SHUTDOWN(log))
return;
/*
* If we think the summary counters are bad, avoid writing the unmount
* record to force log recovery at next mount, after which the summary
* counters will be recalculated. Refer to xlog_check_unmount_rec for
* more details.
*/
if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp,
XFS_ERRTAG_FORCE_SUMMARY_RECALC)) {
xfs_alert(mp, "%s: will fix summary counters at next mount",
__func__);
return;
}
xfs_log_unmount_verify_iclog(log);
xfs_log_write_unmount_record(mp);
xlog_unmount_write(log);
}
/*
......@@ -1515,20 +1442,17 @@ xlog_alloc_log(
return ERR_PTR(error);
} /* xlog_alloc_log */
/*
* Write out the commit record of a transaction associated with the given
* ticket. Return the lsn of the commit record.
* ticket to close off a running log write. Return the lsn of the commit record.
*/
STATIC int
int
xlog_commit_record(
struct xlog *log,
struct xlog_ticket *ticket,
struct xlog_in_core **iclog,
xfs_lsn_t *commitlsnp)
xfs_lsn_t *lsn)
{
struct xfs_mount *mp = log->l_mp;
int error;
struct xfs_log_iovec reg = {
.i_addr = NULL,
.i_len = 0,
......@@ -1538,12 +1462,15 @@ xlog_commit_record(
.lv_niovecs = 1,
.lv_iovecp = &reg,
};
int error;
ASSERT_ALWAYS(iclog);
error = xlog_write(log, &vec, ticket, commitlsnp, iclog,
XLOG_COMMIT_TRANS);
if (XLOG_FORCED_SHUTDOWN(log))
return -EIO;
error = xlog_write(log, &vec, ticket, lsn, iclog, XLOG_COMMIT_TRANS,
false);
if (error)
xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
return error;
}
......@@ -1761,7 +1688,15 @@ xlog_write_iclog(
iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno;
iclog->ic_bio.bi_end_io = xlog_bio_end_io;
iclog->ic_bio.bi_private = iclog;
iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_FUA;
/*
* We use REQ_SYNC | REQ_IDLE here to tell the block layer the are more
* IOs coming immediately after this one. This prevents the block layer
* writeback throttle from throttling log writes behind background
* metadata writeback and causing priority inversions.
*/
iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC |
REQ_IDLE | REQ_FUA;
if (need_flush)
iclog->ic_bio.bi_opf |= REQ_PREFLUSH;
......@@ -1981,7 +1916,7 @@ xlog_dealloc_log(
log->l_mp->m_log = NULL;
destroy_workqueue(log->l_ioend_workqueue);
kmem_free(log);
} /* xlog_dealloc_log */
}
/*
* Update counters atomically now that memcpy is done.
......@@ -2118,23 +2053,21 @@ xlog_print_trans(
}
/*
* Calculate the potential space needed by the log vector. Each region gets
* its own xlog_op_header_t and may need to be double word aligned.
* Calculate the potential space needed by the log vector. We may need a start
* record, and each region gets its own struct xlog_op_header and may need to be
* double word aligned.
*/
static int
xlog_write_calc_vec_length(
struct xlog_ticket *ticket,
struct xfs_log_vec *log_vector)
struct xfs_log_vec *log_vector,
bool need_start_rec)
{
struct xfs_log_vec *lv;
int headers = 0;
int headers = need_start_rec ? 1 : 0;
int len = 0;
int i;
/* acct for start rec of xact */
if (ticket->t_flags & XLOG_TIC_INITED)
headers++;
for (lv = log_vector; lv; lv = lv->lv_next) {
/* we don't write ordered log vectors */
if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED)
......@@ -2156,27 +2089,16 @@ xlog_write_calc_vec_length(
return len;
}
/*
* If first write for transaction, insert start record We can't be trying to
* commit if we are inited. We can't have any "partial_copy" if we are inited.
*/
static int
static void
xlog_write_start_rec(
struct xlog_op_header *ophdr,
struct xlog_ticket *ticket)
{
if (!(ticket->t_flags & XLOG_TIC_INITED))
return 0;
ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
ophdr->oh_clientid = ticket->t_clientid;
ophdr->oh_len = 0;
ophdr->oh_flags = XLOG_START_TRANS;
ophdr->oh_res2 = 0;
ticket->t_flags &= ~XLOG_TIC_INITED;
return sizeof(struct xlog_op_header);
}
static xlog_op_header_t *
......@@ -2365,13 +2287,14 @@ xlog_write(
struct xlog_ticket *ticket,
xfs_lsn_t *start_lsn,
struct xlog_in_core **commit_iclog,
uint flags)
uint flags,
bool need_start_rec)
{
struct xlog_in_core *iclog = NULL;
struct xfs_log_iovec *vecp;
struct xfs_log_vec *lv;
struct xfs_log_vec *lv = log_vector;
struct xfs_log_iovec *vecp = lv->lv_iovecp;
int index = 0;
int len;
int index;
int partial_copy = 0;
int partial_copy_len = 0;
int contwr = 0;
......@@ -2379,25 +2302,13 @@ xlog_write(
int data_cnt = 0;
int error = 0;
*start_lsn = 0;
len = xlog_write_calc_vec_length(ticket, log_vector);
/*
* Region headers and bytes are already accounted for.
* We only need to take into account start records and
* split regions in this function.
* If this is a commit or unmount transaction, we don't need a start
* record to be written. We do, however, have to account for the
* commit or unmount header that gets written. Hence we always have
* to account for an extra xlog_op_header here.
*/
if (ticket->t_flags & XLOG_TIC_INITED)
ticket->t_curr_res -= sizeof(xlog_op_header_t);
/*
* Commit record headers need to be accounted for. These
* come in as separate writes so are easy to detect.
*/
if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS))
ticket->t_curr_res -= sizeof(xlog_op_header_t);
ticket->t_curr_res -= sizeof(struct xlog_op_header);
if (ticket->t_curr_res < 0) {
xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
"ctx ticket reservation ran out. Need to up reservation");
......@@ -2405,9 +2316,8 @@ xlog_write(
xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
}
index = 0;
lv = log_vector;
vecp = lv->lv_iovecp;
len = xlog_write_calc_vec_length(ticket, log_vector, need_start_rec);
*start_lsn = 0;
while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
void *ptr;
int log_offset;
......@@ -2431,7 +2341,6 @@ xlog_write(
while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
struct xfs_log_iovec *reg;
struct xlog_op_header *ophdr;
int start_rec_copy;
int copy_len;
int copy_off;
bool ordered = false;
......@@ -2447,11 +2356,15 @@ xlog_write(
ASSERT(reg->i_len % sizeof(int32_t) == 0);
ASSERT((unsigned long)ptr % sizeof(int32_t) == 0);
start_rec_copy = xlog_write_start_rec(ptr, ticket);
if (start_rec_copy) {
record_cnt++;
/*
* Before we start formatting log vectors, we need to
* write a start record. Only do this for the first
* iclog we write to.
*/
if (need_start_rec) {
xlog_write_start_rec(ptr, ticket);
xlog_write_adv_cnt(&ptr, &len, &log_offset,
start_rec_copy);
sizeof(struct xlog_op_header));
}
ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags);
......@@ -2483,8 +2396,13 @@ xlog_write(
xlog_write_adv_cnt(&ptr, &len, &log_offset,
copy_len);
}
copy_len += start_rec_copy + sizeof(xlog_op_header_t);
copy_len += sizeof(struct xlog_op_header);
record_cnt++;
if (need_start_rec) {
copy_len += sizeof(struct xlog_op_header);
record_cnt++;
need_start_rec = false;
}
data_cnt += contwr ? copy_len : 0;
error = xlog_write_copy_finish(log, iclog, flags,
......@@ -2541,14 +2459,6 @@ xlog_write(
return error;
}
/*****************************************************************************
*
* State Machine functions
*
*****************************************************************************
*/
static void
xlog_state_activate_iclog(
struct xlog_in_core *iclog,
......@@ -2909,7 +2819,7 @@ xlog_state_done_syncing(
*/
wake_up_all(&iclog->ic_write_wait);
spin_unlock(&log->l_icloglock);
xlog_state_do_callback(log); /* also cleans log */
xlog_state_do_callback(log);
}
/*
......@@ -3029,21 +2939,21 @@ xlog_state_get_iclog_space(
*logoffsetp = log_offset;
return 0;
} /* xlog_state_get_iclog_space */
}
/* The first cnt-1 times through here we don't need to
* move the grant write head because the permanent
* reservation has reserved cnt times the unit amount.
* Release part of current permanent unit reservation and
* reset current reservation to be one units worth. Also
* move grant reservation head forward.
/*
* The first cnt-1 times a ticket goes through here we don't need to move the
* grant write head because the permanent reservation has reserved cnt times the
* unit amount. Release part of current permanent unit reservation and reset
* current reservation to be one units worth. Also move grant reservation head
* forward.
*/
STATIC void
xlog_regrant_reserve_log_space(
void
xfs_log_ticket_regrant(
struct xlog *log,
struct xlog_ticket *ticket)
{
trace_xfs_log_regrant_reserve_enter(log, ticket);
trace_xfs_log_ticket_regrant(log, ticket);
if (ticket->t_cnt > 0)
ticket->t_cnt--;
......@@ -3055,21 +2965,20 @@ xlog_regrant_reserve_log_space(
ticket->t_curr_res = ticket->t_unit_res;
xlog_tic_reset_res(ticket);
trace_xfs_log_regrant_reserve_sub(log, ticket);
trace_xfs_log_ticket_regrant_sub(log, ticket);
/* just return if we still have some of the pre-reserved space */
if (ticket->t_cnt > 0)
return;
if (!ticket->t_cnt) {
xlog_grant_add_space(log, &log->l_reserve_head.grant,
ticket->t_unit_res);
trace_xfs_log_regrant_reserve_exit(log, ticket);
trace_xfs_log_ticket_regrant_exit(log, ticket);
ticket->t_curr_res = ticket->t_unit_res;
xlog_tic_reset_res(ticket);
} /* xlog_regrant_reserve_log_space */
}
xfs_log_ticket_put(ticket);
}
/*
* Give back the space left from a reservation.
......@@ -3085,18 +2994,19 @@ xlog_regrant_reserve_log_space(
* space, the count will stay at zero and the only space remaining will be
* in the current reservation field.
*/
STATIC void
xlog_ungrant_log_space(
void
xfs_log_ticket_ungrant(
struct xlog *log,
struct xlog_ticket *ticket)
{
int bytes;
trace_xfs_log_ticket_ungrant(log, ticket);
if (ticket->t_cnt > 0)
ticket->t_cnt--;
trace_xfs_log_ungrant_enter(log, ticket);
trace_xfs_log_ungrant_sub(log, ticket);
trace_xfs_log_ticket_ungrant_sub(log, ticket);
/*
* If this is a permanent reservation ticket, we may be able to free
......@@ -3111,18 +3021,15 @@ xlog_ungrant_log_space(
xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes);
xlog_grant_sub_space(log, &log->l_write_head.grant, bytes);
trace_xfs_log_ungrant_exit(log, ticket);
trace_xfs_log_ticket_ungrant_exit(log, ticket);
xfs_log_space_wake(log->l_mp);
xfs_log_ticket_put(ticket);
}
/*
* Mark the current iclog in the ring as WANT_SYNC and move the current iclog
* pointer to the next iclog in the ring.
*
* When called from xlog_state_get_iclog_space(), the exact size of the iclog
* has not yet been determined, all we know is that we have run out of space in
* the current iclog.
* This routine will mark the current iclog in the ring as WANT_SYNC and move
* the current iclog pointer to the next iclog in the ring.
*/
STATIC void
xlog_state_switch_iclogs(
......@@ -3167,7 +3074,7 @@ xlog_state_switch_iclogs(
}
ASSERT(iclog == log->l_iclog);
log->l_iclog = iclog->ic_next;
} /* xlog_state_switch_iclogs */
}
/*
* Write out all data in the in-core log as of this exact moment in time.
......@@ -3374,13 +3281,6 @@ xfs_log_force_lsn(
return ret;
}
/*****************************************************************************
*
* TICKET functions
*
*****************************************************************************
*/
/*
* Free a used ticket when its refcount falls to zero.
*/
......@@ -3529,7 +3429,6 @@ xlog_ticket_alloc(
tic->t_ocnt = cnt;
tic->t_tid = prandom_u32();
tic->t_clientid = client;
tic->t_flags = XLOG_TIC_INITED;
if (permanent)
tic->t_flags |= XLOG_TIC_PERM_RESERV;
......@@ -3538,13 +3437,6 @@ xlog_ticket_alloc(
return tic;
}
/******************************************************************************
*
* Log debug routines
*
******************************************************************************
*/
#if defined(DEBUG)
/*
* Make sure that the destination ptr is within the valid data region of
......@@ -3630,7 +3522,7 @@ xlog_verify_tail_lsn(
if (blocks < BTOBB(iclog->ic_offset) + 1)
xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
}
} /* xlog_verify_tail_lsn */
}
/*
* Perform a number of checks on the iclog before writing to disk.
......@@ -3733,7 +3625,7 @@ xlog_verify_iclog(
}
ptr += sizeof(xlog_op_header_t) + op_len;
}
} /* xlog_verify_iclog */
}
#endif
/*
......
......@@ -105,10 +105,6 @@ struct xfs_log_item;
struct xfs_item_ops;
struct xfs_trans;
xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
struct xlog_ticket *ticket,
struct xlog_in_core **iclog,
bool regrant);
int xfs_log_force(struct xfs_mount *mp, uint flags);
int xfs_log_force_lsn(struct xfs_mount *mp, xfs_lsn_t lsn, uint flags,
int *log_forced);
......
......@@ -668,6 +668,11 @@ xlog_cil_push_work(
push_seq = cil->xc_push_seq;
ASSERT(push_seq <= ctx->sequence);
/*
* Wake up any background push waiters now this context is being pushed.
*/
wake_up_all(&ctx->push_wait);
/*
* Check if we've anything to push. If there is nothing, then we don't
* move on to a new sequence number and so we have to be able to push
......@@ -744,6 +749,7 @@ xlog_cil_push_work(
*/
INIT_LIST_HEAD(&new_ctx->committing);
INIT_LIST_HEAD(&new_ctx->busy_extents);
init_waitqueue_head(&new_ctx->push_wait);
new_ctx->sequence = ctx->sequence + 1;
new_ctx->cil = cil;
cil->xc_ctx = new_ctx;
......@@ -801,7 +807,7 @@ xlog_cil_push_work(
lvhdr.lv_iovecp = &lhdr;
lvhdr.lv_next = ctx->lv_chain;
error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0, true);
if (error)
goto out_abort_free_ticket;
......@@ -839,10 +845,11 @@ xlog_cil_push_work(
}
spin_unlock(&cil->xc_push_lock);
/* xfs_log_done always frees the ticket on error. */
commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, false);
if (commit_lsn == -1)
goto out_abort;
error = xlog_commit_record(log, tic, &commit_iclog, &commit_lsn);
if (error)
goto out_abort_free_ticket;
xfs_log_ticket_ungrant(log, tic);
spin_lock(&commit_iclog->ic_callback_lock);
if (commit_iclog->ic_state == XLOG_STATE_IOERROR) {
......@@ -875,7 +882,7 @@ xlog_cil_push_work(
return;
out_abort_free_ticket:
xfs_log_ticket_put(tic);
xfs_log_ticket_ungrant(log, tic);
out_abort:
ASSERT(XLOG_FORCED_SHUTDOWN(log));
xlog_cil_committed(ctx);
......@@ -890,7 +897,7 @@ xlog_cil_push_work(
*/
static void
xlog_cil_push_background(
struct xlog *log)
struct xlog *log) __releases(cil->xc_ctx_lock)
{
struct xfs_cil *cil = log->l_cilp;
......@@ -904,14 +911,36 @@ xlog_cil_push_background(
* don't do a background push if we haven't used up all the
* space available yet.
*/
if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) {
up_read(&cil->xc_ctx_lock);
return;
}
spin_lock(&cil->xc_push_lock);
if (cil->xc_push_seq < cil->xc_current_sequence) {
cil->xc_push_seq = cil->xc_current_sequence;
queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
}
/*
* Drop the context lock now, we can't hold that if we need to sleep
* because we are over the blocking threshold. The push_lock is still
* held, so blocking threshold sleep/wakeup is still correctly
* serialised here.
*/
up_read(&cil->xc_ctx_lock);
/*
* If we are well over the space limit, throttle the work that is being
* done until the push work on this context has begun.
*/
if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) {
trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket);
ASSERT(cil->xc_ctx->space_used < log->l_logsize);
xlog_wait(&cil->xc_ctx->push_wait, &cil->xc_push_lock);
return;
}
spin_unlock(&cil->xc_push_lock);
}
......@@ -1007,7 +1036,10 @@ xfs_log_commit_cil(
if (commit_lsn)
*commit_lsn = xc_commit_lsn;
xfs_log_done(mp, tp->t_ticket, NULL, regrant);
if (regrant && !XLOG_FORCED_SHUTDOWN(log))
xfs_log_ticket_regrant(log, tp->t_ticket);
else
xfs_log_ticket_ungrant(log, tp->t_ticket);
tp->t_ticket = NULL;
xfs_trans_unreserve_and_mod_sb(tp);
......@@ -1028,9 +1060,9 @@ xfs_log_commit_cil(
if (lip->li_ops->iop_committing)
lip->li_ops->iop_committing(lip, xc_commit_lsn);
}
xlog_cil_push_background(log);
up_read(&cil->xc_ctx_lock);
/* xlog_cil_push_background() releases cil->xc_ctx_lock */
xlog_cil_push_background(log);
}
/*
......@@ -1189,6 +1221,7 @@ xlog_cil_init(
INIT_LIST_HEAD(&ctx->committing);
INIT_LIST_HEAD(&ctx->busy_extents);
init_waitqueue_head(&ctx->push_wait);
ctx->sequence = 1;
ctx->cil = cil;
cil->xc_ctx = ctx;
......
......@@ -51,13 +51,11 @@ enum xlog_iclog_state {
};
/*
* Flags to log ticket
* Log ticket flags
*/
#define XLOG_TIC_INITED 0x1 /* has been initialized */
#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */
#define XLOG_TIC_PERM_RESERV 0x1 /* permanent reservation */
#define XLOG_TIC_FLAGS \
{ XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \
{ XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
/*
......@@ -242,6 +240,7 @@ struct xfs_cil_ctx {
struct xfs_log_vec *lv_chain; /* logvecs being pushed */
struct list_head iclog_entry;
struct list_head committing; /* ctx committing list */
wait_queue_head_t push_wait; /* background push throttle */
struct work_struct discard_endio_work;
};
......@@ -318,13 +317,53 @@ struct xfs_cil {
* tries to keep 25% of the log free, so we need to keep below that limit or we
* risk running out of free log space to start any new transactions.
*
* In order to keep background CIL push efficient, we will set a lower
* threshold at which background pushing is attempted without blocking current
* transaction commits. A separate, higher bound defines when CIL pushes are
* enforced to ensure we stay within our maximum checkpoint size bounds.
* threshold, yet give us plenty of space for aggregation on large logs.
* In order to keep background CIL push efficient, we only need to ensure the
* CIL is large enough to maintain sufficient in-memory relogging to avoid
* repeated physical writes of frequently modified metadata. If we allow the CIL
* to grow to a substantial fraction of the log, then we may be pinning hundreds
* of megabytes of metadata in memory until the CIL flushes. This can cause
* issues when we are running low on memory - pinned memory cannot be reclaimed,
* and the CIL consumes a lot of memory. Hence we need to set an upper physical
* size limit for the CIL that limits the maximum amount of memory pinned by the
* CIL but does not limit performance by reducing relogging efficiency
* significantly.
*
* As such, the CIL push threshold ends up being the smaller of two thresholds:
* - a threshold large enough that it allows CIL to be pushed and progress to be
* made without excessive blocking of incoming transaction commits. This is
* defined to be 12.5% of the log space - half the 25% push threshold of the
* AIL.
* - small enough that it doesn't pin excessive amounts of memory but maintains
* close to peak relogging efficiency. This is defined to be 16x the iclog
* buffer window (32MB) as measurements have shown this to be roughly the
* point of diminishing performance increases under highly concurrent
* modification workloads.
*
* To prevent the CIL from overflowing upper commit size bounds, we introduce a
* new threshold at which we block committing transactions until the background
* CIL commit commences and switches to a new context. While this is not a hard
* limit, it forces the process committing a transaction to the CIL to block and
* yeild the CPU, giving the CIL push work a chance to be scheduled and start
* work. This prevents a process running lots of transactions from overfilling
* the CIL because it is not yielding the CPU. We set the blocking limit at
* twice the background push space threshold so we keep in line with the AIL
* push thresholds.
*
* Note: this is not a -hard- limit as blocking is applied after the transaction
* is inserted into the CIL and the push has been triggered. It is largely a
* throttling mechanism that allows the CIL push to be scheduled and run. A hard
* limit will be difficult to implement without introducing global serialisation
* in the CIL commit fast path, and it's not at all clear that we actually need
* such hard limits given the ~7 years we've run without a hard limit before
* finding the first situation where a checkpoint size overflow actually
* occurred. Hence the simple throttle, and an ASSERT check to tell us that
* we've overrun the max size.
*/
#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3)
#define XLOG_CIL_SPACE_LIMIT(log) \
min_t(int, (log)->l_logsize >> 3, BBTOB(XLOG_TOTAL_REC_SHIFT(log)) << 4)
#define XLOG_CIL_BLOCKING_SPACE_LIMIT(log) \
(XLOG_CIL_SPACE_LIMIT(log) * 2)
/*
* ticket grant locks, queues and accounting have their own cachlines
......@@ -439,14 +478,14 @@ xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
void xlog_print_trans(struct xfs_trans *);
int
xlog_write(
struct xlog *log,
struct xfs_log_vec *log_vector,
struct xlog_ticket *tic,
xfs_lsn_t *start_lsn,
struct xlog_in_core **commit_iclog,
uint flags);
int xlog_write(struct xlog *log, struct xfs_log_vec *log_vector,
struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
struct xlog_in_core **commit_iclog, uint flags,
bool need_start_rec);
int xlog_commit_record(struct xlog *log, struct xlog_ticket *ticket,
struct xlog_in_core **iclog, xfs_lsn_t *lsn);
void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket);
void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket);
/*
* When we crack an atomic LSN, we sample it first so that the value will not
......
......@@ -167,6 +167,7 @@ typedef struct xfs_mount {
struct xfs_kobj m_error_meta_kobj;
struct xfs_error_cfg m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX];
struct xstats m_stats; /* per-fs stats */
struct ratelimit_state m_flush_inodes_ratelimit;
struct workqueue_struct *m_buf_workqueue;
struct workqueue_struct *m_unwritten_workqueue;
......
......@@ -121,12 +121,11 @@ xfs_qm_dqpurge(
{
struct xfs_mount *mp = dqp->q_mount;
struct xfs_quotainfo *qi = mp->m_quotainfo;
int error = -EAGAIN;
xfs_dqlock(dqp);
if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) {
xfs_dqunlock(dqp);
return -EAGAIN;
}
if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0)
goto out_unlock;
dqp->dq_flags |= XFS_DQ_FREEING;
......@@ -139,7 +138,6 @@ xfs_qm_dqpurge(
*/
if (XFS_DQ_IS_DIRTY(dqp)) {
struct xfs_buf *bp = NULL;
int error;
/*
* We don't care about getting disk errors here. We need
......@@ -149,6 +147,8 @@ xfs_qm_dqpurge(
if (!error) {
error = xfs_bwrite(bp);
xfs_buf_relse(bp);
} else if (error == -EAGAIN) {
goto out_unlock;
}
xfs_dqflock(dqp);
}
......@@ -174,6 +174,10 @@ xfs_qm_dqpurge(
xfs_qm_dqdestroy(dqp);
return 0;
out_unlock:
xfs_dqunlock(dqp);
return error;
}
/*
......
......@@ -528,6 +528,9 @@ xfs_flush_inodes(
{
struct super_block *sb = mp->m_super;
if (!__ratelimit(&mp->m_flush_inodes_ratelimit))
return;
if (down_read_trylock(&sb->s_umount)) {
sync_inodes_sb(sb);
up_read(&sb->s_umount);
......@@ -1366,6 +1369,17 @@ xfs_fc_fill_super(
if (error)
goto out_free_names;
/*
* Cap the number of invocations of xfs_flush_inodes to 16 for every
* quarter of a second. The magic numbers here were determined by
* observation neither to cause stalls in writeback when there are a
* lot of IO threads and the fs is near ENOSPC, nor cause any fstest
* regressions. YMMV.
*/
ratelimit_state_init(&mp->m_flush_inodes_ratelimit, HZ / 4, 16);
ratelimit_set_flags(&mp->m_flush_inodes_ratelimit,
RATELIMIT_MSG_ON_RELEASE);
error = xfs_init_mount_workqueues(mp);
if (error)
goto out_close_devices;
......@@ -1861,7 +1875,8 @@ xfs_init_zones(void)
xfs_ili_zone = kmem_cache_create("xfs_ili",
sizeof(struct xfs_inode_log_item), 0,
SLAB_MEM_SPREAD, NULL);
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
NULL);
if (!xfs_ili_zone)
goto out_destroy_inode_zone;
......
......@@ -176,7 +176,6 @@ xfs_symlink(
return -ENAMETOOLONG;
ASSERT(pathlen > 0);
udqp = gdqp = NULL;
prid = xfs_get_initial_prid(dp);
/*
......
......@@ -1001,8 +1001,6 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
DEFINE_EVENT(xfs_loggrant_class, name, \
TP_PROTO(struct xlog *log, struct xlog_ticket *tic), \
TP_ARGS(log, tic))
DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm);
DEFINE_LOGGRANT_EVENT(xfs_log_done_perm);
DEFINE_LOGGRANT_EVENT(xfs_log_umount_write);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake);
......@@ -1011,12 +1009,13 @@ DEFINE_LOGGRANT_EVENT(xfs_log_reserve);
DEFINE_LOGGRANT_EVENT(xfs_log_reserve_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter);
DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub);
DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant);
DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant_sub);
DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant);
DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_sub);
DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_cil_wait);
DECLARE_EVENT_CLASS(xfs_log_item_class,
TP_PROTO(struct xfs_log_item *lip),
......
......@@ -9,6 +9,7 @@
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_log_priv.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_extent_busy.h"
......@@ -150,6 +151,7 @@ xfs_trans_reserve(
uint blocks,
uint rtextents)
{
struct xfs_mount *mp = tp->t_mountp;
int error = 0;
bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
......@@ -162,7 +164,7 @@ xfs_trans_reserve(
* fail if the count would go below zero.
*/
if (blocks > 0) {
error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd);
error = xfs_mod_fdblocks(mp, -((int64_t)blocks), rsvd);
if (error != 0) {
current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
return -ENOSPC;
......@@ -191,9 +193,9 @@ xfs_trans_reserve(
if (tp->t_ticket != NULL) {
ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES);
error = xfs_log_regrant(tp->t_mountp, tp->t_ticket);
error = xfs_log_regrant(mp, tp->t_ticket);
} else {
error = xfs_log_reserve(tp->t_mountp,
error = xfs_log_reserve(mp,
resp->tr_logres,
resp->tr_logcount,
&tp->t_ticket, XFS_TRANSACTION,
......@@ -213,7 +215,7 @@ xfs_trans_reserve(
* fail if the count would go below zero.
*/
if (rtextents > 0) {
error = xfs_mod_frextents(tp->t_mountp, -((int64_t)rtextents));
error = xfs_mod_frextents(mp, -((int64_t)rtextents));
if (error) {
error = -ENOSPC;
goto undo_log;
......@@ -229,7 +231,7 @@ xfs_trans_reserve(
*/
undo_log:
if (resp->tr_logres > 0) {
xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, false);
xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
tp->t_ticket = NULL;
tp->t_log_res = 0;
tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES;
......@@ -237,7 +239,7 @@ xfs_trans_reserve(
undo_blocks:
if (blocks > 0) {
xfs_mod_fdblocks(tp->t_mountp, (int64_t)blocks, rsvd);
xfs_mod_fdblocks(mp, (int64_t)blocks, rsvd);
tp->t_blk_res = 0;
}
......@@ -1004,9 +1006,10 @@ __xfs_trans_commit(
*/
xfs_trans_unreserve_and_mod_dquots(tp);
if (tp->t_ticket) {
commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, regrant);
if (commit_lsn == -1 && !error)
error = -EIO;
if (regrant && !XLOG_FORCED_SHUTDOWN(mp->m_log))
xfs_log_ticket_regrant(mp->m_log, tp->t_ticket);
else
xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
tp->t_ticket = NULL;
}
current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
......@@ -1065,7 +1068,7 @@ xfs_trans_cancel(
xfs_trans_unreserve_and_mod_dquots(tp);
if (tp->t_ticket) {
xfs_log_done(mp, tp->t_ticket, NULL, false);
xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
tp->t_ticket = NULL;
}
......
......@@ -109,17 +109,25 @@ xfs_ail_next(
* We need the AIL lock in order to get a coherent read of the lsn of the last
* item in the AIL.
*/
static xfs_lsn_t
__xfs_ail_min_lsn(
struct xfs_ail *ailp)
{
struct xfs_log_item *lip = xfs_ail_min(ailp);
if (lip)
return lip->li_lsn;
return 0;
}
xfs_lsn_t
xfs_ail_min_lsn(
struct xfs_ail *ailp)
{
xfs_lsn_t lsn = 0;
struct xfs_log_item *lip;
xfs_lsn_t lsn;
spin_lock(&ailp->ail_lock);
lip = xfs_ail_min(ailp);
if (lip)
lsn = lip->li_lsn;
lsn = __xfs_ail_min_lsn(ailp);
spin_unlock(&ailp->ail_lock);
return lsn;
......@@ -681,6 +689,28 @@ xfs_ail_push_all_sync(
finish_wait(&ailp->ail_empty, &wait);
}
void
xfs_ail_update_finish(
struct xfs_ail *ailp,
xfs_lsn_t old_lsn) __releases(ailp->ail_lock)
{
struct xfs_mount *mp = ailp->ail_mount;
/* if the tail lsn hasn't changed, don't do updates or wakeups. */
if (!old_lsn || old_lsn == __xfs_ail_min_lsn(ailp)) {
spin_unlock(&ailp->ail_lock);
return;
}
if (!XFS_FORCED_SHUTDOWN(mp))
xlog_assign_tail_lsn_locked(mp);
if (list_empty(&ailp->ail_head))
wake_up_all(&ailp->ail_empty);
spin_unlock(&ailp->ail_lock);
xfs_log_space_wake(mp);
}
/*
* xfs_trans_ail_update - bulk AIL insertion operation.
*
......@@ -712,7 +742,7 @@ xfs_trans_ail_update_bulk(
xfs_lsn_t lsn) __releases(ailp->ail_lock)
{
struct xfs_log_item *mlip;
int mlip_changed = 0;
xfs_lsn_t tail_lsn = 0;
int i;
LIST_HEAD(tmp);
......@@ -727,9 +757,10 @@ xfs_trans_ail_update_bulk(
continue;
trace_xfs_ail_move(lip, lip->li_lsn, lsn);
if (mlip == lip && !tail_lsn)
tail_lsn = lip->li_lsn;
xfs_ail_delete(ailp, lip);
if (mlip == lip)
mlip_changed = 1;
} else {
trace_xfs_ail_insert(lip, 0, lsn);
}
......@@ -740,23 +771,23 @@ xfs_trans_ail_update_bulk(
if (!list_empty(&tmp))
xfs_ail_splice(ailp, cur, &tmp, lsn);
if (mlip_changed) {
if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount))
xlog_assign_tail_lsn_locked(ailp->ail_mount);
spin_unlock(&ailp->ail_lock);
xfs_log_space_wake(ailp->ail_mount);
} else {
spin_unlock(&ailp->ail_lock);
}
xfs_ail_update_finish(ailp, tail_lsn);
}
bool
/*
* Delete one log item from the AIL.
*
* If this item was at the tail of the AIL, return the LSN of the log item so
* that we can use it to check if the LSN of the tail of the log has moved
* when finishing up the AIL delete process in xfs_ail_update_finish().
*/
xfs_lsn_t
xfs_ail_delete_one(
struct xfs_ail *ailp,
struct xfs_log_item *lip)
{
struct xfs_log_item *mlip = xfs_ail_min(ailp);
xfs_lsn_t lsn = lip->li_lsn;
trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
xfs_ail_delete(ailp, lip);
......@@ -764,7 +795,9 @@ xfs_ail_delete_one(
clear_bit(XFS_LI_IN_AIL, &lip->li_flags);
lip->li_lsn = 0;
return mlip == lip;
if (mlip == lip)
return lsn;
return 0;
}
/**
......@@ -792,10 +825,10 @@ void
xfs_trans_ail_delete(
struct xfs_ail *ailp,
struct xfs_log_item *lip,
int shutdown_type) __releases(ailp->ail_lock)
int shutdown_type)
{
struct xfs_mount *mp = ailp->ail_mount;
bool mlip_changed;
xfs_lsn_t tail_lsn;
if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) {
spin_unlock(&ailp->ail_lock);
......@@ -808,17 +841,8 @@ xfs_trans_ail_delete(
return;
}
mlip_changed = xfs_ail_delete_one(ailp, lip);
if (mlip_changed) {
if (!XFS_FORCED_SHUTDOWN(mp))
xlog_assign_tail_lsn_locked(mp);
if (list_empty(&ailp->ail_head))
wake_up_all(&ailp->ail_empty);
}
spin_unlock(&ailp->ail_lock);
if (mlip_changed)
xfs_log_space_wake(ailp->ail_mount);
tail_lsn = xfs_ail_delete_one(ailp, lip);
xfs_ail_update_finish(ailp, tail_lsn);
}
int
......
......@@ -91,9 +91,11 @@ xfs_trans_ail_update(
xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn);
}
bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip);
xfs_lsn_t xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip);
void xfs_ail_update_finish(struct xfs_ail *ailp, xfs_lsn_t old_lsn)
__releases(ailp->ail_lock);
void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip,
int shutdown_type) __releases(ailp->ail_lock);
int shutdown_type);
static inline void
xfs_trans_ail_remove(
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment