Commit bbfeb614 authored by Dave Chinner's avatar Dave Chinner

Merge branch 'xfs-4.8-buf-fixes' into for-next

parents f6371617 9c7504aa
...@@ -79,6 +79,47 @@ xfs_buf_vmap_len( ...@@ -79,6 +79,47 @@ xfs_buf_vmap_len(
return (bp->b_page_count * PAGE_SIZE) - bp->b_offset; return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
} }
/*
* Bump the I/O in flight count on the buftarg if we haven't yet done so for
* this buffer. The count is incremented once per buffer (per hold cycle)
* because the corresponding decrement is deferred to buffer release. Buffers
* can undergo I/O multiple times in a hold-release cycle and per buffer I/O
* tracking adds unnecessary overhead. This is used for sychronization purposes
* with unmount (see xfs_wait_buftarg()), so all we really need is a count of
* in-flight buffers.
*
* Buffers that are never released (e.g., superblock, iclog buffers) must set
* the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count
* never reaches zero and unmount hangs indefinitely.
*/
static inline void
xfs_buf_ioacct_inc(
struct xfs_buf *bp)
{
if (bp->b_flags & (XBF_NO_IOACCT|_XBF_IN_FLIGHT))
return;
ASSERT(bp->b_flags & XBF_ASYNC);
bp->b_flags |= _XBF_IN_FLIGHT;
percpu_counter_inc(&bp->b_target->bt_io_count);
}
/*
* Clear the in-flight state on a buffer about to be released to the LRU or
* freed and unaccount from the buftarg.
*/
static inline void
xfs_buf_ioacct_dec(
struct xfs_buf *bp)
{
if (!(bp->b_flags & _XBF_IN_FLIGHT))
return;
ASSERT(bp->b_flags & XBF_ASYNC);
bp->b_flags &= ~_XBF_IN_FLIGHT;
percpu_counter_dec(&bp->b_target->bt_io_count);
}
/* /*
* When we mark a buffer stale, we remove the buffer from the LRU and clear the * When we mark a buffer stale, we remove the buffer from the LRU and clear the
* b_lru_ref count so that the buffer is freed immediately when the buffer * b_lru_ref count so that the buffer is freed immediately when the buffer
...@@ -102,6 +143,14 @@ xfs_buf_stale( ...@@ -102,6 +143,14 @@ xfs_buf_stale(
*/ */
bp->b_flags &= ~_XBF_DELWRI_Q; bp->b_flags &= ~_XBF_DELWRI_Q;
/*
* Once the buffer is marked stale and unlocked, a subsequent lookup
* could reset b_flags. There is no guarantee that the buffer is
* unaccounted (released to LRU) before that occurs. Drop in-flight
* status now to preserve accounting consistency.
*/
xfs_buf_ioacct_dec(bp);
spin_lock(&bp->b_lock); spin_lock(&bp->b_lock);
atomic_set(&bp->b_lru_ref, 0); atomic_set(&bp->b_lru_ref, 0);
if (!(bp->b_state & XFS_BSTATE_DISPOSE) && if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
...@@ -815,7 +864,8 @@ xfs_buf_get_uncached( ...@@ -815,7 +864,8 @@ xfs_buf_get_uncached(
struct xfs_buf *bp; struct xfs_buf *bp;
DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
bp = _xfs_buf_alloc(target, &map, 1, 0); /* flags might contain irrelevant bits, pass only what we care about */
bp = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT);
if (unlikely(bp == NULL)) if (unlikely(bp == NULL))
goto fail; goto fail;
...@@ -866,63 +916,85 @@ xfs_buf_hold( ...@@ -866,63 +916,85 @@ xfs_buf_hold(
} }
/* /*
* Releases a hold on the specified buffer. If the * Release a hold on the specified buffer. If the hold count is 1, the buffer is
* the hold count is 1, calls xfs_buf_free. * placed on LRU or freed (depending on b_lru_ref).
*/ */
void void
xfs_buf_rele( xfs_buf_rele(
xfs_buf_t *bp) xfs_buf_t *bp)
{ {
struct xfs_perag *pag = bp->b_pag; struct xfs_perag *pag = bp->b_pag;
bool release;
bool freebuf = false;
trace_xfs_buf_rele(bp, _RET_IP_); trace_xfs_buf_rele(bp, _RET_IP_);
if (!pag) { if (!pag) {
ASSERT(list_empty(&bp->b_lru)); ASSERT(list_empty(&bp->b_lru));
ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
if (atomic_dec_and_test(&bp->b_hold)) if (atomic_dec_and_test(&bp->b_hold)) {
xfs_buf_ioacct_dec(bp);
xfs_buf_free(bp); xfs_buf_free(bp);
}
return; return;
} }
ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode)); ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
ASSERT(atomic_read(&bp->b_hold) > 0); ASSERT(atomic_read(&bp->b_hold) > 0);
if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
spin_lock(&bp->b_lock); spin_lock(&bp->b_lock);
if (!release) {
/*
* Drop the in-flight state if the buffer is already on the LRU
* and it holds the only reference. This is racy because we
* haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
* ensures the decrement occurs only once per-buf.
*/
if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
xfs_buf_ioacct_dec(bp);
goto out_unlock;
}
/* the last reference has been dropped ... */
xfs_buf_ioacct_dec(bp);
if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
/* /*
* If the buffer is added to the LRU take a new * If the buffer is added to the LRU take a new reference to the
* reference to the buffer for the LRU and clear the * buffer for the LRU and clear the (now stale) dispose list
* (now stale) dispose list state flag * state flag
*/ */
if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) { if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
bp->b_state &= ~XFS_BSTATE_DISPOSE; bp->b_state &= ~XFS_BSTATE_DISPOSE;
atomic_inc(&bp->b_hold); atomic_inc(&bp->b_hold);
} }
spin_unlock(&bp->b_lock);
spin_unlock(&pag->pag_buf_lock); spin_unlock(&pag->pag_buf_lock);
} else { } else {
/* /*
* most of the time buffers will already be removed from * most of the time buffers will already be removed from the
* the LRU, so optimise that case by checking for the * LRU, so optimise that case by checking for the
* XFS_BSTATE_DISPOSE flag indicating the last list the * XFS_BSTATE_DISPOSE flag indicating the last list the buffer
* buffer was on was the disposal list * was on was the disposal list
*/ */
if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
list_lru_del(&bp->b_target->bt_lru, &bp->b_lru); list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
} else { } else {
ASSERT(list_empty(&bp->b_lru)); ASSERT(list_empty(&bp->b_lru));
} }
spin_unlock(&bp->b_lock);
ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
spin_unlock(&pag->pag_buf_lock); spin_unlock(&pag->pag_buf_lock);
xfs_perag_put(pag); xfs_perag_put(pag);
xfs_buf_free(bp); freebuf = true;
}
} }
out_unlock:
spin_unlock(&bp->b_lock);
if (freebuf)
xfs_buf_free(bp);
} }
...@@ -1341,6 +1413,7 @@ xfs_buf_submit( ...@@ -1341,6 +1413,7 @@ xfs_buf_submit(
* xfs_buf_ioend too early. * xfs_buf_ioend too early.
*/ */
atomic_set(&bp->b_io_remaining, 1); atomic_set(&bp->b_io_remaining, 1);
xfs_buf_ioacct_inc(bp);
_xfs_buf_ioapply(bp); _xfs_buf_ioapply(bp);
/* /*
...@@ -1526,13 +1599,19 @@ xfs_wait_buftarg( ...@@ -1526,13 +1599,19 @@ xfs_wait_buftarg(
int loop = 0; int loop = 0;
/* /*
* We need to flush the buffer workqueue to ensure that all IO * First wait on the buftarg I/O count for all in-flight buffers to be
* completion processing is 100% done. Just waiting on buffer locks is * released. This is critical as new buffers do not make the LRU until
* not sufficient for async IO as the reference count held over IO is * they are released.
* not released until after the buffer lock is dropped. Hence we need to *
* ensure here that all reference counts have been dropped before we * Next, flush the buffer workqueue to ensure all completion processing
* start walking the LRU list. * has finished. Just waiting on buffer locks is not sufficient for
*/ * async IO as the reference count held over IO is not released until
* after the buffer lock is dropped. Hence we need to ensure here that
* all reference counts have been dropped before we start walking the
* LRU list.
*/
while (percpu_counter_sum(&btp->bt_io_count))
delay(100);
drain_workqueue(btp->bt_mount->m_buf_workqueue); drain_workqueue(btp->bt_mount->m_buf_workqueue);
/* loop until there is nothing left on the lru list. */ /* loop until there is nothing left on the lru list. */
...@@ -1629,6 +1708,8 @@ xfs_free_buftarg( ...@@ -1629,6 +1708,8 @@ xfs_free_buftarg(
struct xfs_buftarg *btp) struct xfs_buftarg *btp)
{ {
unregister_shrinker(&btp->bt_shrinker); unregister_shrinker(&btp->bt_shrinker);
ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
percpu_counter_destroy(&btp->bt_io_count);
list_lru_destroy(&btp->bt_lru); list_lru_destroy(&btp->bt_lru);
if (mp->m_flags & XFS_MOUNT_BARRIER) if (mp->m_flags & XFS_MOUNT_BARRIER)
...@@ -1693,6 +1774,9 @@ xfs_alloc_buftarg( ...@@ -1693,6 +1774,9 @@ xfs_alloc_buftarg(
if (list_lru_init(&btp->bt_lru)) if (list_lru_init(&btp->bt_lru))
goto error; goto error;
if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
goto error;
btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count; btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan; btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
btp->bt_shrinker.seeks = DEFAULT_SEEKS; btp->bt_shrinker.seeks = DEFAULT_SEEKS;
...@@ -1834,7 +1918,7 @@ xfs_buf_delwri_submit_buffers( ...@@ -1834,7 +1918,7 @@ xfs_buf_delwri_submit_buffers(
* side. We need to move the buffer onto the io_list * side. We need to move the buffer onto the io_list
* at this point so the caller can still access it. * at this point so the caller can still access it.
*/ */
bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL); bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_WRITE_FAIL);
bp->b_flags |= XBF_WRITE | XBF_ASYNC; bp->b_flags |= XBF_WRITE | XBF_ASYNC;
if (wait_list) { if (wait_list) {
xfs_buf_hold(bp); xfs_buf_hold(bp);
......
...@@ -43,6 +43,7 @@ typedef enum { ...@@ -43,6 +43,7 @@ typedef enum {
#define XBF_READ (1 << 0) /* buffer intended for reading from device */ #define XBF_READ (1 << 0) /* buffer intended for reading from device */
#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ #define XBF_WRITE (1 << 1) /* buffer intended for writing to device */
#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */ #define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */
#define XBF_NO_IOACCT (1 << 3) /* bypass I/O accounting (non-LRU bufs) */
#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ #define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */
#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ #define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ #define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */
...@@ -62,6 +63,7 @@ typedef enum { ...@@ -62,6 +63,7 @@ typedef enum {
#define _XBF_KMEM (1 << 21)/* backed by heap memory */ #define _XBF_KMEM (1 << 21)/* backed by heap memory */
#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ #define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */
#define _XBF_COMPOUND (1 << 23)/* compound buffer */ #define _XBF_COMPOUND (1 << 23)/* compound buffer */
#define _XBF_IN_FLIGHT (1 << 25) /* I/O in flight, for accounting purposes */
typedef unsigned int xfs_buf_flags_t; typedef unsigned int xfs_buf_flags_t;
...@@ -81,7 +83,8 @@ typedef unsigned int xfs_buf_flags_t; ...@@ -81,7 +83,8 @@ typedef unsigned int xfs_buf_flags_t;
{ _XBF_PAGES, "PAGES" }, \ { _XBF_PAGES, "PAGES" }, \
{ _XBF_KMEM, "KMEM" }, \ { _XBF_KMEM, "KMEM" }, \
{ _XBF_DELWRI_Q, "DELWRI_Q" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \
{ _XBF_COMPOUND, "COMPOUND" } { _XBF_COMPOUND, "COMPOUND" }, \
{ _XBF_IN_FLIGHT, "IN_FLIGHT" }
/* /*
...@@ -115,6 +118,8 @@ typedef struct xfs_buftarg { ...@@ -115,6 +118,8 @@ typedef struct xfs_buftarg {
/* LRU control structures */ /* LRU control structures */
struct shrinker bt_shrinker; struct shrinker bt_shrinker;
struct list_lru bt_lru; struct list_lru bt_lru;
struct percpu_counter bt_io_count;
} xfs_buftarg_t; } xfs_buftarg_t;
struct xfs_buf; struct xfs_buf;
......
...@@ -1081,6 +1081,8 @@ xfs_buf_iodone_callback_error( ...@@ -1081,6 +1081,8 @@ xfs_buf_iodone_callback_error(
trace_xfs_buf_item_iodone_async(bp, _RET_IP_); trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
ASSERT(bp->b_iodone != NULL); ASSERT(bp->b_iodone != NULL);
cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
/* /*
* If the write was asynchronous then no one will be looking for the * If the write was asynchronous then no one will be looking for the
* error. If this is the first failure of this type, clear the error * error. If this is the first failure of this type, clear the error
...@@ -1088,12 +1090,11 @@ xfs_buf_iodone_callback_error( ...@@ -1088,12 +1090,11 @@ xfs_buf_iodone_callback_error(
* async write failure at least once, but we also need to set the buffer * async write failure at least once, but we also need to set the buffer
* up to behave correctly now for repeated failures. * up to behave correctly now for repeated failures.
*/ */
if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL)) || if (!(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL)) ||
bp->b_last_error != bp->b_error) { bp->b_last_error != bp->b_error) {
bp->b_flags |= (XBF_WRITE | XBF_ASYNC | bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL);
XBF_DONE | XBF_WRITE_FAIL);
bp->b_last_error = bp->b_error; bp->b_last_error = bp->b_error;
bp->b_retries = 0; if (cfg->retry_timeout && !bp->b_first_retry_time)
bp->b_first_retry_time = jiffies; bp->b_first_retry_time = jiffies;
xfs_buf_ioerror(bp, 0); xfs_buf_ioerror(bp, 0);
...@@ -1105,7 +1106,6 @@ xfs_buf_iodone_callback_error( ...@@ -1105,7 +1106,6 @@ xfs_buf_iodone_callback_error(
* Repeated failure on an async write. Take action according to the * Repeated failure on an async write. Take action according to the
* error configuration we have been set up to use. * error configuration we have been set up to use.
*/ */
cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
++bp->b_retries > cfg->max_retries) ++bp->b_retries > cfg->max_retries)
......
...@@ -1415,7 +1415,7 @@ xlog_alloc_log( ...@@ -1415,7 +1415,7 @@ xlog_alloc_log(
*/ */
error = -ENOMEM; error = -ENOMEM;
bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL, bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL,
BTOBB(log->l_iclog_size), 0); BTOBB(log->l_iclog_size), XBF_NO_IOACCT);
if (!bp) if (!bp)
goto out_free_log; goto out_free_log;
...@@ -1454,7 +1454,8 @@ xlog_alloc_log( ...@@ -1454,7 +1454,8 @@ xlog_alloc_log(
prev_iclog = iclog; prev_iclog = iclog;
bp = xfs_buf_get_uncached(mp->m_logdev_targp, bp = xfs_buf_get_uncached(mp->m_logdev_targp,
BTOBB(log->l_iclog_size), 0); BTOBB(log->l_iclog_size),
XBF_NO_IOACCT);
if (!bp) if (!bp)
goto out_free_iclog; goto out_free_iclog;
......
...@@ -272,13 +272,15 @@ xfs_readsb( ...@@ -272,13 +272,15 @@ xfs_readsb(
buf_ops = NULL; buf_ops = NULL;
/* /*
* Allocate a (locked) buffer to hold the superblock. * Allocate a (locked) buffer to hold the superblock. This will be kept
* This will be kept around at all times to optimize * around at all times to optimize access to the superblock. Therefore,
* access to the superblock. * set XBF_NO_IOACCT to make sure it doesn't hold the buftarg count
* elevated.
*/ */
reread: reread:
error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
BTOBB(sector_size), 0, &bp, buf_ops); BTOBB(sector_size), XBF_NO_IOACCT, &bp,
buf_ops);
if (error) { if (error) {
if (loud) if (loud)
xfs_warn(mp, "SB validate failed with error %d.", error); xfs_warn(mp, "SB validate failed with error %d.", error);
......
...@@ -634,6 +634,9 @@ xfs_error_get_cfg( ...@@ -634,6 +634,9 @@ xfs_error_get_cfg(
{ {
struct xfs_error_cfg *cfg; struct xfs_error_cfg *cfg;
if (error < 0)
error = -error;
switch (error) { switch (error) {
case EIO: case EIO:
cfg = &mp->m_error_cfg[error_class][XFS_ERR_EIO]; cfg = &mp->m_error_cfg[error_class][XFS_ERR_EIO];
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment