Commit b1c5ebb2 authored by Dave Chinner's avatar Dave Chinner Committed by Dave Chinner

xfs: allocate log vector buffers outside CIL context lock

One of the problems we currently have with delayed logging is that
under serious memory pressure we can deadlock memory reclaim. THis
occurs when memory reclaim (such as run by kswapd) is reclaiming XFS
inodes and issues a log force to unpin inodes that are dirty in the
CIL.

The CIL is pushed, but this will only occur once it gets the CIL
context lock to ensure that all committing transactions are complete
and no new transactions start being committed to the CIL while the
push switches to a new context.

The deadlock occurs when the CIL context lock is held by a
committing process that is doing memory allocation for log vector
buffers, and that allocation is then blocked on memory reclaim
making progress. Memory reclaim, however, is blocked waiting for
a log force to make progress, and so we effectively deadlock at this
point.

To solve this problem, we have to move the CIL log vector buffer
allocation outside of the context lock so that memory reclaim can
always make progress when it needs to force the log. The problem
with doing this is that a CIL push can take place while we are
determining if we need to allocate a new log vector buffer for
an item and hence the current log vector may go away without
warning. That means we canot rely on the existing log vector being
present when we finally grab the context lock and so we must have a
replacement buffer ready to go at all times.

To ensure this, introduce a "shadow log vector" buffer that is
always guaranteed to be present when we gain the CIL context lock
and format the item. This shadow buffer may or may not be used
during the formatting, but if the log item does not have an existing
log vector buffer or that buffer is too small for the new
modifications, we swap it for the new shadow buffer and format
the modifications into that new log vector buffer.

The result of this is that for any object we modify more than once
in a given CIL checkpoint, we double the memory required
to track dirty regions in the log. For single modifications then
we consume the shadow log vectorwe allocate on commit, and that gets
consumed by the checkpoint. However, if we make multiple
modifications, then the second transaction commit will allocate a
shadow log vector and hence we will end up with double the memory
usage as only one of the log vectors is consumed by the CIL
checkpoint. The remaining shadow vector will be freed when th elog
item is freed.

This can probably be optimised in future - access to the shadow log
vector is serialised by the object lock (as opposited to the active
log vector, which is controlled by the CIL context lock) and so we
can probably free shadow log vector from some objects when the log
item is marked clean on removal from the AIL.
Signed-off-by: default avatarDave Chinner <dchinner@redhat.com>
Reviewed-by: default avatarBrian Foster <bfoster@redhat.com>
Signed-off-by: default avatarDave Chinner <david@fromorbit.com>
parent 160ae76f
...@@ -949,6 +949,7 @@ xfs_buf_item_free( ...@@ -949,6 +949,7 @@ xfs_buf_item_free(
xfs_buf_log_item_t *bip) xfs_buf_log_item_t *bip)
{ {
xfs_buf_item_free_format(bip); xfs_buf_item_free_format(bip);
kmem_free(bip->bli_item.li_lv_shadow);
kmem_zone_free(xfs_buf_item_zone, bip); kmem_zone_free(xfs_buf_item_zone, bip);
} }
......
...@@ -74,6 +74,7 @@ xfs_qm_dqdestroy( ...@@ -74,6 +74,7 @@ xfs_qm_dqdestroy(
{ {
ASSERT(list_empty(&dqp->q_lru)); ASSERT(list_empty(&dqp->q_lru));
kmem_free(dqp->q_logitem.qli_item.li_lv_shadow);
mutex_destroy(&dqp->q_qlock); mutex_destroy(&dqp->q_qlock);
XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot); XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot);
......
...@@ -370,6 +370,8 @@ xfs_qm_qoffend_logitem_committed( ...@@ -370,6 +370,8 @@ xfs_qm_qoffend_logitem_committed(
spin_lock(&ailp->xa_lock); spin_lock(&ailp->xa_lock);
xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR); xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR);
kmem_free(qfs->qql_item.li_lv_shadow);
kmem_free(lip->li_lv_shadow);
kmem_free(qfs); kmem_free(qfs);
kmem_free(qfe); kmem_free(qfe);
return (xfs_lsn_t)-1; return (xfs_lsn_t)-1;
......
...@@ -40,6 +40,7 @@ void ...@@ -40,6 +40,7 @@ void
xfs_efi_item_free( xfs_efi_item_free(
struct xfs_efi_log_item *efip) struct xfs_efi_log_item *efip)
{ {
kmem_free(efip->efi_item.li_lv_shadow);
if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS) if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS)
kmem_free(efip); kmem_free(efip);
else else
...@@ -300,6 +301,7 @@ static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip) ...@@ -300,6 +301,7 @@ static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
STATIC void STATIC void
xfs_efd_item_free(struct xfs_efd_log_item *efdp) xfs_efd_item_free(struct xfs_efd_log_item *efdp)
{ {
kmem_free(efdp->efd_item.li_lv_shadow);
if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS) if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS)
kmem_free(efdp); kmem_free(efdp);
else else
......
...@@ -651,6 +651,7 @@ void ...@@ -651,6 +651,7 @@ void
xfs_inode_item_destroy( xfs_inode_item_destroy(
xfs_inode_t *ip) xfs_inode_t *ip)
{ {
kmem_free(ip->i_itemp->ili_item.li_lv_shadow);
kmem_zone_free(xfs_ili_zone, ip->i_itemp); kmem_zone_free(xfs_ili_zone, ip->i_itemp);
} }
......
This diff is collapsed.
...@@ -52,6 +52,7 @@ typedef struct xfs_log_item { ...@@ -52,6 +52,7 @@ typedef struct xfs_log_item {
/* delayed logging */ /* delayed logging */
struct list_head li_cil; /* CIL pointers */ struct list_head li_cil; /* CIL pointers */
struct xfs_log_vec *li_lv; /* active log vector */ struct xfs_log_vec *li_lv; /* active log vector */
struct xfs_log_vec *li_lv_shadow; /* standby vector */
xfs_lsn_t li_seq; /* CIL commit seq */ xfs_lsn_t li_seq; /* CIL commit seq */
} xfs_log_item_t; } xfs_log_item_t;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment