Commit 155cd433 authored by Dave Chinner's avatar Dave Chinner

Merge branch 'xfs-4.9-log-recovery-fixes' into for-next

parents a1f45e66 5cd9cee9
......@@ -258,7 +258,7 @@ xfs_alloc_compute_diff(
xfs_agblock_t wantbno, /* target starting block */
xfs_extlen_t wantlen, /* target length */
xfs_extlen_t alignment, /* target alignment */
char userdata, /* are we allocating data? */
int datatype, /* are we allocating data? */
xfs_agblock_t freebno, /* freespace's starting block */
xfs_extlen_t freelen, /* freespace's length */
xfs_agblock_t *newbnop) /* result: best start block from free */
......@@ -269,6 +269,7 @@ xfs_alloc_compute_diff(
xfs_extlen_t newlen1=0; /* length with newbno1 */
xfs_extlen_t newlen2=0; /* length with newbno2 */
xfs_agblock_t wantend; /* end of target extent */
bool userdata = xfs_alloc_is_userdata(datatype);
ASSERT(freelen >= wantlen);
freeend = freebno + freelen;
......@@ -924,7 +925,7 @@ xfs_alloc_find_best_extent(
sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
args->alignment,
args->userdata, *sbnoa,
args->datatype, *sbnoa,
*slena, &new);
/*
......@@ -1108,7 +1109,7 @@ xfs_alloc_ag_vextent_near(
if (args->len < blen)
continue;
ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
args->alignment, args->userdata, ltbnoa,
args->alignment, args->datatype, ltbnoa,
ltlena, &ltnew);
if (ltnew != NULLAGBLOCK &&
(args->len > blen || ltdiff < bdiff)) {
......@@ -1261,7 +1262,7 @@ xfs_alloc_ag_vextent_near(
args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
xfs_alloc_fix_len(args);
ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
args->alignment, args->userdata, ltbnoa,
args->alignment, args->datatype, ltbnoa,
ltlena, &ltnew);
error = xfs_alloc_find_best_extent(args,
......@@ -1278,7 +1279,7 @@ xfs_alloc_ag_vextent_near(
args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
xfs_alloc_fix_len(args);
gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
args->alignment, args->userdata, gtbnoa,
args->alignment, args->datatype, gtbnoa,
gtlena, &gtnew);
error = xfs_alloc_find_best_extent(args,
......@@ -1338,7 +1339,7 @@ xfs_alloc_ag_vextent_near(
}
rlen = args->len;
(void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
args->userdata, ltbnoa, ltlena, &ltnew);
args->datatype, ltbnoa, ltlena, &ltnew);
ASSERT(ltnew >= ltbno);
ASSERT(ltnew + rlen <= ltbnoa + ltlena);
ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
......@@ -1617,9 +1618,9 @@ xfs_alloc_ag_vextent_small(
goto error0;
if (fbno != NULLAGBLOCK) {
xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
args->userdata);
xfs_alloc_allow_busy_reuse(args->datatype));
if (args->userdata) {
if (xfs_alloc_is_userdata(args->datatype)) {
xfs_buf_t *bp;
bp = xfs_btree_get_bufs(args->mp, args->tp,
......@@ -2099,7 +2100,7 @@ xfs_alloc_fix_freelist(
* somewhere else if we are not being asked to try harder at this
* point
*/
if (pag->pagf_metadata && args->userdata &&
if (pag->pagf_metadata && xfs_alloc_is_userdata(args->datatype) &&
(flags & XFS_ALLOC_FLAG_TRYLOCK)) {
ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
goto out_agbp_relse;
......@@ -2675,7 +2676,7 @@ xfs_alloc_vextent(
* Try near allocation first, then anywhere-in-ag after
* the first a.g. fails.
*/
if ((args->userdata & XFS_ALLOC_INITIAL_USER_DATA) &&
if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
(mp->m_flags & XFS_MOUNT_32BITINODES)) {
args->fsbno = XFS_AGB_TO_FSB(mp,
((mp->m_agfrotor / rotorstep) %
......@@ -2808,7 +2809,7 @@ xfs_alloc_vextent(
#endif
/* Zero the extent if we were asked to do so */
if (args->userdata & XFS_ALLOC_USERDATA_ZERO) {
if (args->datatype & XFS_ALLOC_USERDATA_ZERO) {
error = xfs_zero_extent(args->ip, args->fsbno, args->len);
if (error)
goto error0;
......
......@@ -85,20 +85,33 @@ typedef struct xfs_alloc_arg {
xfs_extlen_t len; /* output: actual size of extent */
xfs_alloctype_t type; /* allocation type XFS_ALLOCTYPE_... */
xfs_alloctype_t otype; /* original allocation type */
int datatype; /* mask defining data type treatment */
char wasdel; /* set if allocation was prev delayed */
char wasfromfl; /* set if allocation is from freelist */
char userdata; /* mask defining userdata treatment */
xfs_fsblock_t firstblock; /* io first block allocated */
struct xfs_owner_info oinfo; /* owner of blocks being allocated */
enum xfs_ag_resv_type resv; /* block reservation to use */
} xfs_alloc_arg_t;
/*
* Defines for userdata
* Defines for datatype
*/
#define XFS_ALLOC_USERDATA (1 << 0)/* allocation is for user data*/
#define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */
#define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */
#define XFS_ALLOC_NOBUSY (1 << 3)/* Busy extents not allowed */
static inline bool
xfs_alloc_is_userdata(int datatype)
{
return (datatype & ~XFS_ALLOC_NOBUSY) != 0;
}
static inline bool
xfs_alloc_allow_busy_reuse(int datatype)
{
return (datatype & XFS_ALLOC_NOBUSY) == 0;
}
/* freespace limit calculations */
#define XFS_ALLOC_AGFL_RESERVE 4
......
......@@ -3348,7 +3348,8 @@ xfs_bmap_adjacent(
mp = ap->ip->i_mount;
nullfb = *ap->firstblock == NULLFSBLOCK;
rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata;
rt = XFS_IS_REALTIME_INODE(ap->ip) &&
xfs_alloc_is_userdata(ap->datatype);
fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
/*
* If allocating at eof, and there's a previous real block,
......@@ -3624,7 +3625,7 @@ xfs_bmap_btalloc(
{
xfs_mount_t *mp; /* mount point structure */
xfs_alloctype_t atype = 0; /* type for allocation routines */
xfs_extlen_t align; /* minimum allocation alignment */
xfs_extlen_t align = 0; /* minimum allocation alignment */
xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */
xfs_agnumber_t ag;
xfs_alloc_arg_t args;
......@@ -3647,7 +3648,8 @@ xfs_bmap_btalloc(
else if (mp->m_dalign)
stripe_align = mp->m_dalign;
align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0;
if (xfs_alloc_is_userdata(ap->datatype))
align = xfs_get_extsz_hint(ap->ip);
if (unlikely(align)) {
error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
align, 0, ap->eof, 0, ap->conv,
......@@ -3660,7 +3662,8 @@ xfs_bmap_btalloc(
nullfb = *ap->firstblock == NULLFSBLOCK;
fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
if (nullfb) {
if (ap->userdata && xfs_inode_is_filestream(ap->ip)) {
if (xfs_alloc_is_userdata(ap->datatype) &&
xfs_inode_is_filestream(ap->ip)) {
ag = xfs_filestream_lookup_ag(ap->ip);
ag = (ag != NULLAGNUMBER) ? ag : 0;
ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0);
......@@ -3700,7 +3703,8 @@ xfs_bmap_btalloc(
* enough for the request. If one isn't found, then adjust
* the minimum allocation size to the largest space found.
*/
if (ap->userdata && xfs_inode_is_filestream(ap->ip))
if (xfs_alloc_is_userdata(ap->datatype) &&
xfs_inode_is_filestream(ap->ip))
error = xfs_bmap_btalloc_filestreams(ap, &args, &blen);
else
error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
......@@ -3784,8 +3788,8 @@ xfs_bmap_btalloc(
args.minleft = ap->minleft;
args.wasdel = ap->wasdel;
args.resv = XFS_AG_RESV_NONE;
args.userdata = ap->userdata;
if (ap->userdata & XFS_ALLOC_USERDATA_ZERO)
args.datatype = ap->datatype;
if (ap->datatype & XFS_ALLOC_USERDATA_ZERO)
args.ip = ap->ip;
error = xfs_alloc_vextent(&args);
......@@ -3879,7 +3883,8 @@ STATIC int
xfs_bmap_alloc(
struct xfs_bmalloca *ap) /* bmap alloc argument struct */
{
if (XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata)
if (XFS_IS_REALTIME_INODE(ap->ip) &&
xfs_alloc_is_userdata(ap->datatype))
return xfs_bmap_rtalloc(ap);
return xfs_bmap_btalloc(ap);
}
......@@ -4204,15 +4209,21 @@ xfs_bmapi_allocate(
}
/*
* Indicate if this is the first user data in the file, or just any
* user data. And if it is userdata, indicate whether it needs to
* be initialised to zero during allocation.
* Set the data type being allocated. For the data fork, the first data
* in the file is treated differently to all other allocations. For the
* attribute fork, we only need to ensure the allocated range is not on
* the busy list.
*/
if (!(bma->flags & XFS_BMAPI_METADATA)) {
bma->userdata = (bma->offset == 0) ?
XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA;
bma->datatype = XFS_ALLOC_NOBUSY;
if (whichfork == XFS_DATA_FORK) {
if (bma->offset == 0)
bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
else
bma->datatype |= XFS_ALLOC_USERDATA;
}
if (bma->flags & XFS_BMAPI_ZERO)
bma->userdata |= XFS_ALLOC_USERDATA_ZERO;
bma->datatype |= XFS_ALLOC_USERDATA_ZERO;
}
bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
......@@ -4482,7 +4493,7 @@ xfs_bmapi_write(
bma.tp = tp;
bma.ip = ip;
bma.total = total;
bma.userdata = 0;
bma.datatype = 0;
bma.dfops = dfops;
bma.firstblock = firstblock;
......
......@@ -54,7 +54,7 @@ struct xfs_bmalloca {
bool wasdel; /* replacing a delayed allocation */
bool aeof; /* allocated space at eof */
bool conv; /* overwriting unwritten extents */
char userdata;/* userdata mask */
int datatype;/* data type being allocated */
int flags;
};
......
......@@ -182,7 +182,7 @@ xfs_bmap_rtalloc(
XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
/* Zero the extent if we were asked to do so */
if (ap->userdata & XFS_ALLOC_USERDATA_ZERO) {
if (ap->datatype & XFS_ALLOC_USERDATA_ZERO) {
error = xfs_zero_extent(ap->ip, ap->blkno, ap->length);
if (error)
return error;
......
......@@ -384,7 +384,7 @@ xfs_extent_busy_trim(
* If this is a metadata allocation, try to reuse the busy
* extent instead of trimming the allocation.
*/
if (!args->userdata &&
if (!xfs_alloc_is_userdata(args->datatype) &&
!(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) {
if (!xfs_extent_busy_update_extent(args->mp, args->pag,
busyp, fbno, flen,
......
......@@ -371,7 +371,8 @@ xfs_filestream_new_ag(
struct xfs_mount *mp = ip->i_mount;
xfs_extlen_t minlen = ap->length;
xfs_agnumber_t startag = 0;
int flags, err = 0;
int flags = 0;
int err = 0;
struct xfs_mru_cache_elem *mru;
*agp = NULLAGNUMBER;
......@@ -387,8 +388,10 @@ xfs_filestream_new_ag(
startag = (item->ag + 1) % mp->m_sb.sb_agcount;
}
flags = (ap->userdata ? XFS_PICK_USERDATA : 0) |
(ap->dfops->dop_low ? XFS_PICK_LOWSPACE : 0);
if (xfs_alloc_is_userdata(ap->datatype))
flags |= XFS_PICK_USERDATA;
if (ap->dfops->dop_low)
flags |= XFS_PICK_LOWSPACE;
err = xfs_filestream_pick_ag(pip, startag, agp, flags, minlen);
......
......@@ -413,7 +413,8 @@ struct xlog {
/* log record crc error injection factor */
uint32_t l_badcrc_factor;
#endif
/* log recovery lsn tracking (for buffer submission */
xfs_lsn_t l_recovery_lsn;
};
#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
......
......@@ -44,6 +44,7 @@
#include "xfs_error.h"
#include "xfs_dir2.h"
#include "xfs_rmap_item.h"
#include "xfs_buf_item.h"
#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
......@@ -381,6 +382,15 @@ xlog_recover_iodone(
SHUTDOWN_META_IO_ERROR);
}
}
/*
* On v5 supers, a bli could be attached to update the metadata LSN.
* Clean it up.
*/
if (bp->b_fspriv)
xfs_buf_item_relse(bp);
ASSERT(bp->b_fspriv == NULL);
bp->b_iodone = NULL;
xfs_buf_ioend(bp);
}
......@@ -2360,12 +2370,14 @@ static void
xlog_recover_validate_buf_type(
struct xfs_mount *mp,
struct xfs_buf *bp,
xfs_buf_log_format_t *buf_f)
xfs_buf_log_format_t *buf_f,
xfs_lsn_t current_lsn)
{
struct xfs_da_blkinfo *info = bp->b_addr;
__uint32_t magic32;
__uint16_t magic16;
__uint16_t magicda;
char *warnmsg = NULL;
/*
* We can only do post recovery validation on items on CRC enabled
......@@ -2404,31 +2416,27 @@ xlog_recover_validate_buf_type(
bp->b_ops = &xfs_rmapbt_buf_ops;
break;
default:
xfs_warn(mp, "Bad btree block magic!");
ASSERT(0);
warnmsg = "Bad btree block magic!";
break;
}
break;
case XFS_BLFT_AGF_BUF:
if (magic32 != XFS_AGF_MAGIC) {
xfs_warn(mp, "Bad AGF block magic!");
ASSERT(0);
warnmsg = "Bad AGF block magic!";
break;
}
bp->b_ops = &xfs_agf_buf_ops;
break;
case XFS_BLFT_AGFL_BUF:
if (magic32 != XFS_AGFL_MAGIC) {
xfs_warn(mp, "Bad AGFL block magic!");
ASSERT(0);
warnmsg = "Bad AGFL block magic!";
break;
}
bp->b_ops = &xfs_agfl_buf_ops;
break;
case XFS_BLFT_AGI_BUF:
if (magic32 != XFS_AGI_MAGIC) {
xfs_warn(mp, "Bad AGI block magic!");
ASSERT(0);
warnmsg = "Bad AGI block magic!";
break;
}
bp->b_ops = &xfs_agi_buf_ops;
......@@ -2438,8 +2446,7 @@ xlog_recover_validate_buf_type(
case XFS_BLFT_GDQUOT_BUF:
#ifdef CONFIG_XFS_QUOTA
if (magic16 != XFS_DQUOT_MAGIC) {
xfs_warn(mp, "Bad DQUOT block magic!");
ASSERT(0);
warnmsg = "Bad DQUOT block magic!";
break;
}
bp->b_ops = &xfs_dquot_buf_ops;
......@@ -2451,16 +2458,14 @@ xlog_recover_validate_buf_type(
break;
case XFS_BLFT_DINO_BUF:
if (magic16 != XFS_DINODE_MAGIC) {
xfs_warn(mp, "Bad INODE block magic!");
ASSERT(0);
warnmsg = "Bad INODE block magic!";
break;
}
bp->b_ops = &xfs_inode_buf_ops;
break;
case XFS_BLFT_SYMLINK_BUF:
if (magic32 != XFS_SYMLINK_MAGIC) {
xfs_warn(mp, "Bad symlink block magic!");
ASSERT(0);
warnmsg = "Bad symlink block magic!";
break;
}
bp->b_ops = &xfs_symlink_buf_ops;
......@@ -2468,8 +2473,7 @@ xlog_recover_validate_buf_type(
case XFS_BLFT_DIR_BLOCK_BUF:
if (magic32 != XFS_DIR2_BLOCK_MAGIC &&
magic32 != XFS_DIR3_BLOCK_MAGIC) {
xfs_warn(mp, "Bad dir block magic!");
ASSERT(0);
warnmsg = "Bad dir block magic!";
break;
}
bp->b_ops = &xfs_dir3_block_buf_ops;
......@@ -2477,8 +2481,7 @@ xlog_recover_validate_buf_type(
case XFS_BLFT_DIR_DATA_BUF:
if (magic32 != XFS_DIR2_DATA_MAGIC &&
magic32 != XFS_DIR3_DATA_MAGIC) {
xfs_warn(mp, "Bad dir data magic!");
ASSERT(0);
warnmsg = "Bad dir data magic!";
break;
}
bp->b_ops = &xfs_dir3_data_buf_ops;
......@@ -2486,8 +2489,7 @@ xlog_recover_validate_buf_type(
case XFS_BLFT_DIR_FREE_BUF:
if (magic32 != XFS_DIR2_FREE_MAGIC &&
magic32 != XFS_DIR3_FREE_MAGIC) {
xfs_warn(mp, "Bad dir3 free magic!");
ASSERT(0);
warnmsg = "Bad dir3 free magic!";
break;
}
bp->b_ops = &xfs_dir3_free_buf_ops;
......@@ -2495,8 +2497,7 @@ xlog_recover_validate_buf_type(
case XFS_BLFT_DIR_LEAF1_BUF:
if (magicda != XFS_DIR2_LEAF1_MAGIC &&
magicda != XFS_DIR3_LEAF1_MAGIC) {
xfs_warn(mp, "Bad dir leaf1 magic!");
ASSERT(0);
warnmsg = "Bad dir leaf1 magic!";
break;
}
bp->b_ops = &xfs_dir3_leaf1_buf_ops;
......@@ -2504,8 +2505,7 @@ xlog_recover_validate_buf_type(
case XFS_BLFT_DIR_LEAFN_BUF:
if (magicda != XFS_DIR2_LEAFN_MAGIC &&
magicda != XFS_DIR3_LEAFN_MAGIC) {
xfs_warn(mp, "Bad dir leafn magic!");
ASSERT(0);
warnmsg = "Bad dir leafn magic!";
break;
}
bp->b_ops = &xfs_dir3_leafn_buf_ops;
......@@ -2513,8 +2513,7 @@ xlog_recover_validate_buf_type(
case XFS_BLFT_DA_NODE_BUF:
if (magicda != XFS_DA_NODE_MAGIC &&
magicda != XFS_DA3_NODE_MAGIC) {
xfs_warn(mp, "Bad da node magic!");
ASSERT(0);
warnmsg = "Bad da node magic!";
break;
}
bp->b_ops = &xfs_da3_node_buf_ops;
......@@ -2522,24 +2521,21 @@ xlog_recover_validate_buf_type(
case XFS_BLFT_ATTR_LEAF_BUF:
if (magicda != XFS_ATTR_LEAF_MAGIC &&
magicda != XFS_ATTR3_LEAF_MAGIC) {
xfs_warn(mp, "Bad attr leaf magic!");
ASSERT(0);
warnmsg = "Bad attr leaf magic!";
break;
}
bp->b_ops = &xfs_attr3_leaf_buf_ops;
break;
case XFS_BLFT_ATTR_RMT_BUF:
if (magic32 != XFS_ATTR3_RMT_MAGIC) {
xfs_warn(mp, "Bad attr remote magic!");
ASSERT(0);
warnmsg = "Bad attr remote magic!";
break;
}
bp->b_ops = &xfs_attr3_rmt_buf_ops;
break;
case XFS_BLFT_SB_BUF:
if (magic32 != XFS_SB_MAGIC) {
xfs_warn(mp, "Bad SB block magic!");
ASSERT(0);
warnmsg = "Bad SB block magic!";
break;
}
bp->b_ops = &xfs_sb_buf_ops;
......@@ -2556,6 +2552,40 @@ xlog_recover_validate_buf_type(
xfs_blft_from_flags(buf_f));
break;
}
/*
* Nothing else to do in the case of a NULL current LSN as this means
* the buffer is more recent than the change in the log and will be
* skipped.
*/
if (current_lsn == NULLCOMMITLSN)
return;
if (warnmsg) {
xfs_warn(mp, warnmsg);
ASSERT(0);
}
/*
* We must update the metadata LSN of the buffer as it is written out to
* ensure that older transactions never replay over this one and corrupt
* the buffer. This can occur if log recovery is interrupted at some
* point after the current transaction completes, at which point a
* subsequent mount starts recovery from the beginning.
*
* Write verifiers update the metadata LSN from log items attached to
* the buffer. Therefore, initialize a bli purely to carry the LSN to
* the verifier. We'll clean it up in our ->iodone() callback.
*/
if (bp->b_ops) {
struct xfs_buf_log_item *bip;
ASSERT(!bp->b_iodone || bp->b_iodone == xlog_recover_iodone);
bp->b_iodone = xlog_recover_iodone;
xfs_buf_item_init(bp, mp);
bip = bp->b_fspriv;
bip->bli_item.li_lsn = current_lsn;
}
}
/*
......@@ -2569,7 +2599,8 @@ xlog_recover_do_reg_buffer(
struct xfs_mount *mp,
xlog_recover_item_t *item,
struct xfs_buf *bp,
xfs_buf_log_format_t *buf_f)
xfs_buf_log_format_t *buf_f,
xfs_lsn_t current_lsn)
{
int i;
int bit;
......@@ -2642,7 +2673,7 @@ xlog_recover_do_reg_buffer(
/* Shouldn't be any more regions */
ASSERT(i == item->ri_total);
xlog_recover_validate_buf_type(mp, bp, buf_f);
xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn);
}
/*
......@@ -2685,7 +2716,7 @@ xlog_recover_do_dquot_buffer(
if (log->l_quotaoffs_flag & type)
return false;
xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN);
return true;
}
......@@ -2773,7 +2804,8 @@ xlog_recover_buffer_pass2(
*/
lsn = xlog_recover_get_buf_lsn(mp, bp);
if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
xlog_recover_validate_buf_type(mp, bp, buf_f);
trace_xfs_log_recover_buf_skip(log, buf_f);
xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN);
goto out_release;
}
......@@ -2789,7 +2821,7 @@ xlog_recover_buffer_pass2(
if (!dirty)
goto out_release;
} else {
xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn);
}
/*
......@@ -3846,14 +3878,13 @@ STATIC int
xlog_recover_commit_trans(
struct xlog *log,
struct xlog_recover *trans,
int pass)
int pass,
struct list_head *buffer_list)
{
int error = 0;
int error2;
int items_queued = 0;
struct xlog_recover_item *item;
struct xlog_recover_item *next;
LIST_HEAD (buffer_list);
LIST_HEAD (ra_list);
LIST_HEAD (done_list);
......@@ -3876,7 +3907,7 @@ xlog_recover_commit_trans(
items_queued++;
if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) {
error = xlog_recover_items_pass2(log, trans,
&buffer_list, &ra_list);
buffer_list, &ra_list);
list_splice_tail_init(&ra_list, &done_list);
items_queued = 0;
}
......@@ -3894,15 +3925,14 @@ xlog_recover_commit_trans(
if (!list_empty(&ra_list)) {
if (!error)
error = xlog_recover_items_pass2(log, trans,
&buffer_list, &ra_list);
buffer_list, &ra_list);
list_splice_tail_init(&ra_list, &done_list);
}
if (!list_empty(&done_list))
list_splice_init(&done_list, &trans->r_itemq);
error2 = xfs_buf_delwri_submit(&buffer_list);
return error ? error : error2;
return error;
}
STATIC void
......@@ -4085,7 +4115,8 @@ xlog_recovery_process_trans(
char *dp,
unsigned int len,
unsigned int flags,
int pass)
int pass,
struct list_head *buffer_list)
{
int error = 0;
bool freeit = false;
......@@ -4109,7 +4140,8 @@ xlog_recovery_process_trans(
error = xlog_recover_add_to_cont_trans(log, trans, dp, len);
break;
case XLOG_COMMIT_TRANS:
error = xlog_recover_commit_trans(log, trans, pass);
error = xlog_recover_commit_trans(log, trans, pass,
buffer_list);
/* success or fail, we are now done with this transaction. */
freeit = true;
break;
......@@ -4191,10 +4223,12 @@ xlog_recover_process_ophdr(
struct xlog_op_header *ohead,
char *dp,
char *end,
int pass)
int pass,
struct list_head *buffer_list)
{
struct xlog_recover *trans;
unsigned int len;
int error;
/* Do we understand who wrote this op? */
if (ohead->oh_clientid != XFS_TRANSACTION &&
......@@ -4221,8 +4255,39 @@ xlog_recover_process_ophdr(
return 0;
}
/*
* The recovered buffer queue is drained only once we know that all
* recovery items for the current LSN have been processed. This is
* required because:
*
* - Buffer write submission updates the metadata LSN of the buffer.
* - Log recovery skips items with a metadata LSN >= the current LSN of
* the recovery item.
* - Separate recovery items against the same metadata buffer can share
* a current LSN. I.e., consider that the LSN of a recovery item is
* defined as the starting LSN of the first record in which its
* transaction appears, that a record can hold multiple transactions,
* and/or that a transaction can span multiple records.
*
* In other words, we are allowed to submit a buffer from log recovery
* once per current LSN. Otherwise, we may incorrectly skip recovery
* items and cause corruption.
*
* We don't know up front whether buffers are updated multiple times per
* LSN. Therefore, track the current LSN of each commit log record as it
* is processed and drain the queue when it changes. Use commit records
* because they are ordered correctly by the logging code.
*/
if (log->l_recovery_lsn != trans->r_lsn &&
ohead->oh_flags & XLOG_COMMIT_TRANS) {
error = xfs_buf_delwri_submit(buffer_list);
if (error)
return error;
log->l_recovery_lsn = trans->r_lsn;
}
return xlog_recovery_process_trans(log, trans, dp, len,
ohead->oh_flags, pass);
ohead->oh_flags, pass, buffer_list);
}
/*
......@@ -4240,7 +4305,8 @@ xlog_recover_process_data(
struct hlist_head rhash[],
struct xlog_rec_header *rhead,
char *dp,
int pass)
int pass,
struct list_head *buffer_list)
{
struct xlog_op_header *ohead;
char *end;
......@@ -4254,6 +4320,7 @@ xlog_recover_process_data(
if (xlog_header_check_recover(log->l_mp, rhead))
return -EIO;
trace_xfs_log_recover_record(log, rhead, pass);
while ((dp < end) && num_logops) {
ohead = (struct xlog_op_header *)dp;
......@@ -4262,7 +4329,7 @@ xlog_recover_process_data(
/* errors will abort recovery */
error = xlog_recover_process_ophdr(log, rhash, rhead, ohead,
dp, end, pass);
dp, end, pass, buffer_list);
if (error)
return error;
......@@ -4685,7 +4752,8 @@ xlog_recover_process(
struct hlist_head rhash[],
struct xlog_rec_header *rhead,
char *dp,
int pass)
int pass,
struct list_head *buffer_list)
{
int error;
__le32 crc;
......@@ -4732,7 +4800,8 @@ xlog_recover_process(
if (error)
return error;
return xlog_recover_process_data(log, rhash, rhead, dp, pass);
return xlog_recover_process_data(log, rhash, rhead, dp, pass,
buffer_list);
}
STATIC int
......@@ -4793,9 +4862,11 @@ xlog_do_recovery_pass(
char *offset;
xfs_buf_t *hbp, *dbp;
int error = 0, h_size, h_len;
int error2 = 0;
int bblks, split_bblks;
int hblks, split_hblks, wrapped_hblks;
struct hlist_head rhash[XLOG_RHASH_SIZE];
LIST_HEAD (buffer_list);
ASSERT(head_blk != tail_blk);
rhead_blk = 0;
......@@ -4981,7 +5052,7 @@ xlog_do_recovery_pass(
}
error = xlog_recover_process(log, rhash, rhead, offset,
pass);
pass, &buffer_list);
if (error)
goto bread_err2;
......@@ -5012,7 +5083,8 @@ xlog_do_recovery_pass(
if (error)
goto bread_err2;
error = xlog_recover_process(log, rhash, rhead, offset, pass);
error = xlog_recover_process(log, rhash, rhead, offset, pass,
&buffer_list);
if (error)
goto bread_err2;
......@@ -5025,10 +5097,17 @@ xlog_do_recovery_pass(
bread_err1:
xlog_put_bp(hbp);
/*
* Submit buffers that have been added from the last record processed,
* regardless of error status.
*/
if (!list_empty(&buffer_list))
error2 = xfs_buf_delwri_submit(&buffer_list);
if (error && first_bad)
*first_bad = rhead_blk;
return error;
return error ? error : error2;
}
/*
......
......@@ -933,6 +933,20 @@ xfs_mountfs(
goto out_rtunmount;
}
/*
* Now the log is fully replayed, we can transition to full read-only
* mode for read-only mounts. This will sync all the metadata and clean
* the log so that the recovery we just performed does not have to be
* replayed again on the next mount.
*
* We use the same quiesce mechanism as the rw->ro remount, as they are
* semantically identical operations.
*/
if ((mp->m_flags & (XFS_MOUNT_RDONLY|XFS_MOUNT_NORECOVERY)) ==
XFS_MOUNT_RDONLY) {
xfs_quiesce_attr(mp);
}
/*
* Complete the quota initialisation, post-log-replay component.
*/
......
......@@ -1137,7 +1137,7 @@ xfs_restore_resvblks(struct xfs_mount *mp)
* Note: xfs_log_quiesce() stops background log work - the callers must ensure
* it is started again when appropriate.
*/
static void
void
xfs_quiesce_attr(
struct xfs_mount *mp)
{
......
......@@ -61,6 +61,7 @@ struct xfs_mount;
struct xfs_buftarg;
struct block_device;
extern void xfs_quiesce_attr(struct xfs_mount *mp);
extern void xfs_flush_inodes(struct xfs_mount *mp);
extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *,
......
......@@ -1624,7 +1624,7 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,
__field(char, wasdel)
__field(char, wasfromfl)
__field(int, resv)
__field(char, userdata)
__field(int, datatype)
__field(xfs_fsblock_t, firstblock)
),
TP_fast_assign(
......@@ -1645,13 +1645,13 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,
__entry->wasdel = args->wasdel;
__entry->wasfromfl = args->wasfromfl;
__entry->resv = args->resv;
__entry->userdata = args->userdata;
__entry->datatype = args->datatype;
__entry->firstblock = args->firstblock;
),
TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u "
"prod %u minleft %u total %u alignment %u minalignslop %u "
"len %u type %s otype %s wasdel %d wasfromfl %d resv %d "
"userdata %d firstblock 0x%llx",
"datatype 0x%x firstblock 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
......@@ -1669,7 +1669,7 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,
__entry->wasdel,
__entry->wasfromfl,
__entry->resv,
__entry->userdata,
__entry->datatype,
(unsigned long long)__entry->firstblock)
)
......@@ -1985,6 +1985,29 @@ DEFINE_EVENT(xfs_swap_extent_class, name, \
DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
TRACE_EVENT(xfs_log_recover_record,
TP_PROTO(struct xlog *log, struct xlog_rec_header *rhead, int pass),
TP_ARGS(log, rhead, pass),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_lsn_t, lsn)
__field(int, len)
__field(int, num_logops)
__field(int, pass)
),
TP_fast_assign(
__entry->dev = log->l_mp->m_super->s_dev;
__entry->lsn = be64_to_cpu(rhead->h_lsn);
__entry->len = be32_to_cpu(rhead->h_len);
__entry->num_logops = be32_to_cpu(rhead->h_num_logops);
__entry->pass = pass;
),
TP_printk("dev %d:%d lsn 0x%llx len 0x%x num_logops 0x%x pass %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->lsn, __entry->len, __entry->num_logops,
__entry->pass)
)
DECLARE_EVENT_CLASS(xfs_log_recover_item_class,
TP_PROTO(struct xlog *log, struct xlog_recover *trans,
struct xlog_recover_item *item, int pass),
......@@ -1993,6 +2016,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_item_class,
__field(dev_t, dev)
__field(unsigned long, item)
__field(xlog_tid_t, tid)
__field(xfs_lsn_t, lsn)
__field(int, type)
__field(int, pass)
__field(int, count)
......@@ -2002,15 +2026,17 @@ DECLARE_EVENT_CLASS(xfs_log_recover_item_class,
__entry->dev = log->l_mp->m_super->s_dev;
__entry->item = (unsigned long)item;
__entry->tid = trans->r_log_tid;
__entry->lsn = trans->r_lsn;
__entry->type = ITEM_TYPE(item);
__entry->pass = pass;
__entry->count = item->ri_cnt;
__entry->total = item->ri_total;
),
TP_printk("dev %d:%d trans 0x%x, pass %d, item 0x%p, item type %s "
"item region count/total %d/%d",
TP_printk("dev %d:%d tid 0x%x lsn 0x%llx, pass %d, item 0x%p, "
"item type %s item region count/total %d/%d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->tid,
__entry->lsn,
__entry->pass,
(void *)__entry->item,
__print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
......@@ -2069,6 +2095,7 @@ DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel);
DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_add);
DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_ref_inc);
DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_recover);
DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_skip);
DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_inode_buf);
DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf);
DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment