Commit 0d19d9e1 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'ext4_for_linus-6.8-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 updates from Ted Ts'o:
 "Various ext4 bug fixes and cleanups. The fixes are mostly in the
  fstrim and mballoc code paths.

  Also enable dioread_nolock in the case where the block size is less
  than the page size (dioread_nolock has been default in the bs == ps
  case for quite some time)"

* tag 'ext4_for_linus-6.8-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4:
  ext4: fix inconsistent between segment fstrim and full fstrim
  ext4: fallback to complex scan if aligned scan doesn't work
  ext4: convert ext4_da_do_write_end() to take a folio
  ext4: allow for the last group to be marked as trimmed
  ext4: move ext4_check_bdev_write_error() into nojournal mode
  jbd2: abort journal when detecting metadata writeback error of fs dev
  jbd2: remove unused 'JBD2_CHECKPOINT_IO_ERROR' and 'j_atomic_flags'
  jbd2: replace journal state flag by checking errseq
  jbd2: add errseq to detect client fs's bdev writeback error
  ext4: improving calculation of 'fe_{len|start}' in mb_find_extent()
  ext4: clarify handling of unwritten bh in __ext4_block_zero_page_range()
  ext4: treat end of range as exclusive in ext4_zero_range()
  ext4: enable dioread_nolock as default for bs < ps case
  ext4: delete redundant calculations in ext4_mb_get_buddy_page_lock()
  ext4: reduce unnecessary memory allocation in alloc_flex_gd()
  ext4: avoid online resizing failures due to oversized flex bg
  ext4: remove unnecessary check from alloc_flex_gd()
  ext4: unify the type of flexbg_size to unsigned int
parents 6bd593bc 68da4c44
......@@ -235,8 +235,6 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line,
might_sleep();
ext4_check_bdev_write_error(sb);
if (ext4_handle_valid(handle)) {
err = jbd2_journal_get_write_access(handle, bh);
if (err) {
......@@ -244,7 +242,8 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line,
handle, err);
return err;
}
}
} else
ext4_check_bdev_write_error(sb);
if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb))
return 0;
BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT);
......
......@@ -4523,7 +4523,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
* Round up offset. This is not fallocate, we need to zero out
* blocks, so convert interior block aligned part of the range to
* unwritten and possibly manually zero out unaligned parts of the
* range.
* range. Here, start and partial_begin are inclusive, end and
* partial_end are exclusive.
*/
start = round_up(offset, 1 << blkbits);
end = round_down((offset + len), 1 << blkbits);
......@@ -4609,7 +4610,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
* disk in case of crash before zeroing trans is committed.
*/
if (ext4_should_journal_data(inode)) {
ret = filemap_write_and_wait_range(mapping, start, end);
ret = filemap_write_and_wait_range(mapping, start,
end - 1);
if (ret) {
filemap_invalidate_unlock(mapping);
goto out_mutex;
......
......@@ -2947,7 +2947,7 @@ static int ext4_da_should_update_i_disksize(struct folio *folio,
static int ext4_da_do_write_end(struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page)
struct folio *folio)
{
struct inode *inode = mapping->host;
loff_t old_size = inode->i_size;
......@@ -2958,12 +2958,13 @@ static int ext4_da_do_write_end(struct address_space *mapping,
* block_write_end() will mark the inode as dirty with I_DIRTY_PAGES
* flag, which all that's needed to trigger page writeback.
*/
copied = block_write_end(NULL, mapping, pos, len, copied, page, NULL);
copied = block_write_end(NULL, mapping, pos, len, copied,
&folio->page, NULL);
new_i_size = pos + copied;
/*
* It's important to update i_size while still holding page lock,
* because page writeout could otherwise come in and zero beyond
* It's important to update i_size while still holding folio lock,
* because folio writeout could otherwise come in and zero beyond
* i_size.
*
* Since we are holding inode lock, we are sure i_disksize <=
......@@ -2981,14 +2982,14 @@ static int ext4_da_do_write_end(struct address_space *mapping,
i_size_write(inode, new_i_size);
end = (new_i_size - 1) & (PAGE_SIZE - 1);
if (copied && ext4_da_should_update_i_disksize(page_folio(page), end)) {
if (copied && ext4_da_should_update_i_disksize(folio, end)) {
ext4_update_i_disksize(inode, new_i_size);
disksize_changed = true;
}
}
unlock_page(page);
put_page(page);
folio_unlock(folio);
folio_put(folio);
if (old_size < pos)
pagecache_isize_extended(inode, old_size, pos);
......@@ -3027,10 +3028,10 @@ static int ext4_da_write_end(struct file *file,
return ext4_write_inline_data_end(inode, pos, len, copied,
folio);
if (unlikely(copied < len) && !PageUptodate(page))
if (unlikely(copied < len) && !folio_test_uptodate(folio))
copied = 0;
return ext4_da_do_write_end(mapping, pos, len, copied, &folio->page);
return ext4_da_do_write_end(mapping, pos, len, copied, folio);
}
/*
......@@ -3630,6 +3631,12 @@ void ext4_set_aops(struct inode *inode)
inode->i_mapping->a_ops = &ext4_aops;
}
/*
* Here we can't skip an unwritten buffer even though it usually reads zero
* because it might have data in pagecache (eg, if called from ext4_zero_range,
* ext4_punch_hole, etc) which needs to be properly zeroed out. Otherwise a
* racing writeback can come later and flush the stale pagecache to disk.
*/
static int __ext4_block_zero_page_range(handle_t *handle,
struct address_space *mapping, loff_t from, loff_t length)
{
......
......@@ -1456,9 +1456,8 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
return 0;
}
block++;
pnum = block / blocks_per_page;
page = find_or_create_page(inode->i_mapping, pnum, gfp);
/* blocks_per_page == 1, hence we need another page for the buddy */
page = find_or_create_page(inode->i_mapping, block + 1, gfp);
if (!page)
return -ENOMEM;
BUG_ON(page->mapping != inode->i_mapping);
......@@ -1958,8 +1957,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
static int mb_find_extent(struct ext4_buddy *e4b, int block,
int needed, struct ext4_free_extent *ex)
{
int next = block;
int max, order;
int max, order, next;
void *buddy;
assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
......@@ -1977,16 +1975,12 @@ static int mb_find_extent(struct ext4_buddy *e4b, int block,
/* find actual order */
order = mb_find_order_for_block(e4b, block);
block = block >> order;
ex->fe_len = 1 << order;
ex->fe_start = block << order;
ex->fe_len = (1 << order) - (block & ((1 << order) - 1));
ex->fe_start = block;
ex->fe_group = e4b->bd_group;
/* calc difference from given start */
next = next - ex->fe_start;
ex->fe_len -= next;
ex->fe_start += next;
block = block >> order;
while (needed > ex->fe_len &&
mb_find_buddy(e4b, order, &max)) {
......@@ -2895,14 +2889,19 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
ac->ac_groups_scanned++;
if (cr == CR_POWER2_ALIGNED)
ext4_mb_simple_scan_group(ac, &e4b);
else if ((cr == CR_GOAL_LEN_FAST ||
cr == CR_BEST_AVAIL_LEN) &&
sbi->s_stripe &&
else {
bool is_stripe_aligned = sbi->s_stripe &&
!(ac->ac_g_ex.fe_len %
EXT4_B2C(sbi, sbi->s_stripe)))
EXT4_B2C(sbi, sbi->s_stripe));
if ((cr == CR_GOAL_LEN_FAST ||
cr == CR_BEST_AVAIL_LEN) &&
is_stripe_aligned)
ext4_mb_scan_aligned(ac, &e4b);
else
if (ac->ac_status == AC_STATUS_CONTINUE)
ext4_mb_complex_scan_group(ac, &e4b);
}
ext4_unlock_group(sb, group);
ext4_mb_unload_buddy(&e4b);
......@@ -6735,11 +6734,16 @@ __acquires(bitlock)
static ext4_grpblk_t ext4_last_grp_cluster(struct super_block *sb,
ext4_group_t grp)
{
if (grp < ext4_get_groups_count(sb))
return EXT4_CLUSTERS_PER_GROUP(sb) - 1;
return (ext4_blocks_count(EXT4_SB(sb)->s_es) -
ext4_group_first_block_no(sb, grp) - 1) >>
EXT4_CLUSTER_BITS(sb);
unsigned long nr_clusters_in_group;
if (grp < (ext4_get_groups_count(sb) - 1))
nr_clusters_in_group = EXT4_CLUSTERS_PER_GROUP(sb);
else
nr_clusters_in_group = (ext4_blocks_count(EXT4_SB(sb)->s_es) -
ext4_group_first_block_no(sb, grp))
>> EXT4_CLUSTER_BITS(sb);
return nr_clusters_in_group - 1;
}
static bool ext4_trim_interrupted(void)
......@@ -6753,13 +6757,15 @@ static int ext4_try_to_trim_range(struct super_block *sb,
__acquires(ext4_group_lock_ptr(sb, e4b->bd_group))
__releases(ext4_group_lock_ptr(sb, e4b->bd_group))
{
ext4_grpblk_t next, count, free_count;
ext4_grpblk_t next, count, free_count, last, origin_start;
bool set_trimmed = false;
void *bitmap;
last = ext4_last_grp_cluster(sb, e4b->bd_group);
bitmap = e4b->bd_bitmap;
if (start == 0 && max >= ext4_last_grp_cluster(sb, e4b->bd_group))
if (start == 0 && max >= last)
set_trimmed = true;
origin_start = start;
start = max(e4b->bd_info->bb_first_free, start);
count = 0;
free_count = 0;
......@@ -6768,7 +6774,10 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group))
start = mb_find_next_zero_bit(bitmap, max + 1, start);
if (start > max)
break;
next = mb_find_next_bit(bitmap, max + 1, start);
next = mb_find_next_bit(bitmap, last + 1, start);
if (origin_start == 0 && next >= last)
set_trimmed = true;
if ((next - start) >= minblocks) {
int ret = ext4_trim_extent(sb, start, next - start, e4b);
......
......@@ -218,35 +218,53 @@ struct ext4_new_flex_group_data {
in the flex group */
__u16 *bg_flags; /* block group flags of groups
in @groups */
ext4_group_t resize_bg; /* number of allocated
new_group_data */
ext4_group_t count; /* number of groups in @groups
*/
};
/*
* Avoiding memory allocation failures due to too many groups added each time.
*/
#define MAX_RESIZE_BG 16384
/*
* alloc_flex_gd() allocates a ext4_new_flex_group_data with size of
* @flexbg_size.
*
* Returns NULL on failure otherwise address of the allocated structure.
*/
static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size)
static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned int flexbg_size,
ext4_group_t o_group, ext4_group_t n_group)
{
ext4_group_t last_group;
struct ext4_new_flex_group_data *flex_gd;
flex_gd = kmalloc(sizeof(*flex_gd), GFP_NOFS);
if (flex_gd == NULL)
goto out3;
if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_group_data))
goto out2;
flex_gd->count = flexbg_size;
if (unlikely(flexbg_size > MAX_RESIZE_BG))
flex_gd->resize_bg = MAX_RESIZE_BG;
else
flex_gd->resize_bg = flexbg_size;
flex_gd->groups = kmalloc_array(flexbg_size,
/* Avoid allocating large 'groups' array if not needed */
last_group = o_group | (flex_gd->resize_bg - 1);
if (n_group <= last_group)
flex_gd->resize_bg = 1 << fls(n_group - o_group + 1);
else if (n_group - last_group < flex_gd->resize_bg)
flex_gd->resize_bg = 1 << max(fls(last_group - o_group + 1),
fls(n_group - last_group));
flex_gd->groups = kmalloc_array(flex_gd->resize_bg,
sizeof(struct ext4_new_group_data),
GFP_NOFS);
if (flex_gd->groups == NULL)
goto out2;
flex_gd->bg_flags = kmalloc_array(flexbg_size, sizeof(__u16),
flex_gd->bg_flags = kmalloc_array(flex_gd->resize_bg, sizeof(__u16),
GFP_NOFS);
if (flex_gd->bg_flags == NULL)
goto out1;
......@@ -283,7 +301,7 @@ static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd)
*/
static int ext4_alloc_group_tables(struct super_block *sb,
struct ext4_new_flex_group_data *flex_gd,
int flexbg_size)
unsigned int flexbg_size)
{
struct ext4_new_group_data *group_data = flex_gd->groups;
ext4_fsblk_t start_blk;
......@@ -384,12 +402,12 @@ static int ext4_alloc_group_tables(struct super_block *sb,
group = group_data[0].group;
printk(KERN_DEBUG "EXT4-fs: adding a flex group with "
"%d groups, flexbg size is %d:\n", flex_gd->count,
"%u groups, flexbg size is %u:\n", flex_gd->count,
flexbg_size);
for (i = 0; i < flex_gd->count; i++) {
ext4_debug(
"adding %s group %u: %u blocks (%d free, %d mdata blocks)\n",
"adding %s group %u: %u blocks (%u free, %u mdata blocks)\n",
ext4_bg_has_super(sb, group + i) ? "normal" :
"no-super", group + i,
group_data[i].blocks_count,
......@@ -1605,8 +1623,7 @@ static int ext4_flex_group_add(struct super_block *sb,
static int ext4_setup_next_flex_gd(struct super_block *sb,
struct ext4_new_flex_group_data *flex_gd,
ext4_fsblk_t n_blocks_count,
unsigned long flexbg_size)
ext4_fsblk_t n_blocks_count)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
......@@ -1630,7 +1647,7 @@ static int ext4_setup_next_flex_gd(struct super_block *sb,
BUG_ON(last);
ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &last);
last_group = group | (flexbg_size - 1);
last_group = group | (flex_gd->resize_bg - 1);
if (last_group > n_group)
last_group = n_group;
......@@ -1990,8 +2007,9 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
ext4_fsblk_t o_blocks_count;
ext4_fsblk_t n_blocks_count_retry = 0;
unsigned long last_update_time = 0;
int err = 0, flexbg_size = 1 << sbi->s_log_groups_per_flex;
int err = 0;
int meta_bg;
unsigned int flexbg_size = ext4_flex_bg_size(sbi);
/* See if the device is actually as big as what was requested */
bh = ext4_sb_bread(sb, n_blocks_count - 1, 0);
......@@ -2123,7 +2141,7 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
if (err)
goto out;
flex_gd = alloc_flex_gd(flexbg_size);
flex_gd = alloc_flex_gd(flexbg_size, o_group, n_group);
if (flex_gd == NULL) {
err = -ENOMEM;
goto out;
......@@ -2132,8 +2150,7 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
/* Add flex groups. Note that a regular group is a
* flex group with 1 group.
*/
while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count,
flexbg_size)) {
while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count)) {
if (time_is_before_jiffies(last_update_time + HZ * 10)) {
if (last_update_time)
ext4_msg(sb, KERN_INFO,
......
......@@ -2793,15 +2793,6 @@ static int ext4_check_opt_consistency(struct fs_context *fc,
return -EINVAL;
}
if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DIOREAD_NOLOCK)) {
int blocksize =
BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
if (blocksize < PAGE_SIZE)
ext4_msg(NULL, KERN_WARNING, "Warning: mounting with an "
"experimental mount option 'dioread_nolock' "
"for blocksize < PAGE_SIZE");
}
err = ext4_check_test_dummy_encryption(fc, sb);
if (err)
return err;
......@@ -4410,7 +4401,7 @@ static void ext4_set_def_opts(struct super_block *sb,
((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
set_opt(sb, DELALLOC);
if (sb->s_blocksize == PAGE_SIZE)
if (sb->s_blocksize <= PAGE_SIZE)
set_opt(sb, DIOREAD_NOLOCK);
}
......
......@@ -556,7 +556,6 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
struct transaction_chp_stats_s *stats;
transaction_t *transaction;
journal_t *journal;
struct buffer_head *bh = jh2bh(jh);
JBUFFER_TRACE(jh, "entry");
......@@ -569,16 +568,6 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
JBUFFER_TRACE(jh, "removing from transaction");
/*
* If we have failed to write the buffer out to disk, the filesystem
* may become inconsistent. We cannot abort the journal here since
* we hold j_list_lock and we have to be careful about races with
* jbd2_journal_destroy(). So mark the writeback IO error in the
* journal here and we abort the journal later from a better context.
*/
if (buffer_write_io_error(bh))
set_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags);
__buffer_unlink(jh);
jh->b_cp_transaction = NULL;
percpu_counter_dec(&journal->j_checkpoint_jh_count);
......
......@@ -1534,6 +1534,7 @@ static journal_t *journal_init_common(struct block_device *bdev,
journal->j_fs_dev = fs_dev;
journal->j_blk_offset = start;
journal->j_total_len = len;
jbd2_init_fs_dev_write_error(journal);
err = journal_load_superblock(journal);
if (err)
......@@ -1861,7 +1862,7 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
if (is_journal_aborted(journal))
return -EIO;
if (test_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags)) {
if (jbd2_check_fs_dev_write_error(journal)) {
jbd2_journal_abort(journal, -EIO);
return -EIO;
}
......@@ -2159,12 +2160,12 @@ int jbd2_journal_destroy(journal_t *journal)
/*
* OK, all checkpoint transactions have been checked, now check the
* write out io error flag and abort the journal if some buffer failed
* to write back to the original location, otherwise the filesystem
* may become inconsistent.
* writeback errseq of fs dev and abort the journal if some buffer
* failed to write back to the original location, otherwise the
* filesystem may become inconsistent.
*/
if (!is_journal_aborted(journal) &&
test_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags))
jbd2_check_fs_dev_write_error(journal))
jbd2_journal_abort(journal, -EIO);
if (journal->j_sb_buffer) {
......
......@@ -289,8 +289,6 @@ int jbd2_journal_recover(journal_t *journal)
journal_superblock_t * sb;
struct recovery_info info;
errseq_t wb_err;
struct address_space *mapping;
memset(&info, 0, sizeof(info));
sb = journal->j_superblock;
......@@ -308,9 +306,6 @@ int jbd2_journal_recover(journal_t *journal)
return 0;
}
wb_err = 0;
mapping = journal->j_fs_dev->bd_inode->i_mapping;
errseq_check_and_advance(&mapping->wb_err, &wb_err);
err = do_one_pass(journal, &info, PASS_SCAN);
if (!err)
err = do_one_pass(journal, &info, PASS_REVOKE);
......@@ -334,7 +329,7 @@ int jbd2_journal_recover(journal_t *journal)
err2 = sync_blockdev(journal->j_fs_dev);
if (!err)
err = err2;
err2 = errseq_check_and_advance(&mapping->wb_err, &wb_err);
err2 = jbd2_check_fs_dev_write_error(journal);
if (!err)
err = err2;
/* Make sure all replayed data is on permanent storage */
......
......@@ -1231,11 +1231,25 @@ static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh,
int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
{
struct journal_head *jh;
journal_t *journal;
int rc;
if (is_handle_aborted(handle))
return -EROFS;
journal = handle->h_transaction->t_journal;
if (jbd2_check_fs_dev_write_error(journal)) {
/*
* If the fs dev has writeback errors, it may have failed
* to async write out metadata buffers in the background.
* In this case, we could read old data from disk and write
* it out again, which may lead to on-disk filesystem
* inconsistency. Aborting journal can avoid it happen.
*/
jbd2_journal_abort(journal, -EIO);
return -EIO;
}
if (jbd2_write_access_granted(handle, bh, false))
return 0;
......
......@@ -755,11 +755,6 @@ struct journal_s
*/
unsigned long j_flags;
/**
* @j_atomic_flags: Atomic journaling state flags.
*/
unsigned long j_atomic_flags;
/**
* @j_errno:
*
......@@ -998,6 +993,13 @@ struct journal_s
*/
struct block_device *j_fs_dev;
/**
* @j_fs_dev_wb_err:
*
* Records the errseq of the client fs's backing block device.
*/
errseq_t j_fs_dev_wb_err;
/**
* @j_total_len: Total maximum capacity of the journal region on disk.
*/
......@@ -1399,12 +1401,6 @@ JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit, FAST_COMMIT)
#define JBD2_JOURNAL_FLUSH_VALID (JBD2_JOURNAL_FLUSH_DISCARD | \
JBD2_JOURNAL_FLUSH_ZEROOUT)
/*
* Journal atomic flag definitions
*/
#define JBD2_CHECKPOINT_IO_ERROR 0x001 /* Detect io error while writing
* buffer back to disk */
/*
* Function declarations for the journaling transaction and buffer
* management
......@@ -1698,6 +1694,25 @@ static inline void jbd2_journal_abort_handle(handle_t *handle)
handle->h_aborted = 1;
}
static inline void jbd2_init_fs_dev_write_error(journal_t *journal)
{
struct address_space *mapping = journal->j_fs_dev->bd_inode->i_mapping;
/*
* Save the original wb_err value of client fs's bdev mapping which
* could be used to detect the client fs's metadata async write error.
*/
errseq_check_and_advance(&mapping->wb_err, &journal->j_fs_dev_wb_err);
}
static inline int jbd2_check_fs_dev_write_error(journal_t *journal)
{
struct address_space *mapping = journal->j_fs_dev->bd_inode->i_mapping;
return errseq_check(&mapping->wb_err,
READ_ONCE(journal->j_fs_dev_wb_err));
}
#endif /* __KERNEL__ */
/* Comparison functions for transaction IDs: perform comparisons using
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment