Commit 51ed42a8 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'ext4_for_linus-6.11-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 updates from Ted Ts'o:
 "Many cleanups and bug fixes in ext4, especially for the fast commit
  feature.

  Also some performance improvements; in particular, improving IOPS and
  throughput on fast devices running Async Direct I/O by up to 20% by
  optimizing jbd2_transaction_committed()"

* tag 'ext4_for_linus-6.11-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (40 commits)
  ext4: make sure the first directory block is not a hole
  ext4: check dot and dotdot of dx_root before making dir indexed
  ext4: sanity check for NULL pointer after ext4_force_shutdown
  jbd2: increase maximum transaction size
  jbd2: drop pointless shrinker batch initialization
  jbd2: avoid infinite transaction commit loop
  jbd2: precompute number of transaction descriptor blocks
  jbd2: make jbd2_journal_get_max_txn_bufs() internal
  jbd2: avoid mount failed when commit block is partial submitted
  ext4: avoid writing unitialized memory to disk in EA inodes
  ext4: don't track ranges in fast_commit if inode has inlined data
  ext4: fix possible tid_t sequence overflows
  ext4: use ext4_update_inode_fsync_trans() helper in inode creation
  ext4: add missing MODULE_DESCRIPTION()
  jbd2: add missing MODULE_DESCRIPTION()
  ext4: use memtostr_pad() for s_volume_name
  jbd2: speed up jbd2_transaction_committed()
  ext4: make ext4_da_map_blocks() buffer_head unaware
  ext4: make ext4_insert_delayed_block() insert multi-blocks
  ext4: factor out a helper to check the cluster allocation state
  ...
parents dddebdec f9ca5159
...@@ -2184,6 +2184,8 @@ static void __block_commit_write(struct folio *folio, size_t from, size_t to) ...@@ -2184,6 +2184,8 @@ static void __block_commit_write(struct folio *folio, size_t from, size_t to)
struct buffer_head *bh, *head; struct buffer_head *bh, *head;
bh = head = folio_buffers(folio); bh = head = folio_buffers(folio);
if (!bh)
return;
blocksize = bh->b_size; blocksize = bh->b_size;
block_start = 0; block_start = 0;
......
...@@ -72,7 +72,7 @@ static int add_system_zone(struct ext4_system_blocks *system_blks, ...@@ -72,7 +72,7 @@ static int add_system_zone(struct ext4_system_blocks *system_blks,
{ {
struct ext4_system_zone *new_entry, *entry; struct ext4_system_zone *new_entry, *entry;
struct rb_node **n = &system_blks->root.rb_node, *node; struct rb_node **n = &system_blks->root.rb_node, *node;
struct rb_node *parent = NULL, *new_node = NULL; struct rb_node *parent = NULL, *new_node;
while (*n) { while (*n) {
parent = *n; parent = *n;
......
...@@ -1347,7 +1347,7 @@ struct ext4_super_block { ...@@ -1347,7 +1347,7 @@ struct ext4_super_block {
/*60*/ __le32 s_feature_incompat; /* incompatible feature set */ /*60*/ __le32 s_feature_incompat; /* incompatible feature set */
__le32 s_feature_ro_compat; /* readonly-compatible feature set */ __le32 s_feature_ro_compat; /* readonly-compatible feature set */
/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ /*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */
/*78*/ char s_volume_name[EXT4_LABEL_MAX]; /* volume name */ /*78*/ char s_volume_name[EXT4_LABEL_MAX] __nonstring; /* volume name */
/*88*/ char s_last_mounted[64] __nonstring; /* directory where last mounted */ /*88*/ char s_last_mounted[64] __nonstring; /* directory where last mounted */
/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */ /*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */
/* /*
......
...@@ -310,6 +310,8 @@ void ext4_es_find_extent_range(struct inode *inode, ...@@ -310,6 +310,8 @@ void ext4_es_find_extent_range(struct inode *inode,
ext4_lblk_t lblk, ext4_lblk_t end, ext4_lblk_t lblk, ext4_lblk_t end,
struct extent_status *es) struct extent_status *es)
{ {
es->es_lblk = es->es_len = es->es_pblk = 0;
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
return; return;
...@@ -2052,34 +2054,49 @@ bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk) ...@@ -2052,34 +2054,49 @@ bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk)
} }
/* /*
* ext4_es_insert_delayed_block - adds a delayed block to the extents status * ext4_es_insert_delayed_extent - adds some delayed blocks to the extents
* tree, adding a pending reservation where * status tree, adding a pending reservation
* needed * where needed
* *
* @inode - file containing the newly added block * @inode - file containing the newly added block
* @lblk - logical block to be added * @lblk - start logical block to be added
* @allocated - indicates whether a physical cluster has been allocated for * @len - length of blocks to be added
* the logical cluster that contains the block * @lclu_allocated/end_allocated - indicates whether a physical cluster has
* been allocated for the logical cluster
* that contains the start/end block. Note that
* end_allocated should always be set to false
* if the start and the end block are in the
* same cluster
*/ */
void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
bool allocated) ext4_lblk_t len, bool lclu_allocated,
bool end_allocated)
{ {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct extent_status newes; struct extent_status newes;
ext4_lblk_t end = lblk + len - 1;
int err1 = 0, err2 = 0, err3 = 0; int err1 = 0, err2 = 0, err3 = 0;
struct extent_status *es1 = NULL; struct extent_status *es1 = NULL;
struct extent_status *es2 = NULL; struct extent_status *es2 = NULL;
struct pending_reservation *pr = NULL; struct pending_reservation *pr1 = NULL;
struct pending_reservation *pr2 = NULL;
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
return; return;
es_debug("add [%u/1) delayed to extent status tree of inode %lu\n", es_debug("add [%u/%u) delayed to extent status tree of inode %lu\n",
lblk, inode->i_ino); lblk, len, inode->i_ino);
if (!len)
return;
WARN_ON_ONCE((EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) &&
end_allocated);
newes.es_lblk = lblk; newes.es_lblk = lblk;
newes.es_len = 1; newes.es_len = len;
ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED); ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED);
trace_ext4_es_insert_delayed_block(inode, &newes, allocated); trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated,
end_allocated);
ext4_es_insert_extent_check(inode, &newes); ext4_es_insert_extent_check(inode, &newes);
...@@ -2088,11 +2105,15 @@ void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, ...@@ -2088,11 +2105,15 @@ void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
es1 = __es_alloc_extent(true); es1 = __es_alloc_extent(true);
if ((err1 || err2) && !es2) if ((err1 || err2) && !es2)
es2 = __es_alloc_extent(true); es2 = __es_alloc_extent(true);
if ((err1 || err2 || err3) && allocated && !pr) if (err1 || err2 || err3) {
pr = __alloc_pending(true); if (lclu_allocated && !pr1)
pr1 = __alloc_pending(true);
if (end_allocated && !pr2)
pr2 = __alloc_pending(true);
}
write_lock(&EXT4_I(inode)->i_es_lock); write_lock(&EXT4_I(inode)->i_es_lock);
err1 = __es_remove_extent(inode, lblk, lblk, NULL, es1); err1 = __es_remove_extent(inode, lblk, end, NULL, es1);
if (err1 != 0) if (err1 != 0)
goto error; goto error;
/* Free preallocated extent if it didn't get used. */ /* Free preallocated extent if it didn't get used. */
...@@ -2112,13 +2133,22 @@ void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, ...@@ -2112,13 +2133,22 @@ void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
es2 = NULL; es2 = NULL;
} }
if (allocated) { if (lclu_allocated) {
err3 = __insert_pending(inode, lblk, &pr); err3 = __insert_pending(inode, lblk, &pr1);
if (err3 != 0) if (err3 != 0)
goto error; goto error;
if (pr) { if (pr1) {
__free_pending(pr); __free_pending(pr1);
pr = NULL; pr1 = NULL;
}
}
if (end_allocated) {
err3 = __insert_pending(inode, end, &pr2);
if (err3 != 0)
goto error;
if (pr2) {
__free_pending(pr2);
pr2 = NULL;
} }
} }
error: error:
......
...@@ -249,8 +249,9 @@ extern void ext4_exit_pending(void); ...@@ -249,8 +249,9 @@ extern void ext4_exit_pending(void);
extern void ext4_init_pending_tree(struct ext4_pending_tree *tree); extern void ext4_init_pending_tree(struct ext4_pending_tree *tree);
extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk); extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk);
extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk); extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk);
extern void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, extern void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
bool allocated); ext4_lblk_t len, bool lclu_allocated,
bool end_allocated);
extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len); ext4_lblk_t len);
extern void ext4_clear_inode_es(struct inode *inode); extern void ext4_clear_inode_es(struct inode *inode);
......
...@@ -353,7 +353,7 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl ...@@ -353,7 +353,7 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl
read_unlock(&sbi->s_journal->j_state_lock); read_unlock(&sbi->s_journal->j_state_lock);
} }
spin_lock(&sbi->s_fc_lock); spin_lock(&sbi->s_fc_lock);
if (sbi->s_fc_ineligible_tid < tid) if (tid_gt(tid, sbi->s_fc_ineligible_tid))
sbi->s_fc_ineligible_tid = tid; sbi->s_fc_ineligible_tid = tid;
spin_unlock(&sbi->s_fc_lock); spin_unlock(&sbi->s_fc_lock);
WARN_ON(reason >= EXT4_FC_REASON_MAX); WARN_ON(reason >= EXT4_FC_REASON_MAX);
...@@ -649,6 +649,12 @@ void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t star ...@@ -649,6 +649,12 @@ void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t star
if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
return; return;
if (ext4_has_inline_data(inode)) {
ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR,
handle);
return;
}
args.start = start; args.start = start;
args.end = end; args.end = end;
...@@ -1207,7 +1213,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid) ...@@ -1207,7 +1213,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
if (ret == -EALREADY) { if (ret == -EALREADY) {
/* There was an ongoing commit, check if we need to restart */ /* There was an ongoing commit, check if we need to restart */
if (atomic_read(&sbi->s_fc_subtid) <= subtid && if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
commit_tid > journal->j_commit_sequence) tid_gt(commit_tid, journal->j_commit_sequence))
goto restart_fc; goto restart_fc;
ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0, ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0,
commit_tid); commit_tid);
...@@ -1282,7 +1288,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) ...@@ -1282,7 +1288,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
list_del_init(&iter->i_fc_list); list_del_init(&iter->i_fc_list);
ext4_clear_inode_state(&iter->vfs_inode, ext4_clear_inode_state(&iter->vfs_inode,
EXT4_STATE_FC_COMMITTING); EXT4_STATE_FC_COMMITTING);
if (iter->i_sync_tid <= tid) if (tid_geq(tid, iter->i_sync_tid))
ext4_fc_reset_inode(&iter->vfs_inode); ext4_fc_reset_inode(&iter->vfs_inode);
/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
smp_mb(); smp_mb();
...@@ -1313,7 +1319,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) ...@@ -1313,7 +1319,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
&sbi->s_fc_q[FC_Q_MAIN]); &sbi->s_fc_q[FC_Q_MAIN]);
if (tid >= sbi->s_fc_ineligible_tid) { if (tid_geq(tid, sbi->s_fc_ineligible_tid)) {
sbi->s_fc_ineligible_tid = 0; sbi->s_fc_ineligible_tid = 0;
ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
} }
......
...@@ -1336,10 +1336,7 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap, ...@@ -1336,10 +1336,7 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
} }
} }
if (ext4_handle_valid(handle)) { ext4_update_inode_fsync_trans(handle, inode, 1);
ei->i_sync_tid = handle->h_transaction->t_tid;
ei->i_datasync_tid = handle->h_transaction->t_tid;
}
err = ext4_mark_inode_dirty(handle, inode); err = ext4_mark_inode_dirty(handle, inode);
if (err) { if (err) {
......
...@@ -1410,7 +1410,11 @@ int ext4_inlinedir_to_tree(struct file *dir_file, ...@@ -1410,7 +1410,11 @@ int ext4_inlinedir_to_tree(struct file *dir_file,
hinfo->hash = EXT4_DIRENT_HASH(de); hinfo->hash = EXT4_DIRENT_HASH(de);
hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de); hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de);
} else { } else {
ext4fs_dirhash(dir, de->name, de->name_len, hinfo); err = ext4fs_dirhash(dir, de->name, de->name_len, hinfo);
if (err) {
ret = err;
goto out;
}
} }
if ((hinfo->hash < start_hash) || if ((hinfo->hash < start_hash) ||
((hinfo->hash == start_hash) && ((hinfo->hash == start_hash) &&
......
...@@ -279,4 +279,5 @@ static struct kunit_suite ext4_inode_test_suite = { ...@@ -279,4 +279,5 @@ static struct kunit_suite ext4_inode_test_suite = {
kunit_test_suites(&ext4_inode_test_suite); kunit_test_suites(&ext4_inode_test_suite);
MODULE_DESCRIPTION("KUnit test of ext4 inode timestamp decoding");
MODULE_LICENSE("GPL v2"); MODULE_LICENSE("GPL v2");
This diff is collapsed.
...@@ -1151,7 +1151,7 @@ static int ext4_ioctl_getlabel(struct ext4_sb_info *sbi, char __user *user_label ...@@ -1151,7 +1151,7 @@ static int ext4_ioctl_getlabel(struct ext4_sb_info *sbi, char __user *user_label
BUILD_BUG_ON(EXT4_LABEL_MAX >= FSLABEL_MAX); BUILD_BUG_ON(EXT4_LABEL_MAX >= FSLABEL_MAX);
lock_buffer(sbi->s_sbh); lock_buffer(sbi->s_sbh);
strscpy_pad(label, sbi->s_es->s_volume_name); memtostr_pad(label, sbi->s_es->s_volume_name);
unlock_buffer(sbi->s_sbh); unlock_buffer(sbi->s_sbh);
if (copy_to_user(user_label, label, sizeof(label))) if (copy_to_user(user_label, label, sizeof(label)))
......
...@@ -151,10 +151,11 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, ...@@ -151,10 +151,11 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
return bh; return bh;
} }
if (!bh && (type == INDEX || type == DIRENT_HTREE)) { /* The first directory block must not be a hole. */
if (!bh && (type == INDEX || type == DIRENT_HTREE || block == 0)) {
ext4_error_inode(inode, func, line, block, ext4_error_inode(inode, func, line, block,
"Directory hole found for htree %s block", "Directory hole found for htree %s block %u",
(type == INDEX) ? "index" : "leaf"); (type == INDEX) ? "index" : "leaf", block);
return ERR_PTR(-EFSCORRUPTED); return ERR_PTR(-EFSCORRUPTED);
} }
if (!bh) if (!bh)
...@@ -2172,6 +2173,52 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, ...@@ -2172,6 +2173,52 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
return err ? err : err2; return err ? err : err2;
} }
static bool ext4_check_dx_root(struct inode *dir, struct dx_root *root)
{
struct fake_dirent *fde;
const char *error_msg;
unsigned int rlen;
unsigned int blocksize = dir->i_sb->s_blocksize;
char *blockend = (char *)root + dir->i_sb->s_blocksize;
fde = &root->dot;
if (unlikely(fde->name_len != 1)) {
error_msg = "invalid name_len for '.'";
goto corrupted;
}
if (unlikely(strncmp(root->dot_name, ".", fde->name_len))) {
error_msg = "invalid name for '.'";
goto corrupted;
}
rlen = ext4_rec_len_from_disk(fde->rec_len, blocksize);
if (unlikely((char *)fde + rlen >= blockend)) {
error_msg = "invalid rec_len for '.'";
goto corrupted;
}
fde = &root->dotdot;
if (unlikely(fde->name_len != 2)) {
error_msg = "invalid name_len for '..'";
goto corrupted;
}
if (unlikely(strncmp(root->dotdot_name, "..", fde->name_len))) {
error_msg = "invalid name for '..'";
goto corrupted;
}
rlen = ext4_rec_len_from_disk(fde->rec_len, blocksize);
if (unlikely((char *)fde + rlen >= blockend)) {
error_msg = "invalid rec_len for '..'";
goto corrupted;
}
return true;
corrupted:
EXT4_ERROR_INODE(dir, "Corrupt dir, %s, running e2fsck is recommended",
error_msg);
return false;
}
/* /*
* This converts a one block unindexed directory to a 3 block indexed * This converts a one block unindexed directory to a 3 block indexed
* directory, and adds the dentry to the indexed directory. * directory, and adds the dentry to the indexed directory.
...@@ -2206,17 +2253,17 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, ...@@ -2206,17 +2253,17 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
brelse(bh); brelse(bh);
return retval; return retval;
} }
root = (struct dx_root *) bh->b_data; root = (struct dx_root *) bh->b_data;
if (!ext4_check_dx_root(dir, root)) {
brelse(bh);
return -EFSCORRUPTED;
}
/* The 0th block becomes the root, move the dirents out */ /* The 0th block becomes the root, move the dirents out */
fde = &root->dotdot; fde = &root->dotdot;
de = (struct ext4_dir_entry_2 *)((char *)fde + de = (struct ext4_dir_entry_2 *)((char *)fde +
ext4_rec_len_from_disk(fde->rec_len, blocksize)); ext4_rec_len_from_disk(fde->rec_len, blocksize));
if ((char *) de >= (((char *) root) + blocksize)) {
EXT4_ERROR_INODE(dir, "invalid rec_len for '..'");
brelse(bh);
return -EFSCORRUPTED;
}
len = ((char *) root) + (blocksize - csum_size) - (char *) de; len = ((char *) root) + (blocksize - csum_size) - (char *) de;
/* Allocate new block for the 0th block's dirents */ /* Allocate new block for the 0th block's dirents */
...@@ -3038,10 +3085,7 @@ bool ext4_empty_dir(struct inode *inode) ...@@ -3038,10 +3085,7 @@ bool ext4_empty_dir(struct inode *inode)
EXT4_ERROR_INODE(inode, "invalid size"); EXT4_ERROR_INODE(inode, "invalid size");
return false; return false;
} }
/* The first directory block must not be a hole, bh = ext4_read_dirblock(inode, 0, EITHER);
* so treat it as DIRENT_HTREE
*/
bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE);
if (IS_ERR(bh)) if (IS_ERR(bh))
return false; return false;
...@@ -3483,10 +3527,7 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle, ...@@ -3483,10 +3527,7 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
struct ext4_dir_entry_2 *de; struct ext4_dir_entry_2 *de;
unsigned int offset; unsigned int offset;
/* The first directory block must not be a hole, so bh = ext4_read_dirblock(inode, 0, EITHER);
* treat it as DIRENT_HTREE
*/
bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE);
if (IS_ERR(bh)) { if (IS_ERR(bh)) {
*retval = PTR_ERR(bh); *retval = PTR_ERR(bh);
return NULL; return NULL;
......
...@@ -1327,6 +1327,9 @@ static void ext4_put_super(struct super_block *sb) ...@@ -1327,6 +1327,9 @@ static void ext4_put_super(struct super_block *sb)
ext4_group_desc_free(sbi); ext4_group_desc_free(sbi);
ext4_flex_groups_free(sbi); ext4_flex_groups_free(sbi);
WARN_ON_ONCE(!(sbi->s_mount_state & EXT4_ERROR_FS) &&
percpu_counter_sum(&sbi->s_dirtyclusters_counter));
ext4_percpu_param_destroy(sbi); ext4_percpu_param_destroy(sbi);
#ifdef CONFIG_QUOTA #ifdef CONFIG_QUOTA
for (int i = 0; i < EXT4_MAXQUOTAS; i++) for (int i = 0; i < EXT4_MAXQUOTAS; i++)
...@@ -1457,7 +1460,8 @@ static void ext4_destroy_inode(struct inode *inode) ...@@ -1457,7 +1460,8 @@ static void ext4_destroy_inode(struct inode *inode)
dump_stack(); dump_stack();
} }
if (EXT4_I(inode)->i_reserved_data_blocks) if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ERROR_FS) &&
WARN_ON_ONCE(EXT4_I(inode)->i_reserved_data_blocks))
ext4_msg(inode->i_sb, KERN_ERR, ext4_msg(inode->i_sb, KERN_ERR,
"Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!", "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!",
inode->i_ino, EXT4_I(inode), inode->i_ino, EXT4_I(inode),
......
...@@ -1433,6 +1433,12 @@ static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode, ...@@ -1433,6 +1433,12 @@ static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode,
goto out; goto out;
memcpy(bh->b_data, buf, csize); memcpy(bh->b_data, buf, csize);
/*
* Zero out block tail to avoid writing uninitialized memory
* to disk.
*/
if (csize < blocksize)
memset(bh->b_data + csize, 0, blocksize - csize);
set_buffer_uptodate(bh); set_buffer_uptodate(bh);
ext4_handle_dirty_metadata(handle, ea_inode, bh); ext4_handle_dirty_metadata(handle, ea_inode, bh);
......
...@@ -353,7 +353,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) ...@@ -353,7 +353,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
struct buffer_head *descriptor; struct buffer_head *descriptor;
struct buffer_head **wbuf = journal->j_wbuf; struct buffer_head **wbuf = journal->j_wbuf;
int bufs; int bufs;
int flags; int escape;
int err; int err;
unsigned long long blocknr; unsigned long long blocknr;
ktime_t start_time; ktime_t start_time;
...@@ -660,10 +660,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) ...@@ -660,10 +660,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
*/ */
set_bit(BH_JWrite, &jh2bh(jh)->b_state); set_bit(BH_JWrite, &jh2bh(jh)->b_state);
JBUFFER_TRACE(jh, "ph3: write metadata"); JBUFFER_TRACE(jh, "ph3: write metadata");
flags = jbd2_journal_write_metadata_buffer(commit_transaction, escape = jbd2_journal_write_metadata_buffer(commit_transaction,
jh, &wbuf[bufs], blocknr); jh, &wbuf[bufs], blocknr);
if (flags < 0) { if (escape < 0) {
jbd2_journal_abort(journal, flags); jbd2_journal_abort(journal, escape);
continue; continue;
} }
jbd2_file_log_bh(&io_bufs, wbuf[bufs]); jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
...@@ -672,7 +672,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) ...@@ -672,7 +672,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
buffer */ buffer */
tag_flag = 0; tag_flag = 0;
if (flags & 1) if (escape)
tag_flag |= JBD2_FLAG_ESCAPE; tag_flag |= JBD2_FLAG_ESCAPE;
if (!first_tag) if (!first_tag)
tag_flag |= JBD2_FLAG_SAME_UUID; tag_flag |= JBD2_FLAG_SAME_UUID;
...@@ -766,7 +766,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) ...@@ -766,7 +766,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
if (first_block < journal->j_tail) if (first_block < journal->j_tail)
freed += journal->j_last - journal->j_first; freed += journal->j_last - journal->j_first;
/* Update tail only if we free significant amount of space */ /* Update tail only if we free significant amount of space */
if (freed < jbd2_journal_get_max_txn_bufs(journal)) if (freed < journal->j_max_transaction_buffers)
update_tail = 0; update_tail = 0;
} }
J_ASSERT(commit_transaction->t_state == T_COMMIT); J_ASSERT(commit_transaction->t_state == T_COMMIT);
...@@ -1107,7 +1107,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) ...@@ -1107,7 +1107,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
commit_transaction->t_state = T_COMMIT_CALLBACK; commit_transaction->t_state = T_COMMIT_CALLBACK;
J_ASSERT(commit_transaction == journal->j_committing_transaction); J_ASSERT(commit_transaction == journal->j_committing_transaction);
journal->j_commit_sequence = commit_transaction->t_tid; WRITE_ONCE(journal->j_commit_sequence, commit_transaction->t_tid);
journal->j_committing_transaction = NULL; journal->j_committing_transaction = NULL;
commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
......
...@@ -220,19 +220,12 @@ static int kjournald2(void *arg) ...@@ -220,19 +220,12 @@ static int kjournald2(void *arg)
* so we don't sleep * so we don't sleep
*/ */
DEFINE_WAIT(wait); DEFINE_WAIT(wait);
int should_sleep = 1;
prepare_to_wait(&journal->j_wait_commit, &wait, prepare_to_wait(&journal->j_wait_commit, &wait,
TASK_INTERRUPTIBLE); TASK_INTERRUPTIBLE);
if (journal->j_commit_sequence != journal->j_commit_request)
should_sleep = 0;
transaction = journal->j_running_transaction; transaction = journal->j_running_transaction;
if (transaction && time_after_eq(jiffies, if (transaction == NULL ||
transaction->t_expires)) time_before(jiffies, transaction->t_expires)) {
should_sleep = 0;
if (journal->j_flags & JBD2_UNMOUNT)
should_sleep = 0;
if (should_sleep) {
write_unlock(&journal->j_state_lock); write_unlock(&journal->j_state_lock);
schedule(); schedule();
write_lock(&journal->j_state_lock); write_lock(&journal->j_state_lock);
...@@ -316,11 +309,8 @@ static void journal_kill_thread(journal_t *journal) ...@@ -316,11 +309,8 @@ static void journal_kill_thread(journal_t *journal)
* *
* Return value: * Return value:
* <0: Error * <0: Error
* >=0: Finished OK * =0: Finished OK without escape
* * =1: Finished OK with escape
* On success:
* Bit 0 set == escape performed on the data
* Bit 1 set == buffer copy-out performed (kfree the data after IO)
*/ */
int jbd2_journal_write_metadata_buffer(transaction_t *transaction, int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
...@@ -328,7 +318,6 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, ...@@ -328,7 +318,6 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
struct buffer_head **bh_out, struct buffer_head **bh_out,
sector_t blocknr) sector_t blocknr)
{ {
int need_copy_out = 0;
int done_copy_out = 0; int done_copy_out = 0;
int do_escape = 0; int do_escape = 0;
char *mapped_data; char *mapped_data;
...@@ -355,7 +344,6 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, ...@@ -355,7 +344,6 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
atomic_set(&new_bh->b_count, 1); atomic_set(&new_bh->b_count, 1);
spin_lock(&jh_in->b_state_lock); spin_lock(&jh_in->b_state_lock);
repeat:
/* /*
* If a new transaction has already done a buffer copy-out, then * If a new transaction has already done a buffer copy-out, then
* we use that version of the data for the commit. * we use that version of the data for the commit.
...@@ -365,8 +353,8 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, ...@@ -365,8 +353,8 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
new_folio = virt_to_folio(jh_in->b_frozen_data); new_folio = virt_to_folio(jh_in->b_frozen_data);
new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data); new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data);
} else { } else {
new_folio = jh2bh(jh_in)->b_folio; new_folio = bh_in->b_folio;
new_offset = offset_in_folio(new_folio, jh2bh(jh_in)->b_data); new_offset = offset_in_folio(new_folio, bh_in->b_data);
} }
mapped_data = kmap_local_folio(new_folio, new_offset); mapped_data = kmap_local_folio(new_folio, new_offset);
...@@ -383,54 +371,52 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, ...@@ -383,54 +371,52 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
/* /*
* Check for escaping * Check for escaping
*/ */
if (*((__be32 *)mapped_data) == cpu_to_be32(JBD2_MAGIC_NUMBER)) { if (*((__be32 *)mapped_data) == cpu_to_be32(JBD2_MAGIC_NUMBER))
need_copy_out = 1;
do_escape = 1; do_escape = 1;
}
kunmap_local(mapped_data); kunmap_local(mapped_data);
/* /*
* Do we need to do a data copy? * Do we need to do a data copy?
*/ */
if (need_copy_out && !done_copy_out) { if (do_escape && !done_copy_out) {
char *tmp; char *tmp;
spin_unlock(&jh_in->b_state_lock); spin_unlock(&jh_in->b_state_lock);
tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
if (!tmp) { if (!tmp) {
brelse(new_bh); brelse(new_bh);
free_buffer_head(new_bh);
return -ENOMEM; return -ENOMEM;
} }
spin_lock(&jh_in->b_state_lock); spin_lock(&jh_in->b_state_lock);
if (jh_in->b_frozen_data) { if (jh_in->b_frozen_data) {
jbd2_free(tmp, bh_in->b_size); jbd2_free(tmp, bh_in->b_size);
goto repeat; goto copy_done;
} }
jh_in->b_frozen_data = tmp; jh_in->b_frozen_data = tmp;
memcpy_from_folio(tmp, new_folio, new_offset, bh_in->b_size); memcpy_from_folio(tmp, new_folio, new_offset, bh_in->b_size);
new_folio = virt_to_folio(tmp);
new_offset = offset_in_folio(new_folio, tmp);
done_copy_out = 1;
/* /*
* This isn't strictly necessary, as we're using frozen * This isn't strictly necessary, as we're using frozen
* data for the escaping, but it keeps consistency with * data for the escaping, but it keeps consistency with
* b_frozen_data usage. * b_frozen_data usage.
*/ */
jh_in->b_frozen_triggers = jh_in->b_triggers; jh_in->b_frozen_triggers = jh_in->b_triggers;
copy_done:
new_folio = virt_to_folio(jh_in->b_frozen_data);
new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data);
done_copy_out = 1;
} }
/* /*
* Did we need to do an escaping? Now we've done all the * Did we need to do an escaping? Now we've done all the
* copying, we can finally do so. * copying, we can finally do so.
* b_frozen_data is from jbd2_alloc() which always provides an
* address from the direct kernels mapping.
*/ */
if (do_escape) { if (do_escape)
mapped_data = kmap_local_folio(new_folio, new_offset); *((unsigned int *)jh_in->b_frozen_data) = 0;
*((unsigned int *)mapped_data) = 0;
kunmap_local(mapped_data);
}
folio_set_bh(new_bh, new_folio, new_offset); folio_set_bh(new_bh, new_folio, new_offset);
new_bh->b_size = bh_in->b_size; new_bh->b_size = bh_in->b_size;
...@@ -454,7 +440,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, ...@@ -454,7 +440,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
set_buffer_shadow(bh_in); set_buffer_shadow(bh_in);
spin_unlock(&jh_in->b_state_lock); spin_unlock(&jh_in->b_state_lock);
return do_escape | (done_copy_out << 1); return do_escape;
} }
/* /*
...@@ -789,17 +775,7 @@ EXPORT_SYMBOL(jbd2_fc_end_commit_fallback); ...@@ -789,17 +775,7 @@ EXPORT_SYMBOL(jbd2_fc_end_commit_fallback);
/* Return 1 when transaction with given tid has already committed. */ /* Return 1 when transaction with given tid has already committed. */
int jbd2_transaction_committed(journal_t *journal, tid_t tid) int jbd2_transaction_committed(journal_t *journal, tid_t tid)
{ {
int ret = 1; return tid_geq(READ_ONCE(journal->j_commit_sequence), tid);
read_lock(&journal->j_state_lock);
if (journal->j_running_transaction &&
journal->j_running_transaction->t_tid == tid)
ret = 0;
if (journal->j_committing_transaction &&
journal->j_committing_transaction->t_tid == tid)
ret = 0;
read_unlock(&journal->j_state_lock);
return ret;
} }
EXPORT_SYMBOL(jbd2_transaction_committed); EXPORT_SYMBOL(jbd2_transaction_committed);
...@@ -1451,6 +1427,48 @@ static int journal_revoke_records_per_block(journal_t *journal) ...@@ -1451,6 +1427,48 @@ static int journal_revoke_records_per_block(journal_t *journal)
return space / record_size; return space / record_size;
} }
static int jbd2_journal_get_max_txn_bufs(journal_t *journal)
{
return (journal->j_total_len - journal->j_fc_wbufsize) / 3;
}
/*
* Base amount of descriptor blocks we reserve for each transaction.
*/
static int jbd2_descriptor_blocks_per_trans(journal_t *journal)
{
int tag_space = journal->j_blocksize - sizeof(journal_header_t);
int tags_per_block;
/* Subtract UUID */
tag_space -= 16;
if (jbd2_journal_has_csum_v2or3(journal))
tag_space -= sizeof(struct jbd2_journal_block_tail);
/* Commit code leaves a slack space of 16 bytes at the end of block */
tags_per_block = (tag_space - 16) / journal_tag_bytes(journal);
/*
* Revoke descriptors are accounted separately so we need to reserve
* space for commit block and normal transaction descriptor blocks.
*/
return 1 + DIV_ROUND_UP(jbd2_journal_get_max_txn_bufs(journal),
tags_per_block);
}
/*
* Initialize number of blocks each transaction reserves for its bookkeeping
* and maximum number of blocks a transaction can use. This needs to be called
* after the journal size and the fastcommit area size are initialized.
*/
static void jbd2_journal_init_transaction_limits(journal_t *journal)
{
journal->j_revoke_records_per_block =
journal_revoke_records_per_block(journal);
journal->j_transaction_overhead_buffers =
jbd2_descriptor_blocks_per_trans(journal);
journal->j_max_transaction_buffers =
jbd2_journal_get_max_txn_bufs(journal);
}
/* /*
* Load the on-disk journal superblock and read the key fields into the * Load the on-disk journal superblock and read the key fields into the
* journal_t. * journal_t.
...@@ -1492,8 +1510,8 @@ static int journal_load_superblock(journal_t *journal) ...@@ -1492,8 +1510,8 @@ static int journal_load_superblock(journal_t *journal)
if (jbd2_journal_has_csum_v2or3(journal)) if (jbd2_journal_has_csum_v2or3(journal))
journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
sizeof(sb->s_uuid)); sizeof(sb->s_uuid));
journal->j_revoke_records_per_block = /* After journal features are set, we can compute transaction limits */
journal_revoke_records_per_block(journal); jbd2_journal_init_transaction_limits(journal);
if (jbd2_has_feature_fast_commit(journal)) { if (jbd2_has_feature_fast_commit(journal)) {
journal->j_fc_last = be32_to_cpu(sb->s_maxlen); journal->j_fc_last = be32_to_cpu(sb->s_maxlen);
...@@ -1599,7 +1617,6 @@ static journal_t *journal_init_common(struct block_device *bdev, ...@@ -1599,7 +1617,6 @@ static journal_t *journal_init_common(struct block_device *bdev,
journal->j_shrinker->scan_objects = jbd2_journal_shrink_scan; journal->j_shrinker->scan_objects = jbd2_journal_shrink_scan;
journal->j_shrinker->count_objects = jbd2_journal_shrink_count; journal->j_shrinker->count_objects = jbd2_journal_shrink_count;
journal->j_shrinker->batch = journal->j_max_transaction_buffers;
journal->j_shrinker->private_data = journal; journal->j_shrinker->private_data = journal;
shrinker_register(journal->j_shrinker); shrinker_register(journal->j_shrinker);
...@@ -1743,8 +1760,6 @@ static int journal_reset(journal_t *journal) ...@@ -1743,8 +1760,6 @@ static int journal_reset(journal_t *journal)
journal->j_commit_sequence = journal->j_transaction_sequence - 1; journal->j_commit_sequence = journal->j_transaction_sequence - 1;
journal->j_commit_request = journal->j_commit_sequence; journal->j_commit_request = journal->j_commit_sequence;
journal->j_max_transaction_buffers = jbd2_journal_get_max_txn_bufs(journal);
/* /*
* Now that journal recovery is done, turn fast commits off here. This * Now that journal recovery is done, turn fast commits off here. This
* way, if fast commit was enabled before the crash but if now FS has * way, if fast commit was enabled before the crash but if now FS has
...@@ -2285,8 +2300,6 @@ jbd2_journal_initialize_fast_commit(journal_t *journal) ...@@ -2285,8 +2300,6 @@ jbd2_journal_initialize_fast_commit(journal_t *journal)
journal->j_fc_first = journal->j_last + 1; journal->j_fc_first = journal->j_last + 1;
journal->j_fc_off = 0; journal->j_fc_off = 0;
journal->j_free = journal->j_last - journal->j_first; journal->j_free = journal->j_last - journal->j_first;
journal->j_max_transaction_buffers =
jbd2_journal_get_max_txn_bufs(journal);
return 0; return 0;
} }
...@@ -2374,8 +2387,7 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat, ...@@ -2374,8 +2387,7 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat,
sb->s_feature_ro_compat |= cpu_to_be32(ro); sb->s_feature_ro_compat |= cpu_to_be32(ro);
sb->s_feature_incompat |= cpu_to_be32(incompat); sb->s_feature_incompat |= cpu_to_be32(incompat);
unlock_buffer(journal->j_sb_buffer); unlock_buffer(journal->j_sb_buffer);
journal->j_revoke_records_per_block = jbd2_journal_init_transaction_limits(journal);
journal_revoke_records_per_block(journal);
return 1; return 1;
#undef COMPAT_FEATURE_ON #undef COMPAT_FEATURE_ON
...@@ -2406,8 +2418,7 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat, ...@@ -2406,8 +2418,7 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
sb->s_feature_compat &= ~cpu_to_be32(compat); sb->s_feature_compat &= ~cpu_to_be32(compat);
sb->s_feature_ro_compat &= ~cpu_to_be32(ro); sb->s_feature_ro_compat &= ~cpu_to_be32(ro);
sb->s_feature_incompat &= ~cpu_to_be32(incompat); sb->s_feature_incompat &= ~cpu_to_be32(incompat);
journal->j_revoke_records_per_block = jbd2_journal_init_transaction_limits(journal);
journal_revoke_records_per_block(journal);
} }
EXPORT_SYMBOL(jbd2_journal_clear_features); EXPORT_SYMBOL(jbd2_journal_clear_features);
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include <linux/errno.h> #include <linux/errno.h>
#include <linux/crc32.h> #include <linux/crc32.h>
#include <linux/blkdev.h> #include <linux/blkdev.h>
#include <linux/string_choices.h>
#endif #endif
/* /*
...@@ -374,7 +375,7 @@ int jbd2_journal_skip_recovery(journal_t *journal) ...@@ -374,7 +375,7 @@ int jbd2_journal_skip_recovery(journal_t *journal)
be32_to_cpu(journal->j_superblock->s_sequence); be32_to_cpu(journal->j_superblock->s_sequence);
jbd2_debug(1, jbd2_debug(1,
"JBD2: ignoring %d transaction%s from the journal.\n", "JBD2: ignoring %d transaction%s from the journal.\n",
dropped, (dropped == 1) ? "" : "s"); dropped, str_plural(dropped));
#endif #endif
journal->j_transaction_sequence = ++info.end_transaction; journal->j_transaction_sequence = ++info.end_transaction;
journal->j_head = info.head_block; journal->j_head = info.head_block;
...@@ -443,6 +444,27 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf) ...@@ -443,6 +444,27 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
return provided == cpu_to_be32(calculated); return provided == cpu_to_be32(calculated);
} }
static bool jbd2_commit_block_csum_verify_partial(journal_t *j, void *buf)
{
struct commit_header *h;
__be32 provided;
__u32 calculated;
void *tmpbuf;
tmpbuf = kzalloc(j->j_blocksize, GFP_KERNEL);
if (!tmpbuf)
return false;
memcpy(tmpbuf, buf, sizeof(struct commit_header));
h = tmpbuf;
provided = h->h_chksum[0];
h->h_chksum[0] = 0;
calculated = jbd2_chksum(j, j->j_csum_seed, tmpbuf, j->j_blocksize);
kfree(tmpbuf);
return provided == cpu_to_be32(calculated);
}
static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
journal_block_tag3_t *tag3, journal_block_tag3_t *tag3,
void *buf, __u32 sequence) void *buf, __u32 sequence)
...@@ -810,6 +832,13 @@ static int do_one_pass(journal_t *journal, ...@@ -810,6 +832,13 @@ static int do_one_pass(journal_t *journal,
if (pass == PASS_SCAN && if (pass == PASS_SCAN &&
!jbd2_commit_block_csum_verify(journal, !jbd2_commit_block_csum_verify(journal,
bh->b_data)) { bh->b_data)) {
if (jbd2_commit_block_csum_verify_partial(
journal,
bh->b_data)) {
pr_notice("JBD2: Find incomplete commit block in transaction %u block %lu\n",
next_commit_ID, next_log_block);
goto chksum_ok;
}
chksum_error: chksum_error:
if (commit_time < last_trans_commit_time) if (commit_time < last_trans_commit_time)
goto ignore_crc_mismatch; goto ignore_crc_mismatch;
...@@ -824,6 +853,7 @@ static int do_one_pass(journal_t *journal, ...@@ -824,6 +853,7 @@ static int do_one_pass(journal_t *journal,
} }
} }
if (pass == PASS_SCAN) { if (pass == PASS_SCAN) {
chksum_ok:
last_trans_commit_time = commit_time; last_trans_commit_time = commit_time;
head_block = next_log_block; head_block = next_log_block;
} }
...@@ -843,6 +873,7 @@ static int do_one_pass(journal_t *journal, ...@@ -843,6 +873,7 @@ static int do_one_pass(journal_t *journal,
next_log_block); next_log_block);
need_check_commit_time = true; need_check_commit_time = true;
} }
/* If we aren't in the REVOKE pass, then we can /* If we aren't in the REVOKE pass, then we can
* just skip over this block. */ * just skip over this block. */
if (pass != PASS_REVOKE) { if (pass != PASS_REVOKE) {
......
...@@ -62,28 +62,6 @@ void jbd2_journal_free_transaction(transaction_t *transaction) ...@@ -62,28 +62,6 @@ void jbd2_journal_free_transaction(transaction_t *transaction)
kmem_cache_free(transaction_cache, transaction); kmem_cache_free(transaction_cache, transaction);
} }
/*
* Base amount of descriptor blocks we reserve for each transaction.
*/
static int jbd2_descriptor_blocks_per_trans(journal_t *journal)
{
int tag_space = journal->j_blocksize - sizeof(journal_header_t);
int tags_per_block;
/* Subtract UUID */
tag_space -= 16;
if (jbd2_journal_has_csum_v2or3(journal))
tag_space -= sizeof(struct jbd2_journal_block_tail);
/* Commit code leaves a slack space of 16 bytes at the end of block */
tags_per_block = (tag_space - 16) / journal_tag_bytes(journal);
/*
* Revoke descriptors are accounted separately so we need to reserve
* space for commit block and normal transaction descriptor blocks.
*/
return 1 + DIV_ROUND_UP(journal->j_max_transaction_buffers,
tags_per_block);
}
/* /*
* jbd2_get_transaction: obtain a new transaction_t object. * jbd2_get_transaction: obtain a new transaction_t object.
* *
...@@ -109,7 +87,7 @@ static void jbd2_get_transaction(journal_t *journal, ...@@ -109,7 +87,7 @@ static void jbd2_get_transaction(journal_t *journal,
transaction->t_expires = jiffies + journal->j_commit_interval; transaction->t_expires = jiffies + journal->j_commit_interval;
atomic_set(&transaction->t_updates, 0); atomic_set(&transaction->t_updates, 0);
atomic_set(&transaction->t_outstanding_credits, atomic_set(&transaction->t_outstanding_credits,
jbd2_descriptor_blocks_per_trans(journal) + journal->j_transaction_overhead_buffers +
atomic_read(&journal->j_reserved_credits)); atomic_read(&journal->j_reserved_credits));
atomic_set(&transaction->t_outstanding_revokes, 0); atomic_set(&transaction->t_outstanding_revokes, 0);
atomic_set(&transaction->t_handle_count, 0); atomic_set(&transaction->t_handle_count, 0);
...@@ -213,6 +191,13 @@ static void sub_reserved_credits(journal_t *journal, int blocks) ...@@ -213,6 +191,13 @@ static void sub_reserved_credits(journal_t *journal, int blocks)
wake_up(&journal->j_wait_reserved); wake_up(&journal->j_wait_reserved);
} }
/* Maximum number of blocks for user transaction payload */
static int jbd2_max_user_trans_buffers(journal_t *journal)
{
return journal->j_max_transaction_buffers -
journal->j_transaction_overhead_buffers;
}
/* /*
* Wait until we can add credits for handle to the running transaction. Called * Wait until we can add credits for handle to the running transaction. Called
* with j_state_lock held for reading. Returns 0 if handle joined the running * with j_state_lock held for reading. Returns 0 if handle joined the running
...@@ -262,12 +247,12 @@ __must_hold(&journal->j_state_lock) ...@@ -262,12 +247,12 @@ __must_hold(&journal->j_state_lock)
* big to fit this handle? Wait until reserved credits are freed. * big to fit this handle? Wait until reserved credits are freed.
*/ */
if (atomic_read(&journal->j_reserved_credits) + total > if (atomic_read(&journal->j_reserved_credits) + total >
journal->j_max_transaction_buffers) { jbd2_max_user_trans_buffers(journal)) {
read_unlock(&journal->j_state_lock); read_unlock(&journal->j_state_lock);
jbd2_might_wait_for_commit(journal); jbd2_might_wait_for_commit(journal);
wait_event(journal->j_wait_reserved, wait_event(journal->j_wait_reserved,
atomic_read(&journal->j_reserved_credits) + total <= atomic_read(&journal->j_reserved_credits) + total <=
journal->j_max_transaction_buffers); jbd2_max_user_trans_buffers(journal));
__acquire(&journal->j_state_lock); /* fake out sparse */ __acquire(&journal->j_state_lock); /* fake out sparse */
return 1; return 1;
} }
...@@ -307,14 +292,14 @@ __must_hold(&journal->j_state_lock) ...@@ -307,14 +292,14 @@ __must_hold(&journal->j_state_lock)
needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits); needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits);
/* We allow at most half of a transaction to be reserved */ /* We allow at most half of a transaction to be reserved */
if (needed > journal->j_max_transaction_buffers / 2) { if (needed > jbd2_max_user_trans_buffers(journal) / 2) {
sub_reserved_credits(journal, rsv_blocks); sub_reserved_credits(journal, rsv_blocks);
atomic_sub(total, &t->t_outstanding_credits); atomic_sub(total, &t->t_outstanding_credits);
read_unlock(&journal->j_state_lock); read_unlock(&journal->j_state_lock);
jbd2_might_wait_for_commit(journal); jbd2_might_wait_for_commit(journal);
wait_event(journal->j_wait_reserved, wait_event(journal->j_wait_reserved,
atomic_read(&journal->j_reserved_credits) + rsv_blocks atomic_read(&journal->j_reserved_credits) + rsv_blocks
<= journal->j_max_transaction_buffers / 2); <= jbd2_max_user_trans_buffers(journal) / 2);
__acquire(&journal->j_state_lock); /* fake out sparse */ __acquire(&journal->j_state_lock); /* fake out sparse */
return 1; return 1;
} }
...@@ -344,12 +329,12 @@ static int start_this_handle(journal_t *journal, handle_t *handle, ...@@ -344,12 +329,12 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
* size and limit the number of total credits to not exceed maximum * size and limit the number of total credits to not exceed maximum
* transaction size per operation. * transaction size per operation.
*/ */
if ((rsv_blocks > journal->j_max_transaction_buffers / 2) || if (rsv_blocks > jbd2_max_user_trans_buffers(journal) / 2 ||
(rsv_blocks + blocks > journal->j_max_transaction_buffers)) { rsv_blocks + blocks > jbd2_max_user_trans_buffers(journal)) {
printk(KERN_ERR "JBD2: %s wants too many credits " printk(KERN_ERR "JBD2: %s wants too many credits "
"credits:%d rsv_credits:%d max:%d\n", "credits:%d rsv_credits:%d max:%d\n",
current->comm, blocks, rsv_blocks, current->comm, blocks, rsv_blocks,
journal->j_max_transaction_buffers); jbd2_max_user_trans_buffers(journal));
WARN_ON(1); WARN_ON(1);
return -ENOSPC; return -ENOSPC;
} }
......
...@@ -1085,6 +1085,13 @@ struct journal_s ...@@ -1085,6 +1085,13 @@ struct journal_s
*/ */
int j_revoke_records_per_block; int j_revoke_records_per_block;
/**
* @j_transaction_overhead:
*
* Number of blocks each transaction needs for its own bookkeeping
*/
int j_transaction_overhead_buffers;
/** /**
* @j_commit_interval: * @j_commit_interval:
* *
...@@ -1660,11 +1667,6 @@ int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode); ...@@ -1660,11 +1667,6 @@ int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode);
int jbd2_fc_wait_bufs(journal_t *journal, int num_blks); int jbd2_fc_wait_bufs(journal_t *journal, int num_blks);
int jbd2_fc_release_bufs(journal_t *journal); int jbd2_fc_release_bufs(journal_t *journal);
static inline int jbd2_journal_get_max_txn_bufs(journal_t *journal)
{
return (journal->j_total_len - journal->j_fc_wbufsize) / 4;
}
/* /*
* is_journal_abort * is_journal_abort
* *
......
...@@ -1246,14 +1246,15 @@ TRACE_EVENT(ext4_da_update_reserve_space, ...@@ -1246,14 +1246,15 @@ TRACE_EVENT(ext4_da_update_reserve_space,
); );
TRACE_EVENT(ext4_da_reserve_space, TRACE_EVENT(ext4_da_reserve_space,
TP_PROTO(struct inode *inode), TP_PROTO(struct inode *inode, int nr_resv),
TP_ARGS(inode), TP_ARGS(inode, nr_resv),
TP_STRUCT__entry( TP_STRUCT__entry(
__field( dev_t, dev ) __field( dev_t, dev )
__field( ino_t, ino ) __field( ino_t, ino )
__field( __u64, i_blocks ) __field( __u64, i_blocks )
__field( int, reserve_blocks )
__field( int, reserved_data_blocks ) __field( int, reserved_data_blocks )
__field( __u16, mode ) __field( __u16, mode )
), ),
...@@ -1262,16 +1263,17 @@ TRACE_EVENT(ext4_da_reserve_space, ...@@ -1262,16 +1263,17 @@ TRACE_EVENT(ext4_da_reserve_space,
__entry->dev = inode->i_sb->s_dev; __entry->dev = inode->i_sb->s_dev;
__entry->ino = inode->i_ino; __entry->ino = inode->i_ino;
__entry->i_blocks = inode->i_blocks; __entry->i_blocks = inode->i_blocks;
__entry->reserve_blocks = nr_resv;
__entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
__entry->mode = inode->i_mode; __entry->mode = inode->i_mode;
), ),
TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu " TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu reserve_blocks %d"
"reserved_data_blocks %d", "reserved_data_blocks %d",
MAJOR(__entry->dev), MINOR(__entry->dev), MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino, (unsigned long) __entry->ino,
__entry->mode, __entry->i_blocks, __entry->mode, __entry->i_blocks,
__entry->reserved_data_blocks) __entry->reserve_blocks, __entry->reserved_data_blocks)
); );
TRACE_EVENT(ext4_da_release_space, TRACE_EVENT(ext4_da_release_space,
...@@ -2478,11 +2480,11 @@ TRACE_EVENT(ext4_es_shrink, ...@@ -2478,11 +2480,11 @@ TRACE_EVENT(ext4_es_shrink,
__entry->scan_time, __entry->nr_skipped, __entry->retried) __entry->scan_time, __entry->nr_skipped, __entry->retried)
); );
TRACE_EVENT(ext4_es_insert_delayed_block, TRACE_EVENT(ext4_es_insert_delayed_extent,
TP_PROTO(struct inode *inode, struct extent_status *es, TP_PROTO(struct inode *inode, struct extent_status *es,
bool allocated), bool lclu_allocated, bool end_allocated),
TP_ARGS(inode, es, allocated), TP_ARGS(inode, es, lclu_allocated, end_allocated),
TP_STRUCT__entry( TP_STRUCT__entry(
__field( dev_t, dev ) __field( dev_t, dev )
...@@ -2491,7 +2493,8 @@ TRACE_EVENT(ext4_es_insert_delayed_block, ...@@ -2491,7 +2493,8 @@ TRACE_EVENT(ext4_es_insert_delayed_block,
__field( ext4_lblk_t, len ) __field( ext4_lblk_t, len )
__field( ext4_fsblk_t, pblk ) __field( ext4_fsblk_t, pblk )
__field( char, status ) __field( char, status )
__field( bool, allocated ) __field( bool, lclu_allocated )
__field( bool, end_allocated )
), ),
TP_fast_assign( TP_fast_assign(
...@@ -2501,16 +2504,17 @@ TRACE_EVENT(ext4_es_insert_delayed_block, ...@@ -2501,16 +2504,17 @@ TRACE_EVENT(ext4_es_insert_delayed_block,
__entry->len = es->es_len; __entry->len = es->es_len;
__entry->pblk = ext4_es_show_pblock(es); __entry->pblk = ext4_es_show_pblock(es);
__entry->status = ext4_es_status(es); __entry->status = ext4_es_status(es);
__entry->allocated = allocated; __entry->lclu_allocated = lclu_allocated;
__entry->end_allocated = end_allocated;
), ),
TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s " TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s "
"allocated %d", "allocated %d %d",
MAJOR(__entry->dev), MINOR(__entry->dev), MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino, (unsigned long) __entry->ino,
__entry->lblk, __entry->len, __entry->lblk, __entry->len,
__entry->pblk, show_extent_status(__entry->status), __entry->pblk, show_extent_status(__entry->status),
__entry->allocated) __entry->lclu_allocated, __entry->end_allocated)
); );
/* fsmap traces */ /* fsmap traces */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment