Commit 296c355c authored by Theodore Ts'o's avatar Theodore Ts'o

ext4: Use tracepoints for mb_history trace file

The /proc/fs/ext4/<dev>/mb_history was maintained manually, and had a
number of problems: it required a largish amount of memory to be
allocated for each ext4 filesystem, and the s_mb_history_lock
introduced a CPU contention problem.  

By ripping out the mb_history code and replacing it with ftrace
tracepoints, and we get more functionality: timestamps, event
filtering, the ability to correlate mballoc history with other ext4
tracepoints, etc.
Signed-off-by: default avatar"Theodore Ts'o" <tytso@mit.edu>
parent 90576c0b
...@@ -1113,7 +1113,6 @@ Table 1-12: Files in /proc/fs/ext4/<devname> ...@@ -1113,7 +1113,6 @@ Table 1-12: Files in /proc/fs/ext4/<devname>
.............................................................................. ..............................................................................
File Content File Content
mb_groups details of multiblock allocator buddy cache of free blocks mb_groups details of multiblock allocator buddy cache of free blocks
mb_history multiblock allocation history
.............................................................................. ..............................................................................
......
...@@ -65,6 +65,12 @@ typedef __u32 ext4_lblk_t; ...@@ -65,6 +65,12 @@ typedef __u32 ext4_lblk_t;
/* data type for block group number */ /* data type for block group number */
typedef unsigned int ext4_group_t; typedef unsigned int ext4_group_t;
/*
* Flags used in mballoc's allocation_context flags field.
*
* Also used to show what's going on for debugging purposes when the
* flag field is exported via the traceport interface
*/
/* prefer goal again. length */ /* prefer goal again. length */
#define EXT4_MB_HINT_MERGE 0x0001 #define EXT4_MB_HINT_MERGE 0x0001
...@@ -971,14 +977,6 @@ struct ext4_sb_info { ...@@ -971,14 +977,6 @@ struct ext4_sb_info {
unsigned long s_mb_last_group; unsigned long s_mb_last_group;
unsigned long s_mb_last_start; unsigned long s_mb_last_start;
/* history to debug policy */
struct ext4_mb_history *s_mb_history;
int s_mb_history_cur;
int s_mb_history_max;
int s_mb_history_num;
spinlock_t s_mb_history_lock;
int s_mb_history_filter;
/* stats for buddy allocator */ /* stats for buddy allocator */
spinlock_t s_mb_pa_lock; spinlock_t s_mb_pa_lock;
atomic_t s_bal_reqs; /* number of reqs with len > 1 */ atomic_t s_bal_reqs; /* number of reqs with len > 1 */
......
This diff is collapsed.
...@@ -52,18 +52,8 @@ extern u8 mb_enable_debug; ...@@ -52,18 +52,8 @@ extern u8 mb_enable_debug;
#define mb_debug(n, fmt, a...) #define mb_debug(n, fmt, a...)
#endif #endif
/*
* with EXT4_MB_HISTORY mballoc stores last N allocations in memory
* and you can monitor it in /proc/fs/ext4/<dev>/mb_history
*/
#define EXT4_MB_HISTORY
#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ #define EXT4_MB_HISTORY_ALLOC 1 /* allocation */
#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */ #define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */
#define EXT4_MB_HISTORY_DISCARD 4 /* preallocation discarded */
#define EXT4_MB_HISTORY_FREE 8 /* free */
#define EXT4_MB_HISTORY_DEFAULT (EXT4_MB_HISTORY_ALLOC | \
EXT4_MB_HISTORY_PREALLOC)
/* /*
* How long mballoc can look for a best extent (in found extents) * How long mballoc can look for a best extent (in found extents)
...@@ -217,22 +207,6 @@ struct ext4_allocation_context { ...@@ -217,22 +207,6 @@ struct ext4_allocation_context {
#define AC_STATUS_FOUND 2 #define AC_STATUS_FOUND 2
#define AC_STATUS_BREAK 3 #define AC_STATUS_BREAK 3
struct ext4_mb_history {
struct ext4_free_extent orig; /* orig allocation */
struct ext4_free_extent goal; /* goal allocation */
struct ext4_free_extent result; /* result allocation */
unsigned pid;
unsigned ino;
__u16 found; /* how many extents have been found */
__u16 groups; /* how many groups have been scanned */
__u16 tail; /* what tail broke some buddy */
__u16 buddy; /* buddy the tail ^^^ broke */
__u16 flags;
__u8 cr:3; /* which phase the result extent was found at */
__u8 op:4;
__u8 merged:1;
};
struct ext4_buddy { struct ext4_buddy {
struct page *bd_buddy_page; struct page *bd_buddy_page;
void *bd_buddy; void *bd_buddy;
...@@ -247,13 +221,6 @@ struct ext4_buddy { ...@@ -247,13 +221,6 @@ struct ext4_buddy {
#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) #define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) #define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
#ifndef EXT4_MB_HISTORY
static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
{
return;
}
#endif
#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
......
...@@ -50,13 +50,6 @@ ...@@ -50,13 +50,6 @@
#define CREATE_TRACE_POINTS #define CREATE_TRACE_POINTS
#include <trace/events/ext4.h> #include <trace/events/ext4.h>
static int default_mb_history_length = 1000;
module_param_named(default_mb_history_length, default_mb_history_length,
int, 0644);
MODULE_PARM_DESC(default_mb_history_length,
"Default number of entries saved for mb_history");
struct proc_dir_entry *ext4_proc_root; struct proc_dir_entry *ext4_proc_root;
static struct kset *ext4_kset; static struct kset *ext4_kset;
...@@ -1079,7 +1072,7 @@ enum { ...@@ -1079,7 +1072,7 @@ enum {
Opt_journal_update, Opt_journal_dev, Opt_journal_update, Opt_journal_dev,
Opt_journal_checksum, Opt_journal_async_commit, Opt_journal_checksum, Opt_journal_async_commit,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
Opt_data_err_abort, Opt_data_err_ignore, Opt_mb_history_length, Opt_data_err_abort, Opt_data_err_ignore,
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
...@@ -1126,7 +1119,6 @@ static const match_table_t tokens = { ...@@ -1126,7 +1119,6 @@ static const match_table_t tokens = {
{Opt_data_writeback, "data=writeback"}, {Opt_data_writeback, "data=writeback"},
{Opt_data_err_abort, "data_err=abort"}, {Opt_data_err_abort, "data_err=abort"},
{Opt_data_err_ignore, "data_err=ignore"}, {Opt_data_err_ignore, "data_err=ignore"},
{Opt_mb_history_length, "mb_history_length=%u"},
{Opt_offusrjquota, "usrjquota="}, {Opt_offusrjquota, "usrjquota="},
{Opt_usrjquota, "usrjquota=%s"}, {Opt_usrjquota, "usrjquota=%s"},
{Opt_offgrpjquota, "grpjquota="}, {Opt_offgrpjquota, "grpjquota="},
...@@ -1367,13 +1359,6 @@ static int parse_options(char *options, struct super_block *sb, ...@@ -1367,13 +1359,6 @@ static int parse_options(char *options, struct super_block *sb,
case Opt_data_err_ignore: case Opt_data_err_ignore:
clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
break; break;
case Opt_mb_history_length:
if (match_int(&args[0], &option))
return 0;
if (option < 0)
return 0;
sbi->s_mb_history_max = option;
break;
#ifdef CONFIG_QUOTA #ifdef CONFIG_QUOTA
case Opt_usrjquota: case Opt_usrjquota:
qtype = USRQUOTA; qtype = USRQUOTA;
...@@ -2435,7 +2420,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ...@@ -2435,7 +2420,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
sbi->s_mb_history_max = default_mb_history_length;
set_opt(sbi->s_mount_opt, BARRIER); set_opt(sbi->s_mount_opt, BARRIER);
......
...@@ -743,6 +743,169 @@ TRACE_EVENT(ext4_alloc_da_blocks, ...@@ -743,6 +743,169 @@ TRACE_EVENT(ext4_alloc_da_blocks,
__entry->data_blocks, __entry->meta_blocks) __entry->data_blocks, __entry->meta_blocks)
); );
TRACE_EVENT(ext4_mballoc_alloc,
TP_PROTO(struct ext4_allocation_context *ac),
TP_ARGS(ac),
TP_STRUCT__entry(
__field( dev_t, dev )
__field( ino_t, ino )
__field( __u16, found )
__field( __u16, groups )
__field( __u16, buddy )
__field( __u16, flags )
__field( __u16, tail )
__field( __u8, cr )
__field( __u32, orig_logical )
__field( int, orig_start )
__field( __u32, orig_group )
__field( int, orig_len )
__field( __u32, goal_logical )
__field( int, goal_start )
__field( __u32, goal_group )
__field( int, goal_len )
__field( __u32, result_logical )
__field( int, result_start )
__field( __u32, result_group )
__field( int, result_len )
),
TP_fast_assign(
__entry->dev = ac->ac_inode->i_sb->s_dev;
__entry->ino = ac->ac_inode->i_ino;
__entry->found = ac->ac_found;
__entry->flags = ac->ac_flags;
__entry->groups = ac->ac_groups_scanned;
__entry->buddy = ac->ac_buddy;
__entry->tail = ac->ac_tail;
__entry->cr = ac->ac_criteria;
__entry->orig_logical = ac->ac_o_ex.fe_logical;
__entry->orig_start = ac->ac_o_ex.fe_start;
__entry->orig_group = ac->ac_o_ex.fe_group;
__entry->orig_len = ac->ac_o_ex.fe_len;
__entry->goal_logical = ac->ac_g_ex.fe_logical;
__entry->goal_start = ac->ac_g_ex.fe_start;
__entry->goal_group = ac->ac_g_ex.fe_group;
__entry->goal_len = ac->ac_g_ex.fe_len;
__entry->result_logical = ac->ac_f_ex.fe_logical;
__entry->result_start = ac->ac_f_ex.fe_start;
__entry->result_group = ac->ac_f_ex.fe_group;
__entry->result_len = ac->ac_f_ex.fe_len;
),
TP_printk("dev %s inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u "
"result %u/%d/%u@%u blks %u grps %u cr %u flags 0x%04x "
"tail %u broken %u",
jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
__entry->orig_group, __entry->orig_start,
__entry->orig_len, __entry->orig_logical,
__entry->goal_group, __entry->goal_start,
__entry->goal_len, __entry->goal_logical,
__entry->result_group, __entry->result_start,
__entry->result_len, __entry->result_logical,
__entry->found, __entry->groups, __entry->cr,
__entry->flags, __entry->tail,
__entry->buddy ? 1 << __entry->buddy : 0)
);
TRACE_EVENT(ext4_mballoc_prealloc,
TP_PROTO(struct ext4_allocation_context *ac),
TP_ARGS(ac),
TP_STRUCT__entry(
__field( dev_t, dev )
__field( ino_t, ino )
__field( __u32, orig_logical )
__field( int, orig_start )
__field( __u32, orig_group )
__field( int, orig_len )
__field( __u32, result_logical )
__field( int, result_start )
__field( __u32, result_group )
__field( int, result_len )
),
TP_fast_assign(
__entry->dev = ac->ac_inode->i_sb->s_dev;
__entry->ino = ac->ac_inode->i_ino;
__entry->orig_logical = ac->ac_o_ex.fe_logical;
__entry->orig_start = ac->ac_o_ex.fe_start;
__entry->orig_group = ac->ac_o_ex.fe_group;
__entry->orig_len = ac->ac_o_ex.fe_len;
__entry->result_logical = ac->ac_b_ex.fe_logical;
__entry->result_start = ac->ac_b_ex.fe_start;
__entry->result_group = ac->ac_b_ex.fe_group;
__entry->result_len = ac->ac_b_ex.fe_len;
),
TP_printk("dev %s inode %lu orig %u/%d/%u@%u result %u/%d/%u@%u",
jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
__entry->orig_group, __entry->orig_start,
__entry->orig_len, __entry->orig_logical,
__entry->result_group, __entry->result_start,
__entry->result_len, __entry->result_logical)
);
TRACE_EVENT(ext4_mballoc_discard,
TP_PROTO(struct ext4_allocation_context *ac),
TP_ARGS(ac),
TP_STRUCT__entry(
__field( dev_t, dev )
__field( ino_t, ino )
__field( __u32, result_logical )
__field( int, result_start )
__field( __u32, result_group )
__field( int, result_len )
),
TP_fast_assign(
__entry->dev = ac->ac_inode->i_sb->s_dev;
__entry->ino = ac->ac_inode->i_ino;
__entry->result_logical = ac->ac_b_ex.fe_logical;
__entry->result_start = ac->ac_b_ex.fe_start;
__entry->result_group = ac->ac_b_ex.fe_group;
__entry->result_len = ac->ac_b_ex.fe_len;
),
TP_printk("dev %s inode %lu extent %u/%d/%u@%u ",
jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
__entry->result_group, __entry->result_start,
__entry->result_len, __entry->result_logical)
);
TRACE_EVENT(ext4_mballoc_free,
TP_PROTO(struct ext4_allocation_context *ac),
TP_ARGS(ac),
TP_STRUCT__entry(
__field( dev_t, dev )
__field( ino_t, ino )
__field( __u32, result_logical )
__field( int, result_start )
__field( __u32, result_group )
__field( int, result_len )
),
TP_fast_assign(
__entry->dev = ac->ac_inode->i_sb->s_dev;
__entry->ino = ac->ac_inode->i_ino;
__entry->result_logical = ac->ac_b_ex.fe_logical;
__entry->result_start = ac->ac_b_ex.fe_start;
__entry->result_group = ac->ac_b_ex.fe_group;
__entry->result_len = ac->ac_b_ex.fe_len;
),
TP_printk("dev %s inode %lu extent %u/%d/%u@%u ",
jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
__entry->result_group, __entry->result_start,
__entry->result_len, __entry->result_logical)
);
#endif /* _TRACE_EXT4_H */ #endif /* _TRACE_EXT4_H */
/* This part must be outside protection */ /* This part must be outside protection */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment