Commit 93d33a48 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] laptop mode

From: Bart Samwel <bart@samwel.tk>

Adds /proc/sys/vm/laptop-mode: a special knob which says "this is a laptop".
In this mode the kernel will attempt to avoid spinning disks up.

Algorithm: the idea is to hold dirty data in memory for a long time, but to
flush everything which has been accumulated if the disk happens to spin up
for other reasons.

- Whenever a disk request completes (read or write), schedule a timer a few
  seconds hence.  If the timer was already pending, reset it to a few seconds
  hence.

- When the timer expires, write back the whole world.  We use
  sync_filesystems() for this because it will force ext3 journal commits as
  well.

- In balance_dirty_pages(), kick off background writeback when we hit the
  high threshold (dirty_ratio), not when we hit the low threshold.  This has
  the effect of causing "lumpy" writeback which is something I spent a year
  fixing, but in laptop mode, it is desirable.

- In try_to_free_pages(), only kick pdflush if the VM is getting into
  distress: we want to keep scanning for clean pages, deferring writeback.

- In page reclaim, avoid writing back the odd random dirty page off the
  LRU: only start I/O if the scanning is working harder.

The effect is to perform a sync() a few seconds after all I/O has ceased.

The value which was written into /proc/sys/vm/laptop-mode determines, in
seconds, the delay between the final I/O and the flush.

Additionally, the patch adds tools which help answer the question "why the
heck does my disk spin up all the time?".  The user may set
/proc/sys/vm/block_dump to a non-zero value and the kernel will print out
information which will identify the process which is performing disk reads or
which is dirtying pagecache.

The user should probably disable syslogd before setting block-dump.
parent 77fe0a19
This diff is collapsed.
......@@ -27,6 +27,7 @@
#include <linux/completion.h>
#include <linux/slab.h>
#include <linux/swap.h>
#include <linux/writeback.h>
/*
* for max sense size
......@@ -2471,6 +2472,16 @@ int submit_bio(int rw, struct bio *bio)
mod_page_state(pgpgout, count);
else
mod_page_state(pgpgin, count);
if (unlikely(block_dump)) {
char b[BDEVNAME_SIZE];
printk("%s(%d): %s block %Lu on %s\n",
current->comm, current->pid,
(rw & WRITE) ? "WRITE" : "READ",
(unsigned long long)bio->bi_sector,
bdevname(bio->bi_bdev,b));
}
generic_make_request(bio);
return 1;
}
......@@ -2754,6 +2765,9 @@ void end_that_request_last(struct request *req)
struct gendisk *disk = req->rq_disk;
struct completion *waiting = req->waiting;
if (unlikely(laptop_mode))
laptop_io_completion();
if (disk && blk_fs_request(req)) {
unsigned long duration = jiffies - req->start_time;
switch (rq_data_dir(req)) {
......
......@@ -274,6 +274,8 @@ static void do_sync(unsigned long wait)
sync_inodes(wait); /* Mappings, inodes and blockdevs, again. */
if (!wait)
printk("Emergency Sync complete\n");
if (unlikely(laptop_mode))
laptop_sync_completion();
}
asmlinkage long sys_sync(void)
......
......@@ -75,6 +75,9 @@ void __mark_inode_dirty(struct inode *inode, int flags)
if ((inode->i_state & flags) == flags)
return;
if (unlikely(block_dump))
printk("%s(%d): dirtied file\n", current->comm, current->pid);
spin_lock(&inode_lock);
if ((inode->i_state & flags) != flags) {
const int was_dirty = inode->i_state & I_DIRTY;
......
......@@ -159,6 +159,8 @@ enum
VM_LOWER_ZONE_PROTECTION=20,/* Amount of protection of lower zones */
VM_MIN_FREE_KBYTES=21, /* Minimum free kilobytes to maintain */
VM_MAX_MAP_COUNT=22, /* int: Maximum number of mmaps/address-space */
VM_LAPTOP_MODE=23, /* vm laptop mode */
VM_BLOCK_DUMP=24, /* block dump mode */
};
......
......@@ -72,12 +72,16 @@ static inline void wait_on_inode(struct inode *inode)
* mm/page-writeback.c
*/
int wakeup_bdflush(long nr_pages);
void laptop_io_completion(void);
void laptop_sync_completion(void);
/* These 5 are exported to sysctl. */
/* These are exported to sysctl. */
extern int dirty_background_ratio;
extern int vm_dirty_ratio;
extern int dirty_writeback_centisecs;
extern int dirty_expire_centisecs;
extern int block_dump;
extern int laptop_mode;
struct ctl_table;
struct file;
......
......@@ -744,6 +744,26 @@ static ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec
},
{
.ctl_name = VM_LAPTOP_MODE,
.procname = "laptop_mode",
.data = &laptop_mode,
.maxlen = sizeof(laptop_mode),
.mode = 0644,
.proc_handler = &proc_dointvec,
.strategy = &sysctl_intvec,
.extra1 = &zero,
},
{
.ctl_name = VM_BLOCK_DUMP,
.procname = "block_dump",
.data = &block_dump,
.maxlen = sizeof(block_dump),
.mode = 0644,
.proc_handler = &proc_dointvec,
.strategy = &sysctl_intvec,
.extra1 = &zero,
},
{ .ctl_name = 0 }
};
......
......@@ -28,6 +28,7 @@
#include <linux/smp.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/syscalls.h>
/*
* The maximum number of pages to writeout in a single bdflush/kupdate
......@@ -81,6 +82,16 @@ int dirty_writeback_centisecs = 5 * 100;
*/
int dirty_expire_centisecs = 30 * 100;
/*
* Flag that makes the machine dump writes/reads and block dirtyings.
*/
int block_dump;
/*
* Flag that puts the machine in "laptop mode".
*/
int laptop_mode;
/* End of sysctl-exported parameters */
......@@ -195,7 +206,19 @@ static void balance_dirty_pages(struct address_space *mapping)
if (nr_reclaimable + ps.nr_writeback <= dirty_thresh)
dirty_exceeded = 0;
if (!writeback_in_progress(bdi) && nr_reclaimable > background_thresh)
if (writeback_in_progress(bdi))
return; /* pdflush is already working this queue */
/*
* In laptop mode, we wait until hitting the higher threshold before
* starting background writeout, and then write out all the way down
* to the lower threshold. So slow writers cause minimal disk activity.
*
* In normal mode, we start background writeout at the lower
* background_thresh, to keep the amount of dirty memory low.
*/
if ((laptop_mode && pages_written) ||
(!laptop_mode && (nr_reclaimable > background_thresh)))
pdflush_operation(background_writeout, 0);
}
......@@ -289,7 +312,13 @@ int wakeup_bdflush(long nr_pages)
return pdflush_operation(background_writeout, nr_pages);
}
static struct timer_list wb_timer;
static void wb_timer_fn(unsigned long unused);
static void laptop_timer_fn(unsigned long unused);
static struct timer_list wb_timer =
TIMER_INITIALIZER(wb_timer_fn, 0, 0);
static struct timer_list laptop_mode_wb_timer =
TIMER_INITIALIZER(laptop_timer_fn, 0, 0);
/*
* Periodic writeback of "old" data.
......@@ -368,7 +397,36 @@ static void wb_timer_fn(unsigned long unused)
{
if (pdflush_operation(wb_kupdate, 0) < 0)
mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
}
static void laptop_flush(unsigned long unused)
{
sys_sync();
}
static void laptop_timer_fn(unsigned long unused)
{
pdflush_operation(laptop_flush, 0);
}
/*
* We've spun up the disk and we're in laptop mode: schedule writeback
* of all dirty data a few seconds from now. If the flush is already scheduled
* then push it back - the user is still using the disk.
*/
void laptop_io_completion(void)
{
mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode * HZ);
}
/*
* We're in laptop mode and we've just synced. The sync's writes will have
* caused another writeback to be scheduled by laptop_io_completion.
* Nothing needs to be written back anymore, so we unschedule the writeback.
*/
void laptop_sync_completion(void)
{
del_timer(&laptop_mode_wb_timer);
}
/*
......@@ -429,12 +487,7 @@ void __init page_writeback_init(void)
vm_dirty_ratio *= correction;
vm_dirty_ratio /= 100;
}
init_timer(&wb_timer);
wb_timer.expires = jiffies + (dirty_writeback_centisecs * HZ) / 100;
wb_timer.data = 0;
wb_timer.function = wb_timer_fn;
add_timer(&wb_timer);
mod_timer(&wb_timer, jiffies + (dirty_writeback_centisecs * HZ) / 100);
set_ratelimit();
register_cpu_notifier(&ratelimit_nb);
}
......
......@@ -246,7 +246,8 @@ static void handle_write_error(struct address_space *mapping,
* shrink_list returns the number of reclaimed pages
*/
static int
shrink_list(struct list_head *page_list, unsigned int gfp_mask, int *nr_scanned)
shrink_list(struct list_head *page_list, unsigned int gfp_mask,
int *nr_scanned, int do_writepage)
{
struct address_space *mapping;
LIST_HEAD(ret_pages);
......@@ -354,6 +355,8 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask, int *nr_scanned)
goto keep_locked;
if (!may_write_to_queue(mapping->backing_dev_info))
goto keep_locked;
if (laptop_mode && !do_writepage)
goto keep_locked;
if (clear_page_dirty_for_io(page)) {
int res;
struct writeback_control wbc = {
......@@ -473,7 +476,7 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask, int *nr_scanned)
*/
static int
shrink_cache(struct zone *zone, unsigned int gfp_mask,
int max_scan, int *total_scanned)
int max_scan, int *total_scanned, int do_writepage)
{
LIST_HEAD(page_list);
struct pagevec pvec;
......@@ -521,7 +524,8 @@ shrink_cache(struct zone *zone, unsigned int gfp_mask,
mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
else
mod_page_state_zone(zone, pgscan_direct, nr_scan);
nr_freed = shrink_list(&page_list, gfp_mask, total_scanned);
nr_freed = shrink_list(&page_list, gfp_mask,
total_scanned, do_writepage);
*total_scanned += nr_taken;
if (current_is_kswapd())
mod_page_state(kswapd_steal, nr_freed);
......@@ -735,7 +739,7 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in,
*/
static int
shrink_zone(struct zone *zone, int max_scan, unsigned int gfp_mask,
int *total_scanned, struct page_state *ps)
int *total_scanned, struct page_state *ps, int do_writepage)
{
unsigned long ratio;
int count;
......@@ -764,7 +768,8 @@ shrink_zone(struct zone *zone, int max_scan, unsigned int gfp_mask,
count = atomic_read(&zone->nr_scan_inactive);
if (count >= SWAP_CLUSTER_MAX) {
atomic_set(&zone->nr_scan_inactive, 0);
return shrink_cache(zone, gfp_mask, count, total_scanned);
return shrink_cache(zone, gfp_mask, count,
total_scanned, do_writepage);
}
return 0;
}
......@@ -787,7 +792,7 @@ shrink_zone(struct zone *zone, int max_scan, unsigned int gfp_mask,
*/
static int
shrink_caches(struct zone **zones, int priority, int *total_scanned,
int gfp_mask, struct page_state *ps)
int gfp_mask, struct page_state *ps, int do_writepage)
{
int ret = 0;
int i;
......@@ -803,7 +808,8 @@ shrink_caches(struct zone **zones, int priority, int *total_scanned,
continue; /* Let kswapd poll it */
max_scan = zone->nr_inactive >> priority;
ret += shrink_zone(zone, max_scan, gfp_mask, total_scanned, ps);
ret += shrink_zone(zone, max_scan, gfp_mask,
total_scanned, ps, do_writepage);
}
return ret;
}
......@@ -833,6 +839,8 @@ int try_to_free_pages(struct zone **zones,
int nr_reclaimed = 0;
struct reclaim_state *reclaim_state = current->reclaim_state;
int i;
unsigned long total_scanned = 0;
int do_writepage = 0;
inc_page_state(allocstall);
......@@ -840,13 +848,13 @@ int try_to_free_pages(struct zone **zones,
zones[i]->temp_priority = DEF_PRIORITY;
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
int total_scanned = 0;
int scanned = 0;
struct page_state ps;
get_page_state(&ps);
nr_reclaimed += shrink_caches(zones, priority, &total_scanned,
gfp_mask, &ps);
shrink_slab(total_scanned, gfp_mask);
nr_reclaimed += shrink_caches(zones, priority, &scanned,
gfp_mask, &ps, do_writepage);
shrink_slab(scanned, gfp_mask);
if (reclaim_state) {
nr_reclaimed += reclaim_state->reclaimed_slab;
reclaim_state->reclaimed_slab = 0;
......@@ -858,14 +866,20 @@ int try_to_free_pages(struct zone **zones,
if (!(gfp_mask & __GFP_FS))
break; /* Let the caller handle it */
/*
* Try to write back as many pages as we just scanned. Not
* sure if that makes sense, but it's an attempt to avoid
* creating IO storms unnecessarily
* Try to write back as many pages as we just scanned. This
* tends to cause slow streaming writers to write data to the
* disk smoothly, at the dirtying rate, which is nice. But
* that's undesirable in laptop mode, where we *want* lumpy
* writeout. So in laptop mode, write out the whole world.
*/
wakeup_bdflush(total_scanned);
total_scanned += scanned;
if (total_scanned > SWAP_CLUSTER_MAX + SWAP_CLUSTER_MAX/2) {
wakeup_bdflush(laptop_mode ? 0 : total_scanned);
do_writepage = 1;
}
/* Take a nap, wait for some writeback to complete */
if (total_scanned && priority < DEF_PRIORITY - 2)
if (scanned && priority < DEF_PRIORITY - 2)
blk_congestion_wait(WRITE, HZ/10);
}
if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY))
......@@ -908,6 +922,8 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages, struct page_state *ps)
int i;
struct reclaim_state *reclaim_state = current->reclaim_state;
unsigned long total_scanned = 0;
unsigned long total_reclaimed = 0;
int do_writepage = 0;
inc_page_state(pageoutrun);
......@@ -969,16 +985,25 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages, struct page_state *ps)
zone->temp_priority = priority;
max_scan = zone->nr_inactive >> priority;
reclaimed = shrink_zone(zone, max_scan, GFP_KERNEL,
&scanned, ps);
&scanned, ps, do_writepage);
total_scanned += scanned;
reclaim_state->reclaimed_slab = 0;
shrink_slab(scanned, GFP_KERNEL);
reclaimed += reclaim_state->reclaimed_slab;
total_reclaimed += reclaimed;
to_free -= reclaimed;
if (zone->all_unreclaimable)
continue;
if (zone->pages_scanned > zone->present_pages * 2)
zone->all_unreclaimable = 1;
/*
* If we've done a decent amount of scanning and
* the reclaim ratio is low, start doing writepage
* even in laptop mode
*/
if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
total_scanned > total_reclaimed+total_reclaimed/2)
do_writepage = 1;
}
if (nr_pages && to_free > 0)
continue; /* swsusp: need to do more work */
......@@ -997,7 +1022,7 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages, struct page_state *ps)
zone->prev_priority = zone->temp_priority;
}
return nr_pages - to_free;
return total_reclaimed;
}
/*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment