Commit e182d612 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] buffer_head takedown for bighighmem machines

This patch addresses the excessive consumption of ZONE_NORMAL by
buffer_heads on highmem machines.  The algorithms which decide which
buffers to shoot down are fairly dumb, but they only cut in on machines
with large highmem:lowmem ratios and the code footprint is tiny.

The buffer.c change implements the buffer_head accounting - it sets the
upper limit on buffer_head memory occupancy to 10% of ZONE_NORMAL.

A possible side-effect of this change is that the kernel will perform
more calls to get_block() to map pages to disk.  This will only be
observed when a file is being repeatadly overwritten - this is the only
case in which the "cached get_block result" in the buffers is useful.

I did quite some testing of this back in the delalloc ext2 days, and
was not able to come up with a test in which the cached get_block
result was measurably useful.  That's for ext2, which has a fast
get_block().

A desirable side effect of this patch is that the kernel will be able
to cache much more blockdev pagecache in ZONE_NORMAL, so there are more
ext2/3 indirect blocks in cache, so with some workloads, less I/O will
be performed.

In mpage_writepage(): if the number of buffer_heads is excessive then
buffers are stripped from pages as they are submitted for writeback.
This change is only useful for filesystems which are using the mpage
code.  That's ext2 and ext3-writeback and JFS.  An mpage patch for
reiserfs was floating about but seems to have got lost.

There is no need to strip buffers for reads because the mpage code does
not attach buffers for reads.

These are perhaps not the most appropriate buffer_heads to toss away.
Perhaps something smarter should be done to detect file overwriting, or
to toss the 'oldest' buffer_heads first.

In refill_inactive(): if the number of buffer_heads is excessive then
strip buffers from pages as they move onto the inactive list.  This
change is useful for all filesystems.  This approach is good because
pages which are being repeatedly overwritten will remain on the active
list and will retain their buffers, whereas pages which are not being
overwritten will be stripped.
parent ce92adf3
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/mm.h> #include <linux/mm.h>
#include <linux/percpu.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/smp_lock.h> #include <linux/smp_lock.h>
#include <linux/blkdev.h> #include <linux/blkdev.h>
...@@ -2555,9 +2556,44 @@ asmlinkage long sys_bdflush(int func, long data) ...@@ -2555,9 +2556,44 @@ asmlinkage long sys_bdflush(int func, long data)
static kmem_cache_t *bh_cachep; static kmem_cache_t *bh_cachep;
static mempool_t *bh_mempool; static mempool_t *bh_mempool;
/*
* Once the number of bh's in the machine exceeds this level, we start
* stripping them in writeback.
*/
static int max_buffer_heads;
int buffer_heads_over_limit;
struct bh_accounting {
int nr; /* Number of live bh's */
int ratelimit; /* Limit cacheline bouncing */
};
static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
static void recalc_bh_state(void)
{
int i;
int tot = 0;
if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
return;
__get_cpu_var(bh_accounting).ratelimit = 0;
for (i = 0; i < NR_CPUS; i++)
tot += per_cpu(bh_accounting, i).nr;
buffer_heads_over_limit = (tot > max_buffer_heads);
}
struct buffer_head *alloc_buffer_head(void) struct buffer_head *alloc_buffer_head(void)
{ {
return mempool_alloc(bh_mempool, GFP_NOFS); struct buffer_head *ret = mempool_alloc(bh_mempool, GFP_NOFS);
if (ret) {
preempt_disable();
__get_cpu_var(bh_accounting).nr++;
recalc_bh_state();
preempt_enable();
}
return ret;
} }
EXPORT_SYMBOL(alloc_buffer_head); EXPORT_SYMBOL(alloc_buffer_head);
...@@ -2565,6 +2601,10 @@ void free_buffer_head(struct buffer_head *bh) ...@@ -2565,6 +2601,10 @@ void free_buffer_head(struct buffer_head *bh)
{ {
BUG_ON(!list_empty(&bh->b_assoc_buffers)); BUG_ON(!list_empty(&bh->b_assoc_buffers));
mempool_free(bh, bh_mempool); mempool_free(bh, bh_mempool);
preempt_disable();
__get_cpu_var(bh_accounting).nr--;
recalc_bh_state();
preempt_enable();
} }
EXPORT_SYMBOL(free_buffer_head); EXPORT_SYMBOL(free_buffer_head);
...@@ -2595,6 +2635,7 @@ static void bh_mempool_free(void *element, void *pool_data) ...@@ -2595,6 +2635,7 @@ static void bh_mempool_free(void *element, void *pool_data)
void __init buffer_init(void) void __init buffer_init(void)
{ {
int i; int i;
int nrpages;
bh_cachep = kmem_cache_create("buffer_head", bh_cachep = kmem_cache_create("buffer_head",
sizeof(struct buffer_head), 0, sizeof(struct buffer_head), 0,
...@@ -2603,4 +2644,10 @@ void __init buffer_init(void) ...@@ -2603,4 +2644,10 @@ void __init buffer_init(void)
bh_mempool_free, NULL); bh_mempool_free, NULL);
for (i = 0; i < ARRAY_SIZE(bh_wait_queue_heads); i++) for (i = 0; i < ARRAY_SIZE(bh_wait_queue_heads); i++)
init_waitqueue_head(&bh_wait_queue_heads[i].wqh); init_waitqueue_head(&bh_wait_queue_heads[i].wqh);
/*
* Limit the bh occupancy to 10% of ZONE_NORMAL
*/
nrpages = (nr_free_buffer_pages() * 10) / 100;
max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
} }
...@@ -460,6 +460,9 @@ mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block, ...@@ -460,6 +460,9 @@ mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block,
clear_buffer_dirty(bh); clear_buffer_dirty(bh);
bh = bh->b_this_page; bh = bh->b_this_page;
} while (bh != head); } while (bh != head);
if (buffer_heads_over_limit)
try_to_free_buffers(page);
} }
bvec = &bio->bi_io_vec[bio->bi_idx++]; bvec = &bio->bi_io_vec[bio->bi_idx++];
......
...@@ -167,6 +167,7 @@ void wakeup_bdflush(void); ...@@ -167,6 +167,7 @@ void wakeup_bdflush(void);
struct buffer_head *alloc_buffer_head(void); struct buffer_head *alloc_buffer_head(void);
void free_buffer_head(struct buffer_head * bh); void free_buffer_head(struct buffer_head * bh);
void FASTCALL(unlock_buffer(struct buffer_head *bh)); void FASTCALL(unlock_buffer(struct buffer_head *bh));
extern int buffer_heads_over_limit;
/* /*
* Generic address_space_operations implementations for buffer_head-backed * Generic address_space_operations implementations for buffer_head-backed
......
...@@ -20,6 +20,7 @@ void __pagevec_free(struct pagevec *pvec); ...@@ -20,6 +20,7 @@ void __pagevec_free(struct pagevec *pvec);
void __pagevec_lru_add(struct pagevec *pvec); void __pagevec_lru_add(struct pagevec *pvec);
void lru_add_drain(void); void lru_add_drain(void);
void pagevec_deactivate_inactive(struct pagevec *pvec); void pagevec_deactivate_inactive(struct pagevec *pvec);
void pagevec_strip(struct pagevec *pvec);
static inline void pagevec_init(struct pagevec *pvec) static inline void pagevec_init(struct pagevec *pvec)
{ {
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include <linux/pagevec.h> #include <linux/pagevec.h>
#include <linux/init.h> #include <linux/init.h>
#include <linux/mm_inline.h> #include <linux/mm_inline.h>
#include <linux/buffer_head.h>
#include <linux/prefetch.h> #include <linux/prefetch.h>
/* How many pages do we try to swap or page in/out together? */ /* How many pages do we try to swap or page in/out together? */
...@@ -222,6 +223,23 @@ void __pagevec_lru_add(struct pagevec *pvec) ...@@ -222,6 +223,23 @@ void __pagevec_lru_add(struct pagevec *pvec)
pagevec_release(pvec); pagevec_release(pvec);
} }
/*
* Try to drop buffers from the pages in a pagevec
*/
void pagevec_strip(struct pagevec *pvec)
{
int i;
for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
if (PagePrivate(page) && !TestSetPageLocked(page)) {
try_to_release_page(page, 0);
unlock_page(page);
}
}
}
/* /*
* Perform any setup for the swap system * Perform any setup for the swap system
*/ */
......
...@@ -433,10 +433,17 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in) ...@@ -433,10 +433,17 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in)
list_move(&page->lru, &zone->inactive_list); list_move(&page->lru, &zone->inactive_list);
if (!pagevec_add(&pvec, page)) { if (!pagevec_add(&pvec, page)) {
spin_unlock_irq(&zone->lru_lock); spin_unlock_irq(&zone->lru_lock);
if (buffer_heads_over_limit)
pagevec_strip(&pvec);
__pagevec_release(&pvec); __pagevec_release(&pvec);
spin_lock_irq(&zone->lru_lock); spin_lock_irq(&zone->lru_lock);
} }
} }
if (buffer_heads_over_limit) {
spin_unlock_irq(&zone->lru_lock);
pagevec_strip(&pvec);
spin_lock_irq(&zone->lru_lock);
}
while (!list_empty(&l_active)) { while (!list_empty(&l_active)) {
page = list_entry(l_active.prev, struct page, lru); page = list_entry(l_active.prev, struct page, lru);
prefetchw_prev_lru_page(page, &l_active, flags); prefetchw_prev_lru_page(page, &l_active, flags);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment