Commit 55b50278 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] real-time enhanced page allocator and throttling

From: Robert Love <rml@tech9.net>

- Let real-time tasks dip further into the reserves than usual in
  __alloc_pages().  There are a lot of ways to special case this.  This
  patch just cuts z->pages_low in half, before doing the incremental min
  thing, for real-time tasks.  I do not do anything in the low memory slow
  path.  We can be a _lot_ more aggressive if we want.  Right now, we just
  give real-time tasks a little help.

- Never ever call balance_dirty_pages() on a real-time task.  Where and
  how exactly we handle this is up for debate.  We could, for example,
  special case real-time tasks inside balance_dirty_pages().  This would
  allow us to perform some of the work (say, waking up pdflush) but not
  other work (say, the active throttling).  As it stands now, we do the
  per-processor accounting in balance_dirty_pages_ratelimited() but we
  never call balance_dirty_pages().  Lots of approaches work.  What we want
  to do is never engage the real-time task in forced writeback.
parent 5fc4d839
......@@ -281,7 +281,9 @@ struct signal_struct {
#define MAX_RT_PRIO MAX_USER_RT_PRIO
#define MAX_PRIO (MAX_RT_PRIO + 40)
#define rt_task(p) ((p)->prio < MAX_RT_PRIO)
/*
* Some day this will be a full-fledged user tracking system..
*/
......
......@@ -84,7 +84,6 @@ int dirty_writeback_centisecs_handler(struct ctl_table *, int, struct file *,
void __user *, size_t *);
void page_writeback_init(void);
void balance_dirty_pages(struct address_space *mapping);
void balance_dirty_pages_ratelimited(struct address_space *mapping);
int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0);
int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
......
......@@ -179,7 +179,6 @@ static DEFINE_PER_CPU(struct runqueue, runqueues);
#define this_rq() (&__get_cpu_var(runqueues))
#define task_rq(p) cpu_rq(task_cpu(p))
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#define rt_task(p) ((p)->prio < MAX_RT_PRIO)
/*
* Default context-switch locking:
......
......@@ -111,6 +111,7 @@ get_dirty_limits(struct page_state *ps, long *pbackground, long *pdirty)
int unmapped_ratio;
long background;
long dirty;
struct task_struct *tsk;
get_page_state(ps);
......@@ -129,7 +130,8 @@ get_dirty_limits(struct page_state *ps, long *pbackground, long *pdirty)
background = (background_ratio * total_pages) / 100;
dirty = (dirty_ratio * total_pages) / 100;
if (current->flags & PF_LESS_THROTTLE) {
tsk = current;
if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
background += background / 4;
dirty += dirty / 4;
}
......@@ -144,7 +146,7 @@ get_dirty_limits(struct page_state *ps, long *pbackground, long *pdirty)
* If we're over `background_thresh' then pdflush is woken to perform some
* writeout.
*/
void balance_dirty_pages(struct address_space *mapping)
static void balance_dirty_pages(struct address_space *mapping)
{
struct page_state ps;
long nr_reclaimable;
......@@ -219,6 +221,10 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
if (dirty_exceeded)
ratelimit = 8;
/*
* Check the rate limiting. Also, we do not want to throttle real-time
* tasks in balance_dirty_pages(). Period.
*/
if (get_cpu_var(ratelimits)++ >= ratelimit) {
__get_cpu_var(ratelimits) = 0;
put_cpu_var(ratelimits);
......
......@@ -520,7 +520,8 @@ static struct page *buffered_rmqueue(struct zone *zone, int order, int cold)
*
* Herein lies the mysterious "incremental min". That's the
*
* min += z->pages_low;
* local_low = z->pages_low;
* min += local_low;
*
* thing. The intent here is to provide additional protection to low zones for
* allocation requests which _could_ use higher zones. So a GFP_HIGHMEM
......@@ -538,10 +539,11 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
unsigned long min;
struct zone **zones, *classzone;
struct page *page;
struct reclaim_state reclaim_state;
struct task_struct *p = current;
int i;
int cold;
int do_retry;
struct reclaim_state reclaim_state;
might_sleep_if(wait);
......@@ -558,8 +560,17 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
min = 1UL << order;
for (i = 0; zones[i] != NULL; i++) {
struct zone *z = zones[i];
unsigned long local_low;
/*
* This is the fabled 'incremental min'. We let real-time tasks
* dip their real-time paws a little deeper into reserves.
*/
local_low = z->pages_low;
if (rt_task(p))
local_low >>= 1;
min += local_low;
min += z->pages_low;
if (z->free_pages >= min ||
(!wait && z->free_pages >= z->pages_high)) {
page = buffered_rmqueue(z, order, cold);
......@@ -582,6 +593,8 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
local_min = z->pages_min;
if (gfp_mask & __GFP_HIGH)
local_min >>= 2;
if (rt_task(p))
local_min >>= 1;
min += local_min;
if (z->free_pages >= min ||
(!wait && z->free_pages >= z->pages_high)) {
......@@ -595,7 +608,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
/* here we're in the low on memory slow path */
rebalance:
if ((current->flags & (PF_MEMALLOC | PF_MEMDIE)) && !in_interrupt()) {
if ((p->flags & (PF_MEMALLOC | PF_MEMDIE)) && !in_interrupt()) {
/* go through the zonelist yet again, ignoring mins */
for (i = 0; zones[i] != NULL; i++) {
struct zone *z = zones[i];
......@@ -611,14 +624,14 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
if (!wait)
goto nopage;
current->flags |= PF_MEMALLOC;
p->flags |= PF_MEMALLOC;
reclaim_state.reclaimed_slab = 0;
current->reclaim_state = &reclaim_state;
p->reclaim_state = &reclaim_state;
try_to_free_pages(classzone, gfp_mask, order);
current->reclaim_state = NULL;
current->flags &= ~PF_MEMALLOC;
p->reclaim_state = NULL;
p->flags &= ~PF_MEMALLOC;
/* go through the zonelist yet one more time */
min = 1UL << order;
......@@ -658,7 +671,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
if (!(gfp_mask & __GFP_NOWARN)) {
printk("%s: page allocation failure."
" order:%d, mode:0x%x\n",
current->comm, order, gfp_mask);
p->comm, order, gfp_mask);
}
return NULL;
got_pg:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment