[PATCH] real-time enhanced page allocator and throttling

From: Robert Love <rml@tech9.net> - Let real-time tasks dip further into the reserves than usual in __alloc_pages(). There are a lot of ways to special case this. This patch just cuts z->pages_low in half, before doing the incremental min thing, for real-time tasks. I do not do anything in the low memory slow path. We can be a _lot_ more aggressive if we want. Right now, we just give real-time tasks a little help. - Never ever call balance_dirty_pages() on a real-time task. Where and how exactly we handle this is up for debate. We could, for example, special case real-time tasks inside balance_dirty_pages(). This would allow us to perform some of the work (say, waking up pdflush) but not other work (say, the active throttling). As it stands now, we do the per-processor accounting in balance_dirty_pages_ratelimited() but we never call balance_dirty_pages(). Lots of approaches work. What we want to do is never engage the real-time task in forced writeback.

[PATCH] real-time enhanced page allocator and throttling
From: Robert Love <rml@tech9.net> - Let real-time tasks dip further into the reserves than usual in __alloc_pages(). There are a lot of ways to special case this. This patch just cuts z->pages_low in half, before doing the incremental min thing, for real-time tasks. I do not do anything in the low memory slow path. We can be a _lot_ more aggressive if we want. Right now, we just give real-time tasks a little help. - Never ever call balance_dirty_pages() on a real-time task. Where and how exactly we handle this is up for debate. We could, for example, special case real-time tasks inside balance_dirty_pages(). This would allow us to perform some of the work (say, waking up pdflush) but not other work (say, the active throttling). As it stands now, we do the per-processor accounting in balance_dirty_pages_ratelimited() but we never call balance_dirty_pages(). Lots of approaches work. What we want to do is never engage the real-time task in forced writeback.
55b50278 · Andrew Morton · Linus Torvalds · 5fc4d839 · 55b50278 · 55b50278
Commit 55b50278 authored Sep 21, 2003 by Andrew Morton Committed by Linus Torvalds Sep 21, 2003
5 changed files
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -281,7 +281,9 @@ struct signal_struct {
 #define MAX_RT_PRIO		MAX_USER_RT_PRIO

 #define MAX_PRIO		(MAX_RT_PRIO + 40)
- 
+
+#define rt_task(p)		((p)->prio < MAX_RT_PRIO)
+
 /*
 * Some day this will be a full-fledged user tracking system..
 */

--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -84,7 +84,6 @@ int dirty_writeback_centisecs_handler(struct ctl_table *, int, struct file *,
 				      void __user *, size_t *);

 void page_writeback_init(void);
-void balance_dirty_pages(struct address_space *mapping);
 void balance_dirty_pages_ratelimited(struct address_space *mapping);
 int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0);
 int do_writepages(struct address_space *mapping, struct writeback_control *wbc);

--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -179,7 +179,6 @@ static DEFINE_PER_CPU(struct runqueue, runqueues);
 #define this_rq()		(&__get_cpu_var(runqueues))
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
-#define rt_task(p)		((p)->prio < MAX_RT_PRIO)

 /*
 * Default context-switch locking:

--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -111,6 +111,7 @@ get_dirty_limits(struct page_state *ps, long *pbackground, long *pdirty)
 	int unmapped_ratio;
 	long background;
 	long dirty;
+	struct task_struct *tsk;

 	get_page_state(ps);

@@ -129,7 +130,8 @@ get_dirty_limits(struct page_state *ps, long *pbackground, long *pdirty)

 	background = (background_ratio * total_pages) / 100;
 	dirty = (dirty_ratio * total_pages) / 100;
-	if (current->flags & PF_LESS_THROTTLE) {
+	tsk = current;
+	if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
 		background += background / 4;
 		dirty += dirty / 4;
 	}
@@ -144,7 +146,7 @@ get_dirty_limits(struct page_state *ps, long *pbackground, long *pdirty)
 * If we're over `background_thresh' then pdflush is woken to perform some
 * writeout.
 */
-void balance_dirty_pages(struct address_space *mapping)
+static void balance_dirty_pages(struct address_space *mapping)
 {
 	struct page_state ps;
 	long nr_reclaimable;
@@ -219,6 +221,10 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
 	if (dirty_exceeded)
 		ratelimit = 8;

+	/*
+	 * Check the rate limiting. Also, we do not want to throttle real-time
+	 * tasks in balance_dirty_pages(). Period.
+	 */
 	if (get_cpu_var(ratelimits)++ >= ratelimit) {
 		__get_cpu_var(ratelimits) = 0;
 		put_cpu_var(ratelimits);

--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -520,7 +520,8 @@ static struct page *buffered_rmqueue(struct zone *zone, int order, int cold)
 *
 * Herein lies the mysterious "incremental min".  That's the
 *
- *	min += z->pages_low;
+ *	local_low = z->pages_low;
+ *	min += local_low;
 *
 * thing.  The intent here is to provide additional protection to low zones for
 * allocation requests which _could_ use higher zones.  So a GFP_HIGHMEM
@@ -538,10 +539,11 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 	unsigned long min;
 	struct zone **zones, *classzone;
 	struct page *page;
+	struct reclaim_state reclaim_state;
+	struct task_struct *p = current;
 	int i;
 	int cold;
 	int do_retry;
-	struct reclaim_state reclaim_state;

 	might_sleep_if(wait);

@@ -558,8 +560,17 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 	min = 1UL << order;
 	for (i = 0; zones[i] != NULL; i++) {
 		struct zone *z = zones[i];
+		unsigned long local_low;
+
+		/*
+		 * This is the fabled 'incremental min'. We let real-time tasks
+		 * dip their real-time paws a little deeper into reserves.
+		 */
+		local_low = z->pages_low;
+		if (rt_task(p))
+			local_low >>= 1;
+		min += local_low;

-		min += z->pages_low;
 		if (z->free_pages >= min ||
 				(!wait && z->free_pages >= z->pages_high)) {
 			page = buffered_rmqueue(z, order, cold);
@@ -582,6 +593,8 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 		local_min = z->pages_min;
 		if (gfp_mask & __GFP_HIGH)
 			local_min >>= 2;
+		if (rt_task(p))
+			local_min >>= 1;
 		min += local_min;
 		if (z->free_pages >= min ||
 				(!wait && z->free_pages >= z->pages_high)) {
@@ -595,7 +608,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 	/* here we're in the low on memory slow path */

 rebalance:
-	if ((current->flags & (PF_MEMALLOC | PF_MEMDIE)) && !in_interrupt()) {
+	if ((p->flags & (PF_MEMALLOC | PF_MEMDIE)) && !in_interrupt()) {
 		/* go through the zonelist yet again, ignoring mins */
 		for (i = 0; zones[i] != NULL; i++) {
 			struct zone *z = zones[i];
@@ -611,14 +624,14 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 	if (!wait)
 		goto nopage;

-	current->flags |= PF_MEMALLOC;
+	p->flags |= PF_MEMALLOC;
 	reclaim_state.reclaimed_slab = 0;
-	current->reclaim_state = &reclaim_state;
+	p->reclaim_state = &reclaim_state;

 	try_to_free_pages(classzone, gfp_mask, order);

-	current->reclaim_state = NULL;
-	current->flags &= ~PF_MEMALLOC;
+	p->reclaim_state = NULL;
+	p->flags &= ~PF_MEMALLOC;

 	/* go through the zonelist yet one more time */
 	min = 1UL << order;
@@ -658,7 +671,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 	if (!(gfp_mask & __GFP_NOWARN)) {
 		printk("%s: page allocation failure."
 			" order:%d, mode:0x%x\n",
-			current->comm, order, gfp_mask);
+			p->comm, order, gfp_mask);
 	}
 	return NULL;
 got_pg: