[PATCH] implement __GFP_REPEAT, __GFP_NOFAIL, __GFP_NORETRY

This is a cleanup patch. There are quite a lot of places in the kernel which will infinitely retry a memory allocation. Generally, they get it wrong. Some do yield(), the semantics of which have changed over time. Some do schedule(), which can lock up if the caller is SCHED_FIFO/RR. Some do schedule_timeout(), etc. And often it is unnecessary, because the page allocator will do the retry internally anyway. But we cannot rely on that - this behaviour may change (-aa and -rmap kernels do not do this, for instance). So it is good to formalise and to centralise this operation. If an allocation specifies __GFP_REPEAT then the page allocator must infinitely retry the allocation. The semantics of __GFP_REPEAT are "try harder". The allocation _may_ fail (the 2.4 -aa and -rmap VM's do not retry infinitely by default). The semantics of __GFP_NOFAIL are "cannot fail". It is a no-op in this VM, but needs to be honoured (or fix up the callers) if the VM ischanged to not retry infinitely by default. The semantics of __GFP_NOREPEAT are "try once, don't loop". This isn't used at present (although perhaps it should be, in swapoff). It is mainly for completeness.

[PATCH] implement __GFP_REPEAT, __GFP_NOFAIL, __GFP_NORETRY
This is a cleanup patch. There are quite a lot of places in the kernel which will infinitely retry a memory allocation. Generally, they get it wrong. Some do yield(), the semantics of which have changed over time. Some do schedule(), which can lock up if the caller is SCHED_FIFO/RR. Some do schedule_timeout(), etc. And often it is unnecessary, because the page allocator will do the retry internally anyway. But we cannot rely on that - this behaviour may change (-aa and -rmap kernels do not do this, for instance). So it is good to formalise and to centralise this operation. If an allocation specifies __GFP_REPEAT then the page allocator must infinitely retry the allocation. The semantics of __GFP_REPEAT are "try harder". The allocation _may_ fail (the 2.4 -aa and -rmap VM's do not retry infinitely by default). The semantics of __GFP_NOFAIL are "cannot fail". It is a no-op in this VM, but needs to be honoured (or fix up the callers) if the VM ischanged to not retry infinitely by default. The semantics of __GFP_NOREPEAT are "try once, don't loop". This isn't used at present (although perhaps it should be, in swapoff). It is mainly for completeness.
75908778 · Andrew Morton · Linus Torvalds · efbb77b2 · 75908778 · 75908778
Commit 75908778 authored Apr 20, 2003 by Andrew Morton Committed by Linus Torvalds Apr 20, 2003
Hide whitespace changes
Inline Side-by-side

Showing with 32 additions and 8 deletions

include/linux/gfp.h include/linux/gfp.h +14 -1

include/linux/slab.h include/linux/slab.h +1 -1

mm/page_alloc.c mm/page_alloc.c +15 -3

mm/vmscan.c mm/vmscan.c +2 -3

No files found.
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -11,13 +11,26 @@
 #define __GFP_DMA	0x01
 #define __GFP_HIGHMEM	0x02

-/* Action modifiers - doesn't change the zoning */
+/*
+ * Action modifiers - doesn't change the zoning
+ *
+ * __GFP_REPEAT: Try hard to allocate the memory, but the allocation attempt
+ * _might_ fail.  This depends upon the particular VM implementation.
+ *
+ * __GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller
+ * cannot handle allocation failures.
+ *
+ * __GFP_NORETRY: The VM implementation must not retry indefinitely.
+ */
 #define __GFP_WAIT	0x10	/* Can wait and reschedule? */
 #define __GFP_HIGH	0x20	/* Should access emergency pools? */
 #define __GFP_IO	0x40	/* Can start physical IO? */
 #define __GFP_FS	0x80	/* Can call down to low-level FS? */
 #define __GFP_COLD	0x100	/* Cache-cold page required */
 #define __GFP_NOWARN	0x200	/* Suppress page allocation failure warning */
+#define __GFP_REPEAT	0x400	/* Retry the allocation.  Might fail */
+#define __GFP_NOFAIL	0x800	/* Retry for ever.  Cannot fail */
+#define __GFP_NORETRY	0x1000	/* Do not retry.  Might fail */

 #define GFP_ATOMIC	(__GFP_HIGH)
 #define GFP_NOIO	(__GFP_WAIT)

--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -22,7 +22,7 @@ typedef struct kmem_cache_s kmem_cache_t;
 #define	SLAB_KERNEL		GFP_KERNEL
 #define	SLAB_DMA		GFP_DMA

-#define SLAB_LEVEL_MASK		(__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS|__GFP_COLD|__GFP_NOWARN)
+#define SLAB_LEVEL_MASK		(__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS|__GFP_COLD|__GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|__GFP_NORETRY)
 #define	SLAB_NO_GROW		0x00001000UL	/* don't grow a cache */

 /* flags to pass to kmem_cache_create().

--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -536,6 +536,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 	struct page *page;
 	int i;
 	int cold;
+	int do_retry;

 	if (wait)
 		might_sleep();
@@ -626,10 +627,21 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 	}

 	/*
-	 * Don't let big-order allocations loop.  Yield for kswapd, try again.
+	 * Don't let big-order allocations loop unless the caller explicitly
+	 * requests that.  Wait for some write requests to complete then retry.
+	 *
+	 * In this implementation, __GFP_REPEAT means __GFP_NOFAIL, but that
+	 * may not be true in other implementations.
 	 */
-	if (order <= 3) {
-		yield();
+	do_retry = 0;
+	if (!(gfp_mask & __GFP_NORETRY)) {
+		if ((order <= 3) || (gfp_mask & __GFP_REPEAT))
+			do_retry = 1;
+		if (gfp_mask & __GFP_NOFAIL)
+			do_retry = 1;
+	}
+	if (do_retry) {
+		blk_congestion_wait(WRITE, HZ/50);
 		goto rebalance;
 	}


--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -805,8 +805,7 @@ shrink_caches(struct zone *classzone, int priority, int *total_scanned,
 * excessive rotation of the inactive list, which is _supposed_ to be an LRU,
 * yes?
 */
-int
-try_to_free_pages(struct zone *classzone,
+int try_to_free_pages(struct zone *classzone,
 		unsigned int gfp_mask, unsigned int order)
 {
 	int priority;
@@ -838,7 +837,7 @@ try_to_free_pages(struct zone *classzone,
 		blk_congestion_wait(WRITE, HZ/10);
 		shrink_slab(total_scanned, gfp_mask);
 	}
-	if (gfp_mask & __GFP_FS)
+	if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY))
 		out_of_memory();
 	return 0;
 }