Commit 2b533038 authored by Nick Piggin's avatar Nick Piggin Committed by Greg Kroah-Hartman

[PATCH] Fix buddy list race that could lead to page lru list corruptions

Rohit found an obscure bug causing buddy list corruption.

page_is_buddy is using a non-atomic test (PagePrivate && page_count == 0)
to determine whether or not a free page's buddy is itself free and in the
buddy lists.

Each of the conjuncts may be true at different times due to unrelated
conditions, so the non-atomic page_is_buddy test may find each conjunct to
be true even if they were not both true at the same time (ie. the page was
not on the buddy lists).
Signed-off-by: default avatarMartin Bligh <mbligh@google.com>
Signed-off-by: default avatarRohit Seth <rohitseth@google.com>
Signed-off-by: default avatarNick Piggin <npiggin@suse.de>
Signed-off-by: default avatarKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@suse.de>
parent add92b7a
...@@ -229,10 +229,9 @@ struct page { ...@@ -229,10 +229,9 @@ struct page {
unsigned long private; /* Mapping-private opaque data: unsigned long private; /* Mapping-private opaque data:
* usually used for buffer_heads * usually used for buffer_heads
* if PagePrivate set; used for * if PagePrivate set; used for
* swp_entry_t if PageSwapCache. * swp_entry_t if PageSwapCache;
* When page is free, this
* indicates order in the buddy * indicates order in the buddy
* system. * system if PG_buddy is set.
*/ */
struct address_space *mapping; /* If low bit clear, points to struct address_space *mapping; /* If low bit clear, points to
* inode address_space, or NULL. * inode address_space, or NULL.
......
...@@ -74,7 +74,9 @@ ...@@ -74,7 +74,9 @@
#define PG_mappedtodisk 16 /* Has blocks allocated on-disk */ #define PG_mappedtodisk 16 /* Has blocks allocated on-disk */
#define PG_reclaim 17 /* To be reclaimed asap */ #define PG_reclaim 17 /* To be reclaimed asap */
#define PG_nosave_free 18 /* Free, should not be written */ #define PG_nosave_free 18 /* Free, should not be written */
#define PG_uncached 19 /* Page has been mapped as uncached */ #define PG_buddy 19 /* Page is free, on buddy lists */
#define PG_uncached 20 /* Page has been mapped as uncached */
/* /*
* Global page accounting. One instance per CPU. Only unsigned longs are * Global page accounting. One instance per CPU. Only unsigned longs are
...@@ -319,6 +321,10 @@ extern void __mod_page_state_offset(unsigned long offset, unsigned long delta); ...@@ -319,6 +321,10 @@ extern void __mod_page_state_offset(unsigned long offset, unsigned long delta);
#define SetPageNosaveFree(page) set_bit(PG_nosave_free, &(page)->flags) #define SetPageNosaveFree(page) set_bit(PG_nosave_free, &(page)->flags)
#define ClearPageNosaveFree(page) clear_bit(PG_nosave_free, &(page)->flags) #define ClearPageNosaveFree(page) clear_bit(PG_nosave_free, &(page)->flags)
#define PageBuddy(page) test_bit(PG_buddy, &(page)->flags)
#define __SetPageBuddy(page) __set_bit(PG_buddy, &(page)->flags)
#define __ClearPageBuddy(page) __clear_bit(PG_buddy, &(page)->flags)
#define PageMappedToDisk(page) test_bit(PG_mappedtodisk, &(page)->flags) #define PageMappedToDisk(page) test_bit(PG_mappedtodisk, &(page)->flags)
#define SetPageMappedToDisk(page) set_bit(PG_mappedtodisk, &(page)->flags) #define SetPageMappedToDisk(page) set_bit(PG_mappedtodisk, &(page)->flags)
#define ClearPageMappedToDisk(page) clear_bit(PG_mappedtodisk, &(page)->flags) #define ClearPageMappedToDisk(page) clear_bit(PG_mappedtodisk, &(page)->flags)
......
...@@ -153,7 +153,8 @@ static void bad_page(struct page *page) ...@@ -153,7 +153,8 @@ static void bad_page(struct page *page)
1 << PG_reclaim | 1 << PG_reclaim |
1 << PG_slab | 1 << PG_slab |
1 << PG_swapcache | 1 << PG_swapcache |
1 << PG_writeback ); 1 << PG_writeback |
1 << PG_buddy );
set_page_count(page, 0); set_page_count(page, 0);
reset_page_mapcount(page); reset_page_mapcount(page);
page->mapping = NULL; page->mapping = NULL;
...@@ -224,12 +225,12 @@ static inline unsigned long page_order(struct page *page) { ...@@ -224,12 +225,12 @@ static inline unsigned long page_order(struct page *page) {
static inline void set_page_order(struct page *page, int order) { static inline void set_page_order(struct page *page, int order) {
set_page_private(page, order); set_page_private(page, order);
__SetPagePrivate(page); __SetPageBuddy(page);
} }
static inline void rmv_page_order(struct page *page) static inline void rmv_page_order(struct page *page)
{ {
__ClearPagePrivate(page); __ClearPageBuddy(page);
set_page_private(page, 0); set_page_private(page, 0);
} }
...@@ -268,11 +269,13 @@ __find_combined_index(unsigned long page_idx, unsigned int order) ...@@ -268,11 +269,13 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
* This function checks whether a page is free && is the buddy * This function checks whether a page is free && is the buddy
* we can do coalesce a page and its buddy if * we can do coalesce a page and its buddy if
* (a) the buddy is not in a hole && * (a) the buddy is not in a hole &&
* (b) the buddy is free && * (b) the buddy is in the buddy system &&
* (c) the buddy is on the buddy system && * (c) a page and its buddy have the same order.
* (d) a page and its buddy have the same order. *
* for recording page's order, we use page_private(page) and PG_private. * For recording whether a page is in the buddy system, we use PG_buddy.
* Setting, clearing, and testing PG_buddy is serialized by zone->lock.
* *
* For recording page's order, we use page_private(page).
*/ */
static inline int page_is_buddy(struct page *page, int order) static inline int page_is_buddy(struct page *page, int order)
{ {
...@@ -281,10 +284,10 @@ static inline int page_is_buddy(struct page *page, int order) ...@@ -281,10 +284,10 @@ static inline int page_is_buddy(struct page *page, int order)
return 0; return 0;
#endif #endif
if (PagePrivate(page) && if (PageBuddy(page) && page_order(page) == order) {
(page_order(page) == order) && BUG_ON(page_count(page) != 0);
page_count(page) == 0)
return 1; return 1;
}
return 0; return 0;
} }
...@@ -301,7 +304,7 @@ static inline int page_is_buddy(struct page *page, int order) ...@@ -301,7 +304,7 @@ static inline int page_is_buddy(struct page *page, int order)
* as necessary, plus some accounting needed to play nicely with other * as necessary, plus some accounting needed to play nicely with other
* parts of the VM system. * parts of the VM system.
* At each level, we keep a list of pages, which are heads of continuous * At each level, we keep a list of pages, which are heads of continuous
* free pages of length of (1 << order) and marked with PG_Private.Page's * free pages of length of (1 << order) and marked with PG_buddy. Page's
* order is recorded in page_private(page) field. * order is recorded in page_private(page) field.
* So when we are allocating or freeing one, we can derive the state of the * So when we are allocating or freeing one, we can derive the state of the
* other. That is, if we allocate a small block, and both were * other. That is, if we allocate a small block, and both were
...@@ -364,7 +367,8 @@ static inline int free_pages_check(struct page *page) ...@@ -364,7 +367,8 @@ static inline int free_pages_check(struct page *page)
1 << PG_slab | 1 << PG_slab |
1 << PG_swapcache | 1 << PG_swapcache |
1 << PG_writeback | 1 << PG_writeback |
1 << PG_reserved )))) 1 << PG_reserved |
1 << PG_buddy ))))
bad_page(page); bad_page(page);
if (PageDirty(page)) if (PageDirty(page))
__ClearPageDirty(page); __ClearPageDirty(page);
...@@ -522,7 +526,8 @@ static int prep_new_page(struct page *page, int order) ...@@ -522,7 +526,8 @@ static int prep_new_page(struct page *page, int order)
1 << PG_slab | 1 << PG_slab |
1 << PG_swapcache | 1 << PG_swapcache |
1 << PG_writeback | 1 << PG_writeback |
1 << PG_reserved )))) 1 << PG_reserved |
1 << PG_buddy ))))
bad_page(page); bad_page(page);
/* /*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment