Commit 0c0e6195 authored by KAMEZAWA Hiroyuki's avatar KAMEZAWA Hiroyuki Committed by Linus Torvalds

memory unplug: page offline

Logic.
 - set all pages in  [start,end)  as isolated migration-type.
   by this, all free pages in the range will be not-for-use.
 - Migrate all LRU pages in the range.
 - Test all pages in the range's refcnt is zero or not.

Todo:
 - allocate migration destination page from better area.
 - confirm page_count(page)== 0 && PageReserved(page) page is safe to be freed..
 (I don't like this kind of page but..
 - Find out pages which cannot be migrated.
 - more running tests.
 - Use reclaim for unplugging other memory type area.
Signed-off-by: default avatarKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: default avatarYasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent a5d76b54
...@@ -305,6 +305,9 @@ config HOTPLUG_CPU ...@@ -305,6 +305,9 @@ config HOTPLUG_CPU
config ARCH_ENABLE_MEMORY_HOTPLUG config ARCH_ENABLE_MEMORY_HOTPLUG
def_bool y def_bool y
config ARCH_ENABLE_MEMORY_HOTREMOVE
def_bool y
config SCHED_SMT config SCHED_SMT
bool "SMT scheduler support" bool "SMT scheduler support"
depends on SMP depends on SMP
......
...@@ -35,6 +35,7 @@ extern const char linux_proc_banner[]; ...@@ -35,6 +35,7 @@ extern const char linux_proc_banner[];
#define ALIGN(x,a) __ALIGN_MASK(x,(typeof(x))(a)-1) #define ALIGN(x,a) __ALIGN_MASK(x,(typeof(x))(a)-1)
#define __ALIGN_MASK(x,mask) (((x)+(mask))&~(mask)) #define __ALIGN_MASK(x,mask) (((x)+(mask))&~(mask))
#define PTR_ALIGN(p, a) ((typeof(p))ALIGN((unsigned long)(p), (a))) #define PTR_ALIGN(p, a) ((typeof(p))ALIGN((unsigned long)(p), (a)))
#define IS_ALIGNED(x,a) (((x) % ((typeof(x))(a))) == 0)
#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr)) #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr))
......
...@@ -58,7 +58,10 @@ extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); ...@@ -58,7 +58,10 @@ extern int add_one_highpage(struct page *page, int pfn, int bad_ppro);
extern void online_page(struct page *page); extern void online_page(struct page *page);
/* VM interface that may be used by firmware interface */ /* VM interface that may be used by firmware interface */
extern int online_pages(unsigned long, unsigned long); extern int online_pages(unsigned long, unsigned long);
#ifdef CONFIG_MEMORY_HOTREMOVE
extern int offline_pages(unsigned long, unsigned long, unsigned long);
extern void __offline_isolated_pages(unsigned long, unsigned long);
#endif
/* reasonably generic interface to expand the physical pages in a zone */ /* reasonably generic interface to expand the physical pages in a zone */
extern int __add_pages(struct zone *zone, unsigned long start_pfn, extern int __add_pages(struct zone *zone, unsigned long start_pfn,
unsigned long nr_pages); unsigned long nr_pages);
......
...@@ -139,6 +139,11 @@ config MEMORY_HOTPLUG_SPARSE ...@@ -139,6 +139,11 @@ config MEMORY_HOTPLUG_SPARSE
def_bool y def_bool y
depends on SPARSEMEM && MEMORY_HOTPLUG depends on SPARSEMEM && MEMORY_HOTPLUG
config MEMORY_HOTREMOVE
bool "Allow for memory hot remove"
depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
depends on MIGRATION
# Heavily threaded applications may benefit from splitting the mm-wide # Heavily threaded applications may benefit from splitting the mm-wide
# page_table_lock, so that faults on different parts of the user address # page_table_lock, so that faults on different parts of the user address
# space can be handled with less contention: split it at this NR_CPUS. # space can be handled with less contention: split it at this NR_CPUS.
......
...@@ -23,6 +23,9 @@ ...@@ -23,6 +23,9 @@
#include <linux/vmalloc.h> #include <linux/vmalloc.h>
#include <linux/ioport.h> #include <linux/ioport.h>
#include <linux/cpuset.h> #include <linux/cpuset.h>
#include <linux/delay.h>
#include <linux/migrate.h>
#include <linux/page-isolation.h>
#include <asm/tlbflush.h> #include <asm/tlbflush.h>
...@@ -302,3 +305,254 @@ int add_memory(int nid, u64 start, u64 size) ...@@ -302,3 +305,254 @@ int add_memory(int nid, u64 start, u64 size)
return ret; return ret;
} }
EXPORT_SYMBOL_GPL(add_memory); EXPORT_SYMBOL_GPL(add_memory);
#ifdef CONFIG_MEMORY_HOTREMOVE
/*
* Confirm all pages in a range [start, end) is belongs to the same zone.
*/
static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long pfn;
struct zone *zone = NULL;
struct page *page;
int i;
for (pfn = start_pfn;
pfn < end_pfn;
pfn += MAX_ORDER_NR_PAGES) {
i = 0;
/* This is just a CONFIG_HOLES_IN_ZONE check.*/
while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i))
i++;
if (i == MAX_ORDER_NR_PAGES)
continue;
page = pfn_to_page(pfn + i);
if (zone && page_zone(page) != zone)
return 0;
zone = page_zone(page);
}
return 1;
}
/*
* Scanning pfn is much easier than scanning lru list.
* Scan pfn from start to end and Find LRU page.
*/
int scan_lru_pages(unsigned long start, unsigned long end)
{
unsigned long pfn;
struct page *page;
for (pfn = start; pfn < end; pfn++) {
if (pfn_valid(pfn)) {
page = pfn_to_page(pfn);
if (PageLRU(page))
return pfn;
}
}
return 0;
}
static struct page *
hotremove_migrate_alloc(struct page *page,
unsigned long private,
int **x)
{
/* This should be improoooooved!! */
return alloc_page(GFP_HIGHUSER_PAGECACHE);
}
#define NR_OFFLINE_AT_ONCE_PAGES (256)
static int
do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long pfn;
struct page *page;
int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
int not_managed = 0;
int ret = 0;
LIST_HEAD(source);
for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
if (!pfn_valid(pfn))
continue;
page = pfn_to_page(pfn);
if (!page_count(page))
continue;
/*
* We can skip free pages. And we can only deal with pages on
* LRU.
*/
ret = isolate_lru_page(page, &source);
if (!ret) { /* Success */
move_pages--;
} else {
/* Becasue we don't have big zone->lock. we should
check this again here. */
if (page_count(page))
not_managed++;
#ifdef CONFIG_DEBUG_VM
printk(KERN_INFO "removing from LRU failed"
" %lx/%d/%lx\n",
pfn, page_count(page), page->flags);
#endif
}
}
ret = -EBUSY;
if (not_managed) {
if (!list_empty(&source))
putback_lru_pages(&source);
goto out;
}
ret = 0;
if (list_empty(&source))
goto out;
/* this function returns # of failed pages */
ret = migrate_pages(&source, hotremove_migrate_alloc, 0);
out:
return ret;
}
/*
* remove from free_area[] and mark all as Reserved.
*/
static int
offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
void *data)
{
__offline_isolated_pages(start, start + nr_pages);
return 0;
}
static void
offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
{
walk_memory_resource(start_pfn, end_pfn - start_pfn, NULL,
offline_isolated_pages_cb);
}
/*
* Check all pages in range, recoreded as memory resource, are isolated.
*/
static int
check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
void *data)
{
int ret;
long offlined = *(long *)data;
ret = test_pages_isolated(start_pfn, start_pfn + nr_pages);
offlined = nr_pages;
if (!ret)
*(long *)data += offlined;
return ret;
}
static long
check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
{
long offlined = 0;
int ret;
ret = walk_memory_resource(start_pfn, end_pfn - start_pfn, &offlined,
check_pages_isolated_cb);
if (ret < 0)
offlined = (long)ret;
return offlined;
}
extern void drain_all_local_pages(void);
int offline_pages(unsigned long start_pfn,
unsigned long end_pfn, unsigned long timeout)
{
unsigned long pfn, nr_pages, expire;
long offlined_pages;
int ret, drain, retry_max;
struct zone *zone;
BUG_ON(start_pfn >= end_pfn);
/* at least, alignment against pageblock is necessary */
if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
return -EINVAL;
if (!IS_ALIGNED(end_pfn, pageblock_nr_pages))
return -EINVAL;
/* This makes hotplug much easier...and readable.
we assume this for now. .*/
if (!test_pages_in_a_zone(start_pfn, end_pfn))
return -EINVAL;
/* set above range as isolated */
ret = start_isolate_page_range(start_pfn, end_pfn);
if (ret)
return ret;
nr_pages = end_pfn - start_pfn;
pfn = start_pfn;
expire = jiffies + timeout;
drain = 0;
retry_max = 5;
repeat:
/* start memory hot removal */
ret = -EAGAIN;
if (time_after(jiffies, expire))
goto failed_removal;
ret = -EINTR;
if (signal_pending(current))
goto failed_removal;
ret = 0;
if (drain) {
lru_add_drain_all();
flush_scheduled_work();
cond_resched();
drain_all_local_pages();
}
pfn = scan_lru_pages(start_pfn, end_pfn);
if (pfn) { /* We have page on LRU */
ret = do_migrate_range(pfn, end_pfn);
if (!ret) {
drain = 1;
goto repeat;
} else {
if (ret < 0)
if (--retry_max == 0)
goto failed_removal;
yield();
drain = 1;
goto repeat;
}
}
/* drain all zone's lru pagevec, this is asyncronous... */
lru_add_drain_all();
flush_scheduled_work();
yield();
/* drain pcp pages , this is synchrouns. */
drain_all_local_pages();
/* check again */
offlined_pages = check_pages_isolated(start_pfn, end_pfn);
if (offlined_pages < 0) {
ret = -EBUSY;
goto failed_removal;
}
printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages);
/* Ok, all of our target is islaoted.
We cannot do rollback at this point. */
offline_isolated_pages(start_pfn, end_pfn);
/* reset pagetype flags */
start_isolate_page_range(start_pfn, end_pfn);
/* removal success */
zone = page_zone(pfn_to_page(start_pfn));
zone->present_pages -= offlined_pages;
zone->zone_pgdat->node_present_pages -= offlined_pages;
totalram_pages -= offlined_pages;
num_physpages -= offlined_pages;
vm_total_pages = nr_free_pagecache_pages();
writeback_set_ratelimit();
return 0;
failed_removal:
printk(KERN_INFO "memory offlining %lx to %lx failed\n",
start_pfn, end_pfn);
/* pushback to free area */
undo_isolate_page_range(start_pfn, end_pfn);
return ret;
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
...@@ -4477,3 +4477,50 @@ void unset_migratetype_isolate(struct page *page) ...@@ -4477,3 +4477,50 @@ void unset_migratetype_isolate(struct page *page)
out: out:
spin_unlock_irqrestore(&zone->lock, flags); spin_unlock_irqrestore(&zone->lock, flags);
} }
#ifdef CONFIG_MEMORY_HOTREMOVE
/*
* All pages in the range must be isolated before calling this.
*/
void
__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
{
struct page *page;
struct zone *zone;
int order, i;
unsigned long pfn;
unsigned long flags;
/* find the first valid pfn */
for (pfn = start_pfn; pfn < end_pfn; pfn++)
if (pfn_valid(pfn))
break;
if (pfn == end_pfn)
return;
zone = page_zone(pfn_to_page(pfn));
spin_lock_irqsave(&zone->lock, flags);
pfn = start_pfn;
while (pfn < end_pfn) {
if (!pfn_valid(pfn)) {
pfn++;
continue;
}
page = pfn_to_page(pfn);
BUG_ON(page_count(page));
BUG_ON(!PageBuddy(page));
order = page_order(page);
#ifdef CONFIG_DEBUG_VM
printk(KERN_INFO "remove from free list %lx %d %lx\n",
pfn, 1 << order, end_pfn);
#endif
list_del(&page->lru);
rmv_page_order(page);
zone->free_area[order].nr_free--;
__mod_zone_page_state(zone, NR_FREE_PAGES,
- (1UL << order));
for (i = 0; i < (1 << order); i++)
SetPageReserved((page+i));
pfn += (1 << order);
}
spin_unlock_irqrestore(&zone->lock, flags);
}
#endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment