Commit 0bd4b96d authored by David Hildenbrand's avatar David Hildenbrand Committed by Michael Ellerman

powernv/memtrace: don't abuse memory hot(un)plug infrastructure for memory allocations

Let's use alloc_contig_pages() for allocating memory and remove the
linear mapping manually via arch_remove_linear_mapping(). Mark all pages
PG_offline, such that they will definitely not get touched - e.g.,
when hibernating. When freeing memory, try to revert what we did.

The original idea was discussed in:
 https://lkml.kernel.org/r/48340e96-7e6b-736f-9e23-d3111b915b6e@redhat.com

This is similar to CONFIG_DEBUG_PAGEALLOC handling on other
architectures, whereby only single pages are unmapped from the linear
mapping. Let's mimic what memory hot(un)plug would do with the linear
mapping.

We now need MEMORY_HOTPLUG and CONTIG_ALLOC as dependencies. Add a TODO
that we want to use __GFP_ZERO for clearing once alloc_contig_pages()
understands that.

Tested with in QEMU/TCG with 10 GiB of main memory:
  [root@localhost ~]# echo 0x40000000 > /sys/kernel/debug/powerpc/memtrace/enable
  [  105.903043][ T1080] memtrace: Allocated trace memory on node 0 at 0x0000000080000000
  [root@localhost ~]# echo 0x40000000 > /sys/kernel/debug/powerpc/memtrace/enable
  [  145.042493][ T1080] radix-mmu: Mapped 0x0000000080000000-0x00000000c0000000 with 64.0 KiB pages
  [  145.049019][ T1080] memtrace: Freed trace memory back on node 0
  [  145.333960][ T1080] memtrace: Allocated trace memory on node 0 at 0x0000000080000000
  [root@localhost ~]# echo 0x80000000 > /sys/kernel/debug/powerpc/memtrace/enable
  [  213.606916][ T1080] radix-mmu: Mapped 0x0000000080000000-0x00000000c0000000 with 64.0 KiB pages
  [  213.613855][ T1080] memtrace: Freed trace memory back on node 0
  [  214.185094][ T1080] memtrace: Allocated trace memory on node 0 at 0x0000000080000000
  [root@localhost ~]# echo 0x100000000 > /sys/kernel/debug/powerpc/memtrace/enable
  [  234.874872][ T1080] radix-mmu: Mapped 0x0000000080000000-0x0000000100000000 with 64.0 KiB pages
  [  234.886974][ T1080] memtrace: Freed trace memory back on node 0
  [  234.890153][ T1080] memtrace: Failed to allocate trace memory on node 0
  [root@localhost ~]# echo 0x40000000 > /sys/kernel/debug/powerpc/memtrace/enable
  [  259.490196][ T1080] memtrace: Allocated trace memory on node 0 at 0x0000000080000000

I also made sure allocated memory is properly zeroed.

Note 1: We currently won't be allocating from ZONE_MOVABLE - because our
	pages are not movable. However, as we don't run with any memory
	hot(un)plug mechanism around, we could make an exception to
	increase the chance of allocations succeeding.

Note 2: PG_reserved isn't sufficient. E.g., kernel_page_present() used
	along PG_reserved in hibernation code will always return "true"
	on powerpc, resulting in the pages getting touched. It's too
	generic - e.g., indicates boot allocations.

Note 3: For now, we keep using memory_block_size_bytes() as minimum
	granularity.
Suggested-by: default avatarMichal Hocko <mhocko@kernel.org>
Signed-off-by: default avatarDavid Hildenbrand <david@redhat.com>
Reviewed-by: default avatarOscar Salvador <osalvador@suse.de>
Signed-off-by: default avatarMichael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20201111145322.15793-9-david@redhat.com
parent ca2c36ca
...@@ -27,11 +27,11 @@ config OPAL_PRD ...@@ -27,11 +27,11 @@ config OPAL_PRD
recovery diagnostics on OpenPower machines recovery diagnostics on OpenPower machines
config PPC_MEMTRACE config PPC_MEMTRACE
bool "Enable removal of RAM from kernel mappings for tracing" bool "Enable runtime allocation of RAM for tracing"
depends on PPC_POWERNV && MEMORY_HOTREMOVE depends on PPC_POWERNV && MEMORY_HOTPLUG && CONTIG_ALLOC
help help
Enabling this option allows for the removal of memory (RAM) Enabling this option allows for runtime allocation of memory (RAM)
from the kernel mappings to be used for hardware tracing. for hardware tracing.
config PPC_VAS config PPC_VAS
bool "IBM Virtual Accelerator Switchboard (VAS)" bool "IBM Virtual Accelerator Switchboard (VAS)"
......
...@@ -51,33 +51,12 @@ static const struct file_operations memtrace_fops = { ...@@ -51,33 +51,12 @@ static const struct file_operations memtrace_fops = {
.open = simple_open, .open = simple_open,
}; };
static int check_memblock_online(struct memory_block *mem, void *arg)
{
if (mem->state != MEM_ONLINE)
return -1;
return 0;
}
static int change_memblock_state(struct memory_block *mem, void *arg)
{
unsigned long state = (unsigned long)arg;
mem->state = state;
return 0;
}
static void memtrace_clear_range(unsigned long start_pfn, static void memtrace_clear_range(unsigned long start_pfn,
unsigned long nr_pages) unsigned long nr_pages)
{ {
unsigned long pfn; unsigned long pfn;
/* /* As HIGHMEM does not apply, use clear_page() directly. */
* As pages are offline, we cannot trust the memmap anymore. As HIGHMEM
* does not apply, avoid passing around "struct page" and use
* clear_page() instead directly.
*/
for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) { for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
if (IS_ALIGNED(pfn, PAGES_PER_SECTION)) if (IS_ALIGNED(pfn, PAGES_PER_SECTION))
cond_resched(); cond_resched();
...@@ -85,72 +64,39 @@ static void memtrace_clear_range(unsigned long start_pfn, ...@@ -85,72 +64,39 @@ static void memtrace_clear_range(unsigned long start_pfn,
} }
} }
/* called with device_hotplug_lock held */
static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages)
{
const unsigned long start = PFN_PHYS(start_pfn);
const unsigned long size = PFN_PHYS(nr_pages);
if (walk_memory_blocks(start, size, NULL, check_memblock_online))
return false;
walk_memory_blocks(start, size, (void *)MEM_GOING_OFFLINE,
change_memblock_state);
if (offline_pages(start_pfn, nr_pages)) {
walk_memory_blocks(start, size, (void *)MEM_ONLINE,
change_memblock_state);
return false;
}
walk_memory_blocks(start, size, (void *)MEM_OFFLINE,
change_memblock_state);
return true;
}
static u64 memtrace_alloc_node(u32 nid, u64 size) static u64 memtrace_alloc_node(u32 nid, u64 size)
{ {
u64 start_pfn, end_pfn, nr_pages, pfn; const unsigned long nr_pages = PHYS_PFN(size);
u64 base_pfn; unsigned long pfn, start_pfn;
u64 bytes = memory_block_size_bytes(); struct page *page;
if (!node_spanned_pages(nid)) /*
* Trace memory needs to be aligned to the size, which is guaranteed
* by alloc_contig_pages().
*/
page = alloc_contig_pages(nr_pages, GFP_KERNEL | __GFP_THISNODE |
__GFP_NOWARN, nid, NULL);
if (!page)
return 0; return 0;
start_pfn = page_to_pfn(page);
start_pfn = node_start_pfn(nid);
end_pfn = node_end_pfn(nid);
nr_pages = size >> PAGE_SHIFT;
/* Trace memory needs to be aligned to the size */
end_pfn = round_down(end_pfn - nr_pages, nr_pages);
lock_device_hotplug();
for (base_pfn = end_pfn; base_pfn > start_pfn; base_pfn -= nr_pages) {
if (memtrace_offline_pages(nid, base_pfn, nr_pages) == true) {
/* /*
* Clear the range while we still have a linear * Clear the range while we still have a linear mapping.
* mapping. *
* TODO: use __GFP_ZERO with alloc_contig_pages() once supported.
*/ */
memtrace_clear_range(base_pfn, nr_pages); memtrace_clear_range(start_pfn, nr_pages);
/* /*
* Remove memory in memory block size chunks so that * Set pages PageOffline(), to indicate that nobody (e.g., hibernation,
* iomem resources are always split to the same size and * dumping, ...) should be touching these pages.
* we never try to remove memory that spans two iomem
* resources.
*/ */
end_pfn = base_pfn + nr_pages; for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++)
for (pfn = base_pfn; pfn < end_pfn; pfn += bytes>> PAGE_SHIFT) { __SetPageOffline(pfn_to_page(pfn));
__remove_memory(nid, pfn << PAGE_SHIFT, bytes);
}
unlock_device_hotplug();
return base_pfn << PAGE_SHIFT;
}
}
unlock_device_hotplug();
return 0; arch_remove_linear_mapping(PFN_PHYS(start_pfn), size);
return PFN_PHYS(start_pfn);
} }
static int memtrace_init_regions_runtime(u64 size) static int memtrace_init_regions_runtime(u64 size)
...@@ -220,16 +166,30 @@ static int memtrace_init_debugfs(void) ...@@ -220,16 +166,30 @@ static int memtrace_init_debugfs(void)
return ret; return ret;
} }
static int online_mem_block(struct memory_block *mem, void *arg) static int memtrace_free(int nid, u64 start, u64 size)
{ {
return device_online(&mem->dev); struct mhp_params params = { .pgprot = PAGE_KERNEL };
const unsigned long nr_pages = PHYS_PFN(size);
const unsigned long start_pfn = PHYS_PFN(start);
unsigned long pfn;
int ret;
ret = arch_create_linear_mapping(nid, start, size, &params);
if (ret)
return ret;
for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++)
__ClearPageOffline(pfn_to_page(pfn));
free_contig_range(start_pfn, nr_pages);
return 0;
} }
/* /*
* Iterate through the chunks of memory we have removed from the kernel * Iterate through the chunks of memory we allocated and attempt to expose
* and attempt to add them back to the kernel. * them back to the kernel.
*/ */
static int memtrace_online(void) static int memtrace_free_regions(void)
{ {
int i, ret = 0; int i, ret = 0;
struct memtrace_entry *ent; struct memtrace_entry *ent;
...@@ -237,7 +197,7 @@ static int memtrace_online(void) ...@@ -237,7 +197,7 @@ static int memtrace_online(void)
for (i = memtrace_array_nr - 1; i >= 0; i--) { for (i = memtrace_array_nr - 1; i >= 0; i--) {
ent = &memtrace_array[i]; ent = &memtrace_array[i];
/* We have onlined this chunk previously */ /* We have freed this chunk previously */
if (ent->nid == NUMA_NO_NODE) if (ent->nid == NUMA_NO_NODE)
continue; continue;
...@@ -247,30 +207,25 @@ static int memtrace_online(void) ...@@ -247,30 +207,25 @@ static int memtrace_online(void)
ent->mem = 0; ent->mem = 0;
} }
if (add_memory(ent->nid, ent->start, ent->size, MHP_NONE)) { if (memtrace_free(ent->nid, ent->start, ent->size)) {
pr_err("Failed to add trace memory to node %d\n", pr_err("Failed to free trace memory on node %d\n",
ent->nid); ent->nid);
ret += 1; ret += 1;
continue; continue;
} }
lock_device_hotplug();
walk_memory_blocks(ent->start, ent->size, NULL,
online_mem_block);
unlock_device_hotplug();
/* /*
* Memory was added successfully so clean up references to it * Memory was freed successfully so clean up references to it
* so on reentry we can tell that this chunk was added. * so on reentry we can tell that this chunk was freed.
*/ */
debugfs_remove_recursive(ent->dir); debugfs_remove_recursive(ent->dir);
pr_info("Added trace memory back to node %d\n", ent->nid); pr_info("Freed trace memory back on node %d\n", ent->nid);
ent->size = ent->start = ent->nid = NUMA_NO_NODE; ent->size = ent->start = ent->nid = NUMA_NO_NODE;
} }
if (ret) if (ret)
return ret; return ret;
/* If all chunks of memory were added successfully, reset globals */ /* If all chunks of memory were freed successfully, reset globals */
kfree(memtrace_array); kfree(memtrace_array);
memtrace_array = NULL; memtrace_array = NULL;
memtrace_size = 0; memtrace_size = 0;
...@@ -295,18 +250,16 @@ static int memtrace_enable_set(void *data, u64 val) ...@@ -295,18 +250,16 @@ static int memtrace_enable_set(void *data, u64 val)
mutex_lock(&memtrace_mutex); mutex_lock(&memtrace_mutex);
/* Re-add/online previously removed/offlined memory */ /* Free all previously allocated memory. */
if (memtrace_size) { if (memtrace_size && memtrace_free_regions())
if (memtrace_online())
goto out_unlock; goto out_unlock;
}
if (!val) { if (!val) {
rc = 0; rc = 0;
goto out_unlock; goto out_unlock;
} }
/* Offline and remove memory */ /* Allocate memory. */
if (memtrace_init_regions_runtime(val)) if (memtrace_init_regions_runtime(val))
goto out_unlock; goto out_unlock;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment