Commit 6b500267 authored by David Rientjes's avatar David Rientjes Committed by Jiri Slaby

mm, pcp: allow restoring percpu_pagelist_fraction default

commit 7cd2b0a3 upstream.

Oleg reports a division by zero error on zero-length write() to the
percpu_pagelist_fraction sysctl:

    divide error: 0000 [#1] SMP DEBUG_PAGEALLOC
    CPU: 1 PID: 9142 Comm: badarea_io Not tainted 3.15.0-rc2-vm-nfs+ #19
    Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
    task: ffff8800d5aeb6e0 ti: ffff8800d87a2000 task.ti: ffff8800d87a2000
    RIP: 0010: percpu_pagelist_fraction_sysctl_handler+0x84/0x120
    RSP: 0018:ffff8800d87a3e78  EFLAGS: 00010246
    RAX: 0000000000000f89 RBX: ffff88011f7fd000 RCX: 0000000000000000
    RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000010
    RBP: ffff8800d87a3e98 R08: ffffffff81d002c8 R09: ffff8800d87a3f50
    R10: 000000000000000b R11: 0000000000000246 R12: 0000000000000060
    R13: ffffffff81c3c3e0 R14: ffffffff81cfddf8 R15: ffff8801193b0800
    FS:  00007f614f1e9740(0000) GS:ffff88011f440000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
    CR2: 00007f614f1fa000 CR3: 00000000d9291000 CR4: 00000000000006e0
    Call Trace:
      proc_sys_call_handler+0xb3/0xc0
      proc_sys_write+0x14/0x20
      vfs_write+0xba/0x1e0
      SyS_write+0x46/0xb0
      tracesys+0xe1/0xe6

However, if the percpu_pagelist_fraction sysctl is set by the user, it
is also impossible to restore it to the kernel default since the user
cannot write 0 to the sysctl.

This patch allows the user to write 0 to restore the default behavior.
It still requires a fraction equal to or larger than 8, however, as
stated by the documentation for sanity.  If a value in the range [1, 7]
is written, the sysctl will return EINVAL.

This successfully solves the divide by zero issue at the same time.
Signed-off-by: default avatarDavid Rientjes <rientjes@google.com>
Reported-by: default avatarOleg Drokin <green@linuxhacker.ru>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: default avatarJiri Slaby <jslaby@suse.cz>
parent 78a97b52
...@@ -664,7 +664,8 @@ The batch value of each per cpu pagelist is also updated as a result. It is ...@@ -664,7 +664,8 @@ The batch value of each per cpu pagelist is also updated as a result. It is
set to pcp->high/4. The upper limit of batch is (PAGE_SHIFT * 8) set to pcp->high/4. The upper limit of batch is (PAGE_SHIFT * 8)
The initial value is zero. Kernel does not use this value at boot time to set The initial value is zero. Kernel does not use this value at boot time to set
the high water marks for each per cpu page list. the high water marks for each per cpu page list. If the user writes '0' to this
sysctl, it will revert to this default behavior.
============================================================== ==============================================================
......
...@@ -138,7 +138,6 @@ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; ...@@ -138,7 +138,6 @@ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
static int maxolduid = 65535; static int maxolduid = 65535;
static int minolduid; static int minolduid;
static int min_percpu_pagelist_fract = 8;
static int ngroups_max = NGROUPS_MAX; static int ngroups_max = NGROUPS_MAX;
static const int cap_last_cap = CAP_LAST_CAP; static const int cap_last_cap = CAP_LAST_CAP;
...@@ -1287,7 +1286,7 @@ static struct ctl_table vm_table[] = { ...@@ -1287,7 +1286,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(percpu_pagelist_fraction), .maxlen = sizeof(percpu_pagelist_fraction),
.mode = 0644, .mode = 0644,
.proc_handler = percpu_pagelist_fraction_sysctl_handler, .proc_handler = percpu_pagelist_fraction_sysctl_handler,
.extra1 = &min_percpu_pagelist_fract, .extra1 = &zero,
}, },
#ifdef CONFIG_MMU #ifdef CONFIG_MMU
{ {
......
...@@ -69,6 +69,7 @@ ...@@ -69,6 +69,7 @@
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock); static DEFINE_MUTEX(pcp_batch_high_lock);
#define MIN_PERCPU_PAGELIST_FRACTION (8)
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DEFINE_PER_CPU(int, numa_node); DEFINE_PER_CPU(int, numa_node);
...@@ -4079,7 +4080,7 @@ static void __meminit zone_init_free_lists(struct zone *zone) ...@@ -4079,7 +4080,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
#endif #endif
static int __meminit zone_batchsize(struct zone *zone) static int zone_batchsize(struct zone *zone)
{ {
#ifdef CONFIG_MMU #ifdef CONFIG_MMU
int batch; int batch;
...@@ -4195,8 +4196,8 @@ static void pageset_set_high(struct per_cpu_pageset *p, ...@@ -4195,8 +4196,8 @@ static void pageset_set_high(struct per_cpu_pageset *p,
pageset_update(&p->pcp, high, batch); pageset_update(&p->pcp, high, batch);
} }
static void __meminit pageset_set_high_and_batch(struct zone *zone, static void pageset_set_high_and_batch(struct zone *zone,
struct per_cpu_pageset *pcp) struct per_cpu_pageset *pcp)
{ {
if (percpu_pagelist_fraction) if (percpu_pagelist_fraction)
pageset_set_high(pcp, pageset_set_high(pcp,
...@@ -5789,23 +5790,38 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, ...@@ -5789,23 +5790,38 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos) void __user *buffer, size_t *length, loff_t *ppos)
{ {
struct zone *zone; struct zone *zone;
unsigned int cpu; int old_percpu_pagelist_fraction;
int ret; int ret;
mutex_lock(&pcp_batch_high_lock);
old_percpu_pagelist_fraction = percpu_pagelist_fraction;
ret = proc_dointvec_minmax(table, write, buffer, length, ppos); ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (!write || (ret < 0)) if (!write || ret < 0)
return ret; goto out;
/* Sanity checking to avoid pcp imbalance */
if (percpu_pagelist_fraction &&
percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
percpu_pagelist_fraction = old_percpu_pagelist_fraction;
ret = -EINVAL;
goto out;
}
/* No change? */
if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
goto out;
mutex_lock(&pcp_batch_high_lock);
for_each_populated_zone(zone) { for_each_populated_zone(zone) {
unsigned long high; unsigned int cpu;
high = zone->managed_pages / percpu_pagelist_fraction;
for_each_possible_cpu(cpu) for_each_possible_cpu(cpu)
pageset_set_high(per_cpu_ptr(zone->pageset, cpu), pageset_set_high_and_batch(zone,
high); per_cpu_ptr(zone->pageset, cpu));
} }
out:
mutex_unlock(&pcp_batch_high_lock); mutex_unlock(&pcp_batch_high_lock);
return 0; return ret;
} }
int hashdist = HASHDIST_DEFAULT; int hashdist = HASHDIST_DEFAULT;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment