Commit 48046e09 authored by Jann Horn's avatar Jann Horn Committed by Greg Kroah-Hartman

sched/fair: Don't free p->numa_faults with concurrent readers

commit 16d51a59 upstream.

When going through execve(), zero out the NUMA fault statistics instead of
freeing them.

During execve, the task is reachable through procfs and the scheduler. A
concurrent /proc/*/sched reader can read data from a freed ->numa_faults
allocation (confirmed by KASAN) and write it back to userspace.
I believe that it would also be possible for a use-after-free read to occur
through a race between a NUMA fault and execve(): task_numa_fault() can
lead to task_numa_compare(), which invokes task_weight() on the currently
running task of a different CPU.

Another way to fix this would be to make ->numa_faults RCU-managed or add
extra locking, but it seems easier to wipe the NUMA fault statistics on
execve.
Signed-off-by: default avatarJann Horn <jannh@google.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Fixes: 82727018 ("sched/numa: Call task_numa_free() from do_execve()")
Link: https://lkml.kernel.org/r/20190716152047.14424-1-jannh@google.comSigned-off-by: default avatarIngo Molnar <mingo@kernel.org>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent 02cdc166
...@@ -1826,7 +1826,7 @@ static int __do_execve_file(int fd, struct filename *filename, ...@@ -1826,7 +1826,7 @@ static int __do_execve_file(int fd, struct filename *filename,
membarrier_execve(current); membarrier_execve(current);
rseq_execve(current); rseq_execve(current);
acct_update_integrals(current); acct_update_integrals(current);
task_numa_free(current); task_numa_free(current, false);
free_bprm(bprm); free_bprm(bprm);
kfree(pathbuf); kfree(pathbuf);
if (filename) if (filename)
......
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
extern void task_numa_fault(int last_node, int node, int pages, int flags); extern void task_numa_fault(int last_node, int node, int pages, int flags);
extern pid_t task_numa_group_id(struct task_struct *p); extern pid_t task_numa_group_id(struct task_struct *p);
extern void set_numabalancing_state(bool enabled); extern void set_numabalancing_state(bool enabled);
extern void task_numa_free(struct task_struct *p); extern void task_numa_free(struct task_struct *p, bool final);
extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page, extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page,
int src_nid, int dst_cpu); int src_nid, int dst_cpu);
#else #else
...@@ -34,7 +34,7 @@ static inline pid_t task_numa_group_id(struct task_struct *p) ...@@ -34,7 +34,7 @@ static inline pid_t task_numa_group_id(struct task_struct *p)
static inline void set_numabalancing_state(bool enabled) static inline void set_numabalancing_state(bool enabled)
{ {
} }
static inline void task_numa_free(struct task_struct *p) static inline void task_numa_free(struct task_struct *p, bool final)
{ {
} }
static inline bool should_numa_migrate_memory(struct task_struct *p, static inline bool should_numa_migrate_memory(struct task_struct *p,
......
...@@ -679,7 +679,7 @@ void __put_task_struct(struct task_struct *tsk) ...@@ -679,7 +679,7 @@ void __put_task_struct(struct task_struct *tsk)
WARN_ON(tsk == current); WARN_ON(tsk == current);
cgroup_free(tsk); cgroup_free(tsk);
task_numa_free(tsk); task_numa_free(tsk, true);
security_task_free(tsk); security_task_free(tsk);
exit_creds(tsk); exit_creds(tsk);
delayacct_tsk_free(tsk); delayacct_tsk_free(tsk);
......
...@@ -2345,13 +2345,23 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, ...@@ -2345,13 +2345,23 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
return; return;
} }
void task_numa_free(struct task_struct *p) /*
* Get rid of NUMA staticstics associated with a task (either current or dead).
* If @final is set, the task is dead and has reached refcount zero, so we can
* safely free all relevant data structures. Otherwise, there might be
* concurrent reads from places like load balancing and procfs, and we should
* reset the data back to default state without freeing ->numa_faults.
*/
void task_numa_free(struct task_struct *p, bool final)
{ {
struct numa_group *grp = p->numa_group; struct numa_group *grp = p->numa_group;
void *numa_faults = p->numa_faults; unsigned long *numa_faults = p->numa_faults;
unsigned long flags; unsigned long flags;
int i; int i;
if (!numa_faults)
return;
if (grp) { if (grp) {
spin_lock_irqsave(&grp->lock, flags); spin_lock_irqsave(&grp->lock, flags);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
...@@ -2364,8 +2374,14 @@ void task_numa_free(struct task_struct *p) ...@@ -2364,8 +2374,14 @@ void task_numa_free(struct task_struct *p)
put_numa_group(grp); put_numa_group(grp);
} }
if (final) {
p->numa_faults = NULL; p->numa_faults = NULL;
kfree(numa_faults); kfree(numa_faults);
} else {
p->total_numa_faults = 0;
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
numa_faults[i] = 0;
}
} }
/* /*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment